Netdev List
 help / color / mirror / Atom feed
* [PATCH v15 10/17]If device is in zero-copy mode first, bonding will fail.
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1289280885.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

If device is in this zero-copy mode first, we cannot handle this,
so fail it. This patch is for this.

If bonding is created first, and one of the device will be in zero-copy
mode, this will be handled by mp device. It will first check if all the
slaves have the zero-copy capability. If no, fail too. Otherwise make
all the slaves in zero-copy mode.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
---
 drivers/net/bonding/bond_main.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3b16f62..dfb6a2c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1428,6 +1428,10 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 			   bond_dev->name);
 	}
 
+	/* if the device is in zero-copy mode before bonding, fail it. */
+	if (dev_is_mpassthru(slave_dev))
+		return -EBUSY;
+
 	/* already enslaved */
 	if (slave_dev->flags & IFF_SLAVE) {
 		pr_debug("Error, Device was already enslaved\n");
-- 
1.7.3

^ permalink raw reply related

* [PATCH v15 08/17]Modify netdev_free_page() to release external buffer
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1289280885.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Currently, it can get external buffers from mp device.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 include/linux/skbuff.h |    4 +++-
 net/core/skbuff.c      |   24 ++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6e1e991..6309ce6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1586,9 +1586,11 @@ static inline struct page *netdev_alloc_page(struct net_device *dev)
 	return __netdev_alloc_page(dev, GFP_ATOMIC);
 }
 
+extern void __netdev_free_page(struct net_device *dev, struct page *page);
+
 static inline void netdev_free_page(struct net_device *dev, struct page *page)
 {
-	__free_page(page);
+	__netdev_free_page(dev, page);
 }
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a1018bd..3d81113 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -298,6 +298,30 @@ struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__netdev_alloc_page);
 
+void netdev_free_ext_page(struct net_device *dev, struct page *page)
+{
+	struct skb_ext_page *ext_page = NULL;
+	if (dev_is_mpassthru(dev) && dev->mp_port->hash) {
+		ext_page = dev->mp_port->hash(dev, page);
+		if (ext_page)
+			ext_page->dtor(ext_page);
+		else
+			__free_page(page);
+	}
+}
+EXPORT_SYMBOL(netdev_free_ext_page);
+
+void __netdev_free_page(struct net_device *dev, struct page *page)
+{
+	if (dev_is_mpassthru(dev)) {
+		netdev_free_ext_page(dev, page);
+		return;
+	}
+
+	__free_page(page);
+}
+EXPORT_SYMBOL(__netdev_free_page);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 		int size)
 {
-- 
1.7.3


^ permalink raw reply related

* [PATCH v15 07/17]Modify netdev_alloc_page() to get external buffer
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1289280885.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

    Currently, it can get external buffers from mp device.

    Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
    Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
    Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 net/core/skbuff.c |   27 +++++++++++++++++++++++++++
 1 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 68e197e..a1018bd 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -261,11 +261,38 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 }
 EXPORT_SYMBOL(__netdev_alloc_skb);
 
+struct page *netdev_alloc_ext_pages(struct net_device *dev, int npages)
+{
+	struct mp_port *port;
+	struct skb_ext_page *ext_page = NULL;
+
+	port = dev->mp_port;
+	if (!port)
+		goto out;
+	ext_page = port->ctor(port, NULL, npages);
+	if (ext_page)
+		return ext_page->page;
+out:
+	return NULL;
+
+}
+EXPORT_SYMBOL(netdev_alloc_ext_pages);
+
+struct page *netdev_alloc_ext_page(struct net_device *dev)
+{
+	return netdev_alloc_ext_pages(dev, 1);
+
+}
+EXPORT_SYMBOL(netdev_alloc_ext_page);
+
 struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
 {
 	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
 	struct page *page;
 
+	if (dev_is_mpassthru(dev))
+		return netdev_alloc_ext_page(dev);
+
 	page = alloc_pages_node(node, gfp_mask, 0);
 	return page;
 }
-- 
1.7.3

^ permalink raw reply related

* [PATCH v15 06/17] Use callback to deal with skb_release_data() specially.
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1289280885.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

If buffer is external, then use the callback to destruct
buffers.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 include/linux/skbuff.h |    7 ++++---
 net/core/skbuff.c      |    7 +++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 696e690..6e1e991 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -199,14 +199,15 @@ struct skb_shared_info {
 	struct sk_buff	*frag_list;
 	struct skb_shared_hwtstamps hwtstamps;
 
+	/* Intermediate layers must ensure that destructor_arg
+	 * remains valid until skb destructor */
+	void *		destructor_arg;
+
 	/*
 	 * Warning : all fields before dataref are cleared in __alloc_skb()
 	 */
 	atomic_t	dataref;
 
-	/* Intermediate layers must ensure that destructor_arg
-	 * remains valid until skb destructor */
-	void *		destructor_arg;
 	/* must be last field, see pskb_expand_head() */
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 };
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c83b421..68e197e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -343,6 +343,13 @@ static void skb_release_data(struct sk_buff *skb)
 		if (skb_has_frags(skb))
 			skb_drop_fraglist(skb);
 
+		if (skb->dev && dev_is_mpassthru(skb->dev)) {
+			struct skb_ext_page *ext_page =
+				skb_shinfo(skb)->destructor_arg;
+			if (ext_page && ext_page->dtor)
+				ext_page->dtor(ext_page);
+		}
+
 		kfree(skb->head);
 	}
 }
-- 
1.7.3

^ permalink raw reply related

* [PATCH v15 02/17]Add a new struct for device to manipulate external buffer.
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1289280885.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

    Add a structure in structure net_device, the new field is
    named as mp_port. It's for mediate passthru (zero-copy).
    It contains the capability for the net device driver,
    a socket, and an external buffer creator, external means
    skb buffer belongs to the device may not be allocated from
    kernel space.

    Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
    Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
    Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 include/linux/netdevice.h |   25 ++++++++++++++++++++++++-
 1 files changed, 24 insertions(+), 1 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 46c36ff..f6b1870 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -325,6 +325,28 @@ enum netdev_state_t {
 	__LINK_STATE_DORMANT,
 };
 
+/*The structure for mediate passthru(zero-copy). */
+struct mp_port	{
+	/* the header len */
+	int		hdr_len;
+	/* the max payload len for one descriptor */
+	int		data_len;
+	/* the pages for DMA in one time */
+	int		npages;
+	/* the socket bind to */
+	struct socket	*sock;
+	/* the header len for virtio-net */
+	int		vnet_hlen;
+	/* the external buffer page creator */
+	struct skb_ext_page *(*ctor)(struct mp_port *,
+				struct sk_buff *, int);
+	/* the hash function attached to find according
+	 * backend ring descriptor info for one external
+	 * buffer page.
+	 */
+	struct skb_ext_page *(*hash)(struct net_device *,
+				struct page *);
+};
 
 /*
  * This structure holds at boot time configured netdevice settings. They
@@ -1045,7 +1067,8 @@ struct net_device {
 
 	/* GARP */
 	struct garp_port	*garp_port;
-
+	/* mpassthru */
+	struct mp_port		*mp_port;
 	/* class/net/name entry */
 	struct device		dev;
 	/* space for optional device, statistics, and wireless sysfs groups */
-- 
1.7.3


^ permalink raw reply related

* [PATCH v15 17/17] An example how to alloc user buffer based on napi_gro_frags() interface.
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1289280885.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

This example is made on ixgbe driver which using napi_gro_frags().
It can get buffers from guest side directly using netdev_alloc_page()
and release guest buffers using netdev_free_page().

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
---
 drivers/net/ixgbe/ixgbe_main.c |   37 +++++++++++++++++++++++++++++++++----
 1 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index a4a5263..9f5598b 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1032,7 +1032,14 @@ static inline void ixgbe_release_rx_desc(struct ixgbe_hw *hw,
 static bool is_rx_buffer_mapped_as_page(struct ixgbe_rx_buffer *bi,
 					struct net_device *dev)
 {
-	return true;
+	return dev_is_mpassthru(dev);
+}
+
+static u32 get_page_skb_offset(struct net_device *dev)
+{
+	if (!dev_is_mpassthru(dev))
+		return 0;
+	return dev->mp_port->vnet_hlen;
 }
 
 /**
@@ -1105,7 +1112,8 @@ static void ixgbe_alloc_rx_buffers(struct ixgbe_adapter *adapter,
 				adapter->alloc_rx_page_failed++;
 				goto no_buffers;
 			}
-			bi->page_skb_offset = 0;
+			bi->page_skb_offset =
+				get_page_skb_offset(adapter->netdev);
 			bi->dma = dma_map_page(&pdev->dev, bi->page_skb,
 					bi->page_skb_offset,
 					(PAGE_SIZE / 2),
@@ -1242,8 +1250,10 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			len = le16_to_cpu(rx_desc->wb.upper.length);
 		}
 
-		if (is_no_buffer(rx_buffer_info))
+		if (is_no_buffer(rx_buffer_info)) {
+			printk("no buffers\n");
 			break;
+		}
 		cleaned = true;
 
 		if (!rx_buffer_info->mapped_as_page) {
@@ -1299,6 +1309,11 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 						rx_buffer_info->page_skb,
 						rx_buffer_info->page_skb_offset,
 						len);
+				if (dev_is_mpassthru(netdev) &&
+						netdev->mp_port->hash)
+					skb_shinfo(skb)->destructor_arg =
+						netdev->mp_port->hash(netdev,
+						rx_buffer_info->page_skb);
 				rx_buffer_info->page_skb = NULL;
 				skb->len += len;
 				skb->data_len += len;
@@ -1316,7 +1331,8 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			                   upper_len);
 
 			if ((rx_ring->rx_buf_len > (PAGE_SIZE / 2)) ||
-			    (page_count(rx_buffer_info->page) != 1))
+			    (page_count(rx_buffer_info->page) != 1) ||
+				dev_is_mpassthru(netdev))
 				rx_buffer_info->page = NULL;
 			else
 				get_page(rx_buffer_info->page);
@@ -6529,6 +6545,16 @@ static void ixgbe_netpoll(struct net_device *netdev)
 }
 #endif
 
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+static int ixgbe_ndo_mp_port_prep(struct net_device *dev, struct mp_port *port)
+{
+	port->hdr_len = 128;
+	port->data_len = 2048;
+	port->npages = 1;
+	return 0;
+}
+#endif
+
 static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open 		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
@@ -6548,6 +6574,9 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_set_vf_vlan	= ixgbe_ndo_set_vf_vlan,
 	.ndo_set_vf_tx_rate	= ixgbe_ndo_set_vf_bw,
 	.ndo_get_vf_config	= ixgbe_ndo_get_vf_config,
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+	.ndo_mp_port_prep	= ixgbe_ndo_mp_port_prep,
+#endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= ixgbe_netpoll,
 #endif
-- 
1.7.3


^ permalink raw reply related

* [PATCH v15 09/17] Don't do skb recycle, if device use external buffer.
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1289280885.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 net/core/skbuff.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3d81113..075f4c5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -557,6 +557,12 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size)
 	if (skb_shared(skb) || skb_cloned(skb))
 		return false;
 
+	/* if the device wants to do mediate passthru, the skb may
+	 * get external buffer, so don't recycle
+	 */
+	if (dev_is_mpassthru(skb->dev))
+		return 0;
+
 	skb_release_head_state(skb);
 
 	shinfo = skb_shinfo(skb);
-- 
1.7.3


^ permalink raw reply related

* [PATCH v15 05/17] Add a function to indicate if device use external buffer.
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1289280885.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 include/linux/netdevice.h |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8dcf6de..f91d9bb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1739,6 +1739,11 @@ extern gro_result_t	napi_gro_frags(struct napi_struct *napi);
 extern int netdev_mp_port_prep(struct net_device *dev,
 				struct mp_port *port);
 
+static inline bool dev_is_mpassthru(struct net_device *dev)
+{
+	return dev && dev->mp_port;
+}
+
 static inline void napi_free_frags(struct napi_struct *napi)
 {
 	kfree_skb(napi->skb);
-- 
1.7.3


^ permalink raw reply related

* [PATCH v15 04/17]Add a function make external buffer owner to query capability.
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1289280885.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

The external buffer owner can use the functions to get
the capability of the underlying NIC driver.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhaonew@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 include/linux/netdevice.h |    2 ++
 net/core/dev.c            |   41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 575777f..8dcf6de 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1736,6 +1736,8 @@ extern gro_result_t	napi_frags_finish(struct napi_struct *napi,
 					  gro_result_t ret);
 extern struct sk_buff *	napi_frags_skb(struct napi_struct *napi);
 extern gro_result_t	napi_gro_frags(struct napi_struct *napi);
+extern int netdev_mp_port_prep(struct net_device *dev,
+				struct mp_port *port);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 660dd41..84fbb83 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2942,6 +2942,47 @@ out:
 	return ret;
 }
 
+/* To support meidate passthru(zero-copy) with NIC driver,
+ * we'd better query NIC driver for the capability it can
+ * provide, especially for packet split mode, now we only
+ * query for the header size, and the payload a descriptor
+ * may carry.
+ * Now, it's only called by mpassthru device.
+ */
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+int netdev_mp_port_prep(struct net_device *dev,
+		struct mp_port *port)
+{
+	int rc;
+	int npages, data_len;
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (ops->ndo_mp_port_prep) {
+		rc = ops->ndo_mp_port_prep(dev, port);
+		if (rc)
+			return rc;
+	} else
+		return -EINVAL;
+
+	if (port->hdr_len <= 0)
+		goto err;
+
+	npages = port->npages;
+	data_len = port->data_len;
+	if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+			(data_len < PAGE_SIZE * (npages - 1) ||
+			 data_len > PAGE_SIZE * npages))
+		goto err;
+
+	return 0;
+err:
+	dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_mp_port_prep);
+#endif
+
 /**
  *	netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
-- 
1.7.3


^ permalink raw reply related

* [PATCH v15 03/17]Add a ndo_mp_port_prep pointer to net_device_ops.
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1289280885.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

    If the driver want to allocate external buffers,
    then it can export it's capability, as the skb
    buffer header length, the page length can be DMA, etc.
    The external buffers owner may utilize this.

    Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
    Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
    Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 include/linux/netdevice.h |   10 ++++++++++
 1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f6b1870..575777f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -723,6 +723,12 @@ struct netdev_rx_queue {
  * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
  *			  struct nlattr *port[]);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
+ *
+ * int (*ndo_mp_port_prep)(struct net_device *dev, struct mp_port *port);
+ *	If the driver want to allocate external buffers,
+ *	then it can export it's capability, as the skb
+ *	buffer header length, the page length can be DMA, etc.
+ *	The external buffers owner may utilize this.
  */
 #define HAVE_NET_DEVICE_OPS
 struct net_device_ops {
@@ -795,6 +801,10 @@ struct net_device_ops {
 	int			(*ndo_fcoe_get_wwn)(struct net_device *dev,
 						    u64 *wwn, int type);
 #endif
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+	int			(*ndo_mp_port_prep)(struct net_device *dev,
+						struct mp_port *port);
+#endif
 };
 
 /*
-- 
1.7.3


^ permalink raw reply related

* [PATCH v15 01/17] Add a new structure for skb buffer from external.
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1289293402-4791-1-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 include/linux/skbuff.h |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 77eb60d..696e690 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -211,6 +211,15 @@ struct skb_shared_info {
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 };
 
+/* The structure is for a skb which pages may point to
+ * an external buffer, which is not allocated from kernel space.
+ * It also contains a destructor for itself.
+ */
+struct skb_ext_page {
+	struct		page *page;
+	void		(*dtor)(struct skb_ext_page *);
+};
+
 /* We divide dataref into two halves.  The higher 16 bits hold references
  * to the payload part of skb->data.  The lower 16 bits hold references to
  * the entire skb->data.  A clone of a headerless skb holds the length of
-- 
1.7.3


^ permalink raw reply related

* [PATCH v15 00/17] Provide a zero-copy method on KVM virtio-net.
From: xiaohui.xin @ 2010-11-09  9:03 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike

We provide an zero-copy method which driver side may get external
buffers to DMA. Here external means driver don't use kernel space
to allocate skb buffers. Currently the external buffer can be from
guest virtio-net driver.

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

patch 01-11:  	net core and kernel changes.
patch 12-14:  	new device as interface to mantpulate external buffers.
patch 15: 	for vhost-net.
patch 16:	An example on modifying NIC driver to using napi_gro_frags().
patch 17:	An example how to get guest buffers based on driver
		who using napi_gro_frags().

The guest virtio-net driver submits multiple requests thru vhost-net
backend driver to the kernel. And the requests are queued and then
completed after corresponding actions in h/w are done.

For read, user space buffers are dispensed to NIC driver for rx when
a page constructor API is invoked. Means NICs can allocate user buffers
from a page constructor. We add a hook in netif_receive_skb() function
to intercept the incoming packets, and notify the zero-copy device.

For write, the zero-copy deivce may allocates a new host skb and puts
payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
The request remains pending until the skb is transmitted by h/w.

We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later.

What we have not done yet:
	Performance tuning

what we have done in v1:
	polish the RCU usage
	deal with write logging in asynchroush mode in vhost
	add notifier block for mp device
	rename page_ctor to mp_port in netdevice.h to make it looks generic
	add mp_dev_change_flags() for mp device to change NIC state
	add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
	a small fix for missing dev_put when fail
	using dynamic minor instead of static minor number
	a __KERNEL__ protect to mp_get_sock()

what we have done in v2:
	
	remove most of the RCU usage, since the ctor pointer is only
	changed by BIND/UNBIND ioctl, and during that time, NIC will be
	stopped to get good cleanup(all outstanding requests are finished),
	so the ctor pointer cannot be raced into wrong situation.

	Remove the struct vhost_notifier with struct kiocb.
	Let vhost-net backend to alloc/free the kiocb and transfer them
	via sendmsg/recvmsg.

	use get_user_pages_fast() and set_page_dirty_lock() when read.

	Add some comments for netdev_mp_port_prep() and handle_mpassthru().

what we have done in v3:
	the async write logging is rewritten 
	a drafted synchronous write function for qemu live migration
	a limit for locked pages from get_user_pages_fast() to prevent Dos
	by using RLIMIT_MEMLOCK
	

what we have done in v4:
	add iocb completion callback from vhost-net to queue iocb in mp device
	replace vq->receiver by mp_sock_data_ready()
	remove stuff in mp device which access structures from vhost-net
	modify skb_reserve() to ignore host NIC driver reserved space
	rebase to the latest vhost tree
	split large patches into small pieces, especially for net core part.
	

what we have done in v5:
	address Arnd Bergmann's comments
		-remove IFF_MPASSTHRU_EXCL flag in mp device
		-Add CONFIG_COMPAT macro
		-remove mp_release ops
	move dev_is_mpassthru() as inline func
	fix a bug in memory relinquish
	Apply to current git (2.6.34-rc6) tree.

what we have done in v6:
	move create_iocb() out of page_dtor which may happen in interrupt context
	-This remove the potential issues which lock called in interrupt context
	make the cache used by mp, vhost as static, and created/destoryed during
	modules init/exit functions.
	-This makes multiple mp guest created at the same time.

what we have done in v7:
	some cleanup prepared to suppprt PS mode

what we have done in v8:
	discarding the modifications to point skb->data to guest buffer directly.
	Add code to modify driver to support napi_gro_frags() with Herbert's comments.
	To support PS mode.
	Add mergeable buffer support in mp device.
	Add GSO/GRO support in mp deice.
	Address comments from Eric Dumazet about cache line and rcu usage.

what we have done in v9:
	v8 patch is based on a fix in dev_gro_receive().
	But Herbert did not agree with the fix we have sent out.
	And he suggest another fix. v9 is modified to base on that fix.
	

what we have done in v10:
	Fix a partial csum error.
	Cleanup some unused fields with struct page_info{} in mp device.
	Modify kmem_cache_zalloc() to kmem_cache_alloc() based on Michael S. Thirkin.

what we have done in v11:
	Address comments from Michael S. Thirkin to add two new ioctls in mp device.
	But still need to revise.

what we have done in v12:
	Address most comments from Ben Hutchings, except the compat ioctls.
	As the comments are sparse, so do not make a split patch.
	Change struct mpassthru_port to struct mp_port, and struct page_ctor
	to struct page_pool.

what we have done in v13:
	Export functions to other drivers like macvtap, in case it want to reuse it to
	get zero-copy.
	Rebase on 2.6.36-rc7.

what we have done in v14:
	Address the comments from David Miller for bonding device issue.
	Currently, we treat it in two cases. One case is that bonding is created before
	zero-copy mode is enabled for a device. The code will check if all the slaves are
	capable of zero-copy. If yes, it will force all the slaves in zero-copy mode.
	If not, fails zero-copy. The other case is that zero-copy is enabled before bonding
	is created, just fail bonding.

what we have done in v15:
	Address comments from Eric Dumazet about how to clear destructor_arg field of shinfo.

Performance:
	We have seen the performance data request from mailling-list.
	And we are now looking into this.

^ permalink raw reply

* [e1000e] BUG triggered when triggering LED blinking
From: Holger Eitzenberger @ 2010-11-09  8:39 UTC (permalink / raw)
  To: e1000-devel; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 2673 bytes --]

Hi,

using e1000e driver version 1.2.10 and kernel version 2.6.32.24 I see
the kernel go BUG() sporadically at the time 'ethtool -p eth0 3' comes
back.

Network hardware is four times 'Intel Corporation 82583V Gigabit Network
Connection' (0x8086:0x150c) on Atom N450.

kernel BUG at kernel/workqueue.c:287!
invalid opcode: 0000 [#1] SMP
last sysfs file:
/sys/devices/pci0000:00/0000:00:1d.0/usb2/2-1/2-1:1.1/input/input2/event2/dev
Modules linked in: nls_utf8 isofs edd ide_cd_mod sr_mod cdrom sg sd_mod
pata_acpi ata_generic usb_storage ppdev ide_pci_generic ata_piix libata
evdev rtc_cmos uhci_hcd parport_pc scsi_mod i2c_i801 ehci_hcd rtc_core
rtc_lib e1000e parport ftdi_sio usbhid usbserial

Pid: 8, comm: events/1 Not tainted (2.6.32.24-62.gce8dff6-ai #1) To Be
Filled By O.E.M.
EIP: 0060:[<c102ed4f>] EFLAGS: 00010206 CPU: 1
EIP is at worker_thread+0xc5/0x144
EAX: c1c052e0 EBX: c1d052e0 ECX: f70cac1c EDX: f70cac1c
ESI: f81c75b0 EDI: f70cac18 EBP: f705e3b0 ESP: f7093f90
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
Process events/1 (pid: 8, ti=f7092000 task=f705e3b0 task.ti=f7092000)
Stack:
 f705e5a0 c1d052ec c1d052e4 00000000 f705e3b0 c103151e f7093fa8 f7093fa8
<0> f7061f68 c1d052e0 c102ec8a 00000000 c103132b 00000000 00000000
00000000
<0> f7093fd0 f7093fd0 c10312ca 00000000 00000000 c100329f f7061f5c
00000000
Call Trace:
 [<c103151e>] ? autoremove_wake_function+0x0/0x2d
 [<c102ec8a>] ? worker_thread+0x0/0x144
 [<c103132b>] ? kthread+0x61/0x66
 [<c10312ca>] ? kthread+0x0/0x66
 [<c100329f>] ? kernel_thread_helper+0x7/0x10
Code: e9 85 00 00 00 8d 79 fc 8b 77 0c 89 7b 18 8b 11 8b 41 04 89 42 04
89 10 89 09 89 49 04 f0 fe 03 fb 8b 41 fc 83 e0 fc 39 c3 74 04 <0f> 0b
eb fe f0 80 61 fc fe 89 f8 ff d6 89 e0 25 00 e0 ff ff 8b
EIP: [<c102ed4f>] worker_thread+0xc5/0x144 SS:ESP 0068:f7093f90
---[ end trace e297b781eb382c2f ]---

The full trace is attached, it may become clearer from that.

After taking a look I think this may be caused by initializing
adapter->led_blink_task several times in e1000_phys_id(), while possibly
led_blink_task is running:

	if ((hw->phy.type == e1000_phy_ife) ||
	    (hw->mac.type == e1000_pchlan) ||
	    (hw->mac.type == e1000_82574)) {
		INIT_WORK(&adapter->led_blink_task, e1000e_led_blink_task);
		if (!adapter->blink_timer.function) {

I can't reproduce it after moving it inside the following if block,
but I'm not quite sure if this catches all races in there.  Especially
the msleep_interruptible() may be too optimistic because it may
actually not wait long enough.  Someone with more knowledge of the
driver should take a look.

I've attached a proposed fix for the double initialization, please check.

 /holger


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: putty.log --]
[-- Type: text/plain; charset=unknown-8bit, Size: 4366 bytes --]

=~=~=~=~=~=~=~=~=~=~=~= PuTTY log 2010.11.03 16:51:28 =~=~=~=~=~=~=~=~=~=~=~=
ÿ------------[ cut here ]------------
kernel BUG at kernel/workqueue.c:287!
invalid opcode: 0000 [#1] SMP 
last sysfs file: /sys/devices/pci0000:00/0000:00:1d.0/usb2/2-1/2-1:1.1/input/input2/event2/dev
Modules linked in: nls_utf8 isofs edd ide_cd_mod sr_mod cdrom sg sd_mod pata_acpi ata_generic usb_storage ppdev ide_pci_generic ata_piix libata evdev rtc_cmos uhci_hcd parport_pc scsi_mod i2c_i801 ehci_hcd rtc_core rtc_lib e1000e parport ftdi_sio usbhid usbserial

Pid: 8, comm: events/1 Not tainted (2.6.32.24-62.gce8dff6-ai #1) To Be Filled By O.E.M.
EIP: 0060:[<c102ed4f>] EFLAGS: 00010206 CPU: 1
EIP is at worker_thread+0xc5/0x144
EAX: c1c052e0 EBX: c1d052e0 ECX: f70cac1c EDX: f70cac1c
ESI: f81c75b0 EDI: f70cac18 EBP: f705e3b0 ESP: f7093f90
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
Process events/1 (pid: 8, ti=f7092000 task=f705e3b0 task.ti=f7092000)
Stack:
 f705e5a0 c1d052ec c1d052e4 00000000 f705e3b0 c103151e f7093fa8 f7093fa8
<0> f7061f68 c1d052e0 c102ec8a 00000000 c103132b 00000000 00000000 00000000
<0> f7093fd0 f7093fd0 c10312ca 00000000 00000000 c100329f f7061f5c 00000000
Call Trace:
 [<c103151e>] ? autoremove_wake_function+0x0/0x2d
 [<c102ec8a>] ? worker_thread+0x0/0x144
 [<c103132b>] ? kthread+0x61/0x66
 [<c10312ca>] ? kthread+0x0/0x66
 [<c100329f>] ? kernel_thread_helper+0x7/0x10
Code: e9 85 00 00 00 8d 79 fc 8b 77 0c 89 7b 18 8b 11 8b 41 04 89 42 04 89 10 89 09 89 49 04 f0 fe 03 fb 8b 41 fc 83 e0 fc 39 c3 74 04 <0f> 0b eb fe f0 80 61 fc fe 89 f8 ff d6 89 e0 25 00 e0 ff ff 8b 
EIP: [<c102ed4f>] worker_thread+0xc5/0x144 SS:ESP 0068:f7093f90
---[ end trace e297b781eb382c2f ]---
------------[ cut here ]------------
kernel BUG at kernel/workqueue.c:191!
invalid opcode: 0000 [#2] SMP 
last sysfs file: /sys/devices/pci0000:00/0000:00:1d.0/usb2/2-1/2-1:1.1/input/input2/event2/dev
Modules linked in: nls_utf8 isofs edd ide_cd_mod sr_mod cdrom sg sd_mod pata_acpi ata_generic usb_storage ppdev ide_pci_generic ata_piix libata evdev rtc_cmos uhci_hcd parport_pc scsi_mod i2c_i801 ehci_hcd rtc_core rtc_lib e1000e parport ftdi_sio usbhid usbserial

Pid: 1012, comm: klogd Tainted: G      D    (2.6.32.24-62.gce8dff6-ai #1) To Be Filled By O.E.M.
EIP: 0060:[<c102eeac>] EFLAGS: 00010283 CPU: 0
EIP is at queue_work_on+0x1b/0x44
EAX: f70cac1c EBX: 00000000 ECX: f70cac18 EDX: 00000000
ESI: f7001280 EDI: f81c7920 EBP: f71dbf60 ESP: f71dbf3c
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process klogd (pid: 1012, ti=f71da000 task=f71d87a0 task.ti=f71da000)
Stack:
 f70c8330 c133ca00 f81c7931 00000100 c1029710 c133d810 c133d610 c133d410
<0> c133d210 f71dbf60 f71dbf60 00000001 00000004 00000100 00000141 c102652e
<0> c1318010 0000000a 00000000 00000046 00000000 099251d0 bfb6e968 c10265be
Call Trace:
 [<f81c7931>] ? e1000e_set_ethtool_ops+0x1d1/0x38b0 [e1000e]
 [<c1029710>] ? run_timer_softirq+0x116/0x16a
 [<c102652e>] ? __do_softirq+0x78/0xe5
 [<c10265be>] ? do_softirq+0x23/0x27
 [<c102668d>] ? irq_exit+0x26/0x58
 [<c100f9ea>] ? smp_apic_timer_interrupt+0x6c/0x76
 [<c10030f6>] ? apic_timer_interrupt+0x2a/0x30
Code: 2c c1 8b 00 03 04 8d c0 bb 2c c1 e9 3d ff ff ff 56 89 d6 53 89 c3 f0 0f ba 29 00 19 c0 31 d2 85 c0 75 2c 8d 41 04 39 41 04 74 04 <0f> 0b eb fe 83 7e 10 00 89 ca 0f 45 1d b4 bc 2c c1 8b 06 03 04 
EIP: [<c102eeac>] queue_work_on+0x1b/0x44 SS:ESP 0068:f71dbf3c
---[ end trace e297b781eb382c30 ]---
Kernel panic - not syncing: Fatal exception in interrupt
Pid: 1012, comm: klogd Tainted: G      D    2.6.32.24-62.gce8dff6-ai #1
Call Trace:
 [<c11e0a50>] ? panic+0x38/0xda
 [<c11e3371>] ? oops_end+0x89/0x94
 [<c1003628>] ? do_invalid_op+0x0/0x70
 [<c100368f>] ? do_invalid_op+0x67/0x70
 [<c102eeac>] ? queue_work_on+0x1b/0x44
 [<c117bfb3>] ? sys_sendto+0xfc/0x127
 [<c101c3cc>] ? dequeue_task_fair+0x3f/0x1bc
 [<c1018e0b>] ? sched_slice+0x6d/0x79
 [<c11e2ac6>] ? error_code+0x66/0x6c
 [<f81c7920>] ? e1000e_set_ethtool_ops+0x1c0/0x38b0 [e1000e]
 [<c1003628>] ? do_invalid_op+0x0/0x70
 [<c102eeac>] ? queue_work_on+0x1b/0x44
 [<f81c7931>] ? e1000e_set_ethtool_ops+0x1d1/0x38b0 [e1000e]
 [<c1029710>] ? run_timer_softirq+0x116/0x16a
 [<c102652e>] ? __do_softirq+0x78/0xe5
 [<c10265be>] ? do_softirq+0x23/0x27
 [<c102668d>] ? irq_exit+0x26/0x58
 [<c100f9ea>] ? smp_apic_timer_interrupt+0x6c/0x76
 [<c10030f6>] ? apic_timer_interrupt+0x2a/0x30

[-- Attachment #3: e1000e-fix.diff --]
[-- Type: text/x-diff, Size: 708 bytes --]

Index: linux-2.6.32.y/drivers/net/e1000e/ethtool.c
===================================================================
--- linux-2.6.32.y.orig/drivers/net/e1000e/ethtool.c	2010-11-08 15:34:35.000000000 +0100
+++ linux-2.6.32.y/drivers/net/e1000e/ethtool.c	2010-11-08 17:32:27.000000000 +0100
@@ -1833,8 +1833,8 @@
 	if ((hw->phy.type == e1000_phy_ife) ||
 	    (hw->mac.type == e1000_pchlan) ||
 	    (hw->mac.type == e1000_82574)) {
-		INIT_WORK(&adapter->led_blink_task, e1000e_led_blink_task);
 		if (!adapter->blink_timer.function) {
+			INIT_WORK(&adapter->led_blink_task, e1000e_led_blink_task);
 			init_timer(&adapter->blink_timer);
 			adapter->blink_timer.function =
 				e1000_led_blink_callback;

^ permalink raw reply

* Re: [PATCH] Fix CAN info leak/minor heap overflow
From: Oliver Hartkopp @ 2010-11-09  7:52 UTC (permalink / raw)
  To: David Miller
  Cc: Urs Thuermann, netdev, Dan Rosenberg, security, Linus Torvalds
In-Reply-To: <ygfiq0bsjry.fsf@janus.isnogud.escape.de>

On 05.11.2010 19:33, Urs Thuermann wrote:
> This patch removes the leakage of kernel space addresses to userspace.
> Instead, socket inode numbers are used to create unique proc file
> names for CAN_BCM sockets and for referring to sockets in filter
> lists.  In addition, this makes debugging easier, since inode numbers
> are also shown in ls -l /proc/<pid>/fd/<fd> and lsof(8) output.
> 
> BTW, if kernel space addresses are considered security critical
> information one should also take a look and possibly change
> 
>     /proc/net/{tcp,tcp6,udp,udp6,raw,raw6,unix}
> 
> and maybe some others.
> 
> The change of the procfs content leads to a new version string
> 20101105.
> 
> Signed-off-by: Urs Thuermann <urs@isnogud.escape.de>
> Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>

Besides the ongoing(?) discussion about the exposed kernel addresses in procfs
- what are your plans about this patch that already moves the kernel addresses
to inode numbers?

Is it something for net-2.6 / net-next-2.6 / stable ?

Especially in this case we do not see any problems with userspace tools that
could break as it would be for some other /proc/net entries.

Once this patch is applied (and the procfs layout is changed anyway), i'd also
like to send a patch from my backlog that would extend the procfs output for
can-bcm with an additional drop counter.

Best regards,
Oliver


> CC: Dan Rosenberg <drosenberg@vsecurity.com>
> CC: Linus Torvalds <torvalds@linux-foundation.org>
> 
> ---
> 
> diff --git a/include/linux/can/core.h b/include/linux/can/core.h
> index 6c507be..e20a841 100644
> --- a/include/linux/can/core.h
> +++ b/include/linux/can/core.h
> @@ -19,7 +19,7 @@
>  #include <linux/skbuff.h>
>  #include <linux/netdevice.h>
>  
> -#define CAN_VERSION "20090105"
> +#define CAN_VERSION "20101105"
>  
>  /* increment this number each time you change some user-space interface */
>  #define CAN_ABI_VERSION "8"
> diff --git a/net/can/bcm.c b/net/can/bcm.c
> index 08ffe9e..0e81e04 100644
> --- a/net/can/bcm.c
> +++ b/net/can/bcm.c
> @@ -86,6 +86,12 @@ MODULE_LICENSE("Dual BSD/GPL");
>  MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
>  MODULE_ALIAS("can-proto-2");
>  
> +/*
> + * Point to the sockets inode number inside the bcm ident string.
> + * We skip the string length of "bcm " (== 4) created in bcm_init().
> + */
> +#define INODENUM(bo) (bo->ident + 4)
> +
>  /* easy access to can_frame payload */
>  static inline u64 GET_U64(const struct can_frame *cp)
>  {
> @@ -125,7 +131,7 @@ struct bcm_sock {
>  	struct list_head tx_ops;
>  	unsigned long dropped_usr_msgs;
>  	struct proc_dir_entry *bcm_proc_read;
> -	char procname [9]; /* pointer printed in ASCII with \0 */
> +	char ident[32];
>  };
>  
>  static inline struct bcm_sock *bcm_sk(const struct sock *sk)
> @@ -165,9 +171,7 @@ static int bcm_proc_show(struct seq_file *m, void *v)
>  	struct bcm_sock *bo = bcm_sk(sk);
>  	struct bcm_op *op;
>  
> -	seq_printf(m, ">>> socket %p", sk->sk_socket);
> -	seq_printf(m, " / sk %p", sk);
> -	seq_printf(m, " / bo %p", bo);
> +	seq_printf(m, ">>> socket inode %s", INODENUM(bo));
>  	seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs);
>  	seq_printf(m, " / bound %s", bcm_proc_getifname(ifname, bo->ifindex));
>  	seq_printf(m, " <<<\n");
> @@ -1168,7 +1172,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
>  				err = can_rx_register(dev, op->can_id,
>  						      REGMASK(op->can_id),
>  						      bcm_rx_handler, op,
> -						      "bcm");
> +						      bo->ident);
>  
>  				op->rx_reg_dev = dev;
>  				dev_put(dev);
> @@ -1177,7 +1181,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
>  		} else
>  			err = can_rx_register(NULL, op->can_id,
>  					      REGMASK(op->can_id),
> -					      bcm_rx_handler, op, "bcm");
> +					      bcm_rx_handler, op, bo->ident);
>  		if (err) {
>  			/* this bcm rx op is broken -> remove it */
>  			list_del(&op->list);
> @@ -1402,6 +1406,8 @@ static int bcm_init(struct sock *sk)
>  {
>  	struct bcm_sock *bo = bcm_sk(sk);
>  
> +	snprintf(bo->ident, sizeof(bo->ident), "bcm %lu", sock_i_ino(sk));
> +
>  	bo->bound            = 0;
>  	bo->ifindex          = 0;
>  	bo->dropped_usr_msgs = 0;
> @@ -1466,7 +1472,7 @@ static int bcm_release(struct socket *sock)
>  
>  	/* remove procfs entry */
>  	if (proc_dir && bo->bcm_proc_read)
> -		remove_proc_entry(bo->procname, proc_dir);
> +		remove_proc_entry(INODENUM(bo), proc_dir);
>  
>  	/* remove device reference */
>  	if (bo->bound) {
> @@ -1519,13 +1525,11 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
>  
>  	bo->bound = 1;
>  
> -	if (proc_dir) {
> -		/* unique socket address as filename */
> -		sprintf(bo->procname, "%p", sock);
> -		bo->bcm_proc_read = proc_create_data(bo->procname, 0644,
> +	/* use unique socket inode number as filename */
> +	if (proc_dir)
> +		bo->bcm_proc_read = proc_create_data(INODENUM(bo), 0644,
>  						     proc_dir,
>  						     &bcm_proc_fops, sk);
> -	}
>  
>  	return 0;
>  }
> diff --git a/net/can/proc.c b/net/can/proc.c
> index f4265cc..15bed1c 100644
> --- a/net/can/proc.c
> +++ b/net/can/proc.c
> @@ -204,23 +204,17 @@ static void can_print_rcvlist(struct seq_file *m, struct hlist_head *rx_list,
>  
>  	hlist_for_each_entry_rcu(r, n, rx_list, list) {
>  		char *fmt = (r->can_id & CAN_EFF_FLAG)?
> -			"   %-5s  %08X  %08x  %08x  %08x  %8ld  %s\n" :
> -			"   %-5s     %03X    %08x  %08lx  %08lx  %8ld  %s\n";
> +			"   %-5s  %08X  %08x  %8ld   %s\n" :
> +			"   %-5s     %03X    %08x  %8ld   %s\n";
>  
>  		seq_printf(m, fmt, DNAME(dev), r->can_id, r->mask,
> -				(unsigned long)r->func, (unsigned long)r->data,
>  				r->matches, r->ident);
>  	}
>  }
>  
>  static void can_print_recv_banner(struct seq_file *m)
>  {
> -	/*
> -	 *                  can1.  00000000  00000000  00000000
> -	 *                 .......          0  tp20
> -	 */
> -	seq_puts(m, "  device   can_id   can_mask  function"
> -			"  userdata   matches  ident\n");
> +	seq_puts(m, "  device   can_id   can_mask   matches   ident\n");
>  }
>  
>  static int can_stats_proc_show(struct seq_file *m, void *v)
> diff --git a/net/can/raw.c b/net/can/raw.c
> index e88f610..e057f0d 100644
> --- a/net/can/raw.c
> +++ b/net/can/raw.c
> @@ -88,6 +88,7 @@ struct raw_sock {
>  	struct can_filter dfilter; /* default/single filter */
>  	struct can_filter *filter; /* pointer to filter(s) */
>  	can_err_mask_t err_mask;
> +	char ident[32];
>  };
>  
>  /*
> @@ -154,13 +155,14 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
>  static int raw_enable_filters(struct net_device *dev, struct sock *sk,
>  			      struct can_filter *filter, int count)
>  {
> +	struct raw_sock *ro = raw_sk(sk);
>  	int err = 0;
>  	int i;
>  
>  	for (i = 0; i < count; i++) {
>  		err = can_rx_register(dev, filter[i].can_id,
>  				      filter[i].can_mask,
> -				      raw_rcv, sk, "raw");
> +				      raw_rcv, sk, ro->ident);
>  		if (err) {
>  			/* clean up successfully registered filters */
>  			while (--i >= 0)
> @@ -177,11 +179,12 @@ static int raw_enable_filters(struct net_device *dev, struct sock *sk,
>  static int raw_enable_errfilter(struct net_device *dev, struct sock *sk,
>  				can_err_mask_t err_mask)
>  {
> +	struct raw_sock *ro = raw_sk(sk);
>  	int err = 0;
>  
>  	if (err_mask)
>  		err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG,
> -				      raw_rcv, sk, "raw");
> +				      raw_rcv, sk, ro->ident);
>  
>  	return err;
>  }
> @@ -281,6 +284,8 @@ static int raw_init(struct sock *sk)
>  {
>  	struct raw_sock *ro = raw_sk(sk);
>  
> +	snprintf(ro->ident, sizeof(ro->ident), "raw %lu", sock_i_ino(sk));
> +
>  	ro->bound            = 0;
>  	ro->ifindex          = 0;
>  


^ permalink raw reply

* Re: Loopback performance from kernel 2.6.12 to 2.6.37
From: Eric Dumazet @ 2010-11-09  6:42 UTC (permalink / raw)
  To: Andrew Hendry; +Cc: Jesper Dangaard Brouer, netdev
In-Reply-To: <1289284715.2790.87.camel@edumazet-laptop>

Le mardi 09 novembre 2010 à 07:38 +0100, Eric Dumazet a écrit :

> Hmm, your clock source is HPET, that might explain the problem on a
> scheduler intensive workload.
> 

And if a packet sniffer (dhclient for example) makes all packets being
timestamped, it also can explain a slowdown, even if there is no
scheduler artifacts.

cat /proc/net/packet

> My HP dev machine
> # grep . /sys/devices/system/clocksource/clocksource0/*
> /sys/devices/system/clocksource/clocksource0/available_clocksource:tsc hpet acpi_pm 
> /sys/devices/system/clocksource/clocksource0/current_clocksource:tsc
> 
> My laptop:
> $ grep . /sys/devices/system/clocksource/clocksource0/*
> /sys/devices/system/clocksource/clocksource0/available_clocksource:tsc hpet acpi_pm 
> /sys/devices/system/clocksource/clocksource0/current_clocksource:tsc
> 




^ permalink raw reply

* Re: Loopback performance from kernel 2.6.12 to 2.6.37
From: Eric Dumazet @ 2010-11-09  6:38 UTC (permalink / raw)
  To: Andrew Hendry; +Cc: Jesper Dangaard Brouer, netdev
In-Reply-To: <AANLkTikAPaU_2=wS_T3V-8xFZm-G3qutJBxY8yb0QCYL@mail.gmail.com>

Le mardi 09 novembre 2010 à 17:30 +1100, Andrew Hendry a écrit :
> most my slowdown was kmemleak left on.
> 
> After fixing its is still a lot slower than your dev system
> .
> # time dd if=/dev/zero bs=1M count=10000 | netcat  127.0.0.1 9999
> 10000+0 records in
> 10000+0 records out
> 10485760000 bytes (10 GB) copied, 25.8182 s, 406 MB/s
> 
> real	0m25.821s
> user	0m1.502s
> sys	0m33.463s
> 
> ------------------------------------------------------------------------------------------------------------------
>    PerfTop:     241 irqs/sec  kernel:56.8%  exact:  0.0% [1000Hz
> cycles],  (all, 8 CPUs)
> ------------------------------------------------------------------------------------------------------------------
> 
>              samples  pcnt function                    DSO
>              _______ _____ ___________________________
> ______________________________________
> 
>              1255.00  8.7% hpet_msi_next_event
> /lib/modules/2.6.37-rc1+/build/vmlinux
>              1081.00  7.5% copy_user_generic_string
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               863.00  6.0% __ticket_spin_lock
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               498.00  3.5% do_sys_poll
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               455.00  3.2% system_call
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               409.00  2.8% fget_light
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               348.00  2.4% tcp_sendmsg
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               269.00  1.9% fsnotify
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               258.00  1.8% _raw_spin_unlock_irqrestore
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               223.00  1.6% _raw_spin_lock_irqsave
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               203.00  1.4% __clear_user
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               184.00  1.3% tcp_poll
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               178.00  1.2% vfs_write
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               165.00  1.1% tcp_recvmsg
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               152.00  1.1% pipe_read
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               149.00  1.0% schedule
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               135.00  0.9% rw_verify_area
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               135.00  0.9% __pollwait
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               130.00  0.9% __write
> /lib/libc-2.12.1.so
>               127.00  0.9% __ticket_spin_unlock
> /lib/modules/2.6.37-rc1+/build/vmlinux
>               126.00  0.9% __poll
> /lib/libc-2.12.1.so
> 
> 


Hmm, your clock source is HPET, that might explain the problem on a
scheduler intensive workload.

My HP dev machine
# grep . /sys/devices/system/clocksource/clocksource0/*
/sys/devices/system/clocksource/clocksource0/available_clocksource:tsc hpet acpi_pm 
/sys/devices/system/clocksource/clocksource0/current_clocksource:tsc

My laptop:
$ grep . /sys/devices/system/clocksource/clocksource0/*
/sys/devices/system/clocksource/clocksource0/available_clocksource:tsc hpet acpi_pm 
/sys/devices/system/clocksource/clocksource0/current_clocksource:tsc



^ permalink raw reply

* Re: Loopback performance from kernel 2.6.12 to 2.6.37
From: Andrew Hendry @ 2010-11-09  6:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, netdev
In-Reply-To: <1289283797.2790.84.camel@edumazet-laptop>

most my slowdown was kmemleak left on.

After fixing its is still a lot slower than your dev system
.
# time dd if=/dev/zero bs=1M count=10000 | netcat  127.0.0.1 9999
10000+0 records in
10000+0 records out
10485760000 bytes (10 GB) copied, 25.8182 s, 406 MB/s

real	0m25.821s
user	0m1.502s
sys	0m33.463s

------------------------------------------------------------------------------------------------------------------
   PerfTop:     241 irqs/sec  kernel:56.8%  exact:  0.0% [1000Hz
cycles],  (all, 8 CPUs)
------------------------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________
______________________________________

             1255.00  8.7% hpet_msi_next_event
/lib/modules/2.6.37-rc1+/build/vmlinux
             1081.00  7.5% copy_user_generic_string
/lib/modules/2.6.37-rc1+/build/vmlinux
              863.00  6.0% __ticket_spin_lock
/lib/modules/2.6.37-rc1+/build/vmlinux
              498.00  3.5% do_sys_poll
/lib/modules/2.6.37-rc1+/build/vmlinux
              455.00  3.2% system_call
/lib/modules/2.6.37-rc1+/build/vmlinux
              409.00  2.8% fget_light
/lib/modules/2.6.37-rc1+/build/vmlinux
              348.00  2.4% tcp_sendmsg
/lib/modules/2.6.37-rc1+/build/vmlinux
              269.00  1.9% fsnotify
/lib/modules/2.6.37-rc1+/build/vmlinux
              258.00  1.8% _raw_spin_unlock_irqrestore
/lib/modules/2.6.37-rc1+/build/vmlinux
              223.00  1.6% _raw_spin_lock_irqsave
/lib/modules/2.6.37-rc1+/build/vmlinux
              203.00  1.4% __clear_user
/lib/modules/2.6.37-rc1+/build/vmlinux
              184.00  1.3% tcp_poll
/lib/modules/2.6.37-rc1+/build/vmlinux
              178.00  1.2% vfs_write
/lib/modules/2.6.37-rc1+/build/vmlinux
              165.00  1.1% tcp_recvmsg
/lib/modules/2.6.37-rc1+/build/vmlinux
              152.00  1.1% pipe_read
/lib/modules/2.6.37-rc1+/build/vmlinux
              149.00  1.0% schedule
/lib/modules/2.6.37-rc1+/build/vmlinux
              135.00  0.9% rw_verify_area
/lib/modules/2.6.37-rc1+/build/vmlinux
              135.00  0.9% __pollwait
/lib/modules/2.6.37-rc1+/build/vmlinux
              130.00  0.9% __write
/lib/libc-2.12.1.so
              127.00  0.9% __ticket_spin_unlock
/lib/modules/2.6.37-rc1+/build/vmlinux
              126.00  0.9% __poll
/lib/libc-2.12.1.so


On Tue, Nov 9, 2010 at 5:23 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mardi 09 novembre 2010 à 06:22 +0100, Eric Dumazet a écrit :
>> Le mardi 09 novembre 2010 à 11:05 +1100, Andrew Hendry a écrit :
>> > results on an i7 860 @ 2.80Ghz machine, no virtualization involved. 2.6.37-rc1+
>> >
>> > # time dd if=/dev/zero bs=1M count=10000 | netcat  127.0.0.1 9999
>> > 10000+0 records in
>> > 10000+0 records out
>> > 10485760000 bytes (10 GB) copied, 50.2022 s, 209 MB/s
>> >
>> > real        0m50.210s
>> > user        0m1.094s
>> > sys 0m57.589s
>>
>> Thanks !
>>
>> Could you take a pef snapshot during the test ?
>>
>> # perf record -a -g sleep 10
>> # perf report
>>
>>
>
> On my laptop
> Intel(R) Core(TM)2 Duo CPU     T8300  @ 2.40GHz
> (2.6.35-22-generic #35-Ubuntu SMP Sat Oct 16 20:45:36 UTC 2010 x86_64
> GNU/Linux) :
>
> time dd if=/dev/zero bs=1M count=10000|netcat 127.0.0.1 9999
> 10000+0 enregistrements lus
> 10000+0 enregistrements écrits
> 10485760000 octets (10 GB) copiés, 38,2691 s, 274 MB/s
>
> real    0m38.274s
> user    0m1.870s
> sys     0m38.370s
>
>
> perf top result :
>
> -------------------------------------------------------------------------------------------------
>   PerfTop:    1948 irqs/sec  kernel:90.7%  exact:  0.0% [1000Hz cycles],  (all, 2 CPUs)
> -------------------------------------------------------------------------------------------------
>
>             samples  pcnt function                    DSO
>             _______ _____ ___________________________ ___________________
>
>             1867.00 12.4% copy_user_generic_string    [kernel.kallsyms]
>             1166.00  7.7% __ticket_spin_lock          [kernel.kallsyms]
>              744.00  4.9% __clear_user                [kernel.kallsyms]
>              667.00  4.4% system_call                 [kernel.kallsyms]
>              329.00  2.2% tcp_sendmsg                 [kernel.kallsyms]
>              304.00  2.0% schedule                    [kernel.kallsyms]
>              257.00  1.7% _raw_spin_unlock_irqrestore [kernel.kallsyms]
>              231.00  1.5% fget_light                  [kernel.kallsyms]
>              216.00  1.4% do_poll                     [kernel.kallsyms]
>              203.00  1.3% __read_chk                  /lib/libc-2.12.1.so
>              202.00  1.3% __pollwait                  [kernel.kallsyms]
>              201.00  1.3% __poll                      /lib/libc-2.12.1.so
>              187.00  1.2% system_call_after_swapgs    [kernel.kallsyms]
>              176.00  1.2% __write                     /lib/libc-2.12.1.so
>              173.00  1.1% _raw_spin_lock_irqsave      [kernel.kallsyms]
>              163.00  1.1% tcp_recvmsg                 [kernel.kallsyms]
>              158.00  1.0% do_sys_poll                 [kernel.kallsyms]
>              153.00  1.0% vfs_write                   [kernel.kallsyms]
>              143.00  0.9% pipe_read                   [kernel.kallsyms]
>              141.00  0.9% fput                        [kernel.kallsyms]
>              121.00  0.8% common_file_perm            [kernel.kallsyms]
>              120.00  0.8% _cond_resched               [kernel.kallsyms]
>
>
> # vmstat 1
> procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
>  r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
>  2  0   1456 120056  51572 2606876    0    0   158    41  254  190  9  2 88  0
>  2  0   1456 120140  51580 2606868    0    0    12     0  758 158309 11 76 13  0
>  2  0   1456 119520  51588 2606896    0    0     0   176  778 160749  8 80 12  0
>  2  0   1456 120388  51588 2606896    0    0     0     0  730 158201  9 76 16  0
>  3  0   1456 120388  51588 2606896    0    0     0     0  745 158490  8 76 16  0
>  2  0   1456 120520  51588 2606896    0    0     0     0  991 159120  9 78 13  0
>  2  0   1456 120024  51588 2606896    0    0     0     0  653 160023 10 79 11  0
>  3  0   1456 120520  51588 2606896    0    0     0     0  659 160614  8 78 14  0
>  2  0   1456 120272  51596 2606896    0    0     0    80  695 159922 10 75 14  0
>  4  0   1456 120272  51596 2606896    0    0     0     0  675 158010  7 79 14  0
>
>
> # powertop
>     PowerTOP version 1.13      (C) 2007 Intel Corporation
>
> < Detailed C-state information is not P-states (frequencies)
>                                      Turbo Mode    43.1%
>                                        2.40 Ghz    48.0%
>                                        2.00 Ghz     8.2%
>                                        1.60 Ghz     0.7%
>                                        1200 Mhz     0.1%
>
> Wakeups-from-idle per second : 542.9    interval: 10.0s
> no ACPI power usage estimate available
>
> Top causes for wakeups:
>  21.9% (196.5)   [kernel scheduler] Load balancing tick
>  21.2% (190.7)   [Rescheduling interrupts] <kernel IPI>
>  12.7% (114.0)   PS/2 keyboard/mouse/touchpad interrupt
>  12.0% (107.9)   plugin-containe
>  11.1% ( 99.3)   alsa-sink
>   6.0% ( 53.8)   firefox-bin
>   4.4% ( 39.7)   fping
>   3.9% ( 35.2)   Xorg
>   1.3% ( 11.3)   [b43] <interrupt>
>   1.1% ( 10.0)   ksoftirqd/0
>   0.4% (  4.0)D  nagios3
>   0.2% (  1.9)D  gnome-terminal
>   0.7% (  6.4)   [Thermal event interrupts] <kernel IPI>
>
>
>
>

^ permalink raw reply

* Re: Loopback performance from kernel 2.6.12 to 2.6.37
From: Eric Dumazet @ 2010-11-09  6:23 UTC (permalink / raw)
  To: Andrew Hendry; +Cc: Jesper Dangaard Brouer, netdev
In-Reply-To: <1289280152.2790.23.camel@edumazet-laptop>

Le mardi 09 novembre 2010 à 06:22 +0100, Eric Dumazet a écrit :
> Le mardi 09 novembre 2010 à 11:05 +1100, Andrew Hendry a écrit :
> > results on an i7 860 @ 2.80Ghz machine, no virtualization involved. 2.6.37-rc1+
> > 
> > # time dd if=/dev/zero bs=1M count=10000 | netcat  127.0.0.1 9999
> > 10000+0 records in
> > 10000+0 records out
> > 10485760000 bytes (10 GB) copied, 50.2022 s, 209 MB/s
> > 
> > real	0m50.210s
> > user	0m1.094s
> > sys	0m57.589s
> 
> Thanks !
> 
> Could you take a pef snapshot during the test ?
> 
> # perf record -a -g sleep 10
> # perf report
> 
> 

On my laptop 
Intel(R) Core(TM)2 Duo CPU     T8300  @ 2.40GHz
(2.6.35-22-generic #35-Ubuntu SMP Sat Oct 16 20:45:36 UTC 2010 x86_64
GNU/Linux) :

time dd if=/dev/zero bs=1M count=10000|netcat 127.0.0.1 9999
10000+0 enregistrements lus
10000+0 enregistrements écrits
10485760000 octets (10 GB) copiés, 38,2691 s, 274 MB/s

real	0m38.274s
user	0m1.870s
sys	0m38.370s


perf top result :

-------------------------------------------------------------------------------------------------
   PerfTop:    1948 irqs/sec  kernel:90.7%  exact:  0.0% [1000Hz cycles],  (all, 2 CPUs)
-------------------------------------------------------------------------------------------------

             samples  pcnt function                    DSO
             _______ _____ ___________________________ ___________________

             1867.00 12.4% copy_user_generic_string    [kernel.kallsyms]  
             1166.00  7.7% __ticket_spin_lock          [kernel.kallsyms]  
              744.00  4.9% __clear_user                [kernel.kallsyms]  
              667.00  4.4% system_call                 [kernel.kallsyms]  
              329.00  2.2% tcp_sendmsg                 [kernel.kallsyms]  
              304.00  2.0% schedule                    [kernel.kallsyms]  
              257.00  1.7% _raw_spin_unlock_irqrestore [kernel.kallsyms]  
              231.00  1.5% fget_light                  [kernel.kallsyms]  
              216.00  1.4% do_poll                     [kernel.kallsyms]  
              203.00  1.3% __read_chk                  /lib/libc-2.12.1.so
              202.00  1.3% __pollwait                  [kernel.kallsyms]  
              201.00  1.3% __poll                      /lib/libc-2.12.1.so
              187.00  1.2% system_call_after_swapgs    [kernel.kallsyms]  
              176.00  1.2% __write                     /lib/libc-2.12.1.so
              173.00  1.1% _raw_spin_lock_irqsave      [kernel.kallsyms]  
              163.00  1.1% tcp_recvmsg                 [kernel.kallsyms]  
              158.00  1.0% do_sys_poll                 [kernel.kallsyms]  
              153.00  1.0% vfs_write                   [kernel.kallsyms]  
              143.00  0.9% pipe_read                   [kernel.kallsyms]  
              141.00  0.9% fput                        [kernel.kallsyms]  
              121.00  0.8% common_file_perm            [kernel.kallsyms]  
              120.00  0.8% _cond_resched               [kernel.kallsyms]  


# vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 2  0   1456 120056  51572 2606876    0    0   158    41  254  190  9  2 88  0
 2  0   1456 120140  51580 2606868    0    0    12     0  758 158309 11 76 13  0
 2  0   1456 119520  51588 2606896    0    0     0   176  778 160749  8 80 12  0
 2  0   1456 120388  51588 2606896    0    0     0     0  730 158201  9 76 16  0
 3  0   1456 120388  51588 2606896    0    0     0     0  745 158490  8 76 16  0
 2  0   1456 120520  51588 2606896    0    0     0     0  991 159120  9 78 13  0
 2  0   1456 120024  51588 2606896    0    0     0     0  653 160023 10 79 11  0
 3  0   1456 120520  51588 2606896    0    0     0     0  659 160614  8 78 14  0
 2  0   1456 120272  51596 2606896    0    0     0    80  695 159922 10 75 14  0
 4  0   1456 120272  51596 2606896    0    0     0     0  675 158010  7 79 14  0


# powertop
     PowerTOP version 1.13      (C) 2007 Intel Corporation

< Detailed C-state information is not P-states (frequencies)
                                      Turbo Mode    43.1%
                                        2.40 Ghz    48.0%
                                        2.00 Ghz     8.2%
                                        1.60 Ghz     0.7%
                                        1200 Mhz     0.1%

Wakeups-from-idle per second : 542.9    interval: 10.0s
no ACPI power usage estimate available

Top causes for wakeups:
  21.9% (196.5)   [kernel scheduler] Load balancing tick
  21.2% (190.7)   [Rescheduling interrupts] <kernel IPI>
  12.7% (114.0)   PS/2 keyboard/mouse/touchpad interrupt
  12.0% (107.9)   plugin-containe
  11.1% ( 99.3)   alsa-sink
   6.0% ( 53.8)   firefox-bin
   4.4% ( 39.7)   fping
   3.9% ( 35.2)   Xorg
   1.3% ( 11.3)   [b43] <interrupt>
   1.1% ( 10.0)   ksoftirqd/0
   0.4% (  4.0)D  nagios3
   0.2% (  1.9)D  gnome-terminal
   0.7% (  6.4)   [Thermal event interrupts] <kernel IPI>




^ permalink raw reply

* Re: [PATCH] via-rhine: hardware VLAN support
From: Roger Luethi @ 2010-11-09  6:18 UTC (permalink / raw)
  To: Jesse Gross; +Cc: netdev, David S. Miller
In-Reply-To: <AANLkTim40QH2AWz8YtW_y3=WjEU0_Rom9-CPFj-O5MCt@mail.gmail.com>

On Mon, 08 Nov 2010 12:53:57 -0800, Jesse Gross wrote:
> On Mon, Nov 8, 2010 at 8:21 AM, Roger Luethi <rl@hellgate.ch> wrote:
> > On Fri, 05 Nov 2010 11:31:56 -0700, Jesse Gross wrote:
> >> On Fri, Nov 5, 2010 at 3:43 AM, Roger Luethi <rl@hellgate.ch> wrote:
> >> > This patch adds VLAN hardware support for Rhine chips.
> >>
> >> This uses the old interfaces for vlan acceleration.  We're working to
> >> switch drivers over to use the new methods and the old ones will be
> >> going away in the future.  It would be great if we can avoid adding
> >> more code that uses those interfaces.
> >
> > Can you point me to a driver that has been switched to use the new methods
> > already? Is there some other form of documentation?
> 
> bnx2 is an example of a driver that has been converted.  The commit
> that actually made the change was
> 7d0fd2117e3d0550d7987b3aff2bfbc0244cf7c6, which should highlight the
> differences.  A key point is that drivers should no longer reference
> vlan groups at all.

Thank you. I will take a look and submit a revised patch.

^ permalink raw reply

* Re: Takes > 1 second to delete macvlan with global IPv6 address on it.
From: Eric Dumazet @ 2010-11-09  6:15 UTC (permalink / raw)
  To: Ben Greear; +Cc: NetDev
In-Reply-To: <4CD893C6.2030803@candelatech.com>

Le lundi 08 novembre 2010 à 16:20 -0800, Ben Greear a écrit :
> This is on an otherwise lightly loaded 2.6.36 + hacks system, 12 physical interfaces,
> and two VETH interfaces.
> 
> It's much faster to delete an interface when it has no IPv6 address:
> 
> [root@ct503-60 lanforge]# time ip link add link eth5 up name eth5#0 address 00:00:00:00:00:01 type macvlan
> 
> real	0m0.005s
> user	0m0.001s
> sys	0m0.004s
> [root@ct503-60 lanforge]# time ip link delete eth5#0
> 
> real	0m0.033s
> user	0m0.001s
> sys	0m0.005s
> [root@ct503-60 lanforge]# ip link add link eth5 up name eth5#0 address 00:00:00:00:00:01 type macvlan
> 
> [root@ct503-60 lanforge]# ip -6 addr add 2002::1/64 dev eth5#0
> [root@ct503-60 lanforge]# time ip link delete eth5#0
> 
> real	0m1.030s
> user	0m0.000s
> sys	0m0.013s
> 
> 
> Funny enough, if you explicitly remove the IPv6 addr first it seems
> to run at normal speed (adding both operation's times together)
> 
> [root@ct503-60 lanforge]# ip link add link eth5 up name eth5#0 address 00:00:00:00:00:01 type macvlan
> [root@ct503-60 lanforge]# ip -6 addr add 2002::1/64 dev eth5#0
> [root@ct503-60 lanforge]# time ip -6 addr delete 2002::1/64 dev eth5#0
> 
> real	0m0.001s
> user	0m0.000s
> sys	0m0.001s
> [root@ct503-60 lanforge]# time ip link delete eth5#0
> 
> real	0m0.028s
> user	0m0.001s
> sys	0m0.005s
> 

The key here is you have to wait a bit (2 seconds) between 
"ip -6 addr add..." and the "ip link delete", or it is fast.

So ipv6 misses a cleanup somewhere and a device refcount is held.

here is a debugging patch on current kernels :

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 072652d..820d9ed 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1799,6 +1799,7 @@ extern void netdev_run_todo(void);
  */
 static inline void dev_put(struct net_device *dev)
 {
+	WARN_ON(dev->reg_state == NETREG_UNREGISTERED);
 	irqsafe_cpu_dec(*dev->pcpu_refcnt);
 }
 
gives :

[  418.614227] ------------[ cut here ]------------
[  418.614281] WARNING: at include/linux/netdevice.h:1802 in6_dev_finish_destroy+0xc9/0xf0()
[  418.614348] Hardware name: ProLiant BL460c G6
[  418.614392] Modules linked in: macvlan ipmi_devintf ipmi_si ipmi_msghandler dm_mod tg3 libphy sg [last unloaded: x_tables]
[  418.614804] Pid: 5403, comm: ip Tainted: G        W   2.6.37-rc1-00186-g5c6f178-dirty #271
[  418.614857] Call Trace:
[  418.614901]  [<ffffffff814ecac9>] ? in6_dev_finish_destroy+0xc9/0xf0
[  418.614952]  [<ffffffff81046440>] warn_slowpath_common+0x90/0xc0
[  418.615002]  [<ffffffff8104648a>] warn_slowpath_null+0x1a/0x20
[  418.615051]  [<ffffffff814ecac9>] in6_dev_finish_destroy+0xc9/0xf0
[  418.615101]  [<ffffffff814f469e>] ip6_dst_ifdown+0x5e/0x60
[  418.615150]  [<ffffffff81448318>] dst_ifdown+0x38/0x110
[  418.615198]  [<ffffffff81448457>] dst_dev_event+0x67/0x130
[  418.615247]  [<ffffffff815d2888>] notifier_call_chain+0x58/0x80
[  418.615298]  [<ffffffff8106b86e>] __raw_notifier_call_chain+0xe/0x10
[  418.615348]  [<ffffffff8106b886>] raw_notifier_call_chain+0x16/0x20
[  418.615432]  [<ffffffff814408d7>] call_netdevice_notifiers+0x37/0x70
[  418.615496]  [<ffffffff81440a47>] netdev_run_todo+0x137/0x260
[  418.615560]  [<ffffffff8144f11e>] rtnl_unlock+0xe/0x10
[  418.615621]  [<ffffffff8144f18a>] rtnetlink_rcv+0x2a/0x40
[  418.615684]  [<ffffffff8148b043>] netlink_unicast+0x2c3/0x2d0
[  418.615747]  [<ffffffff81438a8b>] ? memcpy_fromiovec+0x7b/0xa0
[  418.615810]  [<ffffffff8148bddd>] netlink_sendmsg+0x24d/0x380
[  418.615874]  [<ffffffff8142dad0>] sock_sendmsg+0xc0/0xf0
[  418.615938]  [<ffffffff81458370>] ? verify_compat_iovec+0x80/0x130
[  418.616002]  [<ffffffff8142e894>] sys_sendmsg+0x1a4/0x340
[  418.616065]  [<ffffffff810dad46>] ? handle_mm_fault+0x676/0x8b0
[  418.616129]  [<ffffffff815d2610>] ? do_page_fault+0x2a0/0x4c0
[  418.616192]  [<ffffffff8142df09>] ? sys_recvmsg+0x49/0x70
[  418.616254]  [<ffffffff81457f14>] compat_sys_sendmsg+0x14/0x20
[  418.616317]  [<ffffffff81458cbf>] compat_sys_socketcall+0x1cf/0x220
[  418.616380]  [<ffffffff815cf1e5>] ? page_fault+0x25/0x30
[  418.616443]  [<ffffffff8102ec60>] sysenter_dispatch+0x7/0x2e
[  418.616520] ---[ end trace c2d75997b525ef59 ]---



^ permalink raw reply related

* Re: Loopback performance from kernel 2.6.12 to 2.6.37
From: Eric Dumazet @ 2010-11-09  5:22 UTC (permalink / raw)
  To: Andrew Hendry; +Cc: Jesper Dangaard Brouer, netdev
In-Reply-To: <AANLkTi=HhouZymj0R7JsDy-X1LDbfT_WL0x10EMhdOho@mail.gmail.com>

Le mardi 09 novembre 2010 à 11:05 +1100, Andrew Hendry a écrit :
> results on an i7 860 @ 2.80Ghz machine, no virtualization involved. 2.6.37-rc1+
> 
> # time dd if=/dev/zero bs=1M count=10000 | netcat  127.0.0.1 9999
> 10000+0 records in
> 10000+0 records out
> 10485760000 bytes (10 GB) copied, 50.2022 s, 209 MB/s
> 
> real	0m50.210s
> user	0m1.094s
> sys	0m57.589s

Thanks !

Could you take a pef snapshot during the test ?

# perf record -a -g sleep 10
# perf report




^ permalink raw reply

* Re: [v3 RFC PATCH 0/4] Implement multiqueue virtio-net
From: Krishna Kumar2 @ 2010-11-09  4:38 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: anthony, arnd, avi, davem, eric.dumazet, kvm, netdev, rusty
In-Reply-To: <20101026085709.GC23530@redhat.com>

"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/26/2010 02:27:09 PM:

> Re: [v3 RFC PATCH 0/4] Implement multiqueue virtio-net
>
> On Mon, Oct 25, 2010 at 09:20:38PM +0530, Krishna Kumar2 wrote:
> > > Krishna Kumar2/India/IBM@IBMIN wrote on 10/20/2010 02:24:52 PM:
> >
> > Any feedback, comments, objections, issues or bugs about the
> > patches? Please let me know if something needs to be done.
> >
> > Some more test results:
> > _____________________________________________________
> >          Host->Guest BW (numtxqs=2)
> > #       BW%     CPU%    RCPU%   SD%     RSD%
> > _____________________________________________________
>
> I think we discussed the need for external to guest testing
> over 10G. For large messages we should not see any change
> but you should be able to get better numbers for small messages
> assuming a MQ NIC card.

I had to make a few changes to qemu (and a minor change in macvtap
driver) to get multiple TXQ support using macvtap working. The NIC
is a ixgbe card.

__________________________________________________________________________
            Org vs New (I/O: 512 bytes, #numtxqs=2, #vhosts=3)
#      BW1     BW2 (%)       SD1    SD2 (%)        RSD1    RSD2 (%)
__________________________________________________________________________
1      14367   13142 (-8.5)  56     62 (10.7)      8        8 (0)
2      3652    3855 (5.5)    37     35 (-5.4)      7        6 (-14.2)
4      12529   12059 (-3.7)  65     77 (18.4)      35       35 (0)
8      13912   14668 (5.4)   288    332 (15.2)     175      184 (5.1)
16     13433   14455 (7.6)   1218   1321 (8.4)     920      943 (2.5)
24     12750   13477 (5.7)   2876   2985 (3.7)     2514     2348 (-6.6)
32     11729   12632 (7.6)   5299   5332 (.6)      4934     4497 (-8.8)
40     11061   11923 (7.7)   8482   8364 (-1.3)    8374     7495 (-10.4)
48     10624   11267 (6.0)   12329  12258 (-.5)    12762    11538 (-9.5)
64     10524   10596 (.6)    21689  22859 (5.3)    23626    22403 (-5.1)
80     9856    10284 (4.3)   35769  36313 (1.5)    39932    36419 (-8.7)
96     9691    10075 (3.9)   52357  52259 (-.1)    58676    53463 (-8.8)
128    9351    9794 (4.7)    114707 94275 (-17.8)  114050   97337 (-14.6)
__________________________________________________________________________
Avg:      BW: (3.3)      SD: (-7.3)      RSD: (-11.0)

__________________________________________________________________________
            Org vs New (I/O: 1K, #numtxqs=8, #vhosts=5)
#      BW1      BW2 (%)       SD1   SD2 (%)        RSD1   RSD2 (%)
__________________________________________________________________________
1      16509    15985 (-3.1)  45    47 (4.4)       7       7 (0)
2      6963     4499 (-35.3)  17    51 (200.0)     7       7 (0)
4      12932    11080 (-14.3) 49    74 (51.0)      35      35 (0)
8      13878    14095 (1.5)   223   292 (30.9)     175     181 (3.4)
16     13440    13698 (1.9)   980   1131 (15.4)    926     942 (1.7)
24     12680    12927 (1.9)   2387  2463 (3.1)     2526    2342 (-7.2)
32     11714    12261 (4.6)   4506  4486 (-.4)     4941    4463 (-9.6)
40     11059    11651 (5.3)   7244  7081 (-2.2)    8349    7437 (-10.9)
48     10580    11095 (4.8)   10811 10500 (-2.8)   12809   11403 (-10.9)
64     10569    10566 (0)     19194 19270 (.3)     23648   21717 (-8.1)
80     9827     10753 (9.4)   31668 29425 (-7.0)   39991   33824 (-15.4)
96     10043    10150 (1.0)   45352 44227 (-2.4)   57766   51131 (-11.4)
128    9360     9979 (6.6)    92058 79198 (-13.9)  114381  92873 (-18.8)
__________________________________________________________________________
Avg:      BW: (-.5)      SD: (-7.5)      RSD: (-14.7)

Is there anything else you would like me to test/change, or shall
I submit the next version (with the above macvtap changes)?

Thanks,

- KK


^ permalink raw reply

* Re: [PATCH] virtio_net: Fix queue full check
From: Krishna Kumar2 @ 2010-11-09  4:26 UTC (permalink / raw)
  To: Rusty Russell; +Cc: davem, Michael S. Tsirkin, netdev, yvugenfi
In-Reply-To: <201011080938.47938.rusty@rustcorp.com.au>

Rusty Russell <rusty@rustcorp.com.au> wrote on 11/08/2010 04:38:47 AM:

> Re: [PATCH] virtio_net: Fix queue full check
>
> On Thu, 4 Nov 2010 10:54:24 pm Michael S. Tsirkin wrote:
> > I thought about this some more.  I think the original
> > code is actually correct in returning ENOSPC: indirect
> > buffers are nice, but it's a mistake
> > to rely on them as a memory allocation might fail.
> >
> > And if you look at virtio-net, it is dropping packets
> > under memory pressure which is not really a happy outcome:
> > the packet will get freed, reallocated and we get another one,
> > adding pressure on the allocator instead of releasing it
> > until we free up some buffers.
> >
> > So I now think we should calculate the capacity
> > assuming non-indirect entries, and if we manage to
> > use indirect, all the better.
>
> I've long said it's a weakness in the network stack that it insists
> drivers stop the tx queue before they *might* run out of room, leading to
> worst-case assumptions and underutilization of the tx ring.
>
> However, I lost that debate, and so your patch is the way it's supposed
to
> work.  The other main indirect user (block) doesn't care as its queue
> allows for post-attempt blocking.
>
> I enhanced your commentry a little:
>
> Subject: virtio: return correct capacity to users
> Date: Thu, 4 Nov 2010 14:24:24 +0200
> From: "Michael S. Tsirkin" <mst@redhat.com>
>
> We can't rely on indirect buffers for capacity
> calculations because they need a memory allocation
> which might fail.  In particular, virtio_net can get
> into this situation under stress, and it drops packets
> and performs badly.
>
> So return the number of buffers we can guarantee users.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> Reported-By: Krishna Kumar2 <krkumar2@in.ibm.com>

I have tested this patch for 3-4 hours but so far I have not got the tx
full
error. I am not sure if "Tested-By" applies to this situation, but just in
case:

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reported-By: Krishna Kumar2 <krkumar2@in.ibm.com>
Tested-By: Krishna Kumar2 <krkumar2@in.ibm.com>

I think both this patch and the original patch I submitted
are needed? That patch removes ENOMEM check and the increment
of dev->stats.tx_fifo_errors, and reports "memory failure".

Thanks,

- KK


^ permalink raw reply

* Re: [PATCH] inet: fix ip_mc_drop_socket()
From: Miles Lane @ 2010-11-09  4:20 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Markus Trippelsdorf, David Miller, paulmck, ilpo.jarvinen, LKML,
	Len Brown, netdev
In-Reply-To: <1289250954.2790.11.camel@edumazet-laptop>

Looks good here.

On Mon, Nov 8, 2010 at 4:15 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Hmm, I believe I found the bug.
>
> Thanks guys !
>
> [PATCH] inet: fix ip_mc_drop_socket()
>
> commit 8723e1b4ad9be4444 (inet: RCU changes in inetdev_by_index())
> forgot one call site in ip_mc_drop_socket()
>
> We should not decrease idev refcount after inetdev_by_index() call,
> since refcount is not increased anymore.
>
> Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
> Reported-by: Miles Lane <miles.lane@gmail.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  net/ipv4/igmp.c |    4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
> index c8877c6..3c53c2d 100644
> --- a/net/ipv4/igmp.c
> +++ b/net/ipv4/igmp.c
> @@ -2306,10 +2306,8 @@ void ip_mc_drop_socket(struct sock *sk)
>
>                in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
>                (void) ip_mc_leave_src(sk, iml, in_dev);
> -               if (in_dev != NULL) {
> +               if (in_dev != NULL)
>                        ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
> -                       in_dev_put(in_dev);
> -               }
>                /* decrease mem now to avoid the memleak warning */
>                atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
>                call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
>
>
>

^ permalink raw reply

* Re: [Bugme-new] [Bug 22142] New: skge module doesn't work in 2.6.37-rc1
From: David Miller @ 2010-11-09  2:46 UTC (permalink / raw)
  To: akpm; +Cc: shemminger, bugzilla-daemon, bugme-daemon, netdev, jtmettala
In-Reply-To: <20101108154306.0f93eddb.akpm@linux-foundation.org>

From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 8 Nov 2010 15:43:06 -0800

> skge_devinit() did a nearly-NULL deref.

Fixed in net-2.6:

--------------------
skge: Remove tx queue stopping in skge_devinit()

After e6484930d7c73d324bccda7d43d131088da697b9: net: allocate tx queues in register_netdevice
It causes an Oops at skge_probe() time.

Signed-off-by: Guillaume Chazarain <guichaz@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/skge.c |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index bfec2e0..220e039 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -3858,7 +3858,6 @@ static struct net_device *skge_devinit(struct skge_hw *hw, int port,
 
 	/* device is off until link detection */
 	netif_carrier_off(dev);
-	netif_stop_queue(dev);
 
 	return dev;
 }
-- 
1.7.3.2


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox