Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v6 4/6] net-sysfs: add support for device-specific rx queue sysfs attributes
From: Michael Dalton @ 2014-01-17  6:23 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eric Dumazet, Rusty Russell, Michael S. Tsirkin,
	Jason Wang, Ben Hutchings, virtualization, Michael Dalton
In-Reply-To: <1389939810-14998-1-git-send-email-mwdalton@google.com>

Extend existing support for netdevice receive queue sysfs attributes to
permit a device-specific attribute group. Initial use case for this
support will be to allow the virtio-net device to export per-receive
queue mergeable receive buffer size.

Signed-off-by: Michael Dalton <mwdalton@google.com>
---
v4->v5: Handle sysfs_create_group failure. Call sysfs_remove_group when
        removing a RX queue kobj if a device-specific group exists.
v3->v4: Simplify by removing loop in get_netdev_rx_queue_index.

 include/linux/netdevice.h | 35 +++++++++++++++++++++++++++++----
 net/core/dev.c            | 12 ++++++------
 net/core/net-sysfs.c      | 50 +++++++++++++++++++++++++++--------------------
 3 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d7668b88..e985231 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -668,15 +668,28 @@ extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 			 u16 filter_id);
 #endif
+#endif /* CONFIG_RPS */
 
 /* This structure contains an instance of an RX queue. */
 struct netdev_rx_queue {
+#ifdef CONFIG_RPS
 	struct rps_map __rcu		*rps_map;
 	struct rps_dev_flow_table __rcu	*rps_flow_table;
+#endif
 	struct kobject			kobj;
 	struct net_device		*dev;
 } ____cacheline_aligned_in_smp;
-#endif /* CONFIG_RPS */
+
+/*
+ * RX queue sysfs structures and functions.
+ */
+struct rx_queue_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct netdev_rx_queue *queue,
+	    struct rx_queue_attribute *attr, char *buf);
+	ssize_t (*store)(struct netdev_rx_queue *queue,
+	    struct rx_queue_attribute *attr, const char *buf, size_t len);
+};
 
 #ifdef CONFIG_XPS
 /*
@@ -1313,7 +1326,7 @@ struct net_device {
 						   unicast) */
 
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	struct netdev_rx_queue	*_rx;
 
 	/* Number of RX queues allocated at register_netdev() time */
@@ -1424,6 +1437,8 @@ struct net_device {
 	struct device		dev;
 	/* space for optional device, statistics, and wireless sysfs groups */
 	const struct attribute_group *sysfs_groups[4];
+	/* space for optional per-rx queue attributes */
+	const struct attribute_group *sysfs_rx_queue_group;
 
 	/* rtnetlink link ops */
 	const struct rtnl_link_ops *rtnl_link_ops;
@@ -2375,7 +2390,7 @@ static inline bool netif_is_multiqueue(const struct net_device *dev)
 
 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
 #else
 static inline int netif_set_real_num_rx_queues(struct net_device *dev,
@@ -2394,7 +2409,7 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev,
 					   from_dev->real_num_tx_queues);
 	if (err)
 		return err;
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	return netif_set_real_num_rx_queues(to_dev,
 					    from_dev->real_num_rx_queues);
 #else
@@ -2402,6 +2417,18 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev,
 #endif
 }
 
+#ifdef CONFIG_SYSFS
+static inline unsigned int get_netdev_rx_queue_index(
+		struct netdev_rx_queue *queue)
+{
+	struct net_device *dev = queue->dev;
+	int index = queue - dev->_rx;
+
+	BUG_ON(index >= dev->num_rx_queues);
+	return index;
+}
+#endif
+
 #define DEFAULT_MAX_NUM_RSS_QUEUES	(8)
 int netif_get_num_default_rss_queues(void);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index f87bedd..288df62 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2083,7 +2083,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 }
 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 /**
  *	netif_set_real_num_rx_queues - set actual number of RX queues used
  *	@dev: Network device
@@ -5764,7 +5764,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 }
 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 static int netif_alloc_rx_queues(struct net_device *dev)
 {
 	unsigned int i, count = dev->num_rx_queues;
@@ -6309,7 +6309,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	if (rxqs < 1) {
 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
 		return NULL;
@@ -6365,7 +6365,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (netif_alloc_netdev_queues(dev))
 		goto free_all;
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	dev->num_rx_queues = rxqs;
 	dev->real_num_rx_queues = rxqs;
 	if (netif_alloc_rx_queues(dev))
@@ -6385,7 +6385,7 @@ free_all:
 free_pcpu:
 	free_percpu(dev->pcpu_refcnt);
 	netif_free_tx_queues(dev);
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	kfree(dev->_rx);
 #endif
 
@@ -6410,7 +6410,7 @@ void free_netdev(struct net_device *dev)
 	release_net(dev_net(dev));
 
 	netif_free_tx_queues(dev);
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	kfree(dev->_rx);
 #endif
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 49843bf..7eeadee 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -498,17 +498,7 @@ static struct attribute_group wireless_group = {
 #define net_class_groups	NULL
 #endif /* CONFIG_SYSFS */
 
-#ifdef CONFIG_RPS
-/*
- * RX queue sysfs structures and functions.
- */
-struct rx_queue_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct netdev_rx_queue *queue,
-	    struct rx_queue_attribute *attr, char *buf);
-	ssize_t (*store)(struct netdev_rx_queue *queue,
-	    struct rx_queue_attribute *attr, const char *buf, size_t len);
-};
+#ifdef CONFIG_SYSFS
 #define to_rx_queue_attr(_attr) container_of(_attr,		\
     struct rx_queue_attribute, attr)
 
@@ -543,6 +533,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = {
 	.store = rx_queue_attr_store,
 };
 
+#ifdef CONFIG_RPS
 static ssize_t show_rps_map(struct netdev_rx_queue *queue,
 			    struct rx_queue_attribute *attribute, char *buf)
 {
@@ -718,16 +709,20 @@ static struct rx_queue_attribute rps_cpus_attribute =
 static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
 	__ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
 	    show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+#endif /* CONFIG_RPS */
 
 static struct attribute *rx_queue_default_attrs[] = {
+#ifdef CONFIG_RPS
 	&rps_cpus_attribute.attr,
 	&rps_dev_flow_table_cnt_attribute.attr,
+#endif
 	NULL
 };
 
 static void rx_queue_release(struct kobject *kobj)
 {
 	struct netdev_rx_queue *queue = to_rx_queue(kobj);
+#ifdef CONFIG_RPS
 	struct rps_map *map;
 	struct rps_dev_flow_table *flow_table;
 
@@ -743,6 +738,7 @@ static void rx_queue_release(struct kobject *kobj)
 		RCU_INIT_POINTER(queue->rps_flow_table, NULL);
 		call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
 	}
+#endif
 
 	memset(kobj, 0, sizeof(*kobj));
 	dev_put(queue->dev);
@@ -763,25 +759,36 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
 	kobj->kset = net->queues_kset;
 	error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
 	    "rx-%u", index);
-	if (error) {
-		kobject_put(kobj);
-		return error;
+	if (error)
+		goto exit;
+
+	if (net->sysfs_rx_queue_group) {
+		error = sysfs_create_group(kobj, net->sysfs_rx_queue_group);
+		if (error)
+			goto exit;
 	}
 
 	kobject_uevent(kobj, KOBJ_ADD);
 	dev_hold(queue->dev);
 
 	return error;
+exit:
+	kobject_put(kobj);
+	return error;
 }
-#endif /* CONFIG_RPS */
+#endif /* CONFIG_SYFS */
 
 int
 net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 {
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	int i;
 	int error = 0;
 
+#ifndef CONFIG_RPS
+	if (!net->sysfs_rx_queue_group)
+		return 0;
+#endif
 	for (i = old_num; i < new_num; i++) {
 		error = rx_queue_add_kobject(net, i);
 		if (error) {
@@ -790,8 +797,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 		}
 	}
 
-	while (--i >= new_num)
+	while (--i >= new_num) {
+		if (net->sysfs_rx_queue_group)
+			sysfs_remove_group(&net->_rx[i].kobj,
+					   net->sysfs_rx_queue_group);
 		kobject_put(&net->_rx[i].kobj);
+	}
 
 	return error;
 #else
@@ -1155,9 +1166,6 @@ static int register_queue_kobjects(struct net_device *net)
 	    NULL, &net->dev.kobj);
 	if (!net->queues_kset)
 		return -ENOMEM;
-#endif
-
-#ifdef CONFIG_RPS
 	real_rx = net->real_num_rx_queues;
 #endif
 	real_tx = net->real_num_tx_queues;
@@ -1184,7 +1192,7 @@ static void remove_queue_kobjects(struct net_device *net)
 {
 	int real_rx = 0, real_tx = 0;
 
-#ifdef CONFIG_RPS
+#ifdef CONFIG_SYSFS
 	real_rx = net->real_num_rx_queues;
 #endif
 	real_tx = net->real_num_tx_queues;
-- 
1.8.5.2

^ permalink raw reply related

* [PATCH net-next v6 3/6] virtio-net: auto-tune mergeable rx buffer size for improved performance
From: Michael Dalton @ 2014-01-17  6:23 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eric Dumazet, Rusty Russell, Michael S. Tsirkin,
	Jason Wang, Ben Hutchings, virtualization, Michael Dalton
In-Reply-To: <1389939810-14998-1-git-send-email-mwdalton@google.com>

Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page frag
allocators") changed the mergeable receive buffer size from PAGE_SIZE to
MTU-size, introducing a single-stream regression for benchmarks with large
average packet size. There is no single optimal buffer size for all
workloads.  For workloads with packet size <= MTU bytes, MTU + virtio-net
header-sized buffers are preferred as larger buffers reduce the TCP window
due to SKB truesize. However, single-stream workloads with large average
packet sizes have higher throughput if larger (e.g., PAGE_SIZE) buffers
are used.

This commit auto-tunes the mergeable receiver buffer packet size by
choosing the packet buffer size based on an EWMA of the recent packet
sizes for the receive queue. Packet buffer sizes range from MTU_SIZE +
virtio-net header len to PAGE_SIZE. This improves throughput for
large packet workloads, as any workload with average packet size >=
PAGE_SIZE will use PAGE_SIZE buffers.

These optimizations interact positively with recent commit
ba275241030c ("virtio-net: coalesce rx frags when possible during rx"),
which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
optimizations benefit buffers of any size.

Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
between two QEMU VMs on a single physical machine. Each VM has two VCPUs
with all offloads & vhost enabled. All VMs and vhost threads run in a
single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
in the system will not be scheduled on the benchmark CPUs. Trunk includes
SKB rx frag coalescing.

net-next w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
net-next (MTU-size bufs):  13170.01Gb/s
net-next + auto-tune: 14555.94Gb/s

Jason Wang also reported a throughput increase on mlx4 from 22Gb/s
using MTU-sized buffers to about 26Gb/s using auto-tuning.

Signed-off-by: Michael Dalton <mwdalton@google.com>
---
v5->v6: Fix merge conflict. Subtract 1 before encoding the scaled truesize
        for a mergeable buffer ctx to support 64KB PAGE_SIZE.
v2->v3: Remove per-receive queue metadata ring. Encode packet buffer
        base address and truesize into an unsigned long by requiring a
        minimum packet size alignment of 256. Permit attempts to fill
        an already-full RX ring (reverting the change in v2).
v1->v2: Add per-receive queue metadata ring to track precise truesize for
        mergeable receive buffers. Remove all truesize approximation. Never
        try to fill a full RX ring (required for metadata ring in v2).
 drivers/net/virtio_net.c | 100 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 75 insertions(+), 25 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 5ee71dc..dacd43b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -26,6 +26,7 @@
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <linux/average.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -36,11 +37,18 @@ module_param(gso, bool, 0444);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
-#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
-                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
-                                L1_CACHE_BYTES))
 #define GOOD_COPY_LEN	128
 
+/* Weight used for the RX packet size EWMA. The average packet size is used to
+ * determine the packet buffer size when refilling RX rings. As the entire RX
+ * ring may be refilled at once, the weight is chosen so that the EWMA will be
+ * insensitive to short-term, transient changes in packet size.
+ */
+#define RECEIVE_AVG_WEIGHT 64
+
+/* Minimum alignment for mergeable packet buffers. */
+#define MERGEABLE_BUFFER_ALIGN max(L1_CACHE_BYTES, 256)
+
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 
 struct virtnet_stats {
@@ -75,6 +83,9 @@ struct receive_queue {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
+	/* Average packet length for mergeable receive buffers. */
+	struct ewma mrg_avg_pkt_len;
+
 	/* Page frag for packet buffer allocation. */
 	struct page_frag alloc_frag;
 
@@ -216,6 +227,24 @@ static void skb_xmit_done(struct virtqueue *vq)
 	netif_wake_subqueue(vi->dev, vq2txq(vq));
 }
 
+static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx)
+{
+	unsigned int truesize = mrg_ctx & (MERGEABLE_BUFFER_ALIGN - 1);
+	return (truesize + 1) * MERGEABLE_BUFFER_ALIGN;
+}
+
+static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
+{
+	return (void *)(mrg_ctx & -MERGEABLE_BUFFER_ALIGN);
+
+}
+
+static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
+{
+	unsigned int size = truesize / MERGEABLE_BUFFER_ALIGN;
+	return (unsigned long)buf | (size - 1);
+}
+
 /* Called from bottom half context */
 static struct sk_buff *page_to_skb(struct receive_queue *rq,
 				   struct page *page, unsigned int offset,
@@ -324,31 +353,33 @@ err:
 
 static struct sk_buff *receive_mergeable(struct net_device *dev,
 					 struct receive_queue *rq,
-					 void *buf,
+					 unsigned long ctx,
 					 unsigned int len)
 {
+	void *buf = mergeable_ctx_to_buf_address(ctx);
 	struct skb_vnet_hdr *hdr = buf;
 	int num_buf = hdr->mhdr.num_buffers;
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
-	unsigned int truesize = max_t(unsigned int, len, MERGE_BUFFER_LEN);
+	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
+
 	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
 	struct sk_buff *curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
 		goto err_skb;
-
 	while (--num_buf) {
 		int num_skb_frags;
 
-		buf = virtqueue_get_buf(rq->vq, &len);
-		if (unlikely(!buf)) {
+		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
+		if (unlikely(!ctx)) {
 			pr_debug("%s: rx error: %d buffers out of %d missing\n",
 				 dev->name, num_buf, hdr->mhdr.num_buffers);
 			dev->stats.rx_length_errors++;
 			goto err_buf;
 		}
 
+		buf = mergeable_ctx_to_buf_address(ctx);
 		page = virt_to_head_page(buf);
 
 		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
@@ -365,7 +396,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			head_skb->truesize += nskb->truesize;
 			num_skb_frags = 0;
 		}
-		truesize = max_t(unsigned int, len, MERGE_BUFFER_LEN);
+		truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
@@ -382,19 +413,20 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		}
 	}
 
+	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
 	return head_skb;
 
 err_skb:
 	put_page(page);
 	while (--num_buf) {
-		buf = virtqueue_get_buf(rq->vq, &len);
-		if (unlikely(!buf)) {
+		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
+		if (unlikely(!ctx)) {
 			pr_debug("%s: rx error: %d buffers missing\n",
 				 dev->name, num_buf);
 			dev->stats.rx_length_errors++;
 			break;
 		}
-		page = virt_to_head_page(buf);
+		page = virt_to_head_page(mergeable_ctx_to_buf_address(ctx));
 		put_page(page);
 	}
 err_buf:
@@ -414,17 +446,20 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
 		pr_debug("%s: short packet %i\n", dev->name, len);
 		dev->stats.rx_length_errors++;
-		if (vi->mergeable_rx_bufs)
-			put_page(virt_to_head_page(buf));
-		else if (vi->big_packets)
+		if (vi->mergeable_rx_bufs) {
+			unsigned long ctx = (unsigned long)buf;
+			void *base = mergeable_ctx_to_buf_address(ctx);
+			put_page(virt_to_head_page(base));
+		} else if (vi->big_packets) {
 			give_pages(rq, buf);
-		else
+		} else {
 			dev_kfree_skb(buf);
+		}
 		return;
 	}
 
 	if (vi->mergeable_rx_bufs)
-		skb = receive_mergeable(dev, rq, buf, len);
+		skb = receive_mergeable(dev, rq, (unsigned long)buf, len);
 	else if (vi->big_packets)
 		skb = receive_big(dev, rq, buf, len);
 	else
@@ -567,25 +602,36 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
+	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	struct page_frag *alloc_frag = &rq->alloc_frag;
 	char *buf;
+	unsigned long ctx;
 	int err;
 	unsigned int len, hole;
 
-	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
+	len = hdr_len + clamp_t(unsigned int, ewma_read(&rq->mrg_avg_pkt_len),
+				GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
+	len = ALIGN(len, MERGEABLE_BUFFER_ALIGN);
+	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
 		return -ENOMEM;
+
 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	ctx = mergeable_buf_to_ctx(buf, len);
 	get_page(alloc_frag->page);
-	len = MERGE_BUFFER_LEN;
 	alloc_frag->offset += len;
 	hole = alloc_frag->size - alloc_frag->offset;
-	if (hole < MERGE_BUFFER_LEN) {
+	if (hole < len) {
+		/* To avoid internal fragmentation, if there is very likely not
+		 * enough space for another buffer, add the remaining space to
+		 * the current buffer. This extra space is not included in
+		 * the truesize stored in ctx.
+		 */
 		len += hole;
 		alloc_frag->offset += hole;
 	}
 
 	sg_init_one(rq->sg, buf, len);
-	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
+	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
 
@@ -1385,12 +1431,15 @@ static void free_unused_bufs(struct virtnet_info *vi)
 		struct virtqueue *vq = vi->rq[i].vq;
 
 		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
-			if (vi->mergeable_rx_bufs)
-				put_page(virt_to_head_page(buf));
-			else if (vi->big_packets)
+			if (vi->mergeable_rx_bufs) {
+				unsigned long ctx = (unsigned long)buf;
+				void *base = mergeable_ctx_to_buf_address(ctx);
+				put_page(virt_to_head_page(base));
+			} else if (vi->big_packets) {
 				give_pages(&vi->rq[i], buf);
-			else
+			} else {
 				dev_kfree_skb(buf);
+			}
 		}
 	}
 }
@@ -1498,6 +1547,7 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 			       napi_weight);
 
 		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
+		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
 		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
 	}
 
-- 
1.8.5.2

^ permalink raw reply related

* [PATCH net-next v6 1/6] net: allow > 0 order atomic page alloc in skb_page_frag_refill
From: Michael Dalton @ 2014-01-17  6:23 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eric Dumazet, Rusty Russell, Michael S. Tsirkin,
	Jason Wang, Ben Hutchings, virtualization, Michael Dalton
In-Reply-To: <1389939810-14998-1-git-send-email-mwdalton@google.com>

skb_page_frag_refill currently permits only order-0 page allocs
unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
higher-order page allocations whether or not GFP_WAIT is used. If
memory cannot be allocated, the allocator will fall back to
successively smaller page allocs (down to order-0 page allocs).

This change brings skb_page_frag_refill in line with the existing
page allocation strategy employed by netdev_alloc_frag, which attempts
higher-order page allocations whether or not GFP_WAIT is set, falling
back to successively lower-order page allocations on failure. Part
of migration of virtio-net to per-receive queue page frag allocators.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Michael Dalton <mwdalton@google.com>
---
 net/core/sock.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 85ad6f0..b3f7ee3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1836,9 +1836,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
 		put_page(pfrag->page);
 	}

-	/* We restrict high order allocations to users that can afford to wait */
-	order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
-
+	order = SKB_FRAG_PAGE_ORDER;
 	do {
 		gfp_t gfp = prio;

-- 
1.8.5.2

^ permalink raw reply related

* [PATCH net-next v6 2/6] virtio-net: use per-receive queue page frag alloc for mergeable bufs
From: Michael Dalton @ 2014-01-17  6:23 UTC (permalink / raw)
  To: David S. Miller
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, virtualization,
	Eric Dumazet, Ben Hutchings
In-Reply-To: <1389939810-14998-1-git-send-email-mwdalton@google.com>

The virtio-net driver currently uses netdev_alloc_frag() for GFP_ATOMIC
mergeable rx buffer allocations. This commit migrates virtio-net to use
per-receive queue page frags for GFP_ATOMIC allocation. This change unifies
mergeable rx buffer memory allocation, which now will use skb_refill_frag()
for both atomic and GFP-WAIT buffer allocations.

To address fragmentation concerns, if after buffer allocation there
is too little space left in the page frag to allocate a subsequent
buffer, the remaining space is added to the current allocated buffer
so that the remaining space can be used to store packet data.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael Dalton <mwdalton@google.com>
---
v1->v2: Use GFP_COLD for RX buffer allocations (as in netdev_alloc_frag()).
        Remove per-netdev GFP_KERNEL page_frag allocator.

 drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 9bd70aa..5ee71dc 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -75,6 +75,9 @@ struct receive_queue {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
+	/* Page frag for packet buffer allocation. */
+	struct page_frag alloc_frag;
+
 	/* RX: fragments + linear part + virtio header */
 	struct scatterlist sg[MAX_SKB_FRAGS + 2];
 
@@ -123,11 +126,6 @@ struct virtnet_info {
 	/* Lock for config space updates */
 	struct mutex config_lock;
 
-	/* Page_frag for GFP_KERNEL packet buffer allocation when we run
-	 * low on memory.
-	 */
-	struct page_frag alloc_frag;
-
 	/* Does the affinity hint is set for virtqueues? */
 	bool affinity_hint_set;
 
@@ -333,8 +331,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	int num_buf = hdr->mhdr.num_buffers;
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
-	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len,
-					       MERGE_BUFFER_LEN);
+	unsigned int truesize = max_t(unsigned int, len, MERGE_BUFFER_LEN);
+	struct sk_buff *head_skb = page_to_skb(rq, page, offset, len, truesize);
 	struct sk_buff *curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
@@ -350,11 +348,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			dev->stats.rx_length_errors++;
 			goto err_buf;
 		}
-		if (unlikely(len > MERGE_BUFFER_LEN)) {
-			pr_debug("%s: rx error: merge buffer too long\n",
-				 dev->name);
-			len = MERGE_BUFFER_LEN;
-		}
 
 		page = virt_to_head_page(buf);
 
@@ -372,19 +365,20 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			head_skb->truesize += nskb->truesize;
 			num_skb_frags = 0;
 		}
+		truesize = max_t(unsigned int, len, MERGE_BUFFER_LEN);
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
-			head_skb->truesize += MERGE_BUFFER_LEN;
+			head_skb->truesize += truesize;
 		}
 		offset = buf - page_address(page);
 		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
 			put_page(page);
 			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-					     len, MERGE_BUFFER_LEN);
+					     len, truesize);
 		} else {
 			skb_add_rx_frag(curr_skb, num_skb_frags, page,
-					offset, len, MERGE_BUFFER_LEN);
+					offset, len, truesize);
 		}
 	}
 
@@ -573,25 +567,24 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
-	struct virtnet_info *vi = rq->vq->vdev->priv;
-	char *buf = NULL;
+	struct page_frag *alloc_frag = &rq->alloc_frag;
+	char *buf;
 	int err;
+	unsigned int len, hole;
 
-	if (gfp & __GFP_WAIT) {
-		if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag,
-					 gfp)) {
-			buf = (char *)page_address(vi->alloc_frag.page) +
-			      vi->alloc_frag.offset;
-			get_page(vi->alloc_frag.page);
-			vi->alloc_frag.offset += MERGE_BUFFER_LEN;
-		}
-	} else {
-		buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
-	}
-	if (!buf)
+	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
 		return -ENOMEM;
+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	get_page(alloc_frag->page);
+	len = MERGE_BUFFER_LEN;
+	alloc_frag->offset += len;
+	hole = alloc_frag->size - alloc_frag->offset;
+	if (hole < MERGE_BUFFER_LEN) {
+		len += hole;
+		alloc_frag->offset += hole;
+	}
 
-	sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN);
+	sg_init_one(rq->sg, buf, len);
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
@@ -612,6 +605,7 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
 	int err;
 	bool oom;
 
+	gfp |= __GFP_COLD;
 	do {
 		if (vi->mergeable_rx_bufs)
 			err = add_recvbuf_mergeable(rq, gfp);
@@ -1368,6 +1362,14 @@ static void free_receive_bufs(struct virtnet_info *vi)
 	}
 }
 
+static void free_receive_page_frags(struct virtnet_info *vi)
+{
+	int i;
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		if (vi->rq[i].alloc_frag.page)
+			put_page(vi->rq[i].alloc_frag.page);
+}
+
 static void free_unused_bufs(struct virtnet_info *vi)
 {
 	void *buf;
@@ -1695,9 +1697,8 @@ free_recv_bufs:
 	unregister_netdev(dev);
 free_vqs:
 	cancel_delayed_work_sync(&vi->refill);
+	free_receive_page_frags(vi);
 	virtnet_del_vqs(vi);
-	if (vi->alloc_frag.page)
-		put_page(vi->alloc_frag.page);
 free_stats:
 	free_percpu(vi->stats);
 free:
@@ -1714,6 +1715,8 @@ static void remove_vq_common(struct virtnet_info *vi)
 
 	free_receive_bufs(vi);
 
+	free_receive_page_frags(vi);
+
 	virtnet_del_vqs(vi);
 }
 
@@ -1731,8 +1734,6 @@ static void virtnet_remove(struct virtio_device *vdev)
 	unregister_netdev(vi->dev);
 
 	remove_vq_common(vi);
-	if (vi->alloc_frag.page)
-		put_page(vi->alloc_frag.page);
 
 	flush_work(&vi->config_work);
 
-- 
1.8.5.2

^ permalink raw reply related

* [PATCH net-next v6 0/6] virtio-net: mergeable rx buffer size auto-tuning
From: Michael Dalton @ 2014-01-17  6:23 UTC (permalink / raw)
  To: David S. Miller
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, virtualization,
	Eric Dumazet, Ben Hutchings

The virtio-net device currently uses aligned MTU-sized mergeable receive
packet buffers. Network throughput for workloads with large average
packet size can be improved by posting larger receive packet buffers.
However, due to SKB truesize effects, posting large (e.g, PAGE_SIZE)
buffers reduces the throughput of workloads that do not benefit from GRO
and have no large inbound packets.

This patchset introduces virtio-net mergeable buffer size auto-tuning,
with buffer sizes ranging from aligned MTU-size to PAGE_SIZE. Packet
buffer size is chosen based on a per-receive queue EWMA of incoming
packet size.

To unify mergeable receive buffer memory allocation and improve
SKB frag coalescing, all mergeable buffer memory allocation is
migrated to per-receive queue page frag allocators.

The per-receive queue mergeable packet buffer size is exported via
sysfs, and the network device sysfs layer has been extended to add
support for device-specific per-receive queue sysfs attribute groups.

Michael Dalton (6):
  net: allow > 0 order atomic page alloc in skb_page_frag_refill
  virtio-net: use per-receive queue page frag alloc for mergeable bufs
  virtio-net: auto-tune mergeable rx buffer size for improved
    performance
  net-sysfs: add support for device-specific rx queue sysfs attributes
  lib: Ensure EWMA does not store wrong intermediate values
  virtio-net: initial rx sysfs support, export mergeable rx buffer size

 drivers/net/virtio_net.c  | 197 +++++++++++++++++++++++++++++++++-------------
 include/linux/netdevice.h |  35 +++++++-
 lib/average.c             |   6 +-
 net/core/dev.c            |  12 +--
 net/core/net-sysfs.c      |  50 +++++++-----
 net/core/sock.c           |   4 +-
 6 files changed, 214 insertions(+), 90 deletions(-)

-- 
1.8.5.2

^ permalink raw reply

* Re: [PATCH v4 net-next 2/4] sh_eth: Add support for r7s72100
From: Simon Horman @ 2014-01-17  6:13 UTC (permalink / raw)
  To: Sergei Shtylyov
  Cc: David S. Miller, netdev, linux-sh, linux-arm-kernel, Magnus Damm
In-Reply-To: <52D7FC8D.7000800@cogentembedded.com>

On Thu, Jan 16, 2014 at 07:36:45PM +0400, Sergei Shtylyov wrote:
> Hello.
> 
> On 16-01-2014 4:49, Simon Horman wrote:
> 
> >>>>>This is a fast ethernet controller.
> 
> >>>>>Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
> 
> >>>>[...]
> 
> >>>>>diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
> >>>>>index 4b38533..cc6d4af 100644
> >>>>>--- a/drivers/net/ethernet/renesas/sh_eth.c
> >>>>>+++ b/drivers/net/ethernet/renesas/sh_eth.c
> >>>>>@@ -190,6 +190,59 @@ static const u16 sh_eth_offset_fast_rcar[SH_ETH_MAX_REGISTER_OFFSET] = {
> >>[...]
> >>>>>@@ -701,6 +762,35 @@ static struct sh_eth_cpu_data r8a7740_data = {
> >>>>>  	.shift_rd0	= 1,
> >>>>>  };
> >>>>>
> >>>>>+/* R7S72100 */
> >>>>>+static struct sh_eth_cpu_data r7s72100_data = {
> >>>>>+	.chip_reset	= sh_eth_chip_reset,
> >>>>>+	.set_duplex	= sh_eth_set_duplex,
> >>>>>+
> >>>>>+	.register_type	= SH_ETH_REG_FAST_RZ,
> >>>>>+
> >>>>>+	.ecsr_value	= ECSR_ICD,
> >>>>>+	.ecsipr_value	= ECSIPR_ICDIP,
> >>>>>+	.eesipr_value	= 0xff7f009f,
> >>>>>+
> >>>>>+	.tx_check	= EESR_TC1 | EESR_FTC,
> >>>>>+	.eesr_err_check	= EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT |
> >>>>>+			  EESR_RFE | EESR_RDE | EESR_RFRMER | EESR_TFE |
> >>>>>+			  EESR_TDE | EESR_ECI,
> >>>>>+	.fdr_value	= 0x0000070f,
> >>>>>+	.rmcr_value	= RMCR_RNC,
> >>>>>+
> >>>>>+	.apr		= 1,
> >>>>>+	.mpr		= 1,
> >>>>>+	.tpauser	= 1,
> >>>>>+	.hw_swap	= 1,
> >>>>>+	.rpadir		= 1,
> >>>>>+	.rpadir_value   = 2 << 16,
> >>>>>+	.no_trimd	= 1,
> >>>>>+	.tsu		= 1,
> >>>>>+	.shift_rd0	= 1,
> >>
> >>>>    Perhaps this field should be renamed to something talking about
> >>>>check summing support (since bits 0..15 of RD0 contain a frame check
> >>>>sum for those SoCs). Or maybe it should be just merged with the
> >>>>'hw_crc' field...
> 
> >>>I have no feelings about that one way or another.
> 
> >>    Do you happen to have R8A7740 manual by chance? If so, does it
> >>talk about RX check summing support and using RD0 for that?
> 
> >Yes and yes.
> 
> >I have taken a quick look and the documentation for RX checksumming on the
> >R8A7740 appears to be very similar if not the same as that of the R7S72100.
> 
> >In particular both refer to using the bottom 16 bits of RD0 as
> >containing the packet checksum.
> 
>    OK, now if you had SH7734 manual to completely confirm that check
> sum is stored in the same place there... most probably it is, of
> course, and we should merge 'hw_crc' and 'shift_rd0' into a single
> field.

Unfortunately I don't have access to that manual.

> 
> [...]
> >>>>>diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
> >>>>>index 0fe35b7..0bcde90 100644
> >>>>>--- a/drivers/net/ethernet/renesas/sh_eth.h
> >>>>>+++ b/drivers/net/ethernet/renesas/sh_eth.h
> >>[...]
> >>>>>@@ -191,6 +192,7 @@ enum DMAC_M_BIT {
> >>>>>  /* EDTRR */
> >>>>>  enum DMAC_T_BIT {
> >>>>>  	EDTRR_TRNS_GETHER = 0x03,
> >>>>>+	EDTRR_TRNS_RZ_ETHER = 0x03,
> 
> >>>>    I doubt we need a special case here. You didn't introduce one for
> >>>>the software reset bits.
> 
> >>>True, but RZ is not Gigabit. So I think we either need two names
> >>>or to choose a more generic name.
> 
> >>    Well, R7S72100 manual calls these bits just TR[1:0]. Don't know
> >>what SoCs having Gigabit call it in the manuals...
> 
> >>>>>  	EDTRR_TRNS_ETHER = 0x01,
> 
> >>    R-Car manuals seem to call the bit TRNS (as well as the
> >>prehistoric SH manuals probably). Perhaps we could use that
> >>difference, TRNS vs TR, don't know...
> 
> >Perhaps we should just leave it as-is, using EDTRR_TRNS_GETHER and
> >EDTRR_TRNS_RZ_ETHER, after all.
> 
>    No, I liked your last version more. At least it's more
> consistent, not adding separate values for either TR[1:0] or soft
> reset bits.
> 
> >At least until we can think of a better names :)
> 
>    I doubt we can come up with something better.
> 
> WBR, Sergei
> 

^ permalink raw reply

* Re: [PATCH net-next v2 3/3] reciprocal_divide: correction/update of the algorithm
From: Eric Dumazet @ 2014-01-17  5:42 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: netdev, Austin S Hemmelgarn, linux-kernel, Jesse Gross,
	Jamal Hadi Salim, Stephen Hemminger, Matt Mackall, Pekka Enberg,
	Christoph Lameter, Andy Gospodarek, Veaceslav Falico,
	Jay Vosburgh, Jakub Zawadzki, Daniel Borkmann
In-Reply-To: <20140117042901.GB26022@order.stressinduktion.org>

On Fri, 2014-01-17 at 05:29 +0100, Hannes Frederic Sowa wrote:

> Also I doubt the performance drop for SLAB will be that massive. Also it was
> already replaced by SLUB as the default SLAB allocator, which doesn't use
> reciprocal_divide.

Google servers use SLAB, not SLUB, for various reasons, and performance
is one of them.

^ permalink raw reply

* Re: Fwd: [RFC PATCH net-next 0/3] virtio_net: add aRFS support
From: Stefan Hajnoczi @ 2014-01-17  5:22 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Zhi Yong Wu, Linux Netdev List, Eric Dumazet, David S. Miller,
	Zhi Yong Wu, Michael S. Tsirkin, Rusty Russell, Jason Wang
In-Reply-To: <CA+mtBx9PBtYurdnhCKL0MLL8i+_+3yPNWFVj5h6SPJH+YDBCjw@mail.gmail.com>

On Thu, Jan 16, 2014 at 09:12:29AM -0800, Tom Herbert wrote:
> On Thu, Jan 16, 2014 at 12:52 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > On Thu, Jan 16, 2014 at 04:34:10PM +0800, Zhi Yong Wu wrote:
> >> CC: stefanha, MST, Rusty Russel
> >>
> >> ---------- Forwarded message ----------
> >> From: Jason Wang <jasowang@redhat.com>
> >> Date: Thu, Jan 16, 2014 at 12:23 PM
> >> Subject: Re: [RFC PATCH net-next 0/3] virtio_net: add aRFS support
> >> To: Zhi Yong Wu <zwu.kernel@gmail.com>
> >> Cc: netdev@vger.kernel.org, therbert@google.com, edumazet@google.com,
> >> davem@davemloft.net, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> >>
> >>
> >> On 01/15/2014 10:20 PM, Zhi Yong Wu wrote:
> >> >
> >> > From: Zhi Yong Wu<wuzhy@linux.vnet.ibm.com>
> >> >
> >> > HI, folks
> >> >
> >> > The patchset is trying to integrate aRFS support to virtio_net. In this case,
> >> > aRFS will be used to select the RX queue. To make sure that it's going ahead
> >> > in the correct direction, although it is still one RFC and isn't tested, it's
> >> > post out ASAP. Any comment are appreciated, thanks.
> >> >
> >> > If anyone is interested in playing with it, you can get this patchset from my
> >> > dev git on github:
> >> >    git://github.com/wuzhy/kernel.git virtnet_rfs
> >> >
> >> > Zhi Yong Wu (3):
> >> >    virtio_pci: Introduce one new config api vp_get_vq_irq()
> >> >    virtio_net: Introduce one dummy function virtnet_filter_rfs()
> >> >    virtio-net: Add accelerated RFS support
> >> >
> >> >   drivers/net/virtio_net.c      |   67 ++++++++++++++++++++++++++++++++++++++++-
> >> >   drivers/virtio/virtio_pci.c   |   11 +++++++
> >> >   include/linux/virtio_config.h |   12 +++++++
> >> >   3 files changed, 89 insertions(+), 1 deletions(-)
> >> >
> >>
> >> Please run get_maintainter.pl before sending the patch. You'd better
> >> at least cc virtio maintainer/list for this.
> >>
> >> The core aRFS method is a noop in this RFC which make this series no
> >> much sense to discuss. You should at least mention the big picture
> >> here in the cover letter. I suggest you should post a RFC which can
> >> run and has expected result or you can just raise a thread for the
> >> design discussion.
> >>
> >> And this method has been discussed before, you can search "[net-next
> >> RFC PATCH 5/5] virtio-net: flow director support" in netdev archive
> >> for a very old prototype implemented by me. It can work and looks like
> >> most of this RFC have already done there.
> >>
> >> A basic question is whether or not we need this, not all the mq cards
> >> use aRFS (see ixgbe ATR). And whether or not it can bring extra
> >> overheads? For virtio, we want to reduce the vmexits as much as
> >> possible but this aRFS seems introduce a lot of more of this. Making a
> >> complex interfaces just for an virtual device may not be good, simple
> >> method may works for most of the cases.
> >>
> >> We really should consider to offload this to real nic. VMDq and L2
> >> forwarding offload may help in this case.
> >
> Adding flow director support would be a good step, Zhi's patches for
> support in tun have been merged, so support in virtio-net would be a
> good follow on. But, flow-director does have some limitations and
> performance issues of it's own (forced pairing between TX and RX
> queues, lookup on every TX packet). In the case of virtualization,
> aRFS, RSS, ntuple filtering, LRO, etc. can be implemented as software
> emulations and so far seems to be wins in most cases. Extending these
> down into the stack so that they can leverage HW mechanisms is a good
> goal for best performance. It's probably generally true that most of
> the offloads commonly available for NICs we'll want in virtualization
> path. Of course, we need to deomonstrate that they provide real
> performance benefit in this use case.
> 
> I believe tying in aRFS (or flow director) into a real aRFS is just a
> matter of programming the RFS table properly. This is not the complex
> side of the interface, I believe this already works with the tun
> patches.
> 
> > Zhi Yong and I had an IRC chat.  I wanted to post my questions on the
> > list - it's still the same concern I had in the old email thread that
> > Jason mentioned.
> >
> > In order for virtio-net aRFS to make sense there needs to be an overall
> > plan for pushing flow mapping information down to the physical NIC.
> > That's the only way to actually achieve the benefit of steering:
> > processing the packet on the CPU where the application is running.
> >
> I don't think this is necessarily true. Per flow steering amongst
> virtual queues should be beneficial in itself. virtio-net can leverage
> RFS or aRFS where it's available.

I guess we need to see benchmark results :)

> > If it's not possible or too hard to implement aRFS down the entire
> > stack, we won't be able to process the packet on the right CPU.
> > Then we might as well not bother with aRFS and just distribute uniformly
> > across the rx virtqueues.
> >
> > Please post an outline of how rx packets will be steered up the stack so
> > we can discuss whether aRFS can bring any benefit.
> >
> 1. The aRFS interface for the guest to specify which virtual queue to
> receive a packet on is fairly straight forward.
> 2. To hook into RFS, we need to match the virtual queue to the real
> CPU it will processed on, and then program the RFS table for that flow
> and CPU.
> 3. NIC aRFS keys off the RFS tables so it can program the HW with the
> correct queue for the CPU.

There are a lot of details that are not yet worked out:

If you want to implement aRFS down the vhost_net + macvtap path
(probably easiest?) how will Step 2 work?  Do the necessary kernel
interfaces exist to take the flow information in vhost_net, give them to
macvtap, and finally push them down to the physical NIC?

Not sure if aRFS will work down the full stack with vhost_net + tap +
bridge.  Any ideas?

At the QEMU level it is currently pointless to implement virtio-net aRFS
emulation since the QEMU global mutex is taken and virtio-net emulation
is not multi-threaded.

I think aRFS is a good thing, we just need to see performance results
and know that this won't be a dead end after merging changes to
virtio-net and the virtio specification.

Stefan

^ permalink raw reply

* Re: Fwd: [RFC PATCH net-next 0/3] virtio_net: add aRFS support
From: Tom Herbert @ 2014-01-17  5:08 UTC (permalink / raw)
  To: Jason Wang
  Cc: Stefan Hajnoczi, Zhi Yong Wu, Linux Netdev List, Eric Dumazet,
	David S. Miller, Zhi Yong Wu, Michael S. Tsirkin, Rusty Russell
In-Reply-To: <52D8A2D5.4040807@redhat.com>

On Thu, Jan 16, 2014 at 7:26 PM, Jason Wang <jasowang@redhat.com> wrote:
> On 01/17/2014 01:12 AM, Tom Herbert wrote:
>> On Thu, Jan 16, 2014 at 12:52 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> On Thu, Jan 16, 2014 at 04:34:10PM +0800, Zhi Yong Wu wrote:
>>>> CC: stefanha, MST, Rusty Russel
>>>>
>>>> ---------- Forwarded message ----------
>>>> From: Jason Wang <jasowang@redhat.com>
>>>> Date: Thu, Jan 16, 2014 at 12:23 PM
>>>> Subject: Re: [RFC PATCH net-next 0/3] virtio_net: add aRFS support
>>>> To: Zhi Yong Wu <zwu.kernel@gmail.com>
>>>> Cc: netdev@vger.kernel.org, therbert@google.com, edumazet@google.com,
>>>> davem@davemloft.net, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>>
>>>>
>>>> On 01/15/2014 10:20 PM, Zhi Yong Wu wrote:
>>>>> From: Zhi Yong Wu<wuzhy@linux.vnet.ibm.com>
>>>>>
>>>>> HI, folks
>>>>>
>>>>> The patchset is trying to integrate aRFS support to virtio_net. In this case,
>>>>> aRFS will be used to select the RX queue. To make sure that it's going ahead
>>>>> in the correct direction, although it is still one RFC and isn't tested, it's
>>>>> post out ASAP. Any comment are appreciated, thanks.
>>>>>
>>>>> If anyone is interested in playing with it, you can get this patchset from my
>>>>> dev git on github:
>>>>>    git://github.com/wuzhy/kernel.git virtnet_rfs
>>>>>
>>>>> Zhi Yong Wu (3):
>>>>>    virtio_pci: Introduce one new config api vp_get_vq_irq()
>>>>>    virtio_net: Introduce one dummy function virtnet_filter_rfs()
>>>>>    virtio-net: Add accelerated RFS support
>>>>>
>>>>>   drivers/net/virtio_net.c      |   67 ++++++++++++++++++++++++++++++++++++++++-
>>>>>   drivers/virtio/virtio_pci.c   |   11 +++++++
>>>>>   include/linux/virtio_config.h |   12 +++++++
>>>>>   3 files changed, 89 insertions(+), 1 deletions(-)
>>>>>
>>>> Please run get_maintainter.pl before sending the patch. You'd better
>>>> at least cc virtio maintainer/list for this.
>>>>
>>>> The core aRFS method is a noop in this RFC which make this series no
>>>> much sense to discuss. You should at least mention the big picture
>>>> here in the cover letter. I suggest you should post a RFC which can
>>>> run and has expected result or you can just raise a thread for the
>>>> design discussion.
>>>>
>>>> And this method has been discussed before, you can search "[net-next
>>>> RFC PATCH 5/5] virtio-net: flow director support" in netdev archive
>>>> for a very old prototype implemented by me. It can work and looks like
>>>> most of this RFC have already done there.
>>>>
>>>> A basic question is whether or not we need this, not all the mq cards
>>>> use aRFS (see ixgbe ATR). And whether or not it can bring extra
>>>> overheads? For virtio, we want to reduce the vmexits as much as
>>>> possible but this aRFS seems introduce a lot of more of this. Making a
>>>> complex interfaces just for an virtual device may not be good, simple
>>>> method may works for most of the cases.
>>>>
>>>> We really should consider to offload this to real nic. VMDq and L2
>>>> forwarding offload may help in this case.
>> Adding flow director support would be a good step, Zhi's patches for
>> support in tun have been merged, so support in virtio-net would be a
>> good follow on. But, flow-director does have some limitations and
>> performance issues of it's own (forced pairing between TX and RX
>> queues, lookup on every TX packet).
>
> True. But the pairing was designed to work without guest involving since
> we really want to reduce the vmexits from guest. And lookup on every TX
> packets could be released to every N packets. But I agree exposing the
> API to guest may bring lots of flexibility.
>> In the case of virtualization,
>> aRFS, RSS, ntuple filtering, LRO, etc. can be implemented as software
>> emulations and so far seems to be wins in most cases. Extending these
>> down into the stack so that they can leverage HW mechanisms is a good
>> goal for best performance. It's probably generally true that most of
>> the offloads commonly available for NICs we'll want in virtualization
>> path. Of course, we need to deomonstrate that they provide real
>> performance benefit in this use case.
>
> Yes, we need a prototype to see how much it can help.
>>
>> I believe tying in aRFS (or flow director) into a real aRFS is just a
>> matter of programming the RFS table properly. This is not the complex
>> side of the interface, I believe this already works with the tun
>> patches.
>
> Right, what we may needs is
>
> - exposing new tun ioctls for qemu adding or removing a flow
> - new virtqueue command for guest driver to adding or removing a flow
> (btw, current control virtqueue is really slow, we may need to improve it).
> - an agreement of host and guest to use the same hash method, or just
> compute software hash in host and pass it to guest (which needs extra
> API to do)

The model to get RX hash from a device is well known, the guest can
use that to reflect information about a flow back to the host, and for
performance we might piggyback RX queue selection on the TX
descriptors of a flow. Probably some limitations with real HW, but I
assume would have less issues in SW.

IMO, if we have a flow state on the host we should *never* need to
perform any hash computation on TX (a host is not a switch :-) ), we
may want to have some mirrored flow state in the kernel for these
flows which are indexed by the hash provided in TX.

> - change guest driver to use aRFS
>
> Some of the above has been implemented in my old RFC.

Looks pretty similar to Zhi's tun work. Are you planning to refresh
those patches?

>>
>>> Zhi Yong and I had an IRC chat.  I wanted to post my questions on the
>>> list - it's still the same concern I had in the old email thread that
>>> Jason mentioned.
>>>
>>> In order for virtio-net aRFS to make sense there needs to be an overall
>>> plan for pushing flow mapping information down to the physical NIC.
>>> That's the only way to actually achieve the benefit of steering:
>>> processing the packet on the CPU where the application is running.
>>>
>> I don't think this is necessarily true. Per flow steering amongst
>> virtual queues should be beneficial in itself. virtio-net can leverage
>> RFS or aRFS where it's available.
>>
>>> If it's not possible or too hard to implement aRFS down the entire
>>> stack, we won't be able to process the packet on the right CPU.
>>> Then we might as well not bother with aRFS and just distribute uniformly
>>> across the rx virtqueues.
>>>
>>> Please post an outline of how rx packets will be steered up the stack so
>>> we can discuss whether aRFS can bring any benefit.
>>>
>> 1. The aRFS interface for the guest to specify which virtual queue to
>> receive a packet on is fairly straight forward.
>> 2. To hook into RFS, we need to match the virtual queue to the real
>> CPU it will processed on, and then program the RFS table for that flow
>> and CPU.
>> 3. NIC aRFS keys off the RFS tables so it can program the HW with the
>> correct queue for the CPU.
>>
>>> Stefan
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH] netdevice.7: document SIOCGIFCONF case ifc_req==NULL
From: Michael Kerrisk (man-pages) @ 2014-01-16  4:23 UTC (permalink / raw)
  To: Tilman Schmidt; +Cc: mtk.manpages, linux-man, netdev
In-Reply-To: <52D04B8B.3040306@imap.cc>

On 01/11/2014 08:35 AM, Tilman Schmidt wrote:
> Hello Michael,
> 
> Am 10.01.2014 18:52, schrieb Michael Kerrisk (man-pages):
>> On 01/10/2014 12:30 PM, Tilman Schmidt wrote:
>>> Add the missing description of the possibility to call SIOCGIFCONF
>>> with ifc_req==NULL to determine the needed buffer size, as described
>>> in http://lkml.indiana.edu/hypermail/linux/kernel/0110.1/0506.html
>>> and verified against source files net/core/dev_ioctl.c and
>>> net/ipv4/devinet.c in the current kernel git tree.
> [...]>
>> Thanks for the patch. I'm trying to verify this from the code, but 
>> am having some trouble finding the relevant pieces. Could you point
>> me more specifically at the points in the kernel source where this
>> case is handled?
> 
> Gladly.
> 
> Function dev_ifconf() [net/core/dev_ioctl.c line 67ff.] is the main
> handler for SIOCGIFCONF. It calls the registered protocol specific
> handlers via the table gifconf_list[]. The current kernel has only
> one such handler, inet_gifconf() [net/ipv4/devinet.c line 1115ff.]
> 
> If ifc.ifc_buf is NULL, dev_ifconf() calls the protocol specific
> handlers with NULL as second argument. [net/core/dev_ioctl.c line 96]
> 
> If inet_gifconf() is called with NULL as second argument it just
> adds up the data sizes, skipping the size check and data transfer.
> [net/ipv4/devinet.c line 1127f.]

Thanks. Patch applied.

Cheers,

Michael




-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply

* Re: [PATCH net-next v2 3/3] reciprocal_divide: correction/update of the algorithm
From: Hannes Frederic Sowa @ 2014-01-17  4:29 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, Austin S Hemmelgarn, linux-kernel, Jesse Gross,
	Jamal Hadi Salim, Stephen Hemminger, Matt Mackall, Pekka Enberg,
	Christoph Lameter, Andy Gospodarek, Veaceslav Falico,
	Jay Vosburgh, Jakub Zawadzki, Daniel Borkmann
In-Reply-To: <1389926017.31367.464.camel@edumazet-glaptop2.roam.corp.google.com>

On Thu, Jan 16, 2014 at 06:33:37PM -0800, Eric Dumazet wrote:
> On Fri, 2014-01-17 at 01:28 +0100, Hannes Frederic Sowa wrote:
> > Jakub Zawadzki noticed that some divisions by reciprocal_divide()
> > were not correct [1][2], which he could also show with BPF code after
> > divisions are transformed into reciprocal_value() for runtime invariant
> > which can be passed to reciprocal_divide() later on; reverse in BPF dump
> > ended up with a different, off-by-one K.
> > 
> > This has been fixed by Eric Dumazet in commit aee636c4809fa5 ("bpf: do not
> > use reciprocal divide"). This follow-up patch improves reciprocal_value()
> > and reciprocal_divide() to work in all cases, so future use is safe.
> > 
> > Known problems with the old implementation were that division by 1 always
> > returned 0 and some off-by-ones when the dividend and divisor where
> > very large.  This seemed to not be problematic with its current users
> > in networking, mm/slab.c and lib/flex_array.c, but future users would
> > need to check for this specifically and it might not be obvious at first.
> > 
> > In order to fix that, we propose an extension from the original
> > implementation from commit 6a2d7a955d8d resp. [3][4], by using
> > the algorithm proposed in "Division by Invariant Integers Using
> > Multiplication" [5], Torbjörn Granlund and Peter L. Montgomery, that is,
> > pseudocode for q = n/d where q,n,d is in u32 universe:
> > 
> > 1) Initialization:
> > 
> >   int l = ceil(log_2 d)
> >   uword m' = floor((1<<32)*((1<<l)-d)/d)+1
> >   int sh_1 = min(l,1)
> >   int sh_2 = max(l-1,0)
> > 
> > 2) For q = n/d, all uword:
> > 
> >   uword t = (n*m')>>32
> >   q = (t+((n-t)>>sh_1))>>sh_2
> > 
> > The assembler implementation from Agner Fog [6] also helped a lot
> > while implementing. We have tested the implementation on x86_64,
> > ppc64, i686, s390x; on x86_64/haswell we're still half the latency
> > compared to normal divide.
> > 
> > Joint work with Daniel Borkmann.
> > 
> >   [1] http://www.wireshark.org/~darkjames/reciprocal-buggy.c
> >   [2] http://www.wireshark.org/~darkjames/set-and-dump-filter-k-bug.c
> >   [3] https://gmplib.org/~tege/division-paper.pdf
> >   [4] http://homepage.cs.uiowa.edu/~jones/bcd/divide.html
> >   [5] http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
> >   [6] http://www.agner.org/optimize/asmlib.zip
> > 
> > Fixes: 6a2d7a955d8d ("SLAB: use a multiply instead of a divide in obj_to_index()")
> 
> 
> I already demonstrated this slab patch was fine.
> 
> The current algo works well (no off-by-one error) when the dividend is
> a multiple of the divisor.

Sure, so did we state in the commit message.

> You are adding extra overhead, while we know its not necessary.
> 
> By using "Fixes: ... " you are asking a backport to stable branches,
> which seems really silly in this case, especially with this monolithic
> patch changing 12 files in different subsystems.

We can drop the the Fixes tags, no problem.

> If you believe flex_array has a problem, please fix flex_array only,
> by a small patch (Maybe a revert ?)

I really doubt it is helpful to have an implementation of reciprocal_divide
which has some known (and maybe unkown) problems in the long term.

This implementation still has an performance benefit compared to regular
division while calculating correct results in any case.

We clearly didn't intend stable inclusion, in fact this patch has been posted
for net-next inclusion as an improvment and not as a bugfix. The Fixes tags
where just lingering on this patch from my first attempt where the situation
was not that clear (at least for me).

Also I doubt the performance drop for SLAB will be that massive. Also it was
already replaced by SLUB as the default SLAB allocator, which doesn't use
reciprocal_divide.

Greetings,

  Hannes

^ permalink raw reply

* Re: [Patch net-next] vxlan: do not use vxlan_net before checking event type
From: Fan Du @ 2014-01-17  4:28 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, Daniel Borkmann, David S. Miller
In-Reply-To: <1389932410-21080-1-git-send-email-xiyou.wangcong@gmail.com>

Hi, Cong

On 2014年01月17日 12:20, Cong Wang wrote:
> When cloning a netns, loopback device will be registered
> and therefore an event will be notified. Of course
> vxlan doesn't care about it, therefore should check if it
> is NETDEV_UNREGISTER before getting the vxlan_net struct.
> Otherwise, vxlan_net is probably not initialized yet at
> this point.

I'm bit new to vxlan, but in vxlan_init_module

register_pernet_device is called before register_netdevice_notifier.
By my understanding once register_pernet_device is called,
then subsequent vxlan_notifier_block callback see a valid vxlan_net_id.
I mean execute vxlan_notifier_block callback indicates a valid vxlan_net_id,
or I miss somewhere else.

-- 
浮沉随浪只记今朝笑

--fan

^ permalink raw reply

* Re: PANIC in vxlan <debugging now>
From: Cong Wang @ 2014-01-17  4:24 UTC (permalink / raw)
  To: Jesse Brandeburg; +Cc: netdev, Daniel Borkmann
In-Reply-To: <CAHA+R7N-FbNVSY-fjObfSrzc3UFMdAT-h-TPnX09EuA4R79U0g@mail.gmail.com>

On Thu, Jan 16, 2014 at 7:47 PM, Cong Wang <cwang@twopensource.com> wrote:
> On Thu, Jan 16, 2014 at 6:03 PM, Jesse Brandeburg
> <jesse.brandeburg@intel.com> wrote:
>>
>> It appears that the bug is in acaf4e70997f (net: vxlan: when lower dev
>> unregisters remove vxlan dev as well).
>>
>> reverting that patch avoids the panic.  I wasn't able to see
>> immediately what was wrong in the patch.
>
> It seems vxlan pernet ops is run after loopback register.
>
> Please try the attached (totally untested) patch?

Never mind, I already verified it by myself.
A formal patch was just sent out.

^ permalink raw reply

* [Patch net-next] vxlan: do not use vxlan_net before checking event type
From: Cong Wang @ 2014-01-17  4:20 UTC (permalink / raw)
  To: netdev; +Cc: Daniel Borkmann, David S. Miller, Cong Wang

When cloning a netns, loopback device will be registered
and therefore an event will be notified. Of course
vxlan doesn't care about it, therefore should check if it
is NETDEV_UNREGISTER before getting the vxlan_net struct.
Otherwise, vxlan_net is probably not initialized yet at
this point.

Fixes: commit acaf4e70997ff5ef3588f5a 
Reported-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>

---
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index a2dee80..2812559 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2655,36 +2655,30 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
 	.fill_info	= vxlan_fill_info,
 };
 
-static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
-					     struct net_device *dev)
-{
-	struct vxlan_dev *vxlan, *next;
-	LIST_HEAD(list_kill);
-
-	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
-		struct vxlan_rdst *dst = &vxlan->default_dst;
-
-		/* In case we created vxlan device with carrier
-		 * and we loose the carrier due to module unload
-		 * we also need to remove vxlan device. In other
-		 * cases, it's not necessary and remote_ifindex
-		 * is 0 here, so no matches.
-		 */
-		if (dst->remote_ifindex == dev->ifindex)
-			vxlan_dellink(vxlan->dev, &list_kill);
-	}
-
-	unregister_netdevice_many(&list_kill);
-}
-
 static int vxlan_lowerdev_event(struct notifier_block *unused,
 				unsigned long event, void *ptr)
 {
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	if (event == NETDEV_UNREGISTER) {
+		struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+		struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+		struct vxlan_dev *vxlan, *next;
+		LIST_HEAD(list_kill);
+
+		list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
+			struct vxlan_rdst *dst = &vxlan->default_dst;
+
+			/* In case we created vxlan device with carrier
+			 * and we loose the carrier due to module unload
+			 * we also need to remove vxlan device. In other
+			 * cases, it's not necessary and remote_ifindex
+			 * is 0 here, so no matches.
+			 */
+			if (dst->remote_ifindex == dev->ifindex)
+				vxlan_dellink(vxlan->dev, &list_kill);
+		}
 
-	if (event == NETDEV_UNREGISTER)
-		vxlan_handle_lowerdev_unregister(vn, dev);
+		unregister_netdevice_many(&list_kill);
+	}
 
 	return NOTIFY_DONE;
 }

^ permalink raw reply related

* Re: [PATCH] ipv6: don't call addrconf_dst_alloc again when enable lo
From: Hannes Frederic Sowa @ 2014-01-17  4:09 UTC (permalink / raw)
  To: chenweilong; +Cc: Gao feng, David Miller, kumaran.4353, netdev
In-Reply-To: <52D88F29.5010404@huawei.com>

On Fri, Jan 17, 2014 at 10:02:17AM +0800, chenweilong wrote:
> It's quite a long time, How's your patch going on?

I apologize, it has gotten a bit off my radar lately, but I have the branch
still around and will resurrect it asap. I think it is a problem which really
should be solved.

Thanks,

  Hannes

^ permalink raw reply

* Re: PANIC in vxlan <debugging now>
From: Cong Wang @ 2014-01-17  3:47 UTC (permalink / raw)
  To: Jesse Brandeburg; +Cc: netdev, Daniel Borkmann
In-Reply-To: <20140116180318.00004f53@unknown>

[-- Attachment #1: Type: text/plain, Size: 420 bytes --]

On Thu, Jan 16, 2014 at 6:03 PM, Jesse Brandeburg
<jesse.brandeburg@intel.com> wrote:
>
> It appears that the bug is in acaf4e70997f (net: vxlan: when lower dev
> unregisters remove vxlan dev as well).
>
> reverting that patch avoids the panic.  I wasn't able to see
> immediately what was wrong in the patch.

It seems vxlan pernet ops is run after loopback register.

Please try the attached (totally untested) patch?

[-- Attachment #2: vxlan.diff --]
[-- Type: text/plain, Size: 2002 bytes --]

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index a2dee80..2812559 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2655,36 +2655,30 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
 	.fill_info	= vxlan_fill_info,
 };
 
-static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
-					     struct net_device *dev)
-{
-	struct vxlan_dev *vxlan, *next;
-	LIST_HEAD(list_kill);
-
-	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
-		struct vxlan_rdst *dst = &vxlan->default_dst;
-
-		/* In case we created vxlan device with carrier
-		 * and we loose the carrier due to module unload
-		 * we also need to remove vxlan device. In other
-		 * cases, it's not necessary and remote_ifindex
-		 * is 0 here, so no matches.
-		 */
-		if (dst->remote_ifindex == dev->ifindex)
-			vxlan_dellink(vxlan->dev, &list_kill);
-	}
-
-	unregister_netdevice_many(&list_kill);
-}
-
 static int vxlan_lowerdev_event(struct notifier_block *unused,
 				unsigned long event, void *ptr)
 {
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	if (event == NETDEV_UNREGISTER) {
+		struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+		struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+		struct vxlan_dev *vxlan, *next;
+		LIST_HEAD(list_kill);
+
+		list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
+			struct vxlan_rdst *dst = &vxlan->default_dst;
+
+			/* In case we created vxlan device with carrier
+			 * and we loose the carrier due to module unload
+			 * we also need to remove vxlan device. In other
+			 * cases, it's not necessary and remote_ifindex
+			 * is 0 here, so no matches.
+			 */
+			if (dst->remote_ifindex == dev->ifindex)
+				vxlan_dellink(vxlan->dev, &list_kill);
+		}
 
-	if (event == NETDEV_UNREGISTER)
-		vxlan_handle_lowerdev_unregister(vn, dev);
+		unregister_netdevice_many(&list_kill);
+	}
 
 	return NOTIFY_DONE;
 }

^ permalink raw reply related

* Re: [PATCH v2 01/16] reset: add non CONFIG_RESET_CONTROLLER routines
From: Chen-Yu Tsai @ 2014-01-17  3:46 UTC (permalink / raw)
  To: Philipp Zabel
  Cc: Srinivas Kandagatla, Giuseppe Cavallaro, Maxime Ripard, netdev,
	linux-arm-kernel, linux-sunxi, linux-kernel, Rob Herring,
	Emilio Lopez, Mike Turquette, Ivan T. Ivanov, Barry Song,
	Stephen Warren
In-Reply-To: <1389360637.5854.53.camel@pizza.hi.pengutronix.de>

Hi,

On Fri, Jan 10, 2014 at 9:30 PM, Philipp Zabel <p.zabel@pengutronix.de> wrote:
> Hi,
>
> [Added Ivan, Stephen and Barry to Cc:]
>
> Am Freitag, den 10.01.2014, 15:00 +0800 schrieb Chen-Yu Tsai:
>> Some drivers are shared between platforms that may or may not
>> have RESET_CONTROLLER selected for them.
>
> I expected that drivers compiled for platforms without reset controllers
> but use the reset API should select or depend on RESET_CONTROLLER.
> Stubbing out device_reset and reset_control_get will turn a compile time
> error into a runtime error for everyone forgetting to do this when
> writing a new driver that needs the reset.

Since this was the intended behavior, I'll drop this patch and select
RESET_CONTROLLER for the stmmac driver for now.


Thanks
ChenYu

>
>>  Add dummy functions
>> when RESET_CONTROLLER is not selected, thereby eliminating the
>> need for drivers to enclose reset_control_*() calls within
>> #ifdef CONFIG_RESET_CONTROLLER, #endif
>>
>> Signed-off-by: Chen-Yu Tsai <wens@csie.org>
>
> This was already proposed by Ivan and Barry earlier, and so far we
> didn't get to a proper conclusion:
>
> https://lkml.org/lkml/2013/10/10/179
> http://lists.infradead.org/pipermail/linux-arm-kernel/2013-June/174758.html
>
> If included, the stubs should at least return an error to indicate a
> reset couldn't be performed, but then I lose the compile time error for
> drivers which should select RESET_CONTROLLER but don't.
>
> Also this alone won't help you if you build multi-arch kernels where one
> platform enables RESET_CONTROLLER and the other expects it to be
> disabled.
>
> regards
> Philipp
>
>> ---
>>  include/linux/reset.h | 39 +++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 39 insertions(+)
>>
>> diff --git a/include/linux/reset.h b/include/linux/reset.h
>> index 6082247..38aa616 100644
>> --- a/include/linux/reset.h
>> +++ b/include/linux/reset.h
>> @@ -4,6 +4,8 @@
>>  struct device;
>>  struct reset_control;
>>
>> +#ifdef CONFIG_RESET_CONTROLLER
>> +
>>  int reset_control_reset(struct reset_control *rstc);
>>  int reset_control_assert(struct reset_control *rstc);
>>  int reset_control_deassert(struct reset_control *rstc);
>> @@ -14,4 +16,41 @@ struct reset_control *devm_reset_control_get(struct device *dev, const char *id)
>>
>>  int device_reset(struct device *dev);
>>
>> +#else /* !CONFIG_RESET_CONTROLLER */
>> +
>> +static inline int reset_control_reset(struct reset_control *rstc)
>> +{
>> +     return 0;
>> +}
>> +
>> +static inline int reset_control_assert(struct reset_control *rstc)
>> +{
>> +     return 0;
>> +}
>> +
>> +static inline int reset_control_deassert(struct reset_control *rstc)
>> +{
>> +     return 0;
>> +}
>
> Those should probably have a WARN_ON(1) like the GPIO API stubs.
>
>> +
>> +static inline struct reset_control *reset_control_get(struct device *dev,
>> +             const char *id)
>> +{
>> +     return NULL;
>> +}
> [...]
>> +static inline struct reset_control *devm_reset_control_get(struct device *dev,
>> +             const char *id)
>> +{
>> +     return NULL;
>> +}
>
> These should return ERR_PTR(-ENOSYS).
>
>> +
>> +static inline int device_reset(struct device *dev)
>> +{
>> +     return 0;
>> +}
>
> And this should return -ENOSYS.
>
> Drivers that also need to run on platforms with CONFIG_RESET_CONTROLLER
> disabled (and that don't need the reset) should ignore -ENOSYS and
> -ENOENT return values from device_reset/(devm_)reset_control_get.
>
> I wonder if it might be a good idea to add a RESET_CONTROLLER_OPTIONAL
> that drivers need to select to enable the API stubs? That way we could
> keep the compile time error for new drivers that need resets but forget
> to select RESET_CONTROLLER.
> Or add a
> #warning If this driver can work without reset, please select CONFIG_RESET_CONTROLLER_OPTIONAL
>
> Another possibility would be to add device_reset_optional and
> (devm_)reset_control_get_optional variants and only provide stubs for
> those, but not for device_reset/(devm_)reset_control_get. Then drivers
> that need to work on platforms without the reset controller API enabled
> could explicitly use the _optional variants, and all other drivers would
> still be checked at compile time.
>
> regards
> Philipp
>

^ permalink raw reply

* [PATCH iproute2] PIE: Add man page
From: Vijay Subramanian @ 2014-01-17  3:39 UTC (permalink / raw)
  To: netdev; +Cc: shemminger, Vijay Subramanian, Vijay Subramanian, Dave Taht

From: Mythili Prabhu <mysuryan@cisco.com>

This adds the manpage for  PIE: Proportional Integral controller Enhanced AQM
scheme.

Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Vijay Subramanian <vijaynsu@cisco.com>
CC: Dave Taht <dave.taht@bufferbloat.net>
---
Targetted for net-next-3.13 branch 

 man/man8/Makefile |    2 +-
 man/man8/tc-pie.8 |  131 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 man/man8/tc-pie.8

diff --git a/man/man8/Makefile b/man/man8/Makefile
index ff80c98..cef0912 100644
--- a/man/man8/Makefile
+++ b/man/man8/Makefile
@@ -2,7 +2,7 @@ TARGETS = ip-address.8 ip-link.8 ip-route.8
 
 MAN8PAGES = $(TARGETS) ip.8 arpd.8 lnstat.8 routel.8 rtacct.8 rtmon.8 ss.8 \
 	tc.8 tc-bfifo.8 tc-cbq.8 tc-cbq-details.8 tc-choke.8 tc-codel.8 \
-	tc-drr.8 tc-ematch.8 tc-fq_codel.8 tc-hfsc.8 tc-htb.8 \
+	tc-drr.8 tc-ematch.8 tc-fq_codel.8 tc-hfsc.8 tc-htb.8 tc-pie.8 \
 	tc-netem.8 tc-pfifo.8 tc-pfifo_fast.8 tc-prio.8 tc-red.8 \
 	tc-sfb.8 tc-sfq.8 tc-stab.8 tc-tbf.8 \
 	bridge.8 rtstat.8 ctstat.8 nstat.8 routef.8 \
diff --git a/man/man8/tc-pie.8 b/man/man8/tc-pie.8
new file mode 100644
index 0000000..536c381
--- /dev/null
+++ b/man/man8/tc-pie.8
@@ -0,0 +1,131 @@
+.TH PIE 8 "16 January 2014" "iproute2" "Linux"
+.SH NAME
+PIE \- Proportional Integral controller-Enhanced AQM algorithm
+.SH SYNOPSIS
+.B tc qdisc ... pie
+[
+.B limit
+PACKETS ] [
+.B target
+TIME ] [
+.B tupdate
+TIME ] [
+.B alpha
+int ] [
+.B beta
+int ] [
+.B ecn
+|
+.B noecn
+] [
+.B bytemode
+|
+.B nobytemode
+]
+
+.SH DESCRIPTION
+Proportional Integral controller-Enhanced (PIE) is a control theoretic active
+queue management scheme. It is based on the proportional integral controller but
+aims to control delay. The main design goals are
+ o Low latency control
+ o High link utilization
+ o Simple implementation
+ o Guaranteed stability and fast responsiveness
+
+.SH ALGORITHM
+PIE is designed to control delay effectively. First, an average dequeue rate is
+estimated based on the standing queue. The rate is used to calculate the current
+delay. Then, on a periodic basis, the delay is used to calculate the dropping
+probabilty. Finally, on arrival, a packet is dropped (or marked) based on this
+probability.
+
+PIE makes adjustments to the probability based on the trend of the delay i.e.
+whether it is going up or down.The delay converges quickly to the target value
+specified.
+
+alpha and beta are statically chosen parameters chosen to control the drop probability
+growth and are determined through control theoretic approaches. alpha determines how
+the deviation between the current and target latency changes probability. beta exerts
+additional adjustments depending on the latency trend.
+
+The drop probabilty is used to mark packets in ecn mode. However, as in RED,
+beyond 10% packets are dropped based on this probability.  The bytemode is used
+to drop packets proportional to the packet size.
+
+Additional details can be found in the paper cited below.
+
+.SH PARAMETERS
+.SS limit
+limit on the queue size in packets. Incoming packets are dropped when this limit
+is reached. Default is 1000 packets.
+
+.SS target
+is the expected queue delay. The default target delay is 20ms.
+
+.SS tupdate
+is the frequency at which the system drop probability is calculated. The default is 30ms.
+
+.SS alpha
+.SS beta
+alpha and beta are parameters chosen to control the drop probability. These
+should be in the range between 0 and 32.
+
+.SS ecn | noecn
+is used to mark packets instead of dropping
+.B ecn
+to turn on ecn mode,
+.B noecn
+to turn off ecn mode. By default,
+.B ecn
+is turned off.
+
+.SS bytemode | nobytemode
+is used to scale drop probability proportional to packet size
+.B bytemode
+to turn on bytemode,
+.B nobytemode
+to turn off bytemode. By default,
+.B bytemode
+is turned off.
+
+.SH EXAMPLES
+ # tc qdisc add dev eth0 root pie
+ # tc -s qdisc show
+   qdisc pie 8034: dev eth0 root refcnt 2 limit 200p target 19000us tupdate 29000us alpha 2 beta 20
+   Sent 7443524 bytes 7204 pkt (dropped 900, overlimits 0 requeues 0)
+   backlog 38998b 37p requeues 0
+   prob 0.123384 delay 25000us avg_dq_rate 1464840
+   pkts_in 7241 overlimit 900 dropped 0 maxq 186 ecn_mark 0
+
+ # tc qdisc add dev eth0 root pie limit 100 target 20ms tupdate 30ms ecn
+ # tc -s qdisc show
+   qdisc pie 8036: dev eth0 root refcnt 2 limit 200p target 19000 tupdate 29000 alpha 2 beta 20 ecn
+   Sent 2491922 bytes 2507 pkt (dropped 214, overlimits 0 requeues 0)
+   backlog 33728b 32p requeues 0
+   prob 0.102262 delay 24000us avg_dq_rate 1464840
+   pkts_in 2468 overlimit 214 dropped 0 maxq 192 ecn_mark 71
+
+
+ # tc qdisc add dev eth0 root pie limit 100 target 50ms tupdate 30ms bytemode
+ # tc -s qdisc show
+   qdisc pie 8036: dev eth0 root refcnt 2 limit 200p target 19000 tupdate 29000 alpha 2 beta 20 ecn
+   Sent 2491922 bytes 2507 pkt (dropped 214, overlimits 0 requeues 0)
+   backlog 33728b 32p requeues 0
+   prob 0.102262 delay 24000us avg_dq_rate 1464840
+   pkts_in 2468 overlimit 214 dropped 0 maxq 192 ecn_mark 71
+
+
+.SH SEE ALSO
+.BR tc (8),
+.BR tc-codel (8)
+.BR tc-red (8)
+
+.SH SOURCES
+ o IETF draft submission is at http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
+ o IEEE  Conference on High Performance Switching and Routing 2013 : "PIE: A
+Lightweight Control Scheme to Address the Bufferbloat Problem"
+
+.SH AUTHORS
+PIE was implemented by Vijay Subramanian and Mythili Prabhu, also the authors of
+this man page. Please report bugs and corrections to the Linux networking
+development mailing list at <netdev@vger.kernel.org>.
-- 
1.7.9.5

^ permalink raw reply related

* [Patch 2/2 net-next v6] ixgbe: set driver_max_VFs should be done before enabling SRIOV
From: Aaron Brown @ 2014-01-17  3:41 UTC (permalink / raw)
  To: davem; +Cc: ethan.zhao, netdev, gospo, sassmann, ethan.kernel, Aaron Brown
In-Reply-To: <1389930065-3330-1-git-send-email-aaron.f.brown@intel.com>

From: "ethan.zhao" <ethan.zhao@oracle.com>

commit 43dc4e01 Limit number of reported VFs to device
 specific value It doesn't work and always returns -EBUSY because VFs are
 already enabled.

ixgbe_enable_sriov()
        pci_enable_sriov()
                sriov_enable()
                {
                ... ..
                iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
                pci_cfg_access_lock(dev);
                ... ...
                }

pci_sriov_set_totalvfs()
{
... ...
if (dev->sriov->ctrl & PCI_SRIOV_CTRL_VFE)
                return -EBUSY;
...
}

So should set driver_max_VFs with pci_sriov_set_totalvfs() before
enable VFs with ixgbe_enable_sriov().

V2: revised for net-next tree.

Signed-off-by: Ethan Zhao <ethan.kernel@gmail.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 3fd4d3f..61d985c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8019,8 +8019,8 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* Mailbox */
 	ixgbe_init_mbx_params_pf(hw);
 	memcpy(&hw->mbx.ops, ii->mbx_ops, sizeof(hw->mbx.ops));
-	ixgbe_enable_sriov(adapter);
 	pci_sriov_set_totalvfs(pdev, IXGBE_MAX_VFS_DRV_LIMIT);
+	ixgbe_enable_sriov(adapter);
 skip_sriov:
 
 #endif
-- 
1.8.5.GIT

^ permalink raw reply related

* [Patch v6 net-next 0/2] Intel Wired LAN Driver Updates
From: Aaron Brown @ 2014-01-17  3:41 UTC (permalink / raw)
  To: davem; +Cc: Aaron Brown, netdev, gospo, sassmann, ethan.kernel

This series contains updates to ixgbe Ethan Zhao.  The first one replaces
the magic number "63" with a macro, IXGBE_MAX_VFS_DRV_LIMIT, the second 
moves the call to set driver_max_VFS to before SRIOV is enabled.

The code of these patches match the v3 (1/2) and v2 (2/2) versions sent
to the e1000-devel and netdev mailing lists.  The intermediate versions
(v4, v5) are from sorting out style issues, mostly tabs to spaces and
split lines probably introduced via mailer.

ethan.zhao (2):
  1/2 ixgbe: define IXGBE_MAX_VFS_DRV_LIMIT macro and cleanup const 63
  2/2 ixgbe: set driver_max_VFs should be done before enabling SRIOV  

 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h | 5 +++++
 3 files changed, 9 insertions(+), 4 deletions(-)

-- 
1.8.5.GIT

^ permalink raw reply

* [PATCH v6 net-next 1/2] ixgbe: define IXGBE_MAX_VFS_DRV_LIMIT macro and cleanup const 63
From: Aaron Brown @ 2014-01-17  3:41 UTC (permalink / raw)
  To: davem; +Cc: ethan.zhao, netdev, gospo, sassmann, ethan.kernel, Aaron Brown
In-Reply-To: <1389930065-3330-1-git-send-email-aaron.f.brown@intel.com>

From: "ethan.zhao" <ethan.zhao@oracle.com>

Because ixgbe driver limit the max number of VF
 functions could be enabled to 63, so define one macro IXGBE_MAX_VFS_DRV_LIMIT
 and cleanup the const 63 in code.

v3: revised for net-next tree.

Signed-off-by: Ethan Zhao <ethan.kernel@gmail.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Aaron Brown <aaron.f.brown@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h | 5 +++++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index b445ad1..3fd4d3f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5067,7 +5067,7 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter)
 
 	/* assign number of SR-IOV VFs */
 	if (hw->mac.type != ixgbe_mac_82598EB) {
-		if (max_vfs > 63) {
+		if (max_vfs > IXGBE_MAX_VFS_DRV_LIMIT) {
 			adapter->num_vfs = 0;
 			e_dev_warn("max_vfs parameter out of range. Not assigning any SR-IOV VFs\n");
 		} else {
@@ -8020,7 +8020,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ixgbe_init_mbx_params_pf(hw);
 	memcpy(&hw->mbx.ops, ii->mbx_ops, sizeof(hw->mbx.ops));
 	ixgbe_enable_sriov(adapter);
-	pci_sriov_set_totalvfs(pdev, 63);
+	pci_sriov_set_totalvfs(pdev, IXGBE_MAX_VFS_DRV_LIMIT);
 skip_sriov:
 
 #endif
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 0558c71..dff0977 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -148,7 +148,7 @@ void ixgbe_enable_sriov(struct ixgbe_adapter *adapter)
 		 * physical function.  If the user requests greater thn
 		 * 63 VFs then it is an error - reset to default of zero.
 		 */
-		adapter->num_vfs = min_t(unsigned int, adapter->num_vfs, 63);
+		adapter->num_vfs = min_t(unsigned int, adapter->num_vfs, IXGBE_MAX_VFS_DRV_LIMIT);
 
 		err = pci_enable_sriov(adapter->pdev, adapter->num_vfs);
 		if (err) {
@@ -257,7 +257,7 @@ static int ixgbe_pci_sriov_enable(struct pci_dev *dev, int num_vfs)
 	 * PF.  The PCI bus driver already checks for other values out of
 	 * range.
 	 */
-	if (num_vfs > 63) {
+	if (num_vfs > IXGBE_MAX_VFS_DRV_LIMIT) {
 		err = -EPERM;
 		goto err_out;
 	}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
index 4713f9f..8bd2919 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
@@ -28,6 +28,11 @@
 #ifndef _IXGBE_SRIOV_H_
 #define _IXGBE_SRIOV_H_
 
+/*  ixgbe driver limit the max number of VFs could be enabled to
+ *  63 (IXGBE_MAX_VF_FUNCTIONS - 1)
+ */
+#define IXGBE_MAX_VFS_DRV_LIMIT  (IXGBE_MAX_VF_FUNCTIONS - 1)
+
 void ixgbe_restore_vf_multicasts(struct ixgbe_adapter *adapter);
 void ixgbe_msg_task(struct ixgbe_adapter *adapter);
 int ixgbe_vf_configuration(struct pci_dev *pdev, unsigned int event_mask);
-- 
1.8.5.GIT

^ permalink raw reply related

* Re: PANIC in vxlan <debugging now>
From: Eric Dumazet @ 2014-01-17  3:37 UTC (permalink / raw)
  To: Fan Du; +Cc: Jesse Brandeburg, netdev
In-Reply-To: <52D89EAE.2080308@windriver.com>

On Fri, 2014-01-17 at 11:08 +0800, Fan Du wrote:
> 
> On 2014年01月17日 09:14, Jesse Brandeburg wrote:
> > I'm currently debugging this but given where the kernel release cycle
> > is I wanted to let the list know.
> 
> How you reproduce it? I use below two methods, both works.
> And my net-next head is at: abfce3ef58b6a6c95de389f9d20047a05b10e484, pretty new.
> 
> Method 1:
> ip link add vxlan0 type vxlan id 42 group ff0e::110
> ip link set vxlan0 up
> ip link set vxlan0 down
> 
> Method 2:
> ip link add vxlan0 type vxlan id 42 group ff0e::110
> ip link set vxlan0 up
> ip link delete vxlan0
> 

Given the stack trace, it seems you need to setup a network namespace.

Try 

ip netns add foo


> 
> > It may well be a bug in our code, and if it is we'll find it, but here is
> > the panic, it doesn't occur when vxlan is not enabled.
> >
> > Jan 16 13:46:44 jbrandeb-cp2 kernel: [   17.331010] cgroup: libvirtd (1387) created nested cgroup for controller "memory" which has incomplete hierarchy supp
> > ort. Nested cgroups may change behavior in the future.
> > Jan 16 13:46:44 jbrandeb-cp2 kernel: [   17.331014] cgroup: "memory" requires setting use_hierarchy to 1 on the root.
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.576568] ------------[ cut here ]------------
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.586411] kernel BUG at include/net/netns/generic.h:45!
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.596336] invalid opcode: 0000 [#1] SMP
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.606268] Modules linked in: lockd sunrpc i40e igb iTCO_wdt iTCO_vendor_support sb_edac ioatdma ptp microcode lpc_ich edac_core i2c_i801 mfd_core dca pps_core wmi kvm uinput isci firewire_ohci libsas firewire_core crc_itu_t scsi_transport_sas mgag200 drm_kms_helper ttm
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.637923] CPU: 0 PID: 1387 Comm: libvirtd Not tainted 3.13.0-rc7+ #30
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.648599] Hardware name: Intel Corporation S2600CO ........../S2600CO, BIOS SE5C600.86B.01.08.6003.062420131549 06/24/2013
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.659612] task: ffff88063b5c6000 ti: ffff8806333ca000 task.ti: ffff8806333ca000
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.670661] RIP: 0010:[<ffffffff816df92f>]  [<ffffffff816df92f>] net_generic.isra.34.part.35+0x4/0x6
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.681738] RSP: 0018:ffff8806333cbb80  EFLAGS: 00010246
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.692536] RAX: 0000000000000000 RBX: 00000000ffffffed RCX: 0000000000000010
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.703577] RDX: ffff88063d03d380 RSI: 0000000000000010 RDI: ffffffff81cfd9f0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.714612] RBP: ffff8806333cbb80 R08: 0000000000000000 R09: ffffffff81cfd9f0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.725531] R10: 00000000000002cc R11: 0000000000000004 R12: 0000000000000000
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.736448] R13: ffff880639118000 R14: ffff8806333cbc68 R15: 0000000000000000
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.747292] FS:  00007f6381830700(0000) GS:ffff880647600000(0000) knlGS:0000000000000000
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.758248] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.769263] CR2: 00007f637c04b000 CR3: 0000000c3aa1f000 CR4: 00000000000407f0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.780402] Stack:
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.791386]  ffff8806333cbbc0 ffffffff814d0865 ffff8806333cbc40 00000000ffffffef
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.802702]  00000000ffffffed ffffffff81cc67d0 0000000000000010 ffff8806333cbc68
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.814021]  ffff8806333cbc00 ffffffff816e9e5d 0000000000000004 ffff8806333cbc68
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.825185] Call Trace:
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.836106]  [<ffffffff814d0865>] vxlan_lowerdev_event+0xf5/0x100
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.847254]  [<ffffffff816e9e5d>] notifier_call_chain+0x4d/0x70
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.858457]  [<ffffffff810912be>] __raw_notifier_call_chain+0xe/0x10
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.869696]  [<ffffffff810912d6>] raw_notifier_call_chain+0x16/0x20
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.880896]  [<ffffffff815d9610>] call_netdevice_notifiers_info+0x40/0x70
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.892063]  [<ffffffff815d9656>] call_netdevice_notifiers+0x16/0x20
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.903107]  [<ffffffff815e1bce>] register_netdevice+0x1be/0x3a0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.914128]  [<ffffffff815e1dce>] register_netdev+0x1e/0x30
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.925072]  [<ffffffff814cb94a>] loopback_net_init+0x4a/0xb0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.936048]  [<ffffffffa016ed6e>] ? lockd_init_net+0x6e/0xb0 [lockd]
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.947081]  [<ffffffff815d6bac>] ops_init+0x4c/0x150
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.958070]  [<ffffffff815d6d23>] setup_net+0x73/0x110
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.969006]  [<ffffffff815d725b>] copy_net_ns+0x7b/0x100
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.979897]  [<ffffffff81090e11>] create_new_namespaces+0x101/0x1b0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   17.990855]  [<ffffffff81090f45>] copy_namespaces+0x85/0xb0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.001656]  [<ffffffff810693d5>] copy_process.part.26+0x935/0x1500
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.012370]  [<ffffffff811d5186>] ? mntput+0x26/0x40
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.022924]  [<ffffffff8106a15c>] do_fork+0xbc/0x2e0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.033331]  [<ffffffff811b7f2e>] ? ____fput+0xe/0x10
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.043622]  [<ffffffff81089c5c>] ? task_work_run+0xac/0xe0
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.053905]  [<ffffffff8106a406>] SyS_clone+0x16/0x20
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.064265]  [<ffffffff816ee689>] stub_clone+0x69/0x90
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.074600]  [<ffffffff816ee329>] ? system_call_fastpath+0x16/0x1b
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.084879] Code: 00 75 1d 55 be 2f 00 00 00 48 c7 c7 65 93 a2 81 48 89 e5 e8 f4 b5 98 ff 5d c6 05 30 aa 5f 00 01 c3 55 48 89 e5 0f 0b 55 48 89 e5<0f>  0b 55 48 89 e5 0f 0b 66 66 66 66 90 55 48 c7 c7 c0 4c cb 81
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.105818] RIP  [<ffffffff816df92f>] net_generic.isra.34.part.35+0x4/0x6
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.116106]  RSP<ffff8806333cbb80>
> > Jan 16 13:46:45 jbrandeb-cp2 kernel: [   18.172366] ---[ end trace 0bb84cf9aa76a384 ]---
> > Jan 16 13:46:47 jbrandeb-cp2 systemd[1]: Startup finished in 4s 918ms 164us (kernel) + 3s 548ms 460us (initrd) + 11s 2ms 474us (userspace) = 19s 469ms 98us.
> > Jan 16 13:46:47 jbrandeb-cp2 dbus-daemon[989]: dbus[989]: [system] Activating via systemd: service name='org.freedesktop.Accounts' unit='accounts-daemon.service'
> >
> > code says:
> > (gdb) l *(vxlan_lowerdev_event+0xf5)
> > 0xffffffff814d0865 is at include/net/netns/generic.h:41.
> > 34      static inline void *net_generic(const struct net *net, int id)
> > 35      {
> > 36              struct net_generic *ng;
> > 37              void *ptr;
> > 38
> > 39              rcu_read_lock();
> > 40              ng = rcu_dereference(net->gen);
> > 41              BUG_ON(id == 0 || id>  ng->len);
> > 42              ptr = ng->ptr[id - 1];
> > 43              rcu_read_unlock();
> > 44
> >>>>> 45              BUG_ON(!ptr);
> > 46              return ptr;
> > 47      }
> > 48      #endif
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> 

^ permalink raw reply

* Re: Fwd: [RFC PATCH net-next 0/3] virtio_net: add aRFS support
From: Jason Wang @ 2014-01-17  3:26 UTC (permalink / raw)
  To: Tom Herbert, Stefan Hajnoczi
  Cc: Zhi Yong Wu, Linux Netdev List, Eric Dumazet, David S. Miller,
	Zhi Yong Wu, Michael S. Tsirkin, Rusty Russell
In-Reply-To: <CA+mtBx9PBtYurdnhCKL0MLL8i+_+3yPNWFVj5h6SPJH+YDBCjw@mail.gmail.com>

On 01/17/2014 01:12 AM, Tom Herbert wrote:
> On Thu, Jan 16, 2014 at 12:52 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>> On Thu, Jan 16, 2014 at 04:34:10PM +0800, Zhi Yong Wu wrote:
>>> CC: stefanha, MST, Rusty Russel
>>>
>>> ---------- Forwarded message ----------
>>> From: Jason Wang <jasowang@redhat.com>
>>> Date: Thu, Jan 16, 2014 at 12:23 PM
>>> Subject: Re: [RFC PATCH net-next 0/3] virtio_net: add aRFS support
>>> To: Zhi Yong Wu <zwu.kernel@gmail.com>
>>> Cc: netdev@vger.kernel.org, therbert@google.com, edumazet@google.com,
>>> davem@davemloft.net, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>
>>>
>>> On 01/15/2014 10:20 PM, Zhi Yong Wu wrote:
>>>> From: Zhi Yong Wu<wuzhy@linux.vnet.ibm.com>
>>>>
>>>> HI, folks
>>>>
>>>> The patchset is trying to integrate aRFS support to virtio_net. In this case,
>>>> aRFS will be used to select the RX queue. To make sure that it's going ahead
>>>> in the correct direction, although it is still one RFC and isn't tested, it's
>>>> post out ASAP. Any comment are appreciated, thanks.
>>>>
>>>> If anyone is interested in playing with it, you can get this patchset from my
>>>> dev git on github:
>>>>    git://github.com/wuzhy/kernel.git virtnet_rfs
>>>>
>>>> Zhi Yong Wu (3):
>>>>    virtio_pci: Introduce one new config api vp_get_vq_irq()
>>>>    virtio_net: Introduce one dummy function virtnet_filter_rfs()
>>>>    virtio-net: Add accelerated RFS support
>>>>
>>>>   drivers/net/virtio_net.c      |   67 ++++++++++++++++++++++++++++++++++++++++-
>>>>   drivers/virtio/virtio_pci.c   |   11 +++++++
>>>>   include/linux/virtio_config.h |   12 +++++++
>>>>   3 files changed, 89 insertions(+), 1 deletions(-)
>>>>
>>> Please run get_maintainter.pl before sending the patch. You'd better
>>> at least cc virtio maintainer/list for this.
>>>
>>> The core aRFS method is a noop in this RFC which make this series no
>>> much sense to discuss. You should at least mention the big picture
>>> here in the cover letter. I suggest you should post a RFC which can
>>> run and has expected result or you can just raise a thread for the
>>> design discussion.
>>>
>>> And this method has been discussed before, you can search "[net-next
>>> RFC PATCH 5/5] virtio-net: flow director support" in netdev archive
>>> for a very old prototype implemented by me. It can work and looks like
>>> most of this RFC have already done there.
>>>
>>> A basic question is whether or not we need this, not all the mq cards
>>> use aRFS (see ixgbe ATR). And whether or not it can bring extra
>>> overheads? For virtio, we want to reduce the vmexits as much as
>>> possible but this aRFS seems introduce a lot of more of this. Making a
>>> complex interfaces just for an virtual device may not be good, simple
>>> method may works for most of the cases.
>>>
>>> We really should consider to offload this to real nic. VMDq and L2
>>> forwarding offload may help in this case.
> Adding flow director support would be a good step, Zhi's patches for
> support in tun have been merged, so support in virtio-net would be a
> good follow on. But, flow-director does have some limitations and
> performance issues of it's own (forced pairing between TX and RX
> queues, lookup on every TX packet). 

True. But the pairing was designed to work without guest involving since
we really want to reduce the vmexits from guest. And lookup on every TX
packets could be released to every N packets. But I agree exposing the
API to guest may bring lots of flexibility.
> In the case of virtualization,
> aRFS, RSS, ntuple filtering, LRO, etc. can be implemented as software
> emulations and so far seems to be wins in most cases. Extending these
> down into the stack so that they can leverage HW mechanisms is a good
> goal for best performance. It's probably generally true that most of
> the offloads commonly available for NICs we'll want in virtualization
> path. Of course, we need to deomonstrate that they provide real
> performance benefit in this use case.

Yes, we need a prototype to see how much it can help.
>
> I believe tying in aRFS (or flow director) into a real aRFS is just a
> matter of programming the RFS table properly. This is not the complex
> side of the interface, I believe this already works with the tun
> patches.

Right, what we may needs is

- exposing new tun ioctls for qemu adding or removing a flow
- new virtqueue command for guest driver to adding or removing a flow
(btw, current control virtqueue is really slow, we may need to improve it).
- an agreement of host and guest to use the same hash method, or just
compute software hash in host and pass it to guest (which needs extra
API to do)
- change guest driver to use aRFS

Some of the above has been implemented in my old RFC.
>
>> Zhi Yong and I had an IRC chat.  I wanted to post my questions on the
>> list - it's still the same concern I had in the old email thread that
>> Jason mentioned.
>>
>> In order for virtio-net aRFS to make sense there needs to be an overall
>> plan for pushing flow mapping information down to the physical NIC.
>> That's the only way to actually achieve the benefit of steering:
>> processing the packet on the CPU where the application is running.
>>
> I don't think this is necessarily true. Per flow steering amongst
> virtual queues should be beneficial in itself. virtio-net can leverage
> RFS or aRFS where it's available.
>
>> If it's not possible or too hard to implement aRFS down the entire
>> stack, we won't be able to process the packet on the right CPU.
>> Then we might as well not bother with aRFS and just distribute uniformly
>> across the rx virtqueues.
>>
>> Please post an outline of how rx packets will be steered up the stack so
>> we can discuss whether aRFS can bring any benefit.
>>
> 1. The aRFS interface for the guest to specify which virtual queue to
> receive a packet on is fairly straight forward.
> 2. To hook into RFS, we need to match the virtual queue to the real
> CPU it will processed on, and then program the RFS table for that flow
> and CPU.
> 3. NIC aRFS keys off the RFS tables so it can program the HW with the
> correct queue for the CPU.
>
>> Stefan
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next 0/5] tipc: align TIPC behaviours of waiting for events with other stacks
From: Jon Maloy @ 2014-01-17  3:20 UTC (permalink / raw)
  To: David Miller, ying.xue@windriver.com
  Cc: Paul.Gortmaker@windriver.com,
	tipc-discussion@lists.sourceforge.net, netdev@vger.kernel.org
In-Reply-To: <20140116.191140.184722722665650663.davem@davemloft.net>

Nice job, Ying.
///jon

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: January-16-14 10:12 PM
> To: ying.xue@windriver.com
> Cc: Paul.Gortmaker@windriver.com; maloy@donjonn.com; Jon Maloy; Erik
> Hugne; netdev@vger.kernel.org; tipc-discussion@lists.sourceforge.net
> Subject: Re: [PATCH net-next 0/5] tipc: align TIPC behaviours of waiting for
> events with other stacks
> 
> From: Ying Xue <ying.xue@windriver.com>
> Date: Fri, 17 Jan 2014 09:50:02 +0800
> 
> > Comparing the current implementations of waiting for events in TIPC
> > socket layer with other stacks, TIPC's behaviour is very different
> > because wait_event_interruptible_timeout()/wait_event_interruptible()
> > are always used by TIPC to wait for events while relevant socket or
> > port variables are fed to them as their arguments. As socket lock has
> > to be released temporarily before the two routines of waiting for
> > events are called, their arguments associated with socket or port
> > structures are out of socket lock protection. This might cause serious
> > issues where the process of calling socket syscall such as sendsmg(),
> > connect(), accept(), and recvmsg(), cannot be waken up at all even if
> > proper event arrives or improperly be woken up although the condition
> > of waking up the process is not satisfied in practice.
> >
> > Therefore, aligning its behaviours with similar functions implemented
> > in other stacks, for instance, sk_stream_wait_connect() and
> > inet_csk_wait_for_connect() etc, can avoid above risks for us.
> 
> Series applied, thank you.

------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today. 
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk

^ permalink raw reply

* Re: [PATCH -next] net_sched: fix error return code in fw_change_attrs()
From: David Miller @ 2014-01-17  3:12 UTC (permalink / raw)
  To: weiyj.lk; +Cc: jhs, xiyou.wangcong, yongjun_wei, netdev
In-Reply-To: <CAPgLHd_cjvdEcYZZNGg1rwKRBQcUS74vGvsdmTNkSWpPqR+fMQ@mail.gmail.com>

From: Wei Yongjun <weiyj.lk@gmail.com>
Date: Fri, 17 Jan 2014 09:53:20 +0800

> From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
> 
> The error code was not set if change indev fail, so the error
> condition wasn't reflected in the return value. Fix to return a
> negative error code from this error handling case instead of 0.
> 
> Fixes: 2519a602c273 ('net_sched: optimize tcf_match_indev()')
> Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

Applied, thank you.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox