Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next rfc v7 3/3] virtio-net: change the number of queues through ethtool
From: Jason Wang @ 2012-11-27 10:16 UTC (permalink / raw)
  To: rusty, mst, krkumar2, virtualization, netdev, linux-kernel
  Cc: bhutchings, jwhan, shiyer, kvm
In-Reply-To: <1354011360-39479-1-git-send-email-jasowang@redhat.com>

This patch implement the {set|get}_channels method of ethool to allow user to
change the number of queues dymaically when the device is running. This would
let the user to configure it on demand.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c |   41 +++++++++++++++++++++++++++++++++++++++++
 1 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index bcaa6e5..f08ec2a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1578,10 +1578,51 @@ static struct virtio_driver virtio_net_driver = {
 #endif
 };
 
+/* TODO: Eliminate OOO packets during switching */
+static int virtnet_set_channels(struct net_device *dev,
+				struct ethtool_channels *channels)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	u16 queue_pairs = channels->combined_count;
+
+	/* We don't support separate rx/tx channels.
+	 * We don't allow setting 'other' channels.
+	 */
+	if (channels->rx_count || channels->tx_count || channels->other_count)
+		return -EINVAL;
+
+	/* Only two modes were support currently */
+	if (queue_pairs != vi->max_queue_pairs && queue_pairs != 1)
+		return -EINVAL;
+
+	vi->curr_queue_pairs = queue_pairs;
+	BUG_ON(virtnet_set_queues(vi));
+
+	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
+	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
+
+	return 0;
+}
+
+static void virtnet_get_channels(struct net_device *dev,
+				 struct ethtool_channels *channels)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+
+	channels->combined_count = vi->curr_queue_pairs;
+	channels->max_combined = vi->max_queue_pairs;
+	channels->max_other = 0;
+	channels->rx_count = 0;
+	channels->tx_count = 0;
+	channels->other_count = 0;
+}
+
 static const struct ethtool_ops virtnet_ethtool_ops = {
 	.get_drvinfo = virtnet_get_drvinfo,
 	.get_link = ethtool_op_get_link,
 	.get_ringparam = virtnet_get_ringparam,
+	.set_channels = virtnet_set_channels,
+	.get_channels = virtnet_get_channels,
 };
 
 static int __init init(void)
-- 
1.7.1

^ permalink raw reply related

* [net-next rfc v7 2/3] virtio_net: multiqueue support
From: Jason Wang @ 2012-11-27 10:15 UTC (permalink / raw)
  To: rusty, mst, krkumar2, virtualization, netdev, linux-kernel
  Cc: bhutchings, jwhan, shiyer, kvm
In-Reply-To: <1354011360-39479-1-git-send-email-jasowang@redhat.com>

This addes multiqueue support to virtio_net driver. In multiple queue modes, the
driver expects the number of queue paris is equal to the number of vcpus. To
eliminate the contention bettwen vcpus and virtqueues, per-cpu virtqueue pairs
were implemented through:

- select the txq based on the smp processor id.
- smp affinity hint were set to the vcpu that owns the queue pairs.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c        |  454 ++++++++++++++++++++++++++++++---------
 include/uapi/linux/virtio_net.h |   16 ++
 2 files changed, 371 insertions(+), 99 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7975133..bcaa6e5 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -84,17 +84,25 @@ struct virtnet_info {
 	struct virtio_device *vdev;
 	struct virtqueue *cvq;
 	struct net_device *dev;
-	struct napi_struct napi;
-	struct send_queue sq;
-	struct receive_queue rq;
+	struct send_queue *sq;
+	struct receive_queue *rq;
 	unsigned int status;
 
+	/* Max # of queue pairs supported by the device */
+	u16 max_queue_pairs;
+
+	/* # of queue pairs currently used by the driver */
+	u16 curr_queue_pairs;
+
 	/* I like... big packets and I cannot lie! */
 	bool big_packets;
 
 	/* Host will merge rx buffers for big packets (shake it! shake it!) */
 	bool mergeable_rx_bufs;
 
+	/* Has control virtqueue */
+	bool has_cvq;
+
 	/* enable config space updates */
 	bool config_enable;
 
@@ -126,6 +134,34 @@ struct padded_vnet_hdr {
 	char padding[6];
 };
 
+static const struct ethtool_ops virtnet_ethtool_ops;
+
+/*
+ * Converting between virtqueue no. and kernel tx/rx queue no.
+ * 0:rx0 1:tx0 2:cvq 3:rx1 4:tx1 ... 2N+1:rxN 2N+2:txN
+ */
+static int vq2txq(struct virtqueue *vq)
+{
+	int index = virtqueue_get_queue_index(vq);
+	return index == 1 ? 0 : (index - 2) / 2;
+}
+
+static int txq2vq(int txq)
+{
+	return txq ? 2 * txq + 2 : 1;
+}
+
+static int vq2rxq(struct virtqueue *vq)
+{
+	int index = virtqueue_get_queue_index(vq);
+	return index ? (index - 1) / 2 : 0;
+}
+
+static int rxq2vq(int rxq)
+{
+	return rxq ? 2 * rxq + 1 : 0;
+}
+
 static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
 {
 	return (struct skb_vnet_hdr *)skb->cb;
@@ -166,7 +202,7 @@ static void skb_xmit_done(struct virtqueue *vq)
 	virtqueue_disable_cb(vq);
 
 	/* We were probably waiting for more output buffers. */
-	netif_wake_queue(vi->dev);
+	netif_wake_subqueue(vi->dev, vq2txq(vq));
 }
 
 static void set_skb_frag(struct sk_buff *skb, struct page *page,
@@ -503,7 +539,7 @@ static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
 static void skb_recv_done(struct virtqueue *rvq)
 {
 	struct virtnet_info *vi = rvq->vdev->priv;
-	struct receive_queue *rq = &vi->rq;
+	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
 
 	/* Schedule NAPI, Suppress further interrupts if successful. */
 	if (napi_schedule_prep(&rq->napi)) {
@@ -650,7 +686,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	struct send_queue *sq = &vi->sq;
+	int qnum = skb_get_queue_mapping(skb);
+	struct send_queue *sq = &vi->sq[qnum];
 	int capacity;
 
 	/* Free up any pending old buffers before queueing new ones. */
@@ -664,13 +701,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (likely(capacity == -ENOMEM)) {
 			if (net_ratelimit())
 				dev_warn(&dev->dev,
-					 "TX queue failure: out of memory\n");
+					 "TXQ (%d) failure: out of memory\n",
+					 qnum);
 		} else {
 			dev->stats.tx_fifo_errors++;
 			if (net_ratelimit())
 				dev_warn(&dev->dev,
-					 "Unexpected TX queue failure: %d\n",
-					 capacity);
+					 "Unexpected TXQ (%d) failure: %d\n",
+					 qnum, capacity);
 		}
 		dev->stats.tx_dropped++;
 		kfree_skb(skb);
@@ -685,12 +723,12 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Apparently nice girls don't return TX_BUSY; stop the queue
 	 * before it gets out of hand.  Naturally, this wastes entries. */
 	if (capacity < 2+MAX_SKB_FRAGS) {
-		netif_stop_queue(dev);
+		netif_stop_subqueue(dev, qnum);
 		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
 			/* More just got used, free them then recheck. */
 			capacity += free_old_xmit_skbs(sq);
 			if (capacity >= 2+MAX_SKB_FRAGS) {
-				netif_start_queue(dev);
+				netif_start_subqueue(dev, qnum);
 				virtqueue_disable_cb(sq->vq);
 			}
 		}
@@ -758,23 +796,13 @@ static struct rtnl_link_stats64 *virtnet_stats(struct net_device *dev,
 static void virtnet_netpoll(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
 
-	napi_schedule(&vi->rq.napi);
+	for (i = 0; i < vi->curr_queue_pairs; i++)
+		napi_schedule(&vi->rq[i].napi);
 }
 #endif
 
-static int virtnet_open(struct net_device *dev)
-{
-	struct virtnet_info *vi = netdev_priv(dev);
-
-	/* Make sure we have some buffers: if oom use wq. */
-	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
-		schedule_delayed_work(&vi->rq.refill, 0);
-
-	virtnet_napi_enable(&vi->rq);
-	return 0;
-}
-
 /*
  * Send command via the control virtqueue and check status.  Commands
  * supported by the hypervisor, as indicated by feature bits, should
@@ -830,13 +858,53 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi)
 	rtnl_unlock();
 }
 
+static int virtnet_set_queues(struct virtnet_info *vi)
+{
+	struct scatterlist sg;
+	struct virtio_net_ctrl_rfs s;
+	struct net_device *dev = vi->dev;
+
+	s.virtqueue_pairs = vi->curr_queue_pairs;
+	sg_init_one(&sg, &s, sizeof(s));
+
+	if (!vi->has_cvq)
+		return -EINVAL;
+
+	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RFS,
+				  VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET, &sg, 1, 0)){
+		dev_warn(&dev->dev, "Fail to set the number of queue pairs to"
+			 " %d\n", vi->curr_queue_pairs);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int virtnet_open(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		/* Make sure we have some buffers: if oom use wq. */
+		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+			schedule_delayed_work(&vi->rq[i].refill, 0);
+		virtnet_napi_enable(&vi->rq[i]);
+	}
+
+	return 0;
+}
+
 static int virtnet_close(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
 
 	/* Make sure refill_work doesn't re-enable napi! */
-	cancel_delayed_work_sync(&vi->rq.refill);
-	napi_disable(&vi->rq.napi);
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		cancel_delayed_work_sync(&vi->rq[i].refill);
+		napi_disable(&vi->rq[i].napi);
+	}
 
 	return 0;
 }
@@ -948,8 +1016,8 @@ static void virtnet_get_ringparam(struct net_device *dev,
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq.vq);
-	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq.vq);
+	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
+	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
 	ring->rx_pending = ring->rx_max_pending;
 	ring->tx_pending = ring->tx_max_pending;
 }
@@ -967,12 +1035,6 @@ static void virtnet_get_drvinfo(struct net_device *dev,
 
 }
 
-static const struct ethtool_ops virtnet_ethtool_ops = {
-	.get_drvinfo = virtnet_get_drvinfo,
-	.get_link = ethtool_op_get_link,
-	.get_ringparam = virtnet_get_ringparam,
-};
-
 #define MIN_MTU 68
 #define MAX_MTU 65535
 
@@ -984,6 +1046,20 @@ static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+/* To avoid contending a lock hold by a vcpu who would exit to host, select the
+ * txq based on the processor id.
+ */
+static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
+		  smp_processor_id();
+
+	while (unlikely(txq >= dev->real_num_tx_queues))
+		txq -= dev->real_num_tx_queues;
+
+	return txq;
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -995,6 +1071,7 @@ static const struct net_device_ops virtnet_netdev = {
 	.ndo_get_stats64     = virtnet_stats,
 	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
+	.ndo_select_queue     = virtnet_select_queue,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = virtnet_netpoll,
 #endif
@@ -1030,10 +1107,10 @@ static void virtnet_config_changed_work(struct work_struct *work)
 
 	if (vi->status & VIRTIO_NET_S_LINK_UP) {
 		netif_carrier_on(vi->dev);
-		netif_wake_queue(vi->dev);
+		netif_tx_wake_all_queues(vi->dev);
 	} else {
 		netif_carrier_off(vi->dev);
-		netif_stop_queue(vi->dev);
+		netif_tx_stop_all_queues(vi->dev);
 	}
 done:
 	mutex_unlock(&vi->config_lock);
@@ -1046,41 +1123,212 @@ static void virtnet_config_changed(struct virtio_device *vdev)
 	schedule_work(&vi->config_work);
 }
 
-static int init_vqs(struct virtnet_info *vi)
+static void free_receive_bufs(struct virtnet_info *vi)
+{
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		while (vi->rq[i].pages)
+			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
+	}
+}
+
+/* Free memory allocated for send and receive queues */
+static void virtnet_free_queues(struct virtnet_info *vi)
 {
-	struct virtqueue *vqs[3];
-	vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL};
-	const char *names[] = { "input", "output", "control" };
-	int nvqs, err;
+	kfree(vi->rq);
+	vi->rq = NULL;
+	kfree(vi->sq);
+	vi->sq = NULL;
+}
 
-	/* We expect two virtqueues, receive then send,
-	 * and optionally control. */
-	nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
+static void free_unused_bufs(struct virtnet_info *vi)
+{
+	void *buf;
+	int i;
 
-	err = vi->vdev->config->find_vqs(vi->vdev, nvqs, vqs, callbacks, names);
-	if (err)
-		return err;
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		struct virtqueue *vq = vi->sq[i].vq;
+		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
+			dev_kfree_skb(buf);
+	}
 
-	vi->rq.vq = vqs[0];
-	vi->sq.vq = vqs[1];
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		struct virtqueue *vq = vi->rq[i].vq;
 
-	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
-		vi->cvq = vqs[2];
+		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
+			if (vi->mergeable_rx_bufs || vi->big_packets)
+				give_pages(&vi->rq[i], buf);
+			else
+				dev_kfree_skb(buf);
+			--vi->rq[i].num;
+		}
+		BUG_ON(vi->rq[i].num != 0);
+	}
+}
 
+static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
+{
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		int cpu = set ? i : -1;
+		virtqueue_set_affinity(vi->rq[i].vq, cpu);
+		virtqueue_set_affinity(vi->sq[i].vq, cpu);
+	}
+}
+
+static void virtnet_del_vqs(struct virtnet_info *vi)
+{
+	struct virtio_device *vdev = vi->vdev;
+
+	virtnet_set_affinity(vi, false);
+
+	vdev->config->del_vqs(vdev);
+
+	virtnet_free_queues(vi);
+}
+
+static int virtnet_find_vqs(struct virtnet_info *vi)
+{
+	vq_callback_t **callbacks;
+	struct virtqueue **vqs;
+	int ret = -ENOMEM;
+	int i, total_vqs;
+	char **names;
+
+	/*
+	 * We expect 1 RX virtqueue followed by 1 TX virtqueue, followd by
+	 * possible control virtqueue, followed by RX/TX N-1 queue pairs used
+	 * in multiqueue mode.
+	 */
+	total_vqs = vi->max_queue_pairs * 2 +
+		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
+
+	/* Allocate space for find_vqs parameters */
+	vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
+	callbacks = kzalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
+	if (!vqs || !callbacks)
+		goto err_mem;
+	names = kzalloc(total_vqs * sizeof(*names), GFP_KERNEL);
+	if (!names)
+		goto err_mem;
+
+	/* Parameters for control virtqueue, if any */
+	if (vi->has_cvq) {
+		callbacks[2] = NULL;
+		names[2] = kasprintf(GFP_KERNEL, "control");
+	}
+
+	/* Allocate/initialize parameters for send/receive virtqueues */
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		callbacks[rxq2vq(i)] = skb_recv_done;
+		callbacks[txq2vq(i)] = skb_xmit_done;
+		names[rxq2vq(i)] = kasprintf(GFP_KERNEL, "input.%d", i);
+		names[txq2vq(i)] = kasprintf(GFP_KERNEL, "output.%d", i);
+	}
+
+	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
+					 (const char **)names);
+	if (ret)
+		goto err_names;
+
+	if (vi->has_cvq) {
+		vi->cvq = vqs[2];
 		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
 			vi->dev->features |= NETIF_F_HW_VLAN_FILTER;
 	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		vi->rq[i].vq = vqs[rxq2vq(i)];
+		vi->sq[i].vq = vqs[txq2vq(i)];
+	}
+
+	kfree(callbacks);
+	kfree(vqs);
+
+	return 0;
+
+err_names:
+	for (i = 0; i < total_vqs * 2; i ++)
+		kfree(names[i]);
+	kfree(names);
+
+err_mem:
+	kfree(callbacks);
+	kfree(vqs);
+
+	return ret;
+}
+
+static int virtnet_alloc_queues(struct virtnet_info *vi)
+{
+	int i;
+
+	vi->sq = kzalloc(sizeof(vi->sq[0]) * vi->max_queue_pairs, GFP_KERNEL);
+	vi->rq = kzalloc(sizeof(vi->rq[0]) * vi->max_queue_pairs, GFP_KERNEL);
+	if (!vi->rq || !vi->sq)
+		goto err;
+
+	/* setup initial receive and send queue parameters */
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		vi->rq[i].pages = NULL;
+		INIT_DELAYED_WORK(&vi->rq[i].refill, refill_work);
+		netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
+			       napi_weight);
+
+		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
+		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
+	}
+
+
 	return 0;
+
+err:
+	virtnet_free_queues(vi);
+	return -ENOMEM;
+}
+
+static int init_vqs(struct virtnet_info *vi)
+{
+	int ret;
+
+	/* Allocate send & receive queues */
+	ret = virtnet_alloc_queues(vi);
+	if (ret)
+		goto err;
+
+	ret = virtnet_find_vqs(vi);
+	if (ret)
+		goto err_free;
+
+	virtnet_set_affinity(vi, true);
+	return 0;
+
+err_free:
+	virtnet_free_queues(vi);
+err:
+	return ret;
 }
 
 static int virtnet_probe(struct virtio_device *vdev)
 {
-	int err;
+	int i, err;
 	struct net_device *dev;
 	struct virtnet_info *vi;
+	u16 curr_queue_pairs;
+
+	/* Find if host supports multiqueue virtio_net device */
+	err = virtio_config_val(vdev, VIRTIO_NET_F_RFS,
+				offsetof(struct virtio_net_config,
+				max_virtqueue_pairs), &curr_queue_pairs);
+
+	/* We need at least 2 queue's */
+	if (err)
+		curr_queue_pairs = 1;
 
 	/* Allocate ourselves a network device with room for our info */
-	dev = alloc_etherdev(sizeof(struct virtnet_info));
+	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), curr_queue_pairs);
 	if (!dev)
 		return -ENOMEM;
 
@@ -1126,22 +1374,17 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	/* Set up our device-specific information */
 	vi = netdev_priv(dev);
-	netif_napi_add(dev, &vi->rq.napi, virtnet_poll, napi_weight);
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;
-	vi->rq.pages = NULL;
 	vi->stats = alloc_percpu(struct virtnet_stats);
 	err = -ENOMEM;
 	if (vi->stats == NULL)
 		goto free;
 
-	INIT_DELAYED_WORK(&vi->rq.refill, refill_work);
 	mutex_init(&vi->config_lock);
 	vi->config_enable = true;
 	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
-	sg_init_table(vi->rq.sg, ARRAY_SIZE(vi->rq.sg));
-	sg_init_table(vi->sq.sg, ARRAY_SIZE(vi->sq.sg));
 
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -1152,10 +1395,21 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
+		vi->has_cvq = true;
+
+	/* Use single tx/rx queue pair as default */
+	vi->curr_queue_pairs = 1;
+	vi->max_queue_pairs = curr_queue_pairs;
+
+	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
 	err = init_vqs(vi);
 	if (err)
 		goto free_stats;
 
+	netif_set_real_num_tx_queues(dev, 1);
+	netif_set_real_num_rx_queues(dev, 1);
+
 	err = register_netdev(dev);
 	if (err) {
 		pr_debug("virtio_net: registering device failed\n");
@@ -1163,12 +1417,15 @@ static int virtnet_probe(struct virtio_device *vdev)
 	}
 
 	/* Last of all, set up some receive buffers. */
-	try_fill_recv(&vi->rq, GFP_KERNEL);
-
-	/* If we didn't even get one input buffer, we're useless. */
-	if (vi->rq.num == 0) {
-		err = -ENOMEM;
-		goto unregister;
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		try_fill_recv(&vi->rq[i], GFP_KERNEL);
+
+		/* If we didn't even get one input buffer, we're useless. */
+		if (vi->rq[i].num == 0) {
+			free_unused_bufs(vi);
+			err = -ENOMEM;
+			goto free_recv_bufs;
+		}
 	}
 
 	/* Assume link up if device can't report link status,
@@ -1181,13 +1438,20 @@ static int virtnet_probe(struct virtio_device *vdev)
 		netif_carrier_on(dev);
 	}
 
-	pr_debug("virtnet: registered device %s\n", dev->name);
+	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
+		 dev->name, curr_queue_pairs);
+
 	return 0;
 
-unregister:
+free_recv_bufs:
+	free_receive_bufs(vi);
 	unregister_netdev(dev);
+
 free_vqs:
-	vdev->config->del_vqs(vdev);
+	for (i = 0; i <curr_queue_pairs; i++)
+		cancel_delayed_work_sync(&vi->rq[i].refill);
+	virtnet_del_vqs(vi);
+
 free_stats:
 	free_percpu(vi->stats);
 free:
@@ -1195,28 +1459,6 @@ free:
 	return err;
 }
 
-static void free_unused_bufs(struct virtnet_info *vi)
-{
-	void *buf;
-	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->sq.vq);
-		if (!buf)
-			break;
-		dev_kfree_skb(buf);
-	}
-	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->rq.vq);
-		if (!buf)
-			break;
-		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(&vi->rq, buf);
-		else
-			dev_kfree_skb(buf);
-		--vi->rq.num;
-	}
-	BUG_ON(vi->rq.num != 0);
-}
-
 static void remove_vq_common(struct virtnet_info *vi)
 {
 	vi->vdev->config->reset(vi->vdev);
@@ -1224,10 +1466,9 @@ static void remove_vq_common(struct virtnet_info *vi)
 	/* Free unused buffers in both send and recv, if any. */
 	free_unused_bufs(vi);
 
-	vi->vdev->config->del_vqs(vi->vdev);
+	free_receive_bufs(vi);
 
-	while (vi->rq.pages)
-		__free_pages(get_a_page(&vi->rq, GFP_KERNEL), 0);
+	virtnet_del_vqs(vi);
 }
 
 static void __devexit virtnet_remove(struct virtio_device *vdev)
@@ -1253,6 +1494,7 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
 static int virtnet_freeze(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
+	int i;
 
 	/* Prevent config work handler from accessing the device */
 	mutex_lock(&vi->config_lock);
@@ -1260,10 +1502,14 @@ static int virtnet_freeze(struct virtio_device *vdev)
 	mutex_unlock(&vi->config_lock);
 
 	netif_device_detach(vi->dev);
-	cancel_delayed_work_sync(&vi->rq.refill);
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		cancel_delayed_work_sync(&vi->rq[i].refill);
 
 	if (netif_running(vi->dev))
-		napi_disable(&vi->rq.napi);
+		for (i = 0; i < vi->max_queue_pairs; i++) {
+			napi_disable(&vi->rq[i].napi);
+			netif_napi_del(&vi->rq[i].napi);
+		}
 
 	remove_vq_common(vi);
 
@@ -1275,24 +1521,28 @@ static int virtnet_freeze(struct virtio_device *vdev)
 static int virtnet_restore(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
-	int err;
+	int err, i;
 
 	err = init_vqs(vi);
 	if (err)
 		return err;
 
 	if (netif_running(vi->dev))
-		virtnet_napi_enable(&vi->rq);
+		for (i = 0; i < vi->max_queue_pairs; i++)
+			virtnet_napi_enable(&vi->rq[i]);
 
 	netif_device_attach(vi->dev);
 
-	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
-		schedule_delayed_work(&vi->rq.refill, 0);
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+			schedule_delayed_work(&vi->rq[i].refill, 0);
 
 	mutex_lock(&vi->config_lock);
 	vi->config_enable = true;
 	mutex_unlock(&vi->config_lock);
 
+	BUG_ON(virtnet_set_queues(vi));
+
 	return 0;
 }
 #endif
@@ -1310,7 +1560,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
 	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
 	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
-	VIRTIO_NET_F_GUEST_ANNOUNCE,
+	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_RFS,
 };
 
 static struct virtio_driver virtio_net_driver = {
@@ -1328,6 +1578,12 @@ static struct virtio_driver virtio_net_driver = {
 #endif
 };
 
+static const struct ethtool_ops virtnet_ethtool_ops = {
+	.get_drvinfo = virtnet_get_drvinfo,
+	.get_link = ethtool_op_get_link,
+	.get_ringparam = virtnet_get_ringparam,
+};
+
 static int __init init(void)
 {
 	return register_virtio_driver(&virtio_net_driver);
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 2470f54..6056cec 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -51,6 +51,7 @@
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20	/* Extra RX mode control support */
 #define VIRTIO_NET_F_GUEST_ANNOUNCE 21	/* Guest can announce device on the
 					 * network */
+#define VIRTIO_NET_F_RFS	22	/* Device supports multiple TXQ/RXQ */
 
 #define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
 #define VIRTIO_NET_S_ANNOUNCE	2	/* Announcement is needed */
@@ -60,6 +61,8 @@ struct virtio_net_config {
 	__u8 mac[6];
 	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
 	__u16 status;
+	/* Total number of RX/TX queues */
+	__u16 max_virtqueue_pairs;
 } __attribute__((packed));
 
 /* This is the first element of the scatter-gather list.  If you don't
@@ -166,4 +169,17 @@ struct virtio_net_ctrl_mac {
 #define VIRTIO_NET_CTRL_ANNOUNCE       3
  #define VIRTIO_NET_CTRL_ANNOUNCE_ACK         0
 
+/*
+ * Control multiqueue
+ *
+ */
+struct virtio_net_ctrl_rfs {
+	u16 virtqueue_pairs;
+};
+
+#define VIRTIO_NET_CTRL_RFS   4
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET        0
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN        1
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX        0x8000
+
 #endif /* _LINUX_VIRTIO_NET_H */
-- 
1.7.1

^ permalink raw reply related

* [net-next rfc v7 1/3] virtio-net: separate fields of sending/receiving queue from virtnet_info
From: Jason Wang @ 2012-11-27 10:15 UTC (permalink / raw)
  To: rusty, mst, krkumar2, virtualization, netdev, linux-kernel
  Cc: bhutchings, jwhan, shiyer, kvm
In-Reply-To: <1354011360-39479-1-git-send-email-jasowang@redhat.com>

To support multiqueue transmitq/receiveq, the first step is to separate queue
related structure from virtnet_info. This patch introduce send_queue and
receive_queue structure and use the pointer to them as the parameter in
functions handling sending/receiving.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c |  289 +++++++++++++++++++++++++---------------------
 1 files changed, 158 insertions(+), 131 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 26c502e..7975133 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -51,16 +51,44 @@ struct virtnet_stats {
 	u64 rx_packets;
 };
 
+/* Internal representation of a send virtqueue */
+struct send_queue {
+	/* Virtqueue associated with this send _queue */
+	struct virtqueue *vq;
+
+	/* TX: fragments + linear part + virtio header */
+	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+};
+
+/* Internal representation of a receive virtqueue */
+struct receive_queue {
+	/* Virtqueue associated with this receive_queue */
+	struct virtqueue *vq;
+
+        struct napi_struct napi;
+
+        /* Number of input buffers, and max we've ever had. */
+        unsigned int num, max;
+
+	/* Work struct for refilling if we run low on memory. */
+	struct delayed_work refill;
+
+	/* Chain pages by the private ptr. */
+	struct page *pages;
+
+	/* RX: fragments + linear part + virtio header */
+	struct scatterlist sg[MAX_SKB_FRAGS + 2];
+};
+
 struct virtnet_info {
 	struct virtio_device *vdev;
-	struct virtqueue *rvq, *svq, *cvq;
+	struct virtqueue *cvq;
 	struct net_device *dev;
 	struct napi_struct napi;
+	struct send_queue sq;
+	struct receive_queue rq;
 	unsigned int status;
 
-	/* Number of input buffers, and max we've ever had. */
-	unsigned int num, max;
-
 	/* I like... big packets and I cannot lie! */
 	bool big_packets;
 
@@ -73,21 +101,11 @@ struct virtnet_info {
 	/* Active statistics */
 	struct virtnet_stats __percpu *stats;
 
-	/* Work struct for refilling if we run low on memory. */
-	struct delayed_work refill;
-
 	/* Work struct for config space updates */
 	struct work_struct config_work;
 
 	/* Lock for config space updates */
 	struct mutex config_lock;
-
-	/* Chain pages by the private ptr. */
-	struct page *pages;
-
-	/* fragments + linear part + virtio header */
-	struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
-	struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
 };
 
 struct skb_vnet_hdr {
@@ -117,22 +135,22 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
  * private is used to chain pages for big packets, put the whole
  * most recent used list in the beginning for reuse
  */
-static void give_pages(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct receive_queue *rq, struct page *page)
 {
 	struct page *end;
 
-	/* Find end of list, sew whole thing into vi->pages. */
+	/* Find end of list, sew whole thing into vi->rq.pages. */
 	for (end = page; end->private; end = (struct page *)end->private);
-	end->private = (unsigned long)vi->pages;
-	vi->pages = page;
+	end->private = (unsigned long)rq->pages;
+	rq->pages = page;
 }
 
-static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
+static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 {
-	struct page *p = vi->pages;
+	struct page *p = rq->pages;
 
 	if (p) {
-		vi->pages = (struct page *)p->private;
+		rq->pages = (struct page *)p->private;
 		/* clear private here, it is used to chain pages */
 		p->private = 0;
 	} else
@@ -140,12 +158,12 @@ static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
 	return p;
 }
 
-static void skb_xmit_done(struct virtqueue *svq)
+static void skb_xmit_done(struct virtqueue *vq)
 {
-	struct virtnet_info *vi = svq->vdev->priv;
+	struct virtnet_info *vi = vq->vdev->priv;
 
 	/* Suppress further interrupts. */
-	virtqueue_disable_cb(svq);
+	virtqueue_disable_cb(vq);
 
 	/* We were probably waiting for more output buffers. */
 	netif_wake_queue(vi->dev);
@@ -167,9 +185,10 @@ static void set_skb_frag(struct sk_buff *skb, struct page *page,
 }
 
 /* Called from bottom half context */
-static struct sk_buff *page_to_skb(struct virtnet_info *vi,
+static struct sk_buff *page_to_skb(struct receive_queue *rq,
 				   struct page *page, unsigned int len)
 {
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	unsigned int copy, hdr_len, offset;
@@ -224,12 +243,12 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	}
 
 	if (page)
-		give_pages(vi, page);
+		give_pages(rq, page);
 
 	return skb;
 }
 
-static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
+static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb)
 {
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
 	struct page *page;
@@ -243,7 +262,7 @@ static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
 			skb->dev->stats.rx_length_errors++;
 			return -EINVAL;
 		}
-		page = virtqueue_get_buf(vi->rvq, &len);
+		page = virtqueue_get_buf(rq->vq, &len);
 		if (!page) {
 			pr_debug("%s: rx error: %d buffers missing\n",
 				 skb->dev->name, hdr->mhdr.num_buffers);
@@ -256,14 +275,15 @@ static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
 
 		set_skb_frag(skb, page, 0, &len);
 
-		--vi->num;
+		--rq->num;
 	}
 	return 0;
 }
 
-static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
+static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 {
-	struct virtnet_info *vi = netdev_priv(dev);
+	struct virtnet_info *vi = rq->vq->vdev->priv;
+	struct net_device *dev = vi->dev;
 	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 	struct sk_buff *skb;
 	struct page *page;
@@ -273,7 +293,7 @@ static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
 		pr_debug("%s: short packet %i\n", dev->name, len);
 		dev->stats.rx_length_errors++;
 		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(vi, buf);
+			give_pages(rq, buf);
 		else
 			dev_kfree_skb(buf);
 		return;
@@ -285,14 +305,14 @@ static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
 		skb_trim(skb, len);
 	} else {
 		page = buf;
-		skb = page_to_skb(vi, page, len);
+		skb = page_to_skb(rq, page, len);
 		if (unlikely(!skb)) {
 			dev->stats.rx_dropped++;
-			give_pages(vi, page);
+			give_pages(rq, page);
 			return;
 		}
 		if (vi->mergeable_rx_bufs)
-			if (receive_mergeable(vi, skb)) {
+			if (receive_mergeable(rq, skb)) {
 				dev_kfree_skb(skb);
 				return;
 			}
@@ -359,8 +379,9 @@ frame_err:
 	dev_kfree_skb(skb);
 }
 
-static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
 {
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	struct sk_buff *skb;
 	struct skb_vnet_hdr *hdr;
 	int err;
@@ -372,77 +393,77 @@ static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
 	skb_put(skb, MAX_PACKET_LEN);
 
 	hdr = skb_vnet_hdr(skb);
-	sg_set_buf(vi->rx_sg, &hdr->hdr, sizeof hdr->hdr);
+	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
 
-	skb_to_sgvec(skb, vi->rx_sg + 1, 0, skb->len);
+	skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
 
-	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 2, skb, gfp);
+	err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp);
 	if (err < 0)
 		dev_kfree_skb(skb);
 
 	return err;
 }
 
-static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 {
 	struct page *first, *list = NULL;
 	char *p;
 	int i, err, offset;
 
-	/* page in vi->rx_sg[MAX_SKB_FRAGS + 1] is list tail */
+	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
 	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
-		first = get_a_page(vi, gfp);
+		first = get_a_page(rq, gfp);
 		if (!first) {
 			if (list)
-				give_pages(vi, list);
+				give_pages(rq, list);
 			return -ENOMEM;
 		}
-		sg_set_buf(&vi->rx_sg[i], page_address(first), PAGE_SIZE);
+		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
 
 		/* chain new page in list head to match sg */
 		first->private = (unsigned long)list;
 		list = first;
 	}
 
-	first = get_a_page(vi, gfp);
+	first = get_a_page(rq, gfp);
 	if (!first) {
-		give_pages(vi, list);
+		give_pages(rq, list);
 		return -ENOMEM;
 	}
 	p = page_address(first);
 
-	/* vi->rx_sg[0], vi->rx_sg[1] share the same page */
-	/* a separated vi->rx_sg[0] for virtio_net_hdr only due to QEMU bug */
-	sg_set_buf(&vi->rx_sg[0], p, sizeof(struct virtio_net_hdr));
+	/* rq->sg[0], rq->sg[1] share the same page */
+	/* a separated rq->sg[0] for virtio_net_hdr only due to QEMU bug */
+	sg_set_buf(&rq->sg[0], p, sizeof(struct virtio_net_hdr));
 
-	/* vi->rx_sg[1] for data packet, from offset */
+	/* rq->sg[1] for data packet, from offset */
 	offset = sizeof(struct padded_vnet_hdr);
-	sg_set_buf(&vi->rx_sg[1], p + offset, PAGE_SIZE - offset);
+	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
 
 	/* chain first in list head */
 	first->private = (unsigned long)list;
-	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2,
+	err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2,
 				first, gfp);
 	if (err < 0)
-		give_pages(vi, first);
+		give_pages(rq, first);
 
 	return err;
 }
 
-static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
 	struct page *page;
 	int err;
 
-	page = get_a_page(vi, gfp);
+	page = get_a_page(rq, gfp);
 	if (!page)
 		return -ENOMEM;
 
-	sg_init_one(vi->rx_sg, page_address(page), PAGE_SIZE);
+	sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
 
-	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 1, page, gfp);
+	err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp);
 	if (err < 0)
-		give_pages(vi, page);
+		give_pages(rq, page);
 
 	return err;
 }
@@ -454,97 +475,101 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
  * before we're receiving packets, or from refill_work which is
  * careful to disable receiving (using napi_disable).
  */
-static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
+static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
 {
+	struct virtnet_info *vi = rq->vq->vdev->priv;
 	int err;
 	bool oom;
 
 	do {
 		if (vi->mergeable_rx_bufs)
-			err = add_recvbuf_mergeable(vi, gfp);
+			err = add_recvbuf_mergeable(rq, gfp);
 		else if (vi->big_packets)
-			err = add_recvbuf_big(vi, gfp);
+			err = add_recvbuf_big(rq, gfp);
 		else
-			err = add_recvbuf_small(vi, gfp);
+			err = add_recvbuf_small(rq, gfp);
 
 		oom = err == -ENOMEM;
 		if (err < 0)
 			break;
-		++vi->num;
+		++rq->num;
 	} while (err > 0);
-	if (unlikely(vi->num > vi->max))
-		vi->max = vi->num;
-	virtqueue_kick(vi->rvq);
+	if (unlikely(rq->num > rq->max))
+		rq->max = rq->num;
+	virtqueue_kick(rq->vq);
 	return !oom;
 }
 
 static void skb_recv_done(struct virtqueue *rvq)
 {
 	struct virtnet_info *vi = rvq->vdev->priv;
+	struct receive_queue *rq = &vi->rq;
+
 	/* Schedule NAPI, Suppress further interrupts if successful. */
-	if (napi_schedule_prep(&vi->napi)) {
+	if (napi_schedule_prep(&rq->napi)) {
 		virtqueue_disable_cb(rvq);
-		__napi_schedule(&vi->napi);
+		__napi_schedule(&rq->napi);
 	}
 }
 
-static void virtnet_napi_enable(struct virtnet_info *vi)
+static void virtnet_napi_enable(struct receive_queue *rq)
 {
-	napi_enable(&vi->napi);
+	napi_enable(&rq->napi);
 
 	/* If all buffers were filled by other side before we napi_enabled, we
 	 * won't get another interrupt, so process any outstanding packets
 	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
 	 * We synchronize against interrupts via NAPI_STATE_SCHED */
-	if (napi_schedule_prep(&vi->napi)) {
-		virtqueue_disable_cb(vi->rvq);
+	if (napi_schedule_prep(&rq->napi)) {
+		virtqueue_disable_cb(rq->vq);
 		local_bh_disable();
-		__napi_schedule(&vi->napi);
+		__napi_schedule(&rq->napi);
 		local_bh_enable();
 	}
 }
 
 static void refill_work(struct work_struct *work)
 {
-	struct virtnet_info *vi;
+	struct receive_queue *rq =
+		container_of(work, struct receive_queue, refill.work);
 	bool still_empty;
 
-	vi = container_of(work, struct virtnet_info, refill.work);
-	napi_disable(&vi->napi);
-	still_empty = !try_fill_recv(vi, GFP_KERNEL);
-	virtnet_napi_enable(vi);
+	napi_disable(&rq->napi);
+	still_empty = !try_fill_recv(rq, GFP_KERNEL);
+	virtnet_napi_enable(rq);
 
 	/* In theory, this can happen: if we don't get any buffers in
 	 * we will *never* try to fill again. */
 	if (still_empty)
-		schedule_delayed_work(&vi->refill, HZ/2);
+		schedule_delayed_work(&rq->refill, HZ/2);
 }
 
 static int virtnet_poll(struct napi_struct *napi, int budget)
 {
-	struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
+	struct receive_queue *rq =
+		container_of(napi, struct receive_queue, napi);
 	void *buf;
 	unsigned int len, received = 0;
 
 again:
 	while (received < budget &&
-	       (buf = virtqueue_get_buf(vi->rvq, &len)) != NULL) {
-		receive_buf(vi->dev, buf, len);
-		--vi->num;
+	       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
+		receive_buf(rq, buf, len);
+		--rq->num;
 		received++;
 	}
 
-	if (vi->num < vi->max / 2) {
-		if (!try_fill_recv(vi, GFP_ATOMIC))
-			schedule_delayed_work(&vi->refill, 0);
+	if (rq->num < rq->max / 2) {
+		if (!try_fill_recv(rq, GFP_ATOMIC))
+			schedule_delayed_work(&rq->refill, 0);
 	}
 
 	/* Out of packets? */
 	if (received < budget) {
 		napi_complete(napi);
-		if (unlikely(!virtqueue_enable_cb(vi->rvq)) &&
+		if (unlikely(!virtqueue_enable_cb(rq->vq)) &&
 		    napi_schedule_prep(napi)) {
-			virtqueue_disable_cb(vi->rvq);
+			virtqueue_disable_cb(rq->vq);
 			__napi_schedule(napi);
 			goto again;
 		}
@@ -553,13 +578,14 @@ again:
 	return received;
 }
 
-static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
+static unsigned int free_old_xmit_skbs(struct send_queue *sq)
 {
 	struct sk_buff *skb;
 	unsigned int len, tot_sgs = 0;
+	struct virtnet_info *vi = sq->vq->vdev->priv;
 	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 
-	while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
+	while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 		pr_debug("Sent skb %p\n", skb);
 
 		u64_stats_update_begin(&stats->tx_syncp);
@@ -573,10 +599,11 @@ static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
 	return tot_sgs;
 }
 
-static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
+static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 {
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
 	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
+	struct virtnet_info *vi = sq->vq->vdev->priv;
 
 	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
 
@@ -611,25 +638,26 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 
 	/* Encode metadata header at front. */
 	if (vi->mergeable_rx_bufs)
-		sg_set_buf(vi->tx_sg, &hdr->mhdr, sizeof hdr->mhdr);
+		sg_set_buf(sq->sg, &hdr->mhdr, sizeof hdr->mhdr);
 	else
-		sg_set_buf(vi->tx_sg, &hdr->hdr, sizeof hdr->hdr);
+		sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
 
-	hdr->num_sg = skb_to_sgvec(skb, vi->tx_sg + 1, 0, skb->len) + 1;
-	return virtqueue_add_buf(vi->svq, vi->tx_sg, hdr->num_sg,
+	hdr->num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
+	return virtqueue_add_buf(sq->vq, sq->sg, hdr->num_sg,
 				 0, skb, GFP_ATOMIC);
 }
 
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	struct send_queue *sq = &vi->sq;
 	int capacity;
 
 	/* Free up any pending old buffers before queueing new ones. */
-	free_old_xmit_skbs(vi);
+	free_old_xmit_skbs(sq);
 
 	/* Try to transmit */
-	capacity = xmit_skb(vi, skb);
+	capacity = xmit_skb(sq, skb);
 
 	/* This can happen with OOM and indirect buffers. */
 	if (unlikely(capacity < 0)) {
@@ -648,7 +676,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 		kfree_skb(skb);
 		return NETDEV_TX_OK;
 	}
-	virtqueue_kick(vi->svq);
+	virtqueue_kick(sq->vq);
 
 	/* Don't wait up for transmitted skbs to be freed. */
 	skb_orphan(skb);
@@ -658,12 +686,12 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * before it gets out of hand.  Naturally, this wastes entries. */
 	if (capacity < 2+MAX_SKB_FRAGS) {
 		netif_stop_queue(dev);
-		if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
+		if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
 			/* More just got used, free them then recheck. */
-			capacity += free_old_xmit_skbs(vi);
+			capacity += free_old_xmit_skbs(sq);
 			if (capacity >= 2+MAX_SKB_FRAGS) {
 				netif_start_queue(dev);
-				virtqueue_disable_cb(vi->svq);
+				virtqueue_disable_cb(sq->vq);
 			}
 		}
 	}
@@ -731,7 +759,7 @@ static void virtnet_netpoll(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	napi_schedule(&vi->napi);
+	napi_schedule(&vi->rq.napi);
 }
 #endif
 
@@ -740,10 +768,10 @@ static int virtnet_open(struct net_device *dev)
 	struct virtnet_info *vi = netdev_priv(dev);
 
 	/* Make sure we have some buffers: if oom use wq. */
-	if (!try_fill_recv(vi, GFP_KERNEL))
-		schedule_delayed_work(&vi->refill, 0);
+	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
+		schedule_delayed_work(&vi->rq.refill, 0);
 
-	virtnet_napi_enable(vi);
+	virtnet_napi_enable(&vi->rq);
 	return 0;
 }
 
@@ -807,8 +835,8 @@ static int virtnet_close(struct net_device *dev)
 	struct virtnet_info *vi = netdev_priv(dev);
 
 	/* Make sure refill_work doesn't re-enable napi! */
-	cancel_delayed_work_sync(&vi->refill);
-	napi_disable(&vi->napi);
+	cancel_delayed_work_sync(&vi->rq.refill);
+	napi_disable(&vi->rq.napi);
 
 	return 0;
 }
@@ -920,11 +948,10 @@ static void virtnet_get_ringparam(struct net_device *dev,
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 
-	ring->rx_max_pending = virtqueue_get_vring_size(vi->rvq);
-	ring->tx_max_pending = virtqueue_get_vring_size(vi->svq);
+	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq.vq);
+	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq.vq);
 	ring->rx_pending = ring->rx_max_pending;
 	ring->tx_pending = ring->tx_max_pending;
-
 }
 
 
@@ -1034,8 +1061,8 @@ static int init_vqs(struct virtnet_info *vi)
 	if (err)
 		return err;
 
-	vi->rvq = vqs[0];
-	vi->svq = vqs[1];
+	vi->rq.vq = vqs[0];
+	vi->sq.vq = vqs[1];
 
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
 		vi->cvq = vqs[2];
@@ -1099,22 +1126,22 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	/* Set up our device-specific information */
 	vi = netdev_priv(dev);
-	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
+	netif_napi_add(dev, &vi->rq.napi, virtnet_poll, napi_weight);
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;
-	vi->pages = NULL;
+	vi->rq.pages = NULL;
 	vi->stats = alloc_percpu(struct virtnet_stats);
 	err = -ENOMEM;
 	if (vi->stats == NULL)
 		goto free;
 
-	INIT_DELAYED_WORK(&vi->refill, refill_work);
+	INIT_DELAYED_WORK(&vi->rq.refill, refill_work);
 	mutex_init(&vi->config_lock);
 	vi->config_enable = true;
 	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
-	sg_init_table(vi->rx_sg, ARRAY_SIZE(vi->rx_sg));
-	sg_init_table(vi->tx_sg, ARRAY_SIZE(vi->tx_sg));
+	sg_init_table(vi->rq.sg, ARRAY_SIZE(vi->rq.sg));
+	sg_init_table(vi->sq.sg, ARRAY_SIZE(vi->sq.sg));
 
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -1136,10 +1163,10 @@ static int virtnet_probe(struct virtio_device *vdev)
 	}
 
 	/* Last of all, set up some receive buffers. */
-	try_fill_recv(vi, GFP_KERNEL);
+	try_fill_recv(&vi->rq, GFP_KERNEL);
 
 	/* If we didn't even get one input buffer, we're useless. */
-	if (vi->num == 0) {
+	if (vi->rq.num == 0) {
 		err = -ENOMEM;
 		goto unregister;
 	}
@@ -1172,22 +1199,22 @@ static void free_unused_bufs(struct virtnet_info *vi)
 {
 	void *buf;
 	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->svq);
+		buf = virtqueue_detach_unused_buf(vi->sq.vq);
 		if (!buf)
 			break;
 		dev_kfree_skb(buf);
 	}
 	while (1) {
-		buf = virtqueue_detach_unused_buf(vi->rvq);
+		buf = virtqueue_detach_unused_buf(vi->rq.vq);
 		if (!buf)
 			break;
 		if (vi->mergeable_rx_bufs || vi->big_packets)
-			give_pages(vi, buf);
+			give_pages(&vi->rq, buf);
 		else
 			dev_kfree_skb(buf);
-		--vi->num;
+		--vi->rq.num;
 	}
-	BUG_ON(vi->num != 0);
+	BUG_ON(vi->rq.num != 0);
 }
 
 static void remove_vq_common(struct virtnet_info *vi)
@@ -1199,8 +1226,8 @@ static void remove_vq_common(struct virtnet_info *vi)
 
 	vi->vdev->config->del_vqs(vi->vdev);
 
-	while (vi->pages)
-		__free_pages(get_a_page(vi, GFP_KERNEL), 0);
+	while (vi->rq.pages)
+		__free_pages(get_a_page(&vi->rq, GFP_KERNEL), 0);
 }
 
 static void __devexit virtnet_remove(struct virtio_device *vdev)
@@ -1233,10 +1260,10 @@ static int virtnet_freeze(struct virtio_device *vdev)
 	mutex_unlock(&vi->config_lock);
 
 	netif_device_detach(vi->dev);
-	cancel_delayed_work_sync(&vi->refill);
+	cancel_delayed_work_sync(&vi->rq.refill);
 
 	if (netif_running(vi->dev))
-		napi_disable(&vi->napi);
+		napi_disable(&vi->rq.napi);
 
 	remove_vq_common(vi);
 
@@ -1255,12 +1282,12 @@ static int virtnet_restore(struct virtio_device *vdev)
 		return err;
 
 	if (netif_running(vi->dev))
-		virtnet_napi_enable(vi);
+		virtnet_napi_enable(&vi->rq);
 
 	netif_device_attach(vi->dev);
 
-	if (!try_fill_recv(vi, GFP_KERNEL))
-		schedule_delayed_work(&vi->refill, 0);
+	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
+		schedule_delayed_work(&vi->rq.refill, 0);
 
 	mutex_lock(&vi->config_lock);
 	vi->config_enable = true;
-- 
1.7.1

^ permalink raw reply related

* [net-next rfc v7 0/3] Multiqueue virtio-net
From: Jason Wang @ 2012-11-27 10:15 UTC (permalink / raw)
  To: rusty, mst, krkumar2, virtualization, netdev, linux-kernel
  Cc: bhutchings, jwhan, shiyer, kvm

Hi all:

This series is an update version of multiqueue virtio-net driver based on
Krishna Kumar's work to let virtio-net use multiple rx/tx queues to do the
packets reception and transmission. Please review and comments.

A protype implementation of qemu-kvm support could by found in
git://github.com/jasowang/qemu-kvm-mq.git. To start a guest with two queues, you
could specify the queues parameters to both tap and virtio-net like:

./qemu-kvm -netdev tap,queues=2,... -device virtio-net-pci,queues=2,...

then enable the multiqueue through ethtool by:

ethtool -L eth0 combined 2

Changes from V6:
- Align the implementation with the RFC spec update v5
- Addressing Rusty's comments:
  * split the patches
  * rename to max_queue_pairs and curr_queue_pairs
  * remove the useless status
  * fix the hibernation bug
- Addressing Ben's comments:
  * check other parameters in ethtool_set_queues

Changes from v5:
- Align the implementation with the RFC spec update v4
- Switch the mode between single mode and multiqueue mode without reset
- Remove the 256 limitation of queues
- Use helpers to do the mapping between virtqueues and tx/rx queues
- Use commbined channels instead of separated rx/tx queus when do the queue
number configuartion
- Other coding style comments from Michael

Changes from V4:
- Add ability to negotiate the number of queues through control virtqueue
- Ethtool -{L|l} support and default the tx/rx queue number to 1
- Expose the API to set irq affinity instead of irq itself

Changes from V3:
- Rebase to the net-next
- Let queue 2 to be the control virtqueue to obey the spec
- Prodives irq affinity
- Choose txq based on processor id

Reference:
- Virtio spec RFC: http://patchwork.ozlabs.org/patch/201303/
- V6: https://lkml.org/lkml/2012/10/30/127
- V5: http://lwn.net/Articles/505388/
- V4: https://lkml.org/lkml/2012/6/25/120
- V2: http://lwn.net/Articles/467283/


Perf Numbers:

Will do some basic test and post as a reply to this mail.

Jason Wang (3):
  virtio-net: separate fields of sending/receiving queue from
    virtnet_info
  virtio_net: multiqueue support
  virtio-net: change the number of queues through ethtool

 drivers/net/virtio_net.c        |  716 ++++++++++++++++++++++++++++-----------
 include/uapi/linux/virtio_net.h |   16 +
 2 files changed, 536 insertions(+), 196 deletions(-)

^ permalink raw reply

* [RFC PATCH iproute2] Add mdb command to bridge
From: Cong Wang @ 2012-11-27  9:49 UTC (permalink / raw)
  To: netdev
  Cc: bridge, Cong Wang, Herbert Xu, Stephen Hemminger, David S. Miller,
	Thomas Graf, Jesper Dangaard Brouer
In-Reply-To: <1354009785-20014-1-git-send-email-amwang@redhat.com>

Warning: it is still a draft! :)

The output is ugly, I just want to prove it works.

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 bridge/Makefile           |    2 +-
 bridge/br_common.h        |    1 +
 bridge/bridge.c           |    1 +
 bridge/mdb.c              |  200 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtnetlink.h |    3 +
 5 files changed, 206 insertions(+), 1 deletions(-)
 create mode 100644 bridge/mdb.c

diff --git a/bridge/Makefile b/bridge/Makefile
index 9a6743e..67aceb4 100644
--- a/bridge/Makefile
+++ b/bridge/Makefile
@@ -1,4 +1,4 @@
-BROBJ = bridge.o fdb.o monitor.o link.o
+BROBJ = bridge.o fdb.o monitor.o link.o mdb.o
 
 include ../Config
 
diff --git a/bridge/br_common.h b/bridge/br_common.h
index 718ecb9..67fd75c 100644
--- a/bridge/br_common.h
+++ b/bridge/br_common.h
@@ -5,6 +5,7 @@ extern int print_fdb(const struct sockaddr_nl *who,
 		     struct nlmsghdr *n, void *arg);
 
 extern int do_fdb(int argc, char **argv);
+extern int do_mdb(int argc, char **argv);
 extern int do_monitor(int argc, char **argv);
 
 extern int preferred_family;
diff --git a/bridge/bridge.c b/bridge/bridge.c
index e2c33b0..1fcd365 100644
--- a/bridge/bridge.c
+++ b/bridge/bridge.c
@@ -43,6 +43,7 @@ static const struct cmd {
 	int (*func)(int argc, char **argv);
 } cmds[] = {
 	{ "fdb", 	do_fdb },
+	{ "mdb", 	do_mdb },
 	{ "monitor",	do_monitor },
 	{ "help",	do_help },
 	{ 0 }
diff --git a/bridge/mdb.c b/bridge/mdb.c
new file mode 100644
index 0000000..7873128
--- /dev/null
+++ b/bridge/mdb.c
@@ -0,0 +1,200 @@
+/*
+ * Get mdb table with netlink
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <linux/if_bridge.h>
+#include <linux/if_ether.h>
+#include <linux/neighbour.h>
+#include <string.h>
+#include <arpa/inet.h>
+
+#include "libnetlink.h"
+#include "br_common.h"
+#include "rt_names.h"
+#include "utils.h"
+
+int filter_index;
+
+static void usage(void)
+{
+	fprintf(stderr, "       bridge mdb {show} [ dev DEV ]\n");
+	exit(-1);
+}
+
+/* Bridge multicast database attributes
+ * [MDBA_MDB] = {
+ *    [MDBA_MCADDR]
+ *    [MDBA_BRPORT] = {
+ *        [MDBA_BRPORT_NO]
+ *    }
+ * }
+ * [MDBA_ROUTER] = {
+ *    [MDBA_BRPORT] = {
+ *        [MDBA_BRPORT_NO]
+ *    }
+ * }
+ */
+enum {
+        MDBA_UNSPEC,
+        MDBA_MDB,
+        MDBA_ROUTER,
+        __MDBA_MAX,
+};
+#define MDBA_MAX (__MDBA_MAX - 1)
+
+enum {
+        MDBA_MDB_UNSPEC,
+        MDBA_MDB_MCADDR,
+        MDBA_MDB_BRPORT,
+        __MDBA_MDB_MAX,
+};
+#define MDBA_MDB_MAX (__MDBA_MDB_MAX - 1)
+
+enum {
+        MDBA_BRPORT_UNSPEC,
+        MDBA_BRPORT_NO,
+        __MDBA_BRPORT_MAX,
+};
+#define MDBA_BRPORT_MAX (__MDBA_BRPORT_MAX - 1)
+
+
+struct br_port_msg {
+        int ifindex;
+};
+
+#ifndef MDBA_RTA
+#define MDBA_RTA(r) \
+	((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct br_port_msg))))
+#endif
+
+static void br_print_ports(FILE *f, struct rtattr *attr)
+{
+	uint16_t *port_no;
+	struct rtattr *i;
+	int rem;
+
+	fprintf(f, "\n      { ");
+
+	rem = RTA_PAYLOAD(attr);
+	for (i = RTA_DATA(attr); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
+		port_no = RTA_DATA(i);
+		fprintf(f, "%u ", *port_no);
+	}
+	fprintf(f, "} ");
+}
+
+int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+	FILE *fp = arg;
+	struct br_port_msg *r = NLMSG_DATA(n);
+	int len = n->nlmsg_len;
+	struct rtattr * tb[MDBA_MAX+1];
+	struct rtattr * ports[MDBA_MDB_MAX+1];
+	SPRINT_BUF(abuf);
+
+	if (n->nlmsg_type != RTM_GETMDB) {
+		fprintf(stderr, "Not RTM_MDB: %08x %08x %08x\n",
+			n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+
+		return 0;
+	}
+
+	len -= NLMSG_LENGTH(sizeof(*r));
+	if (len < 0) {
+		fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+		return -1;
+	}
+
+	if (filter_index && filter_index != r->ifindex)
+		return 0;
+
+	if (!filter_index && r->ifindex)
+		fprintf(fp, "bridge dev %s ", ll_index_to_name(r->ifindex));
+
+	parse_rtattr(tb, MDBA_MAX, MDBA_RTA(r), n->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
+
+	if (tb[MDBA_MDB]) {
+		parse_rtattr_nested(ports, MDBA_MDB_MAX, tb[MDBA_MDB]);
+
+		if (ports[MDBA_MDB_MCADDR]) {
+			uint32_t addr = rta_getattr_u32(ports[MDBA_MDB_MCADDR]);
+			fprintf(fp, "multicast addr: %s ", inet_ntop(AF_INET, &addr, abuf, sizeof(abuf)));
+		}
+
+		if (ports[MDBA_MDB_BRPORT])
+			br_print_ports(fp, ports[MDBA_MDB_BRPORT]);
+	}
+
+	if (tb[MDBA_ROUTER]) {
+		parse_rtattr_nested(ports, MDBA_MDB_MAX, tb[MDBA_ROUTER]);
+
+		if (ports[MDBA_MDB_BRPORT])
+			br_print_ports(fp, ports[MDBA_MDB_BRPORT]);
+	}
+
+	fprintf(fp, "\n");
+	return 0;
+}
+
+static int mdb_show(int argc, char **argv)
+{
+	char *filter_dev = NULL;
+
+	while (argc > 0) {
+		if (strcmp(*argv, "dev") == 0) {
+			NEXT_ARG();
+			if (filter_dev)
+				duparg("dev", *argv);
+			filter_dev = *argv;
+		}
+		argc--; argv++;
+	}
+
+	if (filter_dev) {
+		filter_index = if_nametoindex(filter_dev);
+		if (filter_index == 0) {
+			fprintf(stderr, "Cannot find device \"%s\"\n",
+				filter_dev);
+			return -1;
+		}
+	}
+
+	if (rtnl_wilddump_request(&rth, PF_BRIDGE, RTM_GETMDB) < 0) {
+		perror("Cannot send dump request");
+		exit(1);
+	}
+
+	if (rtnl_dump_filter(&rth, print_mdb, stdout) < 0) {
+		fprintf(stderr, "Dump terminated\n");
+		exit(1);
+	}
+
+	return 0;
+}
+
+int do_mdb(int argc, char **argv)
+{
+	ll_init_map(&rth);
+
+	if (argc > 0) {
+		if (matches(*argv, "show") == 0 ||
+		    matches(*argv, "lst") == 0 ||
+		    matches(*argv, "list") == 0)
+			return mdb_show(argc-1, argv+1);
+		if (matches(*argv, "help") == 0)
+			usage();
+	} else
+		return mdb_show(0, NULL);
+
+	fprintf(stderr, "Command \"%s\" is unknown, try \"bridge mdb help\".\n", *argv);
+	exit(-1);
+}
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 0e3e0c1..f400b5e 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -120,6 +120,9 @@ enum {
 	RTM_SETDCB,
 #define RTM_SETDCB RTM_SETDCB
 
+	RTM_GETMDB = 86,
+#define RTM_GETMDB RTM_GETMDB
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
-- 
1.7.7.6

^ permalink raw reply related

* [RFC PATCH 2/2] bridge: export multicast database via netlink
From: Cong Wang @ 2012-11-27  9:49 UTC (permalink / raw)
  To: netdev
  Cc: bridge, Cong Wang, Herbert Xu, Stephen Hemminger, David S. Miller,
	Thomas Graf, Jesper Dangaard Brouer
In-Reply-To: <1354009785-20014-1-git-send-email-amwang@redhat.com>

Based on net-next.

Warning: this patch is still a draft! :)

This patch exports bridge multicast database via netlink
message type RTM_GETMDB. Similar to fdb, but currently bridge-specific.
We may need to support modify multicast database too (RTM_{ADD,DEL}MDB).

So, the questions are:

1) Is this design okay?
2) Do we need to make it generic like fdb?
3) Do we need to support RTM_{ADD,DEL}MDB?

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 include/uapi/linux/if_bridge.h |   37 +++++++++
 include/uapi/linux/rtnetlink.h |    3 +
 net/bridge/Makefile            |    2 +-
 net/bridge/br_mdb.c            |  174 ++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_multicast.c      |    1 +
 net/bridge/br_private.h        |    1 +
 6 files changed, 217 insertions(+), 1 deletions(-)
 create mode 100644 net/bridge/br_mdb.c

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index b388579..f5655ea 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -116,4 +116,41 @@ enum {
 	__IFLA_BRIDGE_MAX,
 };
 #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1)
+
+/* Bridge multicast database attributes
+ * [MDBA_MDB] = {
+ *    [MDBA_MCADDR]
+ *    [MDBA_BRPORT] = {
+ *        [MDBA_BRPORT_NO]
+ *    }
+ * }
+ * [MDBA_ROUTER] = {
+ *    [MDBA_BRPORT] = {
+ *        [MDBA_BRPORT_NO]
+ *    }
+ * }
+ */
+enum {
+	MDBA_UNSPEC,
+	MDBA_MDB,
+	MDBA_ROUTER,
+	__MDBA_MAX,
+};
+#define MDBA_MAX (__MDBA_MAX - 1)
+
+enum {
+	MDBA_MDB_UNSPEC,
+	MDBA_MDB_MCADDR,
+	MDBA_MDB_BRPORT,
+	__MDBA_MDB_MAX,
+};
+#define MDBA_MDB_MAX (__MDBA_MDB_MAX - 1)
+
+enum {
+	MDBA_BRPORT_UNSPEC,
+	MDBA_BRPORT_NO,
+	__MDBA_BRPORT_MAX,
+};
+#define MDBA_BRPORT_MAX (__MDBA_BRPORT_MAX - 1)
+
 #endif /* _UAPI_LINUX_IF_BRIDGE_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 3dee071..0df623f 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -125,6 +125,9 @@ enum {
 	RTM_GETNETCONF = 82,
 #define RTM_GETNETCONF RTM_GETNETCONF
 
+	RTM_GETMDB = 86,
+#define RTM_GETMDB RTM_GETMDB
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index d0359ea..e859098 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -12,6 +12,6 @@ bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
 
 bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
 
-bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o
+bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
 
 obj-$(CONFIG_BRIDGE_NF_EBTABLES) += netfilter/
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
new file mode 100644
index 0000000..dc73091
--- /dev/null
+++ b/net/bridge/br_mdb.c
@@ -0,0 +1,174 @@
+#include <linux/err.h>
+#include <linux/if_ether.h>
+#include <linux/igmp.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/addrconf.h>
+#include <net/ip6_checksum.h>
+#endif
+
+#include "br_private.h"
+
+struct br_port_msg {
+	int ifindex;
+};
+
+static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
+			       u32 seq, struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_port *p;
+	struct hlist_node *n;
+	struct nlattr *nest, *nest2;
+
+	if (!br->multicast_router || hlist_empty(&br->router_list)) {
+		printk(KERN_INFO "no router on bridge\n");
+		return 0;
+	}
+
+	nest = nla_nest_start(skb, MDBA_ROUTER);
+	if (nest == NULL)
+		return -EMSGSIZE;
+	nest2 = nla_nest_start(skb, MDBA_MDB_BRPORT);
+	if (nest2 == NULL)
+		goto fail;
+
+	hlist_for_each_entry_rcu(p, n, &br->router_list, rlist) {
+		if (p && nla_put_u16(skb, MDBA_BRPORT_NO, p->port_no)) {
+			nla_nest_cancel(skb, nest2);
+			goto fail;
+		}
+	}
+
+	nla_nest_end(skb, nest2);
+	nla_nest_end(skb, nest);
+	return 0;
+fail:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
+			    u32 seq, struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_mdb_htable *mdb;
+	struct nlattr *nest, *nest2;
+	unsigned int i;
+
+	if (br->multicast_disabled) {
+		printk(KERN_INFO "multicast is disabled on bridge\n");
+		return 0;
+	}
+
+	mdb = rcu_dereference(br->mdb);
+	if (!mdb) {
+		printk(KERN_INFO "no mdb on bridge\n");
+		return 0;
+	}
+
+	nest = nla_nest_start(skb, MDBA_MDB);
+	if (nest == NULL)
+		return -EMSGSIZE;
+
+	for (i = 0; i < mdb->max; i++) {
+		struct hlist_node *h;
+		struct net_bridge_mdb_entry *mp;
+		struct net_bridge_port_group *p, **pp;
+		struct net_bridge_port *port;
+
+		hlist_for_each_entry_rcu(mp, h, &mdb->mhash[i], hlist[mdb->ver]) {
+			if (nla_put_be32(skb, MDBA_MDB_MCADDR, mp->addr.u.ip4))
+				goto fail;
+
+			nest2 = nla_nest_start(skb, MDBA_MDB_BRPORT);
+			if (nest2 == NULL)
+				goto fail;
+
+			for (pp = &mp->ports;
+			     (p = rcu_dereference(*pp)) != NULL;
+			      pp = &p->next) {
+				port = p->port;
+				if (port) {
+					printk(KERN_INFO "port %u, mcaddr: %pI4\n", port->port_no, &mp->addr.u.ip4);
+					if (nla_put_u16(skb, MDBA_BRPORT_NO, port->port_no)) {
+						goto fail;
+					}
+				}
+			}
+
+			nla_nest_end(skb, nest2);
+		}
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+fail:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net_device *dev;
+	struct net *net = sock_net(skb->sk);
+	struct nlmsghdr *nlh;
+	u32 seq = cb->nlh->nlmsg_seq;
+	int idx = 0, s_idx;
+
+	s_idx = cb->args[0];
+
+	rcu_read_lock();
+	cb->seq = net->dev_base_seq;
+
+	for_each_netdev_rcu(net, dev) {
+		if (dev->priv_flags & IFF_EBRIDGE) {
+			struct br_port_msg *bpm;
+
+			if (idx < s_idx)
+				goto cont;
+
+			nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+					seq, RTM_GETMDB,
+					sizeof(*bpm), NLM_F_MULTI);
+			if (nlh == NULL)
+				break;
+
+			bpm = nlmsg_data(nlh);
+			bpm->ifindex = dev->ifindex;
+			if (br_mdb_fill_info(skb, cb, seq, dev) < 0) {
+				printk(KERN_INFO "br_mdb_fill_info failed\n");
+				goto fail;
+			}
+			if (br_rports_fill_info(skb, cb, seq, dev) < 0) {
+				printk(KERN_INFO "br_rports_fill_info failed\n");
+				goto fail;
+			}
+
+			nlmsg_end(skb, nlh);
+cont:
+			idx++;
+		}
+	}
+
+	rcu_read_unlock();
+	cb->args[0] = idx;
+	return skb->len;
+
+fail:
+	rcu_read_unlock();
+	nlmsg_cancel(skb, nlh);
+	return skb->len;
+}
+
+void br_mdb_init(void)
+{
+	rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, NULL);
+}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 2417434..be69cd4 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1584,6 +1584,7 @@ void br_multicast_init(struct net_bridge *br)
 		    br_multicast_querier_expired, (unsigned long)br);
 	setup_timer(&br->multicast_query_timer, br_multicast_query_expired,
 		    (unsigned long)br);
+	br_mdb_init();
 }
 
 void br_multicast_open(struct net_bridge *br)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index eb9cd42..bf0f6d5 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -432,6 +432,7 @@ extern int br_multicast_set_port_router(struct net_bridge_port *p,
 extern int br_multicast_toggle(struct net_bridge *br, unsigned long val);
 extern int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
 extern int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
+extern void br_mdb_init(void);
 
 static inline bool br_multicast_is_router(struct net_bridge *br)
 {
-- 
1.7.7.6

^ permalink raw reply related

* [RFC PATCH 1/2] bridge: export port_no and port_id via IFA_INFO_DATA
From: Cong Wang @ 2012-11-27  9:49 UTC (permalink / raw)
  To: netdev
  Cc: Cong Wang, bridge, Herbert Xu, Jesper Dangaard Brouer,
	Thomas Graf, Stephen Hemminger, David S. Miller

Based on net-next.

This patch exports port->port_no port->port_id in the end of IFA_INFO_DATA.

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Cong Wang <amwang@redhat.com>

---
 include/uapi/linux/if_link.h |    2 ++
 net/bridge/br_netlink.c      |    8 +++++++-
 2 files changed, 9 insertions(+), 1 deletions(-)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index bb58aeb..9cd91a9 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -218,6 +218,8 @@ enum {
 	IFLA_BRPORT_MODE,	/* mode (hairpin)          */
 	IFLA_BRPORT_GUARD,	/* bpdu guard              */
 	IFLA_BRPORT_PROTECT,	/* root port protection    */
+	IFLA_BRPORT_NO,		/* port no                 */
+	IFLA_BRPORT_ID,		/* port id                 */
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 65429b9..7b7414e 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -28,6 +28,8 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_MODE */
 		+ nla_total_size(1)	/* IFLA_BRPORT_GUARD */
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROTECT */
+		+ nla_total_size(2)	/* IFLA_BRPORT_NO */
+		+ nla_total_size(2)	/* IFLA_BRPORT_ID */
 		+ 0;
 }
 
@@ -53,7 +55,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) ||
 	    nla_put_u8(skb, IFLA_BRPORT_MODE, mode) ||
 	    nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)))
+	    nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) ||
+	    nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) ||
+	    nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id))
 		return -EMSGSIZE;
 
 	return 0;
@@ -168,6 +172,8 @@ static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_MODE]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_GUARD]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_PROTECT]	= { .type = NLA_U8 },
+	[IFLA_BRPORT_NO]	= { .type = NLA_U16 },
+	[IFLA_BRPORT_ID]	= { .type = NLA_U16 },
 };
 
 /* Change the state of the port and notify spanning tree */
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH 2/2] ewrk3: remove outdated comment
From: Paul Bolle @ 2012-11-27  9:48 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, richard -rw- weinberger

Remove an outdated comment, that should have been removed in the
patch named "MODULE_PARM conversions" from early 2005.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
---
 drivers/net/ethernet/dec/ewrk3.c |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/dec/ewrk3.c b/drivers/net/ethernet/dec/ewrk3.c
index 2ee1c47..9f992b9 100644
--- a/drivers/net/ethernet/dec/ewrk3.c
+++ b/drivers/net/ethernet/dec/ewrk3.c
@@ -1910,7 +1910,6 @@ static struct net_device *ewrk3_devs[MAX_NUM_EWRK3S];
 static int ndevs;
 static int io[MAX_NUM_EWRK3S+1] = { 0x300, 0, };
 
-/* '21' below should really be 'MAX_NUM_EWRK3S' */
 module_param_array(io, int, NULL, 0);
 module_param_array(irq, byte, NULL, 0);
 MODULE_PARM_DESC(io, "EtherWORKS 3 I/O base address(es)");
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH 1/2] ewrk3: silence GCC warning
From: Paul Bolle @ 2012-11-27  9:47 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, richard -rw- weinberger

Building ewrk3.o triggers this GCC warning:
    drivers/net/ethernet/dec/ewrk3.c: In function '__check_irq':
    drivers/net/ethernet/dec/ewrk3.c:1915:1: warning: return from incompatible pointer type [enabled by default]

This can be trivially fixed by changing the 'irq' parameter from int to
byte (which is an alias for unsigned char for module parameters).

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
---
Only compile tested.

 drivers/net/ethernet/dec/ewrk3.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/dec/ewrk3.c b/drivers/net/ethernet/dec/ewrk3.c
index 17ae8c6..2ee1c47 100644
--- a/drivers/net/ethernet/dec/ewrk3.c
+++ b/drivers/net/ethernet/dec/ewrk3.c
@@ -1912,7 +1912,7 @@ static int io[MAX_NUM_EWRK3S+1] = { 0x300, 0, };
 
 /* '21' below should really be 'MAX_NUM_EWRK3S' */
 module_param_array(io, int, NULL, 0);
-module_param_array(irq, int, NULL, 0);
+module_param_array(irq, byte, NULL, 0);
 MODULE_PARM_DESC(io, "EtherWORKS 3 I/O base address(es)");
 MODULE_PARM_DESC(irq, "EtherWORKS 3 IRQ number(s)");
 
-- 
1.7.7.6

^ permalink raw reply related

* Re: BQL support in gianfar causes network hickup
From: Keitel, Tino (ALC NetworX GmbH) @ 2012-11-27  9:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tino Keitel, Paul Gortmaker, netdev@vger.kernel.org
In-Reply-To: <1353950255.7553.6.camel@edumazet-glaptop>

On Mo, 2012-11-26 at 09:17 -0800, Eric Dumazet wrote:
> On Mon, 2012-11-26 at 18:08 +0100, Keitel, Tino (ALC NetworX GmbH)
> wrote:
> > On Mo, 2012-11-26 at 08:34 -0800, Eric Dumazet wrote:
> > > On Mon, 2012-11-26 at 11:01 +0100, Tino Keitel wrote:
> > > > On Sat, Nov 24, 2012 at 15:43:36 -0800, Eric Dumazet wrote:
> > > > 
> > > > [...]
> > > > 
> > > > > Hmm, I wonder if BQL makes a particular bug showing more often.
> > > > > 
> > > > > I see gianfar uses a very small watchdog_timeo of 1 second, while many
> > > > > drivers use 5 seconds.
> > > > > 
> > > > > What happens if you change this to 5 seconds ?
> > > > 
> > > > I still got the trace and a failing ptp client.
> > > > 
> > > 
> > > Thanks. Is this bug easy to trigger ?
> > > 
> > > I suspect a core issue and a race, likely to happen on your (non x86)
> > > hardware
> > > 
> > > Could you add the following debugging patch ?
> > 
> > No visible difference:
> 
> OK it seems you trigger the problem fast !
> 
> Please try the following as well :

Hi,

yes, it can be triggered within 2 minutes.

The patch makes no difference:

NETDEV WATCHDOG: eth1 (fsl-gianfar): transmit queue 0 timed out
------------[ cut here ]------------
WARNING:
at /home/keitelt1/src/git/linux-stable/net/sched/sch_generic.c:255
Modules linked in:
NIP: c02448e0 LR: c02448e0 CTR: c01c19b8
REGS: c7ffbe40 TRAP: 0700   Not tainted  (3.7.0-rc6-dirty)
MSR: 00029032 <EE,ME,IR,DR,RI>  CR: 22002044  XER: 20000000
TASK = c03dd370[0] 'swapper' THREAD: c03fe000
GPR00: c02448e0 c7ffbef0 c03dd370 0000003f 00000001 c001aea8 00000000
00000001
GPR08: 00000001 c03e0000 00000000 0000009d 22002084 1008eb5c 07ffb000
ffffffff
GPR16: 00000004 c0362c7c c03dfbf8 00200000 c0411ed0 c0411cd0 c0411ad0
ffffffff
GPR24: 00000000 c749e1d8 00000004 c783c330 c0400000 c03e0000 c749e000
00000000
NIP [c02448e0] dev_watchdog+0x288/0x298
LR [c02448e0] dev_watchdog+0x288/0x298
Call Trace:
[c7ffbef0] [c02448e0] dev_watchdog+0x288/0x298 (unreliable)
[c7ffbf20] [c00267f8] call_timer_fn+0x6c/0xd8
[c7ffbf50] [c00269e4] run_timer_softirq+0x180/0x1f8
[c7ffbfa0] [c0021144] __do_softirq+0xc4/0x160
[c7ffbff0] [c000d0b8] call_do_softirq+0x14/0x24
[c03ffe90] [c00058e8] do_softirq+0x8c/0xb8
[c03ffeb0] [c0021358] irq_exit+0x98/0xb4
[c03ffec0] [c0009fb0] timer_interrupt+0x158/0x170
[c03ffee0] [c000f02c] ret_from_except+0x0/0x14
--- Exception: 901 at cpu_idle+0x94/0x100
    LR = cpu_idle+0x94/0x100
[c03fffa0] [c00088ec] cpu_idle+0x5c/0x100 (unreliable)
[c03fffc0] [c03b37b0] start_kernel+0x2dc/0x2f0
[c03ffff0] [00003438] 0x3438
Instruction dump:
7d2903a6 4e800421 80fe01fc 4bffff74 7fc3f378 4bfecb7d 7fc4f378 7fe6fb78
7c651b78 3c60c038 38637280 48090e69 <0fe00000> 39200001 993cc7c9
4bffffb8
---[ end trace 26a9da9c2717d65b ]---

Regards,
Tino



^ permalink raw reply

* TCP and reordering
From: Saku Ytti @ 2012-11-27  9:32 UTC (permalink / raw)
  To: netdev

Today due to fast retransmit performance on links which cause
reordering is appalling.

Is it too esoteric situation to handle gracefully? Couldn't we
maintain 'reorder' counter in socket, which is increment when we get
two copies of same packet after duplicate ack, if this counter is
sufficiently high in relation to packet loss, we could start delaying
duplicate acks as we'd expect to receive the sequence very soon.

--
  ++ytti

^ permalink raw reply

* Re: [RFC net-next PATCH V1 7/9] net: frag queue locking per hash bucket
From: Jesper Dangaard Brouer @ 2012-11-27  9:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Florian Westphal, netdev, Pablo Neira Ayuso,
	Thomas Graf, Cong Wang, Patrick McHardy, Paul E. McKenney,
	Herbert Xu
In-Reply-To: <20121123130836.18764.9297.stgit@dragon>

On Fri, 2012-11-23 at 14:08 +0100, Jesper Dangaard Brouer wrote:
> DO NOT apply - patch not finished, can cause on OOPS/PANIC during hash rebuild

Think I have fixed this issue.  And its even fixed in this patch, as the
oops occurred, when testing, with a slightly older version of this
patch.


> This patch implements per hash bucket locking for the frag queue
> hash.  This removes two write locks, and the only remaining write
> lock is for protecting hash rebuild.  This essentially reduce the
> readers-writer lock to a rebuild lock.

[... cut ...]
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 1620a21..447423f 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
[...]
> @@ -102,9 +112,17 @@ EXPORT_SYMBOL(inet_frags_exit_net);
>  
>  static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
>  {
> -	write_lock(&f->lock);
> +	struct inet_frag_bucket *hb;
> +	unsigned int hash;
> +
> +	read_lock(&f->lock);
> +	hash = f->hashfn(fq);
> +	hb = &f->hash[hash];

Before the read_lock, were _here_ below then hash calc.  Which is wrong,
and caused the oops, as the hash result can change, when rnd is changed,
during hash rebuild.

> +
> +	spin_lock_bh(&hb->chain_lock);
>  	hlist_del(&fq->list);
> -	write_unlock(&f->lock);
> +	spin_unlock_bh(&hb->chain_lock);
> +	read_unlock(&f->lock);
>  	inet_frag_lru_del(fq);
>  }

[... cut ...]

^ permalink raw reply

* pull-request: can-next 2012-11-27
From: Marc Kleine-Budde @ 2012-11-27  8:57 UTC (permalink / raw)
  To: netdev; +Cc: linux-can@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 2876 bytes --]

Hello David,

this is pull request is for net-next. Contains a patch by Andreas
Larsson, which enables the sja1000 of driver to work under sparc.
AnilKumar Ch contributed a patch to improve the c_can support under
omap, Olivier Sobrie's patch brings support for the CAN/USB dongles
from Kvaser. In a bunch of patches by me missing MODULE_ALIAS and/or
MODULE_DEVICE_TABLE entries were added to the CAN drivers.

regards,
Marc

---

The following changes since commit 03f52a0a554210d5049eeed9f1bb29047dc807cb:

  ip6mr: Add sizeof verification to MRT6_ASSERT and MT6_PIM (2012-11-26 17:35:58 -0500)

are available in the git repository at:

  git://gitorious.org/linux-can/linux-can-next.git for-davem

for you to fetch changes up to fc8f40b10e58f37ec07e7b0cf85d19e899bdf23f:

  can: mpc5xxx_can: add MODULE_DEVICE_TABLE (2012-11-27 09:49:36 +0100)

----------------------------------------------------------------
Andreas Larsson (1):
      can: sja1000: Make sja1000_of_platform selectable and compilable on SPARC

AnilKumar Ch (1):
      can: c_can: Add d_can raminit support

Marc Kleine-Budde (8):
      can: bfin_can: add MODULE_ALIAS
      can: ti_hecc: add MODULE_ALIAS
      can: sja1000_platform: add MODULE_ALIAS
      can: cc770_platform: add MODULE_ALIAS and MODULE_DEVICE_TABLE
      can: flexcan: add MODULE_DEVICE_TABLE
      can: at91_can: add MODULE_DEVICE_TABLE
      can: c_can_platform: add MODULE_DEVICE_TABLE
      can: mpc5xxx_can: add MODULE_DEVICE_TABLE

Olivier Sobrie (1):
      can: kvaser_usb: Add support for Kvaser CAN/USB devices

 drivers/net/can/at91_can.c                    |    1 +
 drivers/net/can/bfin_can.c                    |    1 +
 drivers/net/can/c_can/c_can.c                 |   12 +
 drivers/net/can/c_can/c_can.h                 |    3 +
 drivers/net/can/c_can/c_can_platform.c        |   30 +-
 drivers/net/can/cc770/cc770_platform.c        |    2 +
 drivers/net/can/flexcan.c                     |    2 +
 drivers/net/can/mscan/mpc5xxx_can.c           |    1 +
 drivers/net/can/sja1000/Kconfig               |    2 +-
 drivers/net/can/sja1000/sja1000_of_platform.c |    6 +-
 drivers/net/can/sja1000/sja1000_platform.c    |    1 +
 drivers/net/can/ti_hecc.c                     |    1 +
 drivers/net/can/usb/Kconfig                   |   29 +
 drivers/net/can/usb/Makefile                  |    1 +
 drivers/net/can/usb/kvaser_usb.c              | 1627 +++++++++++++++++++++++++
 15 files changed, 1715 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/can/usb/kvaser_usb.c

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 261 bytes --]

^ permalink raw reply

* Re: [net-next 06/10] ixgbe: eliminate Smatch warnings in ixgbe_debugfs.c
From: Dan Carpenter @ 2012-11-27  7:18 UTC (permalink / raw)
  To: Hay, Joshua A
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, netdev@vger.kernel.org,
	gospo@redhat.com, sassmann@redhat.com
In-Reply-To: <4329753313F5A742A09953EE6CC1B3CE28C7D984@ORSMSX108.amr.corp.intel.com>

On Tue, Nov 27, 2012 at 01:06:17AM +0000, Hay, Joshua A wrote:
> The return value will be changed to len to preserve error codes returned from simple_write_to_buffer.
> 
> However, changing the logic preceding this return breaks these functions.  If simple_write_to_buffer returns a positive value, other actions are performed with this value.  With this patch, the function will return immediately with that value instead.  This will effectively break the ixgbe_debugfs write operations.
> 
> So ultimately, the change should be: 
> > +	len = simple_write_to_buffer(ixgbe_dbg_reg_ops_buf,
> > +				     sizeof(ixgbe_dbg_reg_ops_buf)-1,
> > +				     ppos,
> > +				     buffer,
> > +				     count);
> > +	if (len < 0)
> > +		return -EFAULT;
> 	
> 	if (len < 0)
> 		return len;
> 

Yes.  Sorry, I wasn't reading carefully before.  That looks fine.

regards,
dan carpenter

^ permalink raw reply

* Re: private netdev flags into UAPI?
From: Or Gerlitz @ 2012-11-27  6:51 UTC (permalink / raw)
  To: David Howells; +Cc: Or Gerlitz, netdev
In-Reply-To: <990.1353981789@warthog.procyon.org.uk>

On 27/11/2012 04:03, David Howells wrote:
> Or Gerlitz <or.gerlitz@gmail.com> wrote:
>
>> On Mon, Nov 26, 2012 at 11:22 AM, David Howells <dhowells@redhat.com> wrote:
>>> They were exposed to userspace already
>> So the script carries the bug into a new directory... why? AFAIK,
>> intentionally there's no way to read private flags from user space, so
>> what's the point in defining them there?
> How should the script know what's private and what's not?  By the
> encapsulation of code inside __KERNEL__ blocks.  In their absence, everything
> is assumed to be public - given it is already part of the UAPI.  I don't know
> that the code is private rather than the comment is wrong.
>
>

makes sense, but I have pointed on a bug in the final result, so this 
way or another, the fact that the bug
existed before doesn't mean we should carry it over.

Or.

^ permalink raw reply

* Re: performance regression on HiperSockets depending on MTU size
From: Eric Dumazet @ 2012-11-27  6:46 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev
In-Reply-To: <k91m5l$77a$1@ger.gmane.org>

On Tue, 2012-11-27 at 06:21 +0000, Cong Wang wrote:

> Eric,
> 
> Do you have a full list of such commits? I am trying to backport TSQ
> to 2.6.32, and of course I don't want to miss these commits either.

I dont think there are other known issues.

mlx4 had a 'problem' because only recently we removed the skb_orphan()
call it used to do in its start_xmit() function.

I remember David had to revert BQL on NIU driver, but NIU does the
skb_orphan() call as well so TSQ is basically disabled.

^ permalink raw reply

* Re: [net-next RFC] pktgen: don't wait for the device who doesn't free skb immediately after sent
From: Jason Wang @ 2012-11-27  6:45 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: mst, netdev, linux-kernel, virtualization, davem
In-Reply-To: <20121126093728.0a10f97f@nehalam.linuxnetplumber.net>

On 11/27/2012 01:37 AM, Stephen Hemminger wrote:
> On Mon, 26 Nov 2012 15:56:52 +0800
> Jason Wang <jasowang@redhat.com> wrote:
>
>> Some deivces do not free the old tx skbs immediately after it has been sent
>> (usually in tx interrupt). One such example is virtio-net which optimizes for
>> virt and only free the possible old tx skbs during the next packet sending. This
>> would lead the pktgen to wait forever in the refcount of the skb if no other
>> pakcet will be sent afterwards.
>>
>> Solving this issue by introducing a new flag IFF_TX_SKB_FREE_DELAY which could
>> notify the pktgen that the device does not free skb immediately after it has
>> been sent and let it not to wait for the refcount to be one.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> Another alternative would be using skb_orphan() and skb->destructor.
> There are other cases where skb's are not freed right away.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hi Stephen:

Do you mean registering a skb->destructor for pktgen then set and check 
bits in skb->tx_flag?

^ permalink raw reply

* Re: linux-next: manual merge of the net-next tree with the infiniband tree
From: Or Gerlitz @ 2012-11-27  6:43 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: David Miller, netdev, linux-next, linux-kernel, Roland Dreier,
	linux-rdma, Ben Hutchings, Amir Vadai
In-Reply-To: <20121127114751.6961da02245ed6851190aca2@canb.auug.org.au>

On 27/11/2012 02:47, Stephen Rothwell wrote:
> Hi all,
>
> Today's linux-next merge of the net-next tree got a conflict in
> drivers/net/ethernet/mellanox/mlx4/en_rx.c between commit 08ff32352d6f
> ("mlx4: 64-byte CQE/EQE support") from the infiniband tree and commit
> f1d29a3fa68b ("mlx4_en: Remove remnants of LRO support") from the
> net-next tree.
>
> I fixed it up (see below) and can carry the fix as necessary (no action
> is required).
>

Acked-by: Or Geritz <ogerlitz@mellanox.com>

^ permalink raw reply

* Re: performance regression on HiperSockets depending on MTU size
From: Cong Wang @ 2012-11-27  6:21 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1353946351.30446.1779.camel@edumazet-glaptop>

On Mon, 26 Nov 2012 at 16:12 GMT, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Hi Frank, thanks for this report.
>
> You could tweak tcp_limit_output_bytes, but IMO the root of the problem
> is in the driver itself.
>
> For example, I had to change mlx4 driver for the same problem : Make
> sure a TX packet can be "TX completed" in a short amount of time.
>
> In the case of mlx4, the wait time was 128 us, but I suspect on your
> case its more like an infinite time or several ms.
>  
> The driver is delaying the free of TX skb by a fixed amount of time,
> or relies on following transmits to perform the TX completion
>

Eric,

Do you have a full list of such commits? I am trying to backport TSQ
to 2.6.32, and of course I don't want to miss these commits either.

Thanks!

^ permalink raw reply

* Re: Fwd: Re: [PATCH] net: ipv6: change %8s to %s for rt->dst.dev->name in seq_printf of rt6_info_route
From: Chen Gang @ 2012-11-27  5:52 UTC (permalink / raw)
  To: Shan Wei; +Cc: Eric Dumazet, David Miller, netdev
In-Reply-To: <50B45265.9070303@asianux.com>

于 2012年11月27日 13:40, Chen Gang 写道:
> 
> and now, I think:
>   A) both input and output through /proc/* are for os api level.
>   B) but both %8s and %s do not change the output interface format (including contents, topology, separator mark, space redundancy).
>   C) so it is belong to 'User Experience', not belong to os api.
> 
>   welcome any another members to giving suggestions and completions.
> 
>   thanks.
> 
>   :-)
> 

  completion: 8 right alignment is not belong to interface format.
    if it was belong to interface format,
    it would cause correctness issue (the name len may be larger than 8).
    so if "8 right alignment" is belong to os api, it means the api is not correct, need change.

  :-)

-- 
Chen Gang

Asianux Corporation

^ permalink raw reply

* Re: [PATCH] vhost: fix length for cross region descriptor
From: Jason Wang @ 2012-11-27  5:47 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, David Miller, linux-kernel
In-Reply-To: <20121126155727.GA21716@redhat.com>

On 11/26/2012 11:57 PM, Michael S. Tsirkin wrote:
> If a single descriptor crosses a region, the
> second chunk length should be decremented
> by size translated so far, instead it includes
> the full descriptor length.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>   drivers/vhost/vhost.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index ef8f598..5a3d0f1 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -1049,7 +1049,7 @@ static int translate_desc(struct vhost_dev *dev, u64 addr, u32 len,
>   		}
>   		_iov = iov + ret;
>   		size = reg->memory_size - addr + reg->guest_phys_addr;
> -		_iov->iov_len = min((u64)len, size);
> +		_iov->iov_len = min((u64)len - s, size);
>   		_iov->iov_base = (void __user *)(unsigned long)
>   			(reg->userspace_addr + addr - reg->guest_phys_addr);
>   		s += size;

Acked-by: Jason Wang <jasowang@redhat.com>

^ permalink raw reply

* linux-next: manual merge of the drop-experimental tree with the net-next tree
From: Stephen Rothwell @ 2012-11-27  5:45 UTC (permalink / raw)
  To: Kees Cook; +Cc: linux-next, linux-kernel, Ben Hutchings, David Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 561 bytes --]

Hi Kees,

Today's linux-next merge of the drop-experimental tree got a conflict in
net/dsa/Kconfig between commit b3422a314c27 ("dsa: Hide core config
options; make drivers select what they need") from the net-next tree and
commit b8e8d99e4ee8 ("net/dsa: remove depends on CONFIG_EXPERIMENTAL")
from the drop-experimental tree.

I fixed it up (using the net-next version as that removed
CONFIG_EXPERIMENTAL as well) and can carry the fix as necessary (no
action is required).

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: Fwd: Re: [PATCH] net: ipv6: change %8s to %s for rt->dst.dev->name in seq_printf of rt6_info_route
From: Chen Gang @ 2012-11-27  5:40 UTC (permalink / raw)
  To: Shan Wei; +Cc: Eric Dumazet, David Miller, netdev
In-Reply-To: <50B44563.8060905@asianux.com>

>   I think (only for my thought, maybe not correct):
>     for user input through /proc/* are all for os api level, not for "User Experience".
>     for most of outputs to user through /proc/*, are "User Experience".
> 

  and now I think what I said above is incorrect.

and now, I think:
  A) both input and output through /proc/* are for os api level.
  B) but both %8s and %s do not change the output interface format (including contents, topology, separator mark, space redundancy).
  C) so it is belong to 'User Experience', not belong to os api.

  welcome any another members to giving suggestions and completions.

  thanks.

  :-)

-- 
Chen Gang

Asianux Corporation

^ permalink raw reply

* Re: Fwd: Re: [PATCH] net: ipv6: change %8s to %s for rt->dst.dev->name in seq_printf of rt6_info_route
From: Chen Gang @ 2012-11-27  5:35 UTC (permalink / raw)
  To: Shan Wei; +Cc: Eric Dumazet, David Miller, netdev
In-Reply-To: <50B447F3.2090806@gmail.com>

于 2012年11月27日 12:56, Shan Wei 写道:
> Chen Gang said, at 2012/11/27 12:18:
>>>
>>>>
>>>>  for the format of information which seq_printf output:
>>>>    it is not belong to OS API level for outside (at least, for current case, it is true). 
>>>>    so we need not keep 'compatible' of it, so %16s is not necessary.
>>>
>>> Can you explain If we don't change to %s, what will happen?
>>>
>>
>>   for outside, nothing will happen.
>>
>>   so it is not for correctness, it is only for "keep source code simple and clear".
> 
> So, it's a clean-up type patch which is just for developer,
> but with the change of /proc interface which is for user.
> user is first, so let us end this thread unless you have necessary reasons to do it. 
> 

1)  it is not change the /proc interface.
    a) both %8s and %s do not change the output interface format (including contents, topology, separator mark, space redundancy).
    b) it is belong to 'User Experience', not belong to os api.
    c) do you agree with what I say above ?

2)  I think:
    one of the differences between system service and user interactive program are:
      for system service (including kernel): "clean-up" is more important than "UE".
      for user interactive program:          "UE" is more important than "Clean-up".
    maybe, it is for ideal world (or maybe it is only in theory).
    (also maybe what I said above is incorrect)


3)  so all together:
    I can understand if it is not integrated into main branch.
    it will be better to continue discussing it in ideal world (or in theory), I think it is valuable for learning with each other.

  :-)

  thanks.

gchen.

> Thanks  
> Shan Wei
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


-- 
Chen Gang

Asianux Corporation

^ permalink raw reply

* Re: Fwd: Re: [PATCH] net: ipv6: change %8s to %s for rt->dst.dev->name in seq_printf of rt6_info_route
From: Shan Wei @ 2012-11-27  4:56 UTC (permalink / raw)
  To: Chen Gang; +Cc: Eric Dumazet, David Miller, netdev
In-Reply-To: <50B43F12.1090709@asianux.com>

Chen Gang said, at 2012/11/27 12:18:
>>
>>>
>>>  for the format of information which seq_printf output:
>>>    it is not belong to OS API level for outside (at least, for current case, it is true). 
>>>    so we need not keep 'compatible' of it, so %16s is not necessary.
>>
>> Can you explain If we don't change to %s, what will happen?
>>
> 
>   for outside, nothing will happen.
> 
>   so it is not for correctness, it is only for "keep source code simple and clear".

So, it's a clean-up type patch which is just for developer,
but with the change of /proc interface which is for user.
user is first, so let us end this thread unless you have necessary reasons to do it. 

Thanks  
Shan Wei

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox