Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 2/3] qlcnic: reduce rx ring size
From: Amit Kumar Salecha @ 2010-10-27  3:53 UTC (permalink / raw)
  To: davem; +Cc: netdev, ameen.rahman, anirban.chakraborty, Sony Chacko
In-Reply-To: <1288151589-32431-1-git-send-email-amit.salecha@qlogic.com>

From: Sony Chacko <sony.chacko@qlogic.com>

If eswitch is enabled, rcv ring size can be reduce, as
physical port is partition-ed.

Signed-off-by: Sony Chacko <sony.chacko@qlogic.com>
Signed-off-by: Amit Kumar Salecha <amit.salecha@qlogic.com>
---
 drivers/net/qlcnic/qlcnic.h         |    4 ++++
 drivers/net/qlcnic/qlcnic_ethtool.c |   23 +++++------------------
 drivers/net/qlcnic/qlcnic_main.c    |   14 ++++++++++++--
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic.h b/drivers/net/qlcnic/qlcnic.h
index a60ff17..6400e6a 100644
--- a/drivers/net/qlcnic/qlcnic.h
+++ b/drivers/net/qlcnic/qlcnic.h
@@ -146,11 +146,13 @@
 #define MAX_CMD_DESCRIPTORS		1024
 #define MAX_RCV_DESCRIPTORS_1G		4096
 #define MAX_RCV_DESCRIPTORS_10G 	8192
+#define MAX_RCV_DESCRIPTORS_VF		2048
 #define MAX_JUMBO_RCV_DESCRIPTORS_1G	512
 #define MAX_JUMBO_RCV_DESCRIPTORS_10G	1024
 
 #define DEFAULT_RCV_DESCRIPTORS_1G	2048
 #define DEFAULT_RCV_DESCRIPTORS_10G	4096
+#define DEFAULT_RCV_DESCRIPTORS_VF	1024
 #define MAX_RDS_RINGS                   2
 
 #define get_next_index(index, length)	\
@@ -971,6 +973,8 @@ struct qlcnic_adapter {
 	u16 num_txd;
 	u16 num_rxd;
 	u16 num_jumbo_rxd;
+	u16 max_rxd;
+	u16 max_jumbo_rxd;
 
 	u8 max_rds_rings;
 	u8 max_sds_rings;
diff --git a/drivers/net/qlcnic/qlcnic_ethtool.c b/drivers/net/qlcnic/qlcnic_ethtool.c
index 25e93a5..ec21d24 100644
--- a/drivers/net/qlcnic/qlcnic_ethtool.c
+++ b/drivers/net/qlcnic/qlcnic_ethtool.c
@@ -437,14 +437,8 @@ qlcnic_get_ringparam(struct net_device *dev,
 	ring->rx_jumbo_pending = adapter->num_jumbo_rxd;
 	ring->tx_pending = adapter->num_txd;
 
-	if (adapter->ahw.port_type == QLCNIC_GBE) {
-		ring->rx_max_pending = MAX_RCV_DESCRIPTORS_1G;
-		ring->rx_jumbo_max_pending = MAX_JUMBO_RCV_DESCRIPTORS_1G;
-	} else {
-		ring->rx_max_pending = MAX_RCV_DESCRIPTORS_10G;
-		ring->rx_jumbo_max_pending = MAX_JUMBO_RCV_DESCRIPTORS_10G;
-	}
-
+	ring->rx_max_pending = adapter->max_rxd;
+	ring->rx_jumbo_max_pending = adapter->max_jumbo_rxd;
 	ring->tx_max_pending = MAX_CMD_DESCRIPTORS;
 
 	ring->rx_mini_max_pending = 0;
@@ -472,24 +466,17 @@ qlcnic_set_ringparam(struct net_device *dev,
 		struct ethtool_ringparam *ring)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(dev);
-	u16 max_rcv_desc = MAX_RCV_DESCRIPTORS_10G;
-	u16 max_jumbo_desc = MAX_JUMBO_RCV_DESCRIPTORS_10G;
 	u16 num_rxd, num_jumbo_rxd, num_txd;
 
-
 	if (ring->rx_mini_pending)
 		return -EOPNOTSUPP;
 
-	if (adapter->ahw.port_type == QLCNIC_GBE) {
-		max_rcv_desc = MAX_RCV_DESCRIPTORS_1G;
-		max_jumbo_desc = MAX_JUMBO_RCV_DESCRIPTORS_10G;
-	}
-
 	num_rxd = qlcnic_validate_ringparam(ring->rx_pending,
-			MIN_RCV_DESCRIPTORS, max_rcv_desc, "rx");
+			MIN_RCV_DESCRIPTORS, adapter->max_rxd, "rx");
 
 	num_jumbo_rxd = qlcnic_validate_ringparam(ring->rx_jumbo_pending,
-			MIN_JUMBO_DESCRIPTORS, max_jumbo_desc, "rx jumbo");
+			MIN_JUMBO_DESCRIPTORS, adapter->max_jumbo_rxd,
+						"rx jumbo");
 
 	num_txd = qlcnic_validate_ringparam(ring->tx_pending,
 			MIN_CMD_DESCRIPTORS, MAX_CMD_DESCRIPTORS, "tx");
diff --git a/drivers/net/qlcnic/qlcnic_main.c b/drivers/net/qlcnic/qlcnic_main.c
index 5a3ce08..7a298cd 100644
--- a/drivers/net/qlcnic/qlcnic_main.c
+++ b/drivers/net/qlcnic/qlcnic_main.c
@@ -656,13 +656,23 @@ qlcnic_check_options(struct qlcnic_adapter *adapter)
 
 	dev_info(&pdev->dev, "firmware v%d.%d.%d\n",
 			fw_major, fw_minor, fw_build);
-
 	if (adapter->ahw.port_type == QLCNIC_XGBE) {
-		adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_10G;
+		if (adapter->flags & QLCNIC_ESWITCH_ENABLED) {
+			adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_VF;
+			adapter->max_rxd = MAX_RCV_DESCRIPTORS_VF;
+		} else {
+			adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_10G;
+			adapter->max_rxd = MAX_RCV_DESCRIPTORS_10G;
+		}
+
 		adapter->num_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+		adapter->max_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_10G;
+
 	} else if (adapter->ahw.port_type == QLCNIC_GBE) {
 		adapter->num_rxd = DEFAULT_RCV_DESCRIPTORS_1G;
 		adapter->num_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_1G;
+		adapter->max_jumbo_rxd = MAX_JUMBO_RCV_DESCRIPTORS_1G;
+		adapter->max_rxd = MAX_RCV_DESCRIPTORS_1G;
 	}
 
 	adapter->msix_supported = !!use_msi_x;
-- 
1.6.0.2


^ permalink raw reply related

* [PATCH 3/3] qlcnic: define valid vlan id range
From: Amit Kumar Salecha @ 2010-10-27  3:53 UTC (permalink / raw)
  To: davem; +Cc: netdev, ameen.rahman, anirban.chakraborty, Sony Chacko
In-Reply-To: <1288151589-32431-1-git-send-email-amit.salecha@qlogic.com>

From: Sony Chacko <sony.chacko@qlogic.com>

4095 vlan id is reserved and should not be use.

Signed-off-by: Sony Chacko <sony.chacko@qlogic.com>
Signed-off-by: Amit Kumar Salecha <amit.salecha@qlogic.com>
---
 drivers/net/qlcnic/qlcnic.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic.h b/drivers/net/qlcnic/qlcnic.h
index 6400e6a..8ecc170 100644
--- a/drivers/net/qlcnic/qlcnic.h
+++ b/drivers/net/qlcnic/qlcnic.h
@@ -1134,7 +1134,7 @@ struct qlcnic_eswitch {
 #define MAX_RX_QUEUES		4
 #define DEFAULT_MAC_LEARN	1
 
-#define IS_VALID_VLAN(vlan)	(vlan >= MIN_VLAN_ID && vlan <= MAX_VLAN_ID)
+#define IS_VALID_VLAN(vlan)	(vlan >= MIN_VLAN_ID && vlan < MAX_VLAN_ID)
 #define IS_VALID_BW(bw)		(bw >= MIN_BW && bw <= MAX_BW)
 #define IS_VALID_TX_QUEUES(que)	(que > 0 && que <= MAX_TX_QUEUES)
 #define IS_VALID_RX_QUEUES(que)	(que > 0 && que <= MAX_RX_QUEUES)
-- 
1.6.0.2


^ permalink raw reply related

* [PATCH 1/3] qlcnic: fix mac learning
From: Amit Kumar Salecha @ 2010-10-27  3:53 UTC (permalink / raw)
  To: davem; +Cc: netdev, ameen.rahman, anirban.chakraborty
In-Reply-To: <1288151589-32431-1-git-send-email-amit.salecha@qlogic.com>

In failover bonding case, same mac address can be programmed on other slave function.
Fw will delete old entry (original func) associated with that mac address.
Need to reporgram mac address, if failover again happen to original function.

Signed-off-by: Amit Kumar Salecha <amit.salecha@qlogic.com>
---
 drivers/net/qlcnic/qlcnic.h      |    1 +
 drivers/net/qlcnic/qlcnic_main.c |    5 +++++
 2 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic.h b/drivers/net/qlcnic/qlcnic.h
index 26c37d3..a60ff17 100644
--- a/drivers/net/qlcnic/qlcnic.h
+++ b/drivers/net/qlcnic/qlcnic.h
@@ -942,6 +942,7 @@ struct qlcnic_ipaddr {
 #define QLCNIC_LOOPBACK_TEST		2
 
 #define QLCNIC_FILTER_AGE	80
+#define QLCNIC_READD_AGE	20
 #define QLCNIC_LB_MAX_FILTERS	64
 
 struct qlcnic_filter {
diff --git a/drivers/net/qlcnic/qlcnic_main.c b/drivers/net/qlcnic/qlcnic_main.c
index f047c7c..5a3ce08 100644
--- a/drivers/net/qlcnic/qlcnic_main.c
+++ b/drivers/net/qlcnic/qlcnic_main.c
@@ -1860,6 +1860,11 @@ qlcnic_send_filter(struct qlcnic_adapter *adapter,
 	hlist_for_each_entry_safe(tmp_fil, tmp_hnode, n, head, fnode) {
 		if (!memcmp(tmp_fil->faddr, &src_addr, ETH_ALEN) &&
 			    tmp_fil->vlan_id == vlan_id) {
+
+			if (jiffies >
+			    (QLCNIC_READD_AGE * HZ + tmp_fil->ftime))
+				qlcnic_change_filter(adapter, src_addr, vlan_id,
+								tx_ring);
 			tmp_fil->ftime = jiffies;
 			return;
 		}
-- 
1.6.0.2


^ permalink raw reply related

* [PATCHv2 0/3]qlcnic: bug fixes
From: Amit Kumar Salecha @ 2010-10-27  3:53 UTC (permalink / raw)
  To: davem; +Cc: netdev, ameen.rahman, anirban.chakraborty

Hi
  Series v2 of 3 patches for bug fixes. Patches are numbered.
  Dropping "dma address align check" patch as pci_alloc_consistent is gaurantee to give
  page align dma address.

-Amit 

^ permalink raw reply

* [PATCH 2/2 v4] xps: Transmit Packet Steering
From: Tom Herbert @ 2010-10-27  3:38 UTC (permalink / raw)
  To: davem, netdev; +Cc: eric.dumazet

This patch implements transmit packet steering (XPS) for multiqueue
devices.  XPS selects a transmit queue during packet transmission based
on configuration.  This is done by mapping the CPU transmitting the
packet to a queue.  This is the transmit side analogue to RPS-- where
RPS is selecting a CPU based on receive queue, XPS selects a queue
based on the CPU (previously there was an XPS patch from Eric
Dumazet, but that might more appropriately be called transmit completion
steering).

Each transmit queue can be associated with a number of CPUs which will
use the queue to send packets.  This is configured as a CPU mask on a
per queue basis in:

/sys/class/net/eth<n>/queues/tx-<n>/xps_cpus

The mappings are stored per device in an inverted data structure that
maps CPUs to queues.  In the netdevice structure this is an array of
num_possible_cpu structures where each structure holds and array of
queue_indexes for queues which that CPU can use.

The benefits of XPS are improved locality in the per queue data
structures.  Also, transmit completions are more likely to be done
nearer to the sending thread, so this should promote locality back
to the socket on free (e.g. UDP).  The benefits of XPS are dependent on
cache hierarchy, application load, and other factors.  XPS would
nominally be configured so that a queue would only be shared by CPUs
which are sharing a cache, the degenerative configuration woud be that
each CPU has it's own queue.

Below are some benchmark results which show the potential benfit of
this patch.  The netperf test has 500 instances of netperf TCP_RR test
with 1 byte req. and resp.

bnx2x on 16 core AMD
   XPS (16 queues, 1 TX queue per CPU)  1234K at 100% CPU
   No XPS (16 queues)                   996K at 100% CPU

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/linux/netdevice.h |   27 ++++
 net/core/dev.c            |   55 +++++++-
 net/core/net-sysfs.c      |  367 ++++++++++++++++++++++++++++++++++++++++++++-
 net/core/net-sysfs.h      |    3 +
 4 files changed, 446 insertions(+), 6 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fcd3dda..f19b78b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -503,6 +503,13 @@ struct netdev_queue {
 	struct Qdisc		*qdisc;
 	unsigned long		state;
 	struct Qdisc		*qdisc_sleeping;
+#ifdef CONFIG_RPS
+	struct netdev_queue	*first;
+	atomic_t		count;
+	struct xps_dev_maps	*xps_maps;
+	struct kobject		kobj;
+#endif
+
 /*
  * write mostly part
  */
@@ -530,6 +537,26 @@ struct rps_map {
 #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
 
 /*
+ * This structure holds an XPS map which can be of variable length.  The
+ * map is an array of queues.
+ */
+struct xps_map {
+	unsigned int len;
+	unsigned int alloc_len;
+	struct rcu_head rcu;
+	u16 queues[0];
+};
+
+/*
+ * This structure holds all XPS maps for device.  Maps are indexed by CPU.
+ */
+struct xps_dev_maps {
+	struct rcu_head rcu;
+	struct xps_map *cpu_map[0];
+};
+#define netdev_get_xps_maps(dev) ((dev)->_tx[0].xps_maps)
+
+/*
  * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
  * tail pointer for that CPU's input queue at the time of last enqueue.
  */
diff --git a/net/core/dev.c b/net/core/dev.c
index 4df783c..12426a6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2119,6 +2119,44 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
 	return queue_index;
 }
 
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_RPS
+	struct xps_dev_maps *dev_maps;
+	struct xps_map *map;
+	int queue_index = -1;
+
+	preempt_disable();
+	rcu_read_lock();
+	dev_maps = rcu_dereference(netdev_get_xps_maps(dev));
+	if (dev_maps) {
+		map = rcu_dereference(dev_maps->cpu_map[smp_processor_id()]);
+		if (map) {
+			if (map->len == 1)
+				queue_index = map->queues[0];
+			else {
+				u32 hash;
+				if (skb->sk && skb->sk->sk_hash)
+					hash = skb->sk->sk_hash;
+				else
+					hash = (__force u16) skb->protocol ^
+					    skb->rxhash;
+				hash = jhash_1word(hash, hashrnd);
+				queue_index = map->queues[
+				    ((u64)hash * map->len) >> 32];
+			}
+			if (unlikely(queue_index >= dev->real_num_tx_queues))
+				queue_index = -1;
+		}
+	}
+	rcu_read_unlock();
+	preempt_enable();
+
+	return queue_index;
+#endif
+	return -1;
+}
+
 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
 {
@@ -2138,7 +2176,9 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 		    queue_index >= dev->real_num_tx_queues) {
 			int old_index = queue_index;
 
-			queue_index = skb_tx_hash(dev, skb);
+			queue_index = get_xps_queue(dev, skb);
+			if (queue_index < 0)
+				queue_index = skb_tx_hash(dev, skb);
 
 			if (queue_index != old_index && sk) {
 				struct dst_entry *dst =
@@ -5052,6 +5092,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 		return -ENOMEM;
 	}
 	dev->_tx = tx;
+#ifdef CONFIG_RPS
+	/*
+	 * Set a pointer to first element in the array which holds the
+	 * reference count.
+	 */
+	{
+		int i;
+		for (i = 0; i < count; i++)
+			tx[i].first = tx;
+	}
+#endif
 	return 0;
 }
 
@@ -5616,7 +5667,9 @@ void free_netdev(struct net_device *dev)
 
 	release_net(dev_net(dev));
 
+#ifndef CONFIG_RPS
 	kfree(dev->_tx);
+#endif
 
 	kfree(rcu_dereference_raw(dev->ingress_queue));
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b143173..e193cf2 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -764,18 +764,375 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 	return error;
 }
 
-static int rx_queue_register_kobjects(struct net_device *net)
+/*
+ * netdev_queue sysfs structures and functions.
+ */
+struct netdev_queue_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct netdev_queue *queue,
+	    struct netdev_queue_attribute *attr, char *buf);
+	ssize_t (*store)(struct netdev_queue *queue,
+	    struct netdev_queue_attribute *attr, const char *buf, size_t len);
+};
+#define to_netdev_queue_attr(_attr) container_of(_attr,		\
+    struct netdev_queue_attribute, attr)
+
+#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
+
+static ssize_t netdev_queue_attr_show(struct kobject *kobj,
+				      struct attribute *attr, char *buf)
+{
+	struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(queue, attribute, buf);
+}
+
+static ssize_t netdev_queue_attr_store(struct kobject *kobj,
+				       struct attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(queue, attribute, buf, count);
+}
+
+static const struct sysfs_ops netdev_queue_sysfs_ops = {
+	.show = netdev_queue_attr_show,
+	.store = netdev_queue_attr_store,
+};
+
+static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+{
+	struct net_device *dev = queue->dev;
+	int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++)
+		if (queue == &dev->_tx[i])
+			break;
+
+	BUG_ON(i >= dev->num_tx_queues);
+
+	return i;
+}
+
+
+static ssize_t show_xps_map(struct netdev_queue *queue,
+			    struct netdev_queue_attribute *attribute, char *buf)
+{
+	struct netdev_queue *first = queue->first;
+	struct xps_dev_maps *dev_maps;
+	cpumask_var_t mask;
+	unsigned long index;
+	size_t len = 0;
+	int i;
+
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	index = get_netdev_queue_index(queue);
+
+	rcu_read_lock();
+	dev_maps = rcu_dereference(first->xps_maps);
+	if (dev_maps) {
+		for (i = 0; i < num_possible_cpus(); i++) {
+			struct xps_map *map =
+			    rcu_dereference(dev_maps->cpu_map[i]);
+			if (map) {
+				int j;
+				for (j = 0; j < map->len; j++) {
+					if (map->queues[j] == index) {
+						cpumask_set_cpu(i, mask);
+						break;
+					}
+				}
+			}
+		}
+	}
+	len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
+	if (PAGE_SIZE - len < 3) {
+		rcu_read_unlock();
+		free_cpumask_var(mask);
+		return -EINVAL;
+	}
+	rcu_read_unlock();
+
+	free_cpumask_var(mask);
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
+static void xps_map_release(struct rcu_head *rcu)
+{
+	struct xps_map *map = container_of(rcu, struct xps_map, rcu);
+
+	kfree(map);
+}
+
+static void xps_dev_maps_release(struct rcu_head *rcu)
 {
+	struct xps_dev_maps *dev_maps =
+	    container_of(rcu, struct xps_dev_maps, rcu);
+
+	kfree(dev_maps);
+}
+
+static DEFINE_MUTEX(xps_map_mutex);
+
+static ssize_t store_xps_map(struct netdev_queue *queue,
+		      struct netdev_queue_attribute *attribute,
+		      const char *buf, size_t len)
+{
+	struct netdev_queue *first = queue->first;
+	cpumask_var_t mask;
+	int err, i, cpu, pos, map_len, alloc_len, need_set;
+	unsigned long index;
+	struct xps_map *map, *new_map;
+	struct xps_dev_maps *dev_maps, *new_dev_maps;
+	int nonempty = 0;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	index = get_netdev_queue_index(queue);
+
+	err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+	if (err) {
+		free_cpumask_var(mask);
+		return err;
+	}
+
+	new_dev_maps = kzalloc(sizeof(struct xps_dev_maps) +
+	    (num_possible_cpus() * sizeof(struct xps_map *)), GFP_KERNEL);
+	if (!new_dev_maps) {
+		free_cpumask_var(mask);
+		return err;
+	}
+
+	mutex_lock(&xps_map_mutex);
+
+	dev_maps = first->xps_maps;
+
+	for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+		new_map = map = dev_maps ? dev_maps->cpu_map[cpu] : NULL;
+
+		if (map) {
+			for (pos = 0; pos < map->len; pos++)
+				if (map->queues[pos] == index)
+					break;
+			map_len = map->len;
+			alloc_len = map->alloc_len;
+		} else
+			pos = map_len = alloc_len = 0;
+
+		need_set = cpu_isset(cpu, *mask) && cpu_online(cpu);
+
+		if (need_set && pos >= map_len) {
+			/* Need to add queue to this CPU's map */
+			if (map_len >= alloc_len) {
+				alloc_len = alloc_len ?  2 * alloc_len : 1;
+				new_map = kzalloc(sizeof(struct xps_map) +
+				    (alloc_len * sizeof(u16)), GFP_KERNEL);
+				if (!new_map)
+					goto error;
+				new_map->alloc_len = alloc_len;
+				for (i = 0; i < map_len; i++)
+					new_map->queues[i] = map->queues[i];
+				new_map->len = map_len;
+			}
+			new_map->queues[new_map->len++] = index;
+		} else if (!need_set && pos < map_len) {
+			/* Need to remove queue from this CPU's map */
+			if (map_len > 1)
+				new_map->queues[pos] =
+				    new_map->queues[--new_map->len];
+			else
+				new_map = NULL;
+		}
+		new_dev_maps->cpu_map[cpu] = new_map;
+	}
+
+	/* Cleanup old maps */
+	for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+		map = dev_maps ? dev_maps->cpu_map[cpu] : NULL;
+		if (map && new_dev_maps->cpu_map[cpu] != map)
+			call_rcu(&map->rcu, xps_map_release);
+		if (new_dev_maps->cpu_map[cpu])
+			nonempty = 1;
+	}
+
+	if (nonempty)
+		rcu_assign_pointer(first->xps_maps, new_dev_maps);
+	else {
+		kfree(new_dev_maps);
+		rcu_assign_pointer(first->xps_maps, NULL);
+	}
+
+	if (dev_maps)
+		call_rcu(&dev_maps->rcu, xps_dev_maps_release);
+
+	mutex_unlock(&xps_map_mutex);
+
+	free_cpumask_var(mask);
+	return len;
+
+error:
+	mutex_unlock(&xps_map_mutex);
+
+	if (new_dev_maps)
+		for (i = 0; i < num_possible_cpus(); i++)
+			kfree(new_dev_maps->cpu_map[i]);
+
+	kfree(new_dev_maps);
+	free_cpumask_var(mask);
+	return -ENOMEM;
+}
+
+static struct netdev_queue_attribute xps_cpus_attribute =
+    __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+
+static struct attribute *netdev_queue_default_attrs[] = {
+	&xps_cpus_attribute.attr,
+	NULL
+};
+
+static void netdev_queue_release(struct kobject *kobj)
+{
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+	struct netdev_queue *first = queue->first;
+	struct xps_dev_maps *dev_maps;
+	struct xps_map *map;
+	unsigned long index;
+	int i, pos, nonempty = 0;
+
+	index = get_netdev_queue_index(queue);
+
+	mutex_lock(&xps_map_mutex);
+	dev_maps = first->xps_maps;
+
+	for (i = 0; i < num_possible_cpus(); i++) {
+		map  = dev_maps ? dev_maps->cpu_map[i] : NULL;
+		if (!map)
+			continue;
+
+		for (pos = 0; pos < map->len; pos++)
+			if (map->queues[pos] == index)
+				break;
+
+		if (pos < map->len) {
+			if (map->len > 1)
+				map->queues[pos] = map->queues[--map->len];
+			else {
+				rcu_assign_pointer(dev_maps->cpu_map[i],
+				    NULL);
+				call_rcu(&map->rcu, xps_map_release);
+				map = NULL;
+			}
+		}
+
+		if (map)
+			nonempty = 1;
+	}
+
+	if (!nonempty) {
+		rcu_assign_pointer(first->xps_maps, NULL);
+		call_rcu(&dev_maps->rcu, xps_dev_maps_release);
+	}
+	mutex_unlock(&xps_map_mutex);
+
+	if (atomic_dec_and_test(&first->count))
+		kfree(first);
+}
+
+static struct kobj_type netdev_queue_ktype = {
+	.sysfs_ops = &netdev_queue_sysfs_ops,
+	.release = netdev_queue_release,
+	.default_attrs = netdev_queue_default_attrs,
+};
+
+static int netdev_queue_add_kobject(struct net_device *net, int index)
+{
+	struct netdev_queue *queue = net->_tx + index;
+	struct netdev_queue *first = queue->first;
+	struct kobject *kobj = &queue->kobj;
+	int error = 0;
+
+	kobj->kset = net->queues_kset;
+	error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
+	    "tx-%u", index);
+	if (error) {
+		kobject_put(kobj);
+		return error;
+	}
+
+	kobject_uevent(kobj, KOBJ_ADD);
+	atomic_inc(&first->count);
+
+	return error;
+}
+
+int
+netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+{
+	int i;
+	int error = 0;
+
+	for (i = old_num; i < new_num; i++) {
+		error = netdev_queue_add_kobject(net, i);
+		if (error) {
+			new_num = old_num;
+			break;
+		}
+	}
+
+	while (--i >= new_num)
+		kobject_put(&net->_rx[i].kobj);
+
+	return error;
+}
+
+static int register_queue_kobjects(struct net_device *net)
+{
+	int error = 0, txq = 0, rxq = 0;
+
 	net->queues_kset = kset_create_and_add("queues",
 	    NULL, &net->dev.kobj);
 	if (!net->queues_kset)
 		return -ENOMEM;
-	return net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues);
+
+	error = net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues);
+	if (error)
+		goto error;
+	rxq = net->real_num_rx_queues;
+
+	error = netdev_queue_update_kobjects(net, 0,
+					     net->real_num_tx_queues);
+	if (error)
+		goto error;
+	txq = net->real_num_tx_queues;
+
+	return 0;
+
+error:
+	netdev_queue_update_kobjects(net, txq, 0);
+	net_rx_queue_update_kobjects(net, rxq, 0);
+	return error;
 }
 
-static void rx_queue_remove_kobjects(struct net_device *net)
+static void remove_queue_kobjects(struct net_device *net)
 {
 	net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0);
+	netdev_queue_update_kobjects(net, net->real_num_tx_queues, 0);
 	kset_unregister(net->queues_kset);
 }
 #endif /* CONFIG_RPS */
@@ -878,7 +1235,7 @@ void netdev_unregister_kobject(struct net_device * net)
 	kobject_get(&dev->kobj);
 
 #ifdef CONFIG_RPS
-	rx_queue_remove_kobjects(net);
+	remove_queue_kobjects(net);
 #endif
 
 	device_del(dev);
@@ -919,7 +1276,7 @@ int netdev_register_kobject(struct net_device *net)
 		return error;
 
 #ifdef CONFIG_RPS
-	error = rx_queue_register_kobjects(net);
+	error = register_queue_kobjects(net);
 	if (error) {
 		device_del(dev);
 		return error;
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 778e157..25ec2ee 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -6,6 +6,9 @@ int netdev_register_kobject(struct net_device *);
 void netdev_unregister_kobject(struct net_device *);
 #ifdef CONFIG_RPS
 int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
+int netdev_queue_update_kobjects(struct net_device *net,
+				 int old_num, int new_num);
+
 #endif
 
 #endif
-- 
1.7.1


^ permalink raw reply related

* [PATCH 1/2 v4] xps: Improvements in TX queue selection
From: Tom Herbert @ 2010-10-27  3:38 UTC (permalink / raw)
  To: davem, netdev; +Cc: eric.dumazet

In dev_pick_tx, don't do work in calculating queue index or setting
the index in the sock unless the device has more than one queue.  This
allows the sock to be set only with a queue index of a multi-queue
device which is desirable if device are stacked like in a tunnel.

We also allow the mapping of a socket to queue to be changed.  To
maintain in order packet transmission a flag (ooo_okay) has been
added to the sk_buff structure.  If a transport layer sets this flag
on a packet, the transmit queue can be changed for the socket.
Presumably, the transport would set this if there was no possbility
of creating OOO packets (for instance, there are no packets in flight
for the socket).  This patch includes the modification in TCP output
for setting this flag.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/linux/skbuff.h |    3 ++-
 net/core/dev.c         |   18 +++++++++++-------
 net/ipv4/tcp_output.c  |    5 ++++-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e6ba898..19f37a6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -386,9 +386,10 @@ struct sk_buff {
 #else
 	__u8			deliver_no_wcard:1;
 #endif
+	__u8			ooo_okay:1;
 	kmemcheck_bitfield_end(flags2);
 
-	/* 0/14 bit hole */
+	/* 0/13 bit hole */
 
 #ifdef CONFIG_NET_DMA
 	dma_cookie_t		dma_cookie;
diff --git a/net/core/dev.c b/net/core/dev.c
index b2269ac..4df783c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2125,20 +2125,24 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 	int queue_index;
 	const struct net_device_ops *ops = dev->netdev_ops;
 
-	if (ops->ndo_select_queue) {
+	if (dev->real_num_tx_queues == 1)
+		queue_index = 0;
+	else if (ops->ndo_select_queue) {
 		queue_index = ops->ndo_select_queue(dev, skb);
 		queue_index = dev_cap_txqueue(dev, queue_index);
 	} else {
 		struct sock *sk = skb->sk;
 		queue_index = sk_tx_queue_get(sk);
-		if (queue_index < 0) {
 
-			queue_index = 0;
-			if (dev->real_num_tx_queues > 1)
-				queue_index = skb_tx_hash(dev, skb);
+		if (queue_index < 0 || skb->ooo_okay ||
+		    queue_index >= dev->real_num_tx_queues) {
+			int old_index = queue_index;
 
-			if (sk) {
-				struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
+			queue_index = skb_tx_hash(dev, skb);
+
+			if (queue_index != old_index && sk) {
+				struct dst_entry *dst =
+				    rcu_dereference_check(sk->sk_dst_cache, 1);
 
 				if (dst && skb_dst(skb) == dst)
 					sk_tx_queue_set(sk, queue_index);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 05b1ecf..2b6eb36 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -822,8 +822,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 							   &md5);
 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
 
-	if (tcp_packets_in_flight(tp) == 0)
+	if (tcp_packets_in_flight(tp) == 0) {
 		tcp_ca_event(sk, CA_EVENT_TX_START);
+		skb->ooo_okay = 1;
+	} else
+		skb->ooo_okay = 0;
 
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);
-- 
1.7.1


^ permalink raw reply related

* Re: [PATCH] ipv6: addrconf: clear IPv6 addresses and routes when losing link
From: Brian Haley @ 2010-10-27  2:31 UTC (permalink / raw)
  To: Lorenzo Colitti; +Cc: netdev
In-Reply-To: <AANLkTikC4pv8aOODM2pOg2bKQGL69wivcUU3f9ZziPhe@mail.gmail.com>

Hi Lorenzo,

On 10/25/2010 10:08 PM, Lorenzo Colitti wrote:
> When roaming between different networks (e.g., changing wireless
> SSIDs, or plugging in to different wired networks), IPv6 addresses and
> routes are not cleared. If the two networks have different IPv6
> subnets assigned, the host maintains both the old and new IPv6
> addresses and gateways, but only the new ones works. If the host
> chooses the wrong source address or gateway, or if the new network
> does not have IPv6 but the old one did, IPv6 connections time out,
> leading to long delays when trying to connect to IPv6 hosts.
> 
> Fix this by ensuring that autoconfigured IPv6 addresses and routes are
> purged when link is lost, not only when the interface goes down.
> 
> Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
> 
> --- a/net/ipv6/addrconf.c	2010-10-20 13:30:22.000000000 -0700
> +++ b/net/ipv6/addrconf.c	2010-10-25 13:55:15.000000000 -0700
> @@ -2524,6 +2524,14 @@
>  		} else {
>  			if (!addrconf_qdisc_ok(dev)) {
>  				/* device is still not ready. */
> +				if (idev && (idev->if_flags & IF_READY)) {
> +					/* Link lost. Clear addresses and
> +					   routes, the device might come back
> +					   on a link where they are no longer
> +					   valid. */
> +					addrconf_ifdown(dev, 0);
> +					idev->if_flags &= ~IF_READY;
> +				}

Just taking another look at this, you don't need that ~IF_READY line,
addrconf_ifdown() is already doing that when how==0.

Could you give my previous patch a try?  I believe it will work the same
way as yours, but also fixes a case where DAD is started twice for some
addresses.

Thanks,

-Brian

^ permalink raw reply

* Re: [RFC][net-next-2.6 PATCH 4/4] net: remove check for headroom in vlan_dev_create
From: Jesse Gross @ 2010-10-27  2:07 UTC (permalink / raw)
  To: John Fastabend; +Cc: netdev@vger.kernel.org
In-Reply-To: <4CC750B8.7060607@intel.com>

On Tue, Oct 26, 2010 at 3:05 PM, John Fastabend
<john.r.fastabend@intel.com> wrote:
> On 10/25/2010 3:45 PM, Jesse Gross wrote:
>> On Thu, Oct 21, 2010 at 3:10 PM, John Fastabend
>> <john.r.fastabend@intel.com> wrote:
>>> It is possible for the headroom to be smaller then the
>>> hard_header_len for a short period of time after toggling
>>> the vlan offload setting.
>>>
>>> This is not a hard error and skb_cow_head is called in
>>> __vlan_put_tag() to resolve this.
>>>
>>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>>
>> How is it possible that the hard_header_len changes on the vlan
>> device?  It looks like the header length never gets changed after it
>> is initialized.  There's no set_flags method in the vlan device to
>> toggle whether it is using offloading or not, it just rides on top of
>> the underlying device.
>
> Your right and I think this is why my previous patch was broken. If we
> can toggle the underlying offloads we should set the header length as
> well. With the updated patch I just sent this should be true now.

OK, it makes sense it that context.
Acked-by: Jesse Gross <jesse@nicira.com>

Thanks.

^ permalink raw reply

* Re: [RFC][net-next-2.6 PATCH v2] 8021q: set hard_header_len when VLAN offload features are toggled
From: Jesse Gross @ 2010-10-27  2:05 UTC (permalink / raw)
  To: John Fastabend; +Cc: netdev, bhutchings
In-Reply-To: <20101026215933.2339.45454.stgit@jf-dev1-dcblab>

On Tue, Oct 26, 2010 at 2:59 PM, John Fastabend
<john.r.fastabend@intel.com> wrote:
> Toggling the vlan tx|rx hw offloads needs to set the hard_header_len
> as well otherwise we end up using LL_RESERVED_SPACE incorrectly.
> This results in pskb_expand_head() being used unnecessarily.
>
> This add a check in vlan_transfer_features  to catch the ETH_FLAG_TXVLAN
> flag and set the header length. This requires drivers to add the
> ETH_FLAG_TXVLAN to vlan_features.
>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

I think this addresses all of the original problems.  However, I don't
think that we want to have drivers claim to support vlan offloading as
a feature for vlan packets.  That implies some type of QinQ
functionality to me.  In addition, if the vlan device claims to
support offloading and a second vlan device is stacked on top of it,
then the two will clobber skb->vlan_tci.  It's probably simpler to
just keep track of whether vlan offloading is currently enabled so we
can find out whether it changed.

> ---
>
>  net/8021q/vlan.c |   10 ++++++++++
>  1 files changed, 10 insertions(+), 0 deletions(-)
>
> diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
> index 05b867e..825011b 100644
> --- a/net/8021q/vlan.c
> +++ b/net/8021q/vlan.c
> @@ -334,6 +334,16 @@ static void vlan_transfer_features(struct net_device *dev,
>        vlandev->features &= ~dev->vlan_features;
>        vlandev->features |= dev->features & dev->vlan_features;
>        vlandev->gso_max_size = dev->gso_max_size;
> +
> +       /* is ETH_FLAGS_TXVLAN being toggled */
> +       if ((vlandev->features & ETH_FLAG_TXVLAN) ^
> +           (old_features & ETH_FLAG_TXVLAN)) {
> +               if (vlandev->features & ETH_FLAG_TXVLAN)
> +                       vlandev->hard_header_len -= VLAN_HLEN;
> +               else
> +                       vlandev->hard_header_len += VLAN_HLEN;
> +       }

The correct flag for dev->features is NETIF_F_HW_VLAN_TX.
ETH_FLAGS_TXVLAN is an ethtool construct (that happens to have the
same value).

Thanks.

^ permalink raw reply

* business Proposal
From: jpoon @ 2010-10-27  1:33 UTC (permalink / raw)


I am still waiting to hear from you. I want to know if you got my  
business proposal.
Joseph Poon


^ permalink raw reply

* RE: [PATCH v13 10/16] Add a hook to intercept external buffers from NIC driver.
From: Xin, Xiaohui @ 2010-10-27  1:33 UTC (permalink / raw)
  To: David Miller
  Cc: netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mst@redhat.com, mingo@elte.hu,
	herbert@gondor.apana.org.au, jdike@linux.intel.com
In-Reply-To: <20101019.082401.28817543.davem@davemloft.net>

>-----Original Message-----
>From: David Miller [mailto:davem@davemloft.net]
>Sent: Tuesday, October 19, 2010 11:24 PM
>To: Xin, Xiaohui
>Cc: netdev@vger.kernel.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
>mst@redhat.com; mingo@elte.hu; herbert@gondor.apana.org.au; jdike@linux.intel.com
>Subject: Re: [PATCH v13 10/16] Add a hook to intercept external buffers from NIC driver.
>
>From: xiaohui.xin@intel.com
>Date: Fri, 15 Oct 2010 17:12:11 +0800
>
>> @@ -2891,6 +2922,11 @@ static int __netif_receive_skb(struct sk_buff *skb)
>>  ncls:
>>  #endif
>>
>> +	/* To intercept mediate passthru(zero-copy) packets here */
>> +	skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
>> +	if (!skb)
>> +		goto out;
>> +
>>  	/* Handle special case of bridge or macvlan */
>>  	rx_handler = rcu_dereference(skb->dev->rx_handler);
>>  	if (rx_handler) {
>
>If you consume the packet here, devices in passthru mode cannot
>be use with bonding.
>
>But there is nothing that prevents a bond being created with such
>a device.
>
>So we have to either prevent such configurations (bad) or make
>it work somehow (good) :-)

The big picture may like this:
To prevent such configurations, we should add code to check in both
mp and bonding driver. If a nic is in zero-copy mode , bonding can't
be made with it, and if nic is in bonding mode, we can't bind the device
to do zero-copy.

If we want to support such configurations, it also has some constraints.
If bonding is created first, we need code to check if all the slaves support
zero-copy mode, and if yes, all the slaves should be assigned a same 
page_ctor(), all the packets received should be intercepted with master nic. 
If not, fails.
If zero-copy is enabled first, bonding created with it should fail.

Somehow, it seems not a trivial work to support it now. Can we support it
later and as a todo with our current work?

Thanks
Xiaohui

^ permalink raw reply

* Re: [net-next PATCH 1/3] qlge: Restoring the vlan setting.
From: Ron Mercer @ 2010-10-27  1:13 UTC (permalink / raw)
  To: David Miller
  Cc: jesse@nicira.com, netdev@vger.kernel.org, Jitendra Kalsaria,
	Ying Ping Lok
In-Reply-To: <20101026.142152.71128173.davem@davemloft.net>

On Tue, Oct 26, 2010 at 02:21:52PM -0700, David Miller wrote:
> From: Ron Mercer <ron.mercer@qlogic.com>
> Date: Tue, 26 Oct 2010 13:54:22 -0700
> 
> > On Mon, Oct 25, 2010 at 05:56:57PM -0700, Jesse Gross wrote:
> >> 
> >> Using vlan groups within a driver is now deprecated.  I realize that
> >> this is just a bug fix but it would nice if we can avoid introducing
> >> more code around vlan groups.  Of course, fully switching the driver
> >> over to use the new vlan model would be even nicer.
> > 
> > I would like this bug fix to be applied though we will schedule switching
> > to the new vlan model ASAP.
> 
> Then why did you put "net-next-2.6" in the subject lines?

It should have been by itself against net-2.6.  I will re-spin these
patches.


^ permalink raw reply

* Re: [PATCH 1/2] r6040: fix multicast operations
From: Shawn Lin @ 2010-10-27  1:19 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Florian Fainelli, netdev, Marc Leclerc, Albert Chen, David Miller
In-Reply-To: <20101026140209.GY13095@decadent.org.uk>

On Tue, 2010-10-26 at 15:02 +0100, Ben Hutchings wrote:
> On Thu, Oct 21, 2010 at 04:27:34PM +0800, Shawn Lin wrote:
> [...]
> > > > +	/* Use internal multicast address registers
> > > > +	 * if the number of multicast addresses is not greater than MCAST_MAX.
> > > > +	 */
> > > > +	else if (netdev_mc_empty(dev)) {
> > > > +		for (i = 0; i < MCAST_MAX ; i++) {
> > > > +			iowrite16(0, ioaddr + MID_1L + 8 * i);
> > > > +			iowrite16(0, ioaddr + MID_1M + 8 * i);
> > > > +			iowrite16(0, ioaddr + MID_1H + 8 * i);
> > > > +		}
> > > > +	} else if (netdev_mc_count(dev) <= MCAST_MAX) {
> > > > +		i = 0;
> > > > +		netdev_for_each_mc_addr(ha, dev) {
> > > > +			adrp = (u16 *) ha->addr;
> > > > +			iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
> > > > +			iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
> > > > +			iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
> > > > +			i++;
> > > > +		}
> > > 
> > > What about the unused exact match entries?  And why is the empty case
> > > special?
> > 
> > Unused exact match entries? I am not sure which entries are you
> > mentioned.
> 
> If there are 1 or 2 addresses in the multicast list then some of the
> exact match entries will be used and some will not.  But the loop
> above does not clear the unused entries.
> 

Yes, you are right.

> [...]
> > 2) if (netdev_mc_count(dev) <= 3)
> > There are two hardware features could be used to filter multicast
> > frames:
> [...]
> > 3) if (netdev_mc_empty(dev))
> > because we masked the multicast hash table flag before examine all
> > conditions, we only need to clear the addresses in the three
> > MAC/Multicast registers.
> [...]
> 
> But why is this so different from the case of 1-3 addresses?  I would
> write these two cases as:
> 
> else if (netdev_mc_count(dev) <= MCAST_MAX) {
> 	i = 0;
> 	netdev_for_each_mc_addr(ha, dev) {
> 		adrp = (u16 *) ha->addr;
> 		iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
> 		iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
> 		iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
> 		i++;
> 	}
> 	while (i < MCAST_MAX) {
> 		iowrite16(0, ioaddr + MID_1L + 8 * i);
> 		iowrite16(0, ioaddr + MID_1M + 8 * i);
> 		iowrite16(0, ioaddr + MID_1H + 8 * i);
> 		i++;
> 	}
> } 
> 
> Ben.
> 

Thanks.

I will fix the issues you mentioned and resubmit the code.

--
Regards,

Shawn Lin


===========================================================================================
The privileged confidential information contained in this email is intended for use only by the addressees as indicated by the original sender of this email. 
If you are not the addressee indicated in this email or are not responsible for delivery of the email to such a person, please kindly reply to the sender indicating this fact and delete all copies of it from your computer and network server immediately. 
Your cooperation is highly appreciated. It is advised that any unauthorized use of confidential information of DM&P Group is strictly prohibited; and any information in this email irrelevant to the official business of DM&P Group shall be deemed as neither given nor endorsed by DM&P Group.

===========================================================================================

^ permalink raw reply

* Re: tap0 device stopped working in 2.6.36 (ok in 2.6.35)
From: Nolan Leake @ 2010-10-27  1:18 UTC (permalink / raw)
  To: Jim; +Cc: netdev
In-Reply-To: <4CC40398.6070105@xs4all.nl>

On Sun, 2010-10-24 at 11:59 +0200, Jim wrote:
> Thanks for explaining the purpose of the patch.
> But it appears something is missing and I think it breaks current
> userspace. I use this tap0 device together with VirtualBox, I have a
> virtual machine setup as bridged to tap0, not a very odd or strange
> setup (this used to be the only method).
> On the host side I run dhcpd to hand out IP address to the virtual
> machine, but despite the dhcpd running on the tap0 device it never got
> 'ready' in the sense that no IP packets made it out from the host to the
> guest.

To make sure I understand the situation, is this correct (ignoring the
exact names of the interfaces):
br0 bridges between eth0 and tap0, and you run dhcpd on tap0?

Since tap0 is part of the bridge, I think dhcpd should be running on
br0.  Does that work?

^ permalink raw reply

* Re: linux-next: build warning from Linus' tree
From: Jesse Gross @ 2010-10-27  0:58 UTC (permalink / raw)
  To: Stephen Rothwell; +Cc: David Miller, netdev, linux-next, linux-kernel
In-Reply-To: <20101027112904.461a0143.sfr@canb.auug.org.au>

On Tue, Oct 26, 2010 at 5:29 PM, Stephen Rothwell <sfr@canb.auug.org.au> wrote:
> Hi all,
>
> While building Linus' tree, today's linux-next build (x86_64 allmodconfig)
> produced this warning:
>
> drivers/net/igb/igb_main.c: In function 'igb_xmit_frame_ring_adv':
> drivers/net/igb/igb_main.c:4110: warning: unused variable 'adapter'
>
> Introduced by commit eab6d18d20fc5b5ba04a7e7fcd6f357197870e51 ("vlan:
> Don't check for vlan group before vlan_tx_tag_present").

Patch sent out to resolve this.

^ permalink raw reply

* [PATCH] igb: Fix unused variable warning.
From: Jesse Gross @ 2010-10-27  0:56 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Commit eab6d18d "vlan: Don't check for vlan group before
vlan_tx_tag_present" removed the need for the adapter variable
in igb_xmit_frame_ring_adv().  This removes the variable as well
to avoid the compiler warning.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 drivers/net/igb/igb_main.c |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 14db09e..892d196 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -4107,7 +4107,6 @@ static inline int igb_maybe_stop_tx(struct igb_ring *tx_ring, int size)
 netdev_tx_t igb_xmit_frame_ring_adv(struct sk_buff *skb,
 				    struct igb_ring *tx_ring)
 {
-	struct igb_adapter *adapter = netdev_priv(tx_ring->netdev);
 	int tso = 0, count;
 	u32 tx_flags = 0;
 	u16 first;
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH net-next-2.6 v2] can: Topcliff: PCH_CAN driver: Fix build warnings
From: Tomoya MORINAGA @ 2010-10-27  0:50 UTC (permalink / raw)
  To: Wolfgang Grandegger, David Miller
  Cc: andrew.chih.howe.khor-ral2JQCrhuEAvxtiuMwx3w,
	sameo-VuQAYsv1563Yd54FQh9/CA,
	margie.foster-ral2JQCrhuEAvxtiuMwx3w,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	kok.howg.ewe-ral2JQCrhuEAvxtiuMwx3w,
	joel.clark-ral2JQCrhuEAvxtiuMwx3w,
	socketcan-core-request-0fE9KPoRgkgATYTw5x5z8w,
	yong.y.wang-ral2JQCrhuEAvxtiuMwx3w, chripell-VaTbYqLCNhc,
	qi.wang-ral2JQCrhuEAvxtiuMwx3w
In-Reply-To: <4CC71DA4.3020600@grandegger.com>

Hi Wolfgang and Marc,

On Wednesday, October 27, 2010 3:27 AM, Wolfgang Grandegger wrote:
> for the inconvenience, but the original author should have received it.
> Tomoya, could you (or somebody else) please also fix the remaining
> issues quickly?

"masa-korg-ECg8zkTtlr0C6LszWs/t0g@public.gmane.org" left this project. and this mail-address will be deleted soon.
I (tomoya-linux-ECg8zkTtlr0C6LszWs/t0g@public.gmane.org) have taken over "pch can drive" from him.

I will modify your comments ASAP.

---
Hi David,

On Wednesday, October 27, 2010 2:52 AM, David Miller wrote:
> There are still problems.
> 
> This is beyond frustrating.
> 
> > index 55ec324..2889e11 100755
> 
> Why does the pch_can.c file have execute permission in your tree?
> It doesn't in net-next-2.6 and that is what you should be generating
> patches against.
Sorry for the inconvenience.
After updating maintainer's comments and yours, I will post again.

---
Thanks, Tomoya(OKI SEMICONDUCTOR CO., LTD.)

^ permalink raw reply

* Yours Faithfully
From: Dr. Robert William @ 2010-10-27  0:34 UTC (permalink / raw)
  To: pascale.vial

Good News My Dear Friend,

I'm happy to inform you about my success in getting those funds transferred under the co-operation of a new partner from Paraguay. Presently i'm in Paraguay for investment projects with my own share of the total sum. meanwhile,i didn't forget your past efforts and attempts to assist me in transferring those funds despite that it failed us some how.
Now contact my secretary in Benin Republic

His name is Mr. Timothy Adams
Phone Number: +229 97373889
Email:  timothyadams90@gmail.com

Please do let me know immediately you receive it so that we can share the joy after all the sufferings at that time. in the moment, I'm very busy here because of the investment projects which me and the new partner are having at hand, finally, remember that I had forwarded instruction to the secretary on your behalf to receive that money, so feel free to get in touch with Mr. Timothy Adams on timothyadams90@gmail.com, he will send the amount to you without any delay.

Yours Faithfully
Dr Robert William

^ permalink raw reply

* linux-next: build warning from Linus' tree
From: Stephen Rothwell @ 2010-10-27  0:29 UTC (permalink / raw)
  To: David Miller, netdev; +Cc: linux-next, linux-kernel, Jesse Gross

[-- Attachment #1: Type: text/plain, Size: 524 bytes --]

Hi all,

While building Linus' tree, today's linux-next build (x86_64 allmodconfig)
produced this warning:

drivers/net/igb/igb_main.c: In function 'igb_xmit_frame_ring_adv':
drivers/net/igb/igb_main.c:4110: warning: unused variable 'adapter'

Introduced by commit eab6d18d20fc5b5ba04a7e7fcd6f357197870e51 ("vlan:
Don't check for vlan group before vlan_tx_tag_present").

Sorry I didn't catch it earlier.
-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/

[-- Attachment #2: Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* Re: netns patches WAS( Re: [PATCH 8/8] net: Implement socketat.
From: Eric W. Biederman @ 2010-10-27  0:27 UTC (permalink / raw)
  To: hadi
  Cc: Daniel Lezcano, Pavel Emelyanov, linux-kernel, Linux Containers,
	netdev, netfilter-devel, linux-fsdevel, Linus Torvalds,
	Michael Kerrisk, Ulrich Drepper, Al Viro, David Miller,
	Serge E. Hallyn, Pavel Emelyanov, Ben Greear, Matt Helsley,
	Jonathan Corbet, Sukadev Bhattiprolu, Jan Engelhardt,
	Patrick McHardy
In-Reply-To: <1288126326.2094.131.camel@mojatatu>

jamal <hadi@cyberus.ca> writes:

> Eric,
>
> Ping?
> If you are too busy to push these in maybe have
> someone clueful like Daniel help out submitting? I think it
> should probably be reasonable to leave out the sockeat
> patch initially if it is deemed controversial..

This merge cycle I am too busy, and my patches did not make it into
linux-next before the merge window.

Everything except socketat at seems non-controversial.  socketat makes
sense to post-pone a little bit until we start converting applications,
and there is a little real world experience about what is needed.

I anticipate some time freeing up in the next couple of weeks so I
should be ready for the next merge window.

Eric

^ permalink raw reply

* Re: [PATCH net-next-2.6 1/2] be2net: Adding an option to use INTx instead of MSI-X
From: Michael Ellerman @ 2010-10-26 23:20 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David Miller, somnath.kotur, netdev, linux-pci
In-Reply-To: <20101026133233.GH15074@solarflare.com>

[-- Attachment #1: Type: text/plain, Size: 1394 bytes --]

On Tue, 2010-10-26 at 14:32 +0100, Ben Hutchings wrote:
> Michael Ellerman wrote:
> > On Mon, 2010-10-25 at 16:25 -0700, David Miller wrote:
> > > From: Ben Hutchings <bhutchings@solarflare.com>
> > > Date: Mon, 25 Oct 2010 23:38:53 +0100

> > Ethtool would be nice, but only for network drivers. Is there a generic
> > solution, quirks are obviously not keeping people happy.
>  
> Since this is (normally) a property of the system, pci=nomsi is the
> generic solution.

Sort of, it's a big hammer. Did all these driver writers not know about
pci=nomsi or did they prefer to add a parameter to their driver for some
reason?

> [...]
> > Misc:
> >   sound/pci/hda/hda_intel.c:MODULE_PARM_DESC(enable_msi, "Enable Message Signaled Interrupt (MSI)");
> >   drivers/message/fusion/mptbase.c:MODULE_PARM_DESC(mpt_msi_enable_spi, " Enable MSI Support for SPI \
> >   drivers/message/fusion/mptbase.c:MODULE_PARM_DESC(mpt_msi_enable_fc, " Enable MSI Support for FC \
> >   drivers/message/fusion/mptbase.c:MODULE_PARM_DESC(mpt_msi_enable_sas, " Enable MSI Support for SAS \
> >   drivers/net/myri10ge/myri10ge.c:MODULE_PARM_DESC(myri10ge_msi, "Enable Message Signalled Interrupts");
> [...]
> 
>   drivers/net/sfc/efx.c:MODULE_PARM_DESC(interrupt_mode,
>   drivers/net/sfc/efx.c-                 "Interrupt mode (0=>MSIX 1=>MSI 2=>legacy)");

Well spotted.

cheers



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* Re: [PATCH] ipv6: addrconf: clear IPv6 addresses and routes when losing link
From: Lorenzo Colitti @ 2010-10-26 22:53 UTC (permalink / raw)
  To: David Miller; +Cc: brian.haley, shemminger, netdev
In-Reply-To: <20101026.112321.258116003.davem@davemloft.net>

On Tue, Oct 26, 2010 at 11:23 AM, David Miller <davem@davemloft.net> wrote:
>> This works pretty well. Why not do the same for IPv6? The only
>> difference in IPv6 is that the kernel is the one doing
>> autoconfiguration, so it's the kernel that should be doing the asking.
>
> That should be fine, but if I have static IPv6 addresses assigned to
> an interface I really don't want them changing on a simple link flap.

As the patch stands, they don't. Only autoconfigured addresses will be
cleared, because addrconf_ifdown() does not remove any addresses that
are permanent (unless they are link-local, in which case they are
recreated as soon as link comes back).

^ permalink raw reply

* Re: [rfc v2.1 03/10] ipvs network name space aware: conn
From: Simon Horman @ 2010-10-26 22:36 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Daniel Lezcano, Wensong Zhang

This patch just contains ip_vs_conn.c
and does the normal
 - moving to vars to struct ipvs
 - adding per netns init and exit

proc_fs required some extra work with adding/chaning private data to get the net ptr.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>

--- 

* v2
  Rebase against current nf-next-2.6 (Simon Horman)

* v2.1
  Fix patch-brokenness in v2 (Simon Horman)

Index: lvs-test-2.6/net/netfilter/ipvs/ip_vs_conn.c
===================================================================
--- lvs-test-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c	2010-10-27 06:05:11.000000000 +0900
+++ lvs-test-2.6/net/netfilter/ipvs/ip_vs_conn.c	2010-10-27 06:15:01.000000000 +0900
@@ -56,23 +56,12 @@ MODULE_PARM_DESC(conn_tab_bits, "Set con
 int ip_vs_conn_tab_size;
 int ip_vs_conn_tab_mask;
 
-/*
- *  Connection hash table: for input and output packets lookups of IPVS
- */
-static struct list_head *ip_vs_conn_tab;
-
-/*  SLAB cache for IPVS connections */
-static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
-
-/*  counter for current IPVS connections */
-static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
-
-/*  counter for no client port connections */
-static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
-
 /* random value for IPVS connection hash */
 static unsigned int ip_vs_conn_rnd;
 
+/* cache name cnt */
+static atomic_t conn_cache_nr = ATOMIC_INIT(0);
+
 /*
  *  Fine locking granularity for big connection hash table
  */
@@ -173,8 +162,8 @@ static unsigned int ip_vs_conn_hashkey_c
 {
 	struct ip_vs_conn_param p;
 
-	ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
-			      NULL, 0, &p);
+	ip_vs_conn_fill_param(NULL, cp->af, cp->protocol, &cp->caddr,
+			      cp->cport, NULL, 0, &p);
 
 	if (cp->dest && cp->dest->svc->pe) {
 		p.pe = cp->dest->svc->pe;
@@ -189,7 +178,7 @@ static unsigned int ip_vs_conn_hashkey_c
  *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
  *	returns bool success.
  */
-static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+static inline int ip_vs_conn_hash(struct net *net, struct ip_vs_conn *cp)
 {
 	unsigned hash;
 	int ret;
@@ -204,7 +193,7 @@ static inline int ip_vs_conn_hash(struct
 	spin_lock(&cp->lock);
 
 	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
-		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+		list_add(&cp->c_list, &net->ipvs->conn_tab[hash]);
 		cp->flags |= IP_VS_CONN_F_HASHED;
 		atomic_inc(&cp->refcnt);
 		ret = 1;
@@ -262,12 +251,13 @@ __ip_vs_conn_in_get(const struct ip_vs_c
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
+	struct netns_ipvs *ipvs = p->net->ipvs;
 
 	hash = ip_vs_conn_hashkey_param(p, false);
 
 	ct_read_lock(hash);
 
-	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+	list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
 		if (cp->af == p->af &&
 		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
@@ -286,12 +276,13 @@ __ip_vs_conn_in_get(const struct ip_vs_c
 	return NULL;
 }
 
-struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
+struct ip_vs_conn *
+ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
 	struct ip_vs_conn *cp;
 
 	cp = __ip_vs_conn_in_get(p);
-	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+	if (!cp && atomic_read(&p->net->ipvs->conn_no_cport_cnt)) {
 		struct ip_vs_conn_param cport_zero_p = *p;
 		cport_zero_p.cport = 0;
 		cp = __ip_vs_conn_in_get(&cport_zero_p);
@@ -313,16 +304,19 @@ ip_vs_conn_fill_param_proto(int af, cons
 			    struct ip_vs_conn_param *p)
 {
 	__be16 _ports[2], *pptr;
+	struct net *net = dev_net(skb->dev);
 
 	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
 	if (pptr == NULL)
 		return 1;
 
 	if (likely(!inverse))
-		ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
+		ip_vs_conn_fill_param(net, af, iph->protocol,
+				      &iph->saddr, pptr[0],
 				      &iph->daddr, pptr[1], p);
 	else
-		ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
+		ip_vs_conn_fill_param(net, af, iph->protocol,
+				      &iph->daddr, pptr[1],
 				      &iph->saddr, pptr[0], p);
 	return 0;
 }
@@ -347,12 +341,13 @@ struct ip_vs_conn *ip_vs_ct_in_get(const
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
+	struct netns_ipvs *ipvs = p->net->ipvs;
 
 	hash = ip_vs_conn_hashkey_param(p, false);
 
 	ct_read_lock(hash);
 
-	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+	list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
 		if (p->pe_data && p->pe->ct_match) {
 			if (p->pe->ct_match(p, cp))
 				goto out;
@@ -394,6 +389,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(co
 {
 	unsigned hash;
 	struct ip_vs_conn *cp, *ret=NULL;
+	struct netns_ipvs *ipvs = p->net->ipvs;
 
 	/*
 	 *	Check for "full" addressed entries
@@ -402,7 +398,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(co
 
 	ct_read_lock(hash);
 
-	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+	list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
 		if (cp->af == p->af &&
 		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
 		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
@@ -457,19 +453,19 @@ void ip_vs_conn_put(struct ip_vs_conn *c
 /*
  *	Fill a no_client_port connection with a client port number
  */
-void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
+void ip_vs_conn_fill_cport(struct net *net, struct ip_vs_conn *cp, __be16 cport)
 {
 	if (ip_vs_conn_unhash(cp)) {
 		spin_lock(&cp->lock);
 		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
-			atomic_dec(&ip_vs_conn_no_cport_cnt);
+			atomic_dec(&net->ipvs->conn_no_cport_cnt);
 			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
 			cp->cport = cport;
 		}
 		spin_unlock(&cp->lock);
 
 		/* hash on new dport */
-		ip_vs_conn_hash(cp);
+		ip_vs_conn_hash(net, cp);
 	}
 }
 
@@ -606,12 +602,12 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, s
  * Check if there is a destination for the connection, if so
  * bind the connection to the destination.
  */
-struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+struct ip_vs_dest *ip_vs_try_bind_dest(struct net *net, struct ip_vs_conn *cp)
 {
 	struct ip_vs_dest *dest;
 
 	if ((cp) && (!cp->dest)) {
-		dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
+		dest = ip_vs_find_dest(net, cp->af, &cp->daddr, cp->dport,
 				       &cp->vaddr, cp->vport,
 				       cp->protocol);
 		ip_vs_bind_dest(cp, dest);
@@ -683,7 +679,7 @@ static inline void ip_vs_unbind_dest(str
  *	If available, return 1, otherwise invalidate this connection
  *	template and return 0.
  */
-int ip_vs_check_template(struct ip_vs_conn *ct)
+int ip_vs_check_template(struct net *net, struct ip_vs_conn *ct)
 {
 	struct ip_vs_dest *dest = ct->dest;
 
@@ -692,7 +688,7 @@ int ip_vs_check_template(struct ip_vs_co
 	 */
 	if ((dest == NULL) ||
 	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
-	    (sysctl_ip_vs_expire_quiescent_template &&
+	    (net->ipvs->sysctl_expire_quiescent_template &&
 	     (atomic_read(&dest->weight) == 0))) {
 		IP_VS_DBG_BUF(9, "check_template: dest not available for "
 			      "protocol %s s:%s:%d v:%s:%d "
@@ -713,7 +709,7 @@ int ip_vs_check_template(struct ip_vs_co
 				ct->dport = htons(0xffff);
 				ct->vport = htons(0xffff);
 				ct->cport = 0;
-				ip_vs_conn_hash(ct);
+				ip_vs_conn_hash(net, ct);
 			}
 		}
 
@@ -770,15 +766,15 @@ static void ip_vs_conn_expire(unsigned l
 			ip_vs_unbind_app(cp);
 		ip_vs_unbind_dest(cp);
 		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
-			atomic_dec(&ip_vs_conn_no_cport_cnt);
-		atomic_dec(&ip_vs_conn_count);
+			atomic_dec(&cp->net->ipvs->conn_no_cport_cnt);
+		atomic_dec(&cp->net->ipvs->conn_count);
 
-		kmem_cache_free(ip_vs_conn_cachep, cp);
+		kmem_cache_free(cp->net->ipvs->conn_cachep, cp);
 		return;
 	}
 
 	/* hash it back to the table */
-	ip_vs_conn_hash(cp);
+	ip_vs_conn_hash(cp->net, cp);
 
   expire_later:
 	IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
@@ -795,9 +791,9 @@ void ip_vs_conn_expire_now(struct ip_vs_
 		mod_timer(&cp->timer, jiffies);
 }
 
-
 /*
- *	Create a new connection entry and hash it into the ip_vs_conn_tab
+ *	Create a new connection entry and hash it into the ip_vs_conn_tab,
+ *	netns ptr will be stored in ip_vs_con here.
  */
 struct ip_vs_conn *
 ip_vs_conn_new(const struct ip_vs_conn_param *p,
@@ -805,9 +801,12 @@ ip_vs_conn_new(const struct ip_vs_conn_p
 	       struct ip_vs_dest *dest)
 {
 	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+							   p->protocol);
+	struct ip_vs_protocol *pp;
+	struct netns_ipvs *ipvs = p->net->ipvs;
 
-	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
+	cp = kmem_cache_zalloc(ipvs->conn_cachep, GFP_ATOMIC);
 	if (cp == NULL) {
 		IP_VS_ERR_RL("%s(): no memory\n", __func__);
 		return NULL;
@@ -842,9 +841,9 @@ ip_vs_conn_new(const struct ip_vs_conn_p
 	atomic_set(&cp->n_control, 0);
 	atomic_set(&cp->in_pkts, 0);
 
-	atomic_inc(&ip_vs_conn_count);
+	atomic_inc(&ipvs->conn_count);
 	if (flags & IP_VS_CONN_F_NO_CPORT)
-		atomic_inc(&ip_vs_conn_no_cport_cnt);
+		atomic_inc(&ipvs->conn_no_cport_cnt);
 
 	/* Bind the connection with a destination server */
 	ip_vs_bind_dest(cp, dest);
@@ -861,8 +860,12 @@ ip_vs_conn_new(const struct ip_vs_conn_p
 #endif
 		ip_vs_bind_xmit(cp);
 
-	if (unlikely(pp && atomic_read(&pp->appcnt)))
-		ip_vs_bind_app(cp, pp);
+	cp->net = p->net;	/* netns ptr  needed in timer */
+	if (pd) {
+		pp = pd->pp;
+		if (unlikely(pp && atomic_read(&pd->appcnt)))
+			ip_vs_bind_app(p->net, cp, pp);
+	}
 
 	/*
 	 * Allow conntrack to be preserved. By default, conntrack
@@ -871,15 +874,31 @@ ip_vs_conn_new(const struct ip_vs_conn_p
 	 * IP_VS_CONN_F_ONE_PACKET too.
 	 */
 
-	if (ip_vs_conntrack_enabled())
+	if (ip_vs_conntrack_enabled(p->net))
 		cp->flags |= IP_VS_CONN_F_NFCT;
 
 	/* Hash it in the ip_vs_conn_tab finally */
-	ip_vs_conn_hash(cp);
+	ip_vs_conn_hash(p->net, cp);
 
 	return cp;
 }
 
+struct ipvs_private {
+	struct seq_net_private p;
+	void *private;
+};
+
+static inline void ipvs_seq_priv_set(struct seq_file *seq, void *data)
+{
+	struct ipvs_private *ipriv=(struct ipvs_private *)seq->private;
+	ipriv->private = data;
+}
+
+static inline void *ipvs_seq_priv_get(struct seq_file *seq)
+{
+	return ((struct ipvs_private *)seq->private)->private;
+}
+
 /*
  *	/proc/net/ip_vs_conn entries
  */
@@ -889,13 +908,15 @@ static void *ip_vs_conn_array(struct seq
 {
 	int idx;
 	struct ip_vs_conn *cp;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net->ipvs;
 
 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
 		ct_read_lock_bh(idx);
-		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+		list_for_each_entry(cp, &ipvs->conn_tab[idx], c_list) {
 			if (pos-- == 0) {
-				seq->private = &ip_vs_conn_tab[idx];
-			return cp;
+				ipvs_seq_priv_set(seq, &ipvs->conn_tab[idx]);
+				return cp;
 			}
 		}
 		ct_read_unlock_bh(idx);
@@ -906,15 +927,17 @@ static void *ip_vs_conn_array(struct seq
 
 static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	seq->private = NULL;
+	ipvs_seq_priv_set(seq, NULL);
 	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
 }
-
+ /* netns: conn_tab OK */
 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct ip_vs_conn *cp = v;
-	struct list_head *e, *l = seq->private;
+	struct list_head *e, *l = ipvs_seq_priv_get(seq);
 	int idx;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net->ipvs;
 
 	++*pos;
 	if (v == SEQ_START_TOKEN)
@@ -924,27 +947,28 @@ static void *ip_vs_conn_seq_next(struct
 	if ((e = cp->c_list.next) != l)
 		return list_entry(e, struct ip_vs_conn, c_list);
 
-	idx = l - ip_vs_conn_tab;
+	idx = l - ipvs->conn_tab;
 	ct_read_unlock_bh(idx);
 
 	while (++idx < ip_vs_conn_tab_size) {
 		ct_read_lock_bh(idx);
-		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-			seq->private = &ip_vs_conn_tab[idx];
+		list_for_each_entry(cp, &ipvs->conn_tab[idx], c_list) {
+			ipvs_seq_priv_set(seq, &ipvs->conn_tab[idx]);
 			return cp;
 		}
 		ct_read_unlock_bh(idx);
 	}
-	seq->private = NULL;
+	ipvs_seq_priv_set(seq, NULL);
 	return NULL;
 }
-
+/* netns: conn_tab OK */
 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
 {
-	struct list_head *l = seq->private;
+	struct list_head *l = ipvs_seq_priv_get(seq);
+	struct net *net = seq_file_net(seq);
 
 	if (l)
-		ct_read_unlock_bh(l - ip_vs_conn_tab);
+		ct_read_unlock_bh(l - net->ipvs->conn_tab);
 }
 
 static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
@@ -1004,7 +1028,16 @@ static const struct seq_operations ip_vs
 
 static int ip_vs_conn_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &ip_vs_conn_seq_ops);
+	int ret;
+	struct ipvs_private *priv;
+
+	ret = seq_open_net(inode, file, &ip_vs_conn_seq_ops,
+			   sizeof(struct ipvs_private));
+	if (!ret) {
+		priv = ((struct seq_file *)file->private_data)->private;
+		priv->private = NULL;
+	}
+	return ret;
 }
 
 static const struct file_operations ip_vs_conn_fops = {
@@ -1012,7 +1045,8 @@ static const struct file_operations ip_v
 	.open    = ip_vs_conn_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-	.release = seq_release,
+	.release = seq_release_private,
+
 };
 
 static const char *ip_vs_origin_name(unsigned flags)
@@ -1067,7 +1101,17 @@ static const struct seq_operations ip_vs
 
 static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &ip_vs_conn_sync_seq_ops);
+	int ret;
+	struct ipvs_private *ipriv;
+
+	ret = seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
+			   sizeof(struct ipvs_private));
+	if (!ret) {
+		ipriv = ((struct seq_file *)file->private_data)->private;
+		ipriv->private = NULL;
+	}
+	return ret;
+//	return seq_open(file, &ip_vs_conn_sync_seq_ops);
 }
 
 static const struct file_operations ip_vs_conn_sync_fops = {
@@ -1075,7 +1119,7 @@ static const struct file_operations ip_v
 	.open    = ip_vs_conn_sync_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-	.release = seq_release,
+	.release = seq_release_private,
 };
 
 #endif
@@ -1112,11 +1156,14 @@ static inline int todrop_entry(struct ip
 	return 1;
 }
 
-/* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(void)
+/* Called from keventd and must protect itself from softirqs
+ * netns: conn_tab OK
+ */
+void ip_vs_random_dropentry(struct net *net)
 {
 	int idx;
 	struct ip_vs_conn *cp;
+	struct netns_ipvs *ipvs = net->ipvs;
 
 	/*
 	 * Randomly scan 1/32 of the whole table every second
@@ -1129,7 +1176,7 @@ void ip_vs_random_dropentry(void)
 		 */
 		ct_write_lock_bh(hash);
 
-		list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+		list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
 			if (cp->flags & IP_VS_CONN_F_TEMPLATE)
 				/* connection template */
 				continue;
@@ -1167,11 +1214,13 @@ void ip_vs_random_dropentry(void)
 
 /*
  *      Flush all the connection entries in the ip_vs_conn_tab
+ * netns: conn_tab OK
  */
-static void ip_vs_conn_flush(void)
+static void ip_vs_conn_flush(struct net *net)
 {
 	int idx;
 	struct ip_vs_conn *cp;
+	struct netns_ipvs *ipvs = net->ipvs;
 
   flush_again:
 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
@@ -1180,7 +1229,7 @@ static void ip_vs_conn_flush(void)
 		 */
 		ct_write_lock_bh(idx);
 
-		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+		list_for_each_entry(cp, &ipvs->conn_tab[idx], c_list) {
 
 			IP_VS_DBG(4, "del connection\n");
 			ip_vs_conn_expire_now(cp);
@@ -1194,16 +1243,17 @@ static void ip_vs_conn_flush(void)
 
 	/* the counter may be not NULL, because maybe some conn entries
 	   are run by slow timer handler or unhashed but still referred */
-	if (atomic_read(&ip_vs_conn_count) != 0) {
+	if (atomic_read(&ipvs->conn_count) != 0) {
 		schedule();
 		goto flush_again;
 	}
 }
 
 
-int __init ip_vs_conn_init(void)
+int __net_init __ip_vs_conn_init(struct net *net)
 {
 	int idx;
+	struct netns_ipvs *ipvs = net->ipvs;
 
 	/* Compute size and mask */
 	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
@@ -1212,19 +1262,26 @@ int __init ip_vs_conn_init(void)
 	/*
 	 * Allocate the connection hash table and initialize its list heads
 	 */
-	ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size *
+	ipvs->conn_tab = vmalloc(ip_vs_conn_tab_size *
 				 sizeof(struct list_head));
-	if (!ip_vs_conn_tab)
+	if (!ipvs->conn_tab)
 		return -ENOMEM;
 
 	/* Allocate ip_vs_conn slab cache */
-	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+	/* Todo: find a better way to name the cache */
+	snprintf(ipvs->conn_cname, sizeof(ipvs->conn_cname)-1,
+			"ipvs_conn_%d", atomic_read(&conn_cache_nr) );
+	atomic_inc(&conn_cache_nr);
+
+	ipvs->conn_cachep = kmem_cache_create(ipvs->conn_cname,
 					      sizeof(struct ip_vs_conn), 0,
 					      SLAB_HWCACHE_ALIGN, NULL);
-	if (!ip_vs_conn_cachep) {
-		vfree(ip_vs_conn_tab);
+	if (!ipvs->conn_cachep) {
+		vfree(ipvs->conn_tab);
 		return -ENOMEM;
 	}
+	atomic_set(&ipvs->conn_count, 0);
+	atomic_set(&ipvs->conn_no_cport_cnt, 0);
 
 	pr_info("Connection hash table configured "
 		"(size=%d, memory=%ldKbytes)\n",
@@ -1234,31 +1291,46 @@ int __init ip_vs_conn_init(void)
 		  sizeof(struct ip_vs_conn));
 
 	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
-		INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
+		INIT_LIST_HEAD(&ipvs->conn_tab[idx]);
 	}
 
 	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
 		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
 	}
 
-	proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
-	proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
-
-	/* calculate the random value for connection hash */
-	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+	proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
+	proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
 
 	return 0;
 }
+/* Cleanup and release all netns related ... */
+static void __net_exit __ip_vs_conn_cleanup(struct net *net) {
 
+	/* flush all the connection entries first */
+	ip_vs_conn_flush(net);
+	/* Release the empty cache */
+	kmem_cache_destroy(net->ipvs->conn_cachep);
+	proc_net_remove(net, "ip_vs_conn");
+	proc_net_remove(net, "ip_vs_conn_sync");
+	vfree(net->ipvs->conn_tab);
+}
+static struct pernet_operations ipvs_conn_ops = {
+	.init = __ip_vs_conn_init,
+	.exit = __ip_vs_conn_cleanup,
+};
 
-void ip_vs_conn_cleanup(void)
+int __init ip_vs_conn_init(void)
 {
-	/* flush all the connection entries first */
-	ip_vs_conn_flush();
+	int rv;
 
-	/* Release the empty cache */
-	kmem_cache_destroy(ip_vs_conn_cachep);
-	proc_net_remove(&init_net, "ip_vs_conn");
-	proc_net_remove(&init_net, "ip_vs_conn_sync");
-	vfree(ip_vs_conn_tab);
+	rv = register_pernet_subsys(&ipvs_conn_ops);
+
+	/* calculate the random value for connection hash */
+	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+	return rv;
+}
+
+void ip_vs_conn_cleanup(void)
+{
+	unregister_pernet_subsys(&ipvs_conn_ops);
 }
Index: lvs-test-2.6/include/net/ip_vs.h
===================================================================
--- lvs-test-2.6.orig/include/net/ip_vs.h	2010-10-27 06:05:30.000000000 +0900
+++ lvs-test-2.6/include/net/ip_vs.h	2010-10-27 07:30:20.000000000 +0900
@@ -1066,9 +1066,9 @@ static inline void ip_vs_notrack(struct
  *      Netfilter connection tracking
  *      (from ip_vs_nfct.c)
  */
-static inline int ip_vs_conntrack_enabled(void)
+static inline int ip_vs_conntrack_enabled(struct net *net)
 {
-	return sysctl_ip_vs_conntrack;
+	return net->ipvs->sysctl_conntrack;
 }
 
 extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp,
@@ -1081,7 +1081,7 @@ extern void ip_vs_conn_drop_conntrack(st
 
 #else
 
-static inline int ip_vs_conntrack_enabled(void)
+static inline int ip_vs_conntrack_enabled(struct net *net)
 {
 	return 0;
 }

^ permalink raw reply

* Re: [rfc v2 03/10] ipvs network name space aware: conn
From: Simon Horman @ 2010-10-26 22:35 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Daniel Lezcano, Wensong Zhang
In-Reply-To: <20101022202357.344354364@joe.akashicho.tokyo.vergenet.net>

On Fri, Oct 22, 2010 at 10:09:37PM +0200, Simon Horman wrote:
> 
> This patch just contains ip_vs_conn.c
> and does the normal
>  - moving to vars to struct ipvs
>  - adding per netns init and exit
> 
> proc_fs required some extra work with adding/chaning private data to get the net ptr.
> 
> Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>

Sorry, I messed this patch up a bit and will repost.

* I still have not addressed any of the problems beyond the
  original scope of my post, which was to rebase Hans's changes.
  In particular I have not addressed any of the issues that
  Julian raised in response to my patches. Hans, are you planning
  to look into that or should I take another stab at things?

^ permalink raw reply

* [PATCH iproute2] Add passthru mode and support 'mode' parameter with macvtap devices
From: Sridhar Samudrala @ 2010-10-26 22:19 UTC (permalink / raw)
  To: kaber, Arnd Bergmann; +Cc: netdev, kvm@vger.kernel.org

Support a new 'passthru' mode with macvlan and 'mode' parameter
with macvtap devices.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index f5bb2dc..23de79e 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -230,6 +230,7 @@ enum macvlan_mode {
 	MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */
 	MACVLAN_MODE_VEPA    = 2, /* talk to other ports through ext bridge */
 	MACVLAN_MODE_BRIDGE  = 4, /* talk to bridge ports directly */
+	MACVLAN_MODE_PASSTHRU  = 8, /* take over the underlying device */
 };
 
 /* SR-IOV virtual function management section */
diff --git a/ip/Makefile b/ip/Makefile
index 2f223ca..6054e8a 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -3,7 +3,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o \
     ipmaddr.o ipmonitor.o ipmroute.o ipprefix.o iptuntap.o \
     ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o \
     iplink_vlan.o link_veth.o link_gre.o iplink_can.o \
-    iplink_macvlan.o
+    iplink_macvlan.o iplink_macvtap.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/iplink_macvlan.c b/ip/iplink_macvlan.c
index a3c78bd..97787f9 100644
--- a/ip/iplink_macvlan.c
+++ b/ip/iplink_macvlan.c
@@ -48,6 +48,8 @@ static int macvlan_parse_opt(struct link_util *lu, int argc, char **argv,
 				mode = MACVLAN_MODE_VEPA;
 			else if (strcmp(*argv, "bridge") == 0)
 				mode = MACVLAN_MODE_BRIDGE;
+			else if (strcmp(*argv, "passthru") == 0)
+				mode = MACVLAN_MODE_PASSTHRU;
 			else
 				return mode_arg();
 
@@ -82,6 +84,7 @@ static void macvlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]
 		  mode == MACVLAN_MODE_PRIVATE ? "private"
 		: mode == MACVLAN_MODE_VEPA    ? "vepa"
 		: mode == MACVLAN_MODE_BRIDGE  ? "bridge"
+		: mode == MACVLAN_MODE_PASSTHRU  ? "passthru"
 		:				 "unknown");
 }
 
diff --git a/ip/iplink_macvtap.c b/ip/iplink_macvtap.c
new file mode 100644
index 0000000..040cc68
--- /dev/null
+++ b/ip/iplink_macvtap.c
@@ -0,0 +1,93 @@
+/*
+ * iplink_macvtap.c	macvtap device support
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <linux/if_link.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+
+static void explain(void)
+{
+	fprintf(stderr,
+		"Usage: ... macvtap mode { private | vepa | bridge | passthru }\n"
+	);
+}
+
+static int mode_arg(void)
+{
+        fprintf(stderr, "Error: argument of \"mode\" must be \"private\", "
+		"\"vepa\" or \"bridge\" \"passthru\"\n");
+        return -1;
+}
+
+static int macvtap_parse_opt(struct link_util *lu, int argc, char **argv,
+			  struct nlmsghdr *n)
+{
+	while (argc > 0) {
+		if (matches(*argv, "mode") == 0) {
+			__u32 mode = 0;
+			NEXT_ARG();
+
+			if (strcmp(*argv, "private") == 0)
+				mode = MACVLAN_MODE_PRIVATE;
+			else if (strcmp(*argv, "vepa") == 0)
+				mode = MACVLAN_MODE_VEPA;
+			else if (strcmp(*argv, "bridge") == 0)
+				mode = MACVLAN_MODE_BRIDGE;
+			else if (strcmp(*argv, "passthru") == 0)
+				mode = MACVLAN_MODE_PASSTHRU;
+			else
+				return mode_arg();
+
+			addattr32(n, 1024, IFLA_MACVLAN_MODE, mode);
+		} else if (matches(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "macvtap: what is \"%s\"?\n", *argv);
+			explain();
+			return -1;
+		}
+		argc--, argv++;
+	}
+
+	return 0;
+}
+
+static void macvtap_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
+{
+	__u32 mode;
+
+	if (!tb)
+		return;
+
+	if (!tb[IFLA_MACVLAN_MODE] ||
+	    RTA_PAYLOAD(tb[IFLA_MACVLAN_MODE]) < sizeof(__u32))
+		return;
+
+	mode = *(__u32 *)RTA_DATA(tb[IFLA_VLAN_ID]);
+	fprintf(f, " mode %s ",
+		  mode == MACVLAN_MODE_PRIVATE ? "private"
+		: mode == MACVLAN_MODE_VEPA    ? "vepa"
+		: mode == MACVLAN_MODE_BRIDGE  ? "bridge"
+		: mode == MACVLAN_MODE_PASSTHRU  ? "passthru"
+		:				 "unknown");
+}
+
+struct link_util macvtap_link_util = {
+	.id		= "macvtap",
+	.maxattr	= IFLA_MACVLAN_MAX,
+	.parse_opt	= macvtap_parse_opt,
+	.print_opt	= macvtap_print_opt,
+};



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox