Netdev List
 help / color / mirror / Atom feed
* [PATCH] rps: add flow director support
From: Changli Gao @ 2010-04-11 21:42 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Changli Gao

add rps flow director support

with rps flow director, users can do weighted packet dispatching among CPUs.
For example, CPU0:CPU1 is 1:3 for eth0's rx-0:

 localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows  
 localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
 localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
 localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
 localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 net/core/net-sysfs.c |  176 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 172 insertions(+), 4 deletions(-)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1e7fdd6..d904610 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -511,6 +511,109 @@ static struct sysfs_ops rx_queue_sysfs_ops = {
 	.store = rx_queue_attr_store,
 };
 
+static DEFINE_MUTEX(rps_map_lock);
+
+static ssize_t show_rps_flow(struct netdev_rx_queue *queue,
+			     struct rx_queue_attribute *attribute, char *buf)
+{
+	unsigned long flowid;
+	struct rps_map *map;
+	u16 cpu;
+
+	strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
+	rcu_read_lock();
+	map = rcu_dereference(queue->rps_map);
+	if (map && flowid < map->len)
+		cpu = map->cpus[flowid];
+	else
+		cpu = 0;
+	rcu_read_unlock();
+	return sprintf(buf, "%hu\n", cpu);
+}
+
+static ssize_t store_rps_flow(struct netdev_rx_queue *queue,
+			      struct rx_queue_attribute *attribute,
+			      const char *buf, size_t len)
+{
+	unsigned long flowid, cpu;
+	struct rps_map *map;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (strict_strtoul(buf, 0, &cpu))
+		return -EINVAL;
+	strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
+
+	mutex_lock(&rps_map_lock);
+	map = queue->rps_map;
+	if (map && flowid < map->len)
+		map->cpus[flowid] = cpu;
+	mutex_unlock(&rps_map_lock);
+
+	return len;
+}
+
+static struct rx_queue_attribute **rps_flow_attribute;
+static int rps_flow_attribute_size;
+
+/* must be called with rps_map_lock locked */
+static int update_rps_flow_files(struct kobject *kobj,
+				 struct rps_map *old_map, struct rps_map *map)
+{
+	int i;
+	int old_map_len = old_map ? old_map->len : 0;
+	int map_len = map ? map->len : 0;
+
+	if (old_map_len >= map_len) {
+		for (i = map_len; i < old_map_len; i++)
+			sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr);
+		return 0;
+	}
+
+	if (map_len > rps_flow_attribute_size) {
+		struct rx_queue_attribute **attrs;
+		char name[sizeof("rps_flow_4294967295")];
+		char *pname;
+
+		attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *),
+				 GFP_KERNEL);
+		if (attrs == NULL)
+			return -ENOMEM;
+		rps_flow_attribute = attrs;
+		for (i = rps_flow_attribute_size; i < map_len; i++) {
+			rps_flow_attribute[i] = kmalloc(sizeof(**attrs),
+							GFP_KERNEL);
+			if (rps_flow_attribute[i] == NULL)
+				break;
+			sprintf(name, "rps_flow_%d", i);
+			pname = kstrdup(name, GFP_KERNEL);
+			if (pname == NULL) {
+				kfree(rps_flow_attribute[i]);
+				break;
+			}
+			rps_flow_attribute[i]->attr.name = pname;
+			rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR;
+			rps_flow_attribute[i]->show = show_rps_flow;
+			rps_flow_attribute[i]->store = store_rps_flow;
+		}
+		rps_flow_attribute_size = i;
+		if (i != map_len)
+			return -ENOMEM;
+	}
+
+	for (i = old_map_len; i < map_len; i++) {
+		if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) {
+			while (--i >= old_map_len)
+				sysfs_remove_file(kobj,
+						  &rps_flow_attribute[i]->attr);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 static ssize_t show_rps_map(struct netdev_rx_queue *queue,
 			    struct rx_queue_attribute *attribute, char *buf)
 {
@@ -555,7 +658,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
 	struct rps_map *old_map, *map;
 	cpumask_var_t mask;
 	int err, cpu, i;
-	static DEFINE_SPINLOCK(rps_map_lock);
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -588,10 +690,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
 		map = NULL;
 	}
 
-	spin_lock(&rps_map_lock);
+	mutex_lock(&rps_map_lock);
 	old_map = queue->rps_map;
-	rcu_assign_pointer(queue->rps_map, map);
-	spin_unlock(&rps_map_lock);
+	err = update_rps_flow_files(&queue->kobj, old_map, map);
+	if (!err)
+		rcu_assign_pointer(queue->rps_map, map);
+	mutex_unlock(&rps_map_lock);
+
+	if (err)
+		return err;
 
 	if (old_map)
 		call_rcu(&old_map->rcu, rps_map_release);
@@ -603,8 +710,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
 static struct rx_queue_attribute rps_cpus_attribute =
 	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
 
+static ssize_t show_rps_flows(struct netdev_rx_queue *queue,
+		struct rx_queue_attribute *attribute, char *buf)
+{
+	struct rps_map *map;
+	unsigned int len;
+
+	rcu_read_lock();
+	map = rcu_dereference(queue->rps_map);
+	len = map ? map->len : 0;
+	rcu_read_unlock();
+	return sprintf(buf, "%u\n", len);
+}
+
+static ssize_t store_rps_flows(struct netdev_rx_queue *queue,
+			       struct rx_queue_attribute *attribute,
+			       const char *buf, size_t len)
+{
+	struct rps_map *old_map, *map;
+	unsigned long flows;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (strict_strtoul(buf, 0, &flows))
+		return -EINVAL;
+	if (flows != 0) {
+		map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows),
+				    L1_CACHE_BYTES), GFP_KERNEL);
+		if (map == NULL)
+			return -ENOMEM;
+		map->len = flows;
+	} else {
+		map = NULL;
+	}
+
+	mutex_lock(&rps_map_lock);
+	old_map = queue->rps_map;
+	err = update_rps_flow_files(&queue->kobj, old_map, map);
+	if (!err) {
+		if (old_map && map)
+			memcpy(map->cpus, old_map->cpus,
+			       sizeof(map->cpus[0]) *
+			       min_t(unsigned int, flows, old_map->len));
+		rcu_assign_pointer(queue->rps_map, map);
+	}
+	mutex_unlock(&rps_map_lock);
+
+	if (err)
+		return err;
+
+	if (old_map)
+		call_rcu(&old_map->rcu, rps_map_release);
+
+	return len;
+}
+
+static struct rx_queue_attribute rps_flows_attribute =
+	__ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows);
+
 static struct attribute *rx_queue_default_attrs[] = {
 	&rps_cpus_attribute.attr,
+	&rps_flows_attribute.attr,
 	NULL
 };
 

^ permalink raw reply related

* Re: [PATCH v2] can: Add esd board support to plx_pci CAN driver
From: Wolfgang Grandegger @ 2010-04-11 14:50 UTC (permalink / raw)
  To: Matthias Fuchs
  Cc: Socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <201004071309.56518.matthias.fuchs-iOnpLzIbIdM@public.gmane.org>

Matthias Fuchs wrote:
> This patch adds support for SJA1000 based PCI CAN interface cards
> from electronic system design gmbh.
> 
> Some changes have been done on the common code:
>  - esd boards must not have the 2nd local interupt enabled (PLX9030/9050)
>  - a new path for PLX9056/PEX8311 chips has been added
>  - new plx9056 reset function has been implemented
>  - struct plx_card_info got a reset function entry
> 
> In detail the following additional boards are now supported:
> 
>         CAN-PCI/200 (PCI)
>         CAN-PCI/266 (PCI)
>         CAN-PMC266 (PMC module)
>         CAN-PCIe/2000 (PCI Express)
>         CAN-CPCI/200 (Compact PCI, 3U)
>         CAN-PCI104 (PCI104)
> 
> Signed-off-by: Matthias Fuchs <matthias.fuchs-iOnpLzIbIdM@public.gmane.org>
> ---
> v2:
>  - update Kconfig
>  - add proper plx9056 reset function
>  - add reset function pointer to plx_card_info structure
>  - use card's reset function in plx_pci_del_card()

Acked-by: Wolfgang Grandegger <wg-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>

Thanks.

Wolfgang.

^ permalink raw reply

* Re: [PATCH] vhost: Make it more scalable by creating a vhost thread per device.
From: Michael S. Tsirkin @ 2010-04-11 15:47 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: Tom Lendacky, netdev, kvm@vger.kernel.org
In-Reply-To: <1270771542.31186.397.camel@w-sridhar.beaverton.ibm.com>

On Thu, Apr 08, 2010 at 05:05:42PM -0700, Sridhar Samudrala wrote:
> On Mon, 2010-04-05 at 10:35 -0700, Sridhar Samudrala wrote:
> > On Sun, 2010-04-04 at 14:14 +0300, Michael S. Tsirkin wrote:
> > > On Fri, Apr 02, 2010 at 10:31:20AM -0700, Sridhar Samudrala wrote:
> > > > Make vhost scalable by creating a separate vhost thread per vhost
> > > > device. This provides better scaling across multiple guests and with
> > > > multiple interfaces in a guest.
> > > 
> > > Thanks for looking into this. An alternative approach is
> > > to simply replace create_singlethread_workqueue with
> > > create_workqueue which would get us a thread per host CPU.
> > > 
> > > It seems that in theory this should be the optimal approach
> > > wrt CPU locality, however, in practice a single thread
> > > seems to get better numbers. I have a TODO to investigate this.
> > > Could you try looking into this?
> > 
> > Yes. I tried using create_workqueue(), but the results were not good
> > atleast when the number of guest interfaces is less than the number
> > of CPUs. I didn't try more than 8 guests.
> > Creating a separate thread per guest interface seems to be more
> > scalable based on the testing i have done so far.
> > 
> > I will try some more tests and get some numbers to compare the following
> > 3 options.
> > - single vhost thread
> > - vhost thread per cpu
> > - vhost thread per guest virtio interface
> 
> Here are the results with netperf TCP_STREAM 64K guest to host on a
> 8-cpu Nehalem system. It shows cumulative bandwidth in Mbps and host 
> CPU utilization.
> 
> Current default single vhost thread
> -----------------------------------
> 1 guest:  12500  37%    
> 2 guests: 12800  46%
> 3 guests: 12600  47%
> 4 guests: 12200  47%
> 5 guests: 12000  47%
> 6 guests: 11700  47%
> 7 guests: 11340  47%
> 8 guests: 11200  48%
> 
> vhost thread per cpu
> --------------------
> 1 guest:   4900 25%
> 2 guests: 10800 49%
> 3 guests: 17100 67%
> 4 guests: 20400 84%
> 5 guests: 21000 90%
> 6 guests: 22500 92%
> 7 guests: 23500 96%
> 8 guests: 24500 99%
> 
> vhost thread per guest interface
> --------------------------------
> 1 guest:  12500 37%
> 2 guests: 21000 72%
> 3 guests: 21600 79%
> 4 guests: 21600 85%
> 5 guests: 22500 89%
> 6 guests: 22800 94%
> 7 guests: 24500 98%
> 8 guests: 26400 99%
> 
> Thanks
> Sridhar


Consider using Ingo's perf tool to get error bars, but looks good
overall. One thing I note though is that we seem to be able to
consume up to 99% CPU now. So I think with this approach
we can no longer claim that we are just like some other parts of
networking stack, doing work outside any cgroup, and we should
make the vhost thread inherit the cgroup and cpu mask
from the process calling SET_OWNER.

-- 
MST

^ permalink raw reply

* Re: [PATCH] rps: add flow director support
From: Eric Dumazet @ 2010-04-11 16:05 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev, Tom Herbert
In-Reply-To: <1271022140-3917-1-git-send-email-xiaosuo@gmail.com>

Le lundi 12 avril 2010 à 05:42 +0800, Changli Gao a écrit :
> add rps flow director support
> 
> with rps flow director, users can do weighted packet dispatching among CPUs.
> For example, CPU0:CPU1 is 1:3 for eth0's rx-0:
> 
>  localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows  
>  localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----

Changli

I am a bit disappointed to find so many bugs in your patch.

I believe this is over engineering at this stage, we yet have to get
some benches or real world results.

Plus it conflicts with the much more interesting upcoming stuff (RFS).
You name this patch 'flow director', to get our attention, but it's an
old idea of you, to get different weights on cpus, that RPS is not yet
able to perform.

Maybe this is the reason you forgot to CC Tom Herbert (and me) ?

Consider now :

1) echo 65000 >/sys/class/net/eth0/queues/rx-0/rps_flow_0
   possible crash, dereferencing a smaller cpumap.

2) echo 3000000000 >/sys/class/net/eth0/queues/rx-0/rps_flow_0
   probable crash, because of overflow in RPS_MAP_SIZE(flows)

3) How can rps_flow_attribute & rps_flow_attribute_size be static (one
instance for whole kernel), if your intent is to have a per rxqueue
attributes ? (/sys/class/net/eth0/queues/rx-0/ ...). Or the first lines
of update_rps_flow_files() are completely wrong...

echo 10 > /sys/class/net/eth0/queues/rx-0/rps_flows
echo 2 > /sys/class/net/eth1/queues/rx-0/rps_flows
cat /sys/class/net/eth0/queues/rx-0/rps_flow_9 

4) Lack of atomic changes of the RPS flows -> many packet reordering can
occur.

5) Many possible memory leaks in update_rps_flow_files(), you obviously
were very lazy. We try to build a bug-free kernel, not only a 'cool
kernel', and if you are lazy, your patches wont be accepted.



>  net/core/net-sysfs.c |  176 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 172 insertions(+), 4 deletions(-)
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 1e7fdd6..d904610 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -511,6 +511,109 @@ static struct sysfs_ops rx_queue_sysfs_ops = {
>  	.store = rx_queue_attr_store,
>  };
>  
> +static DEFINE_MUTEX(rps_map_lock);
> +
> +static ssize_t show_rps_flow(struct netdev_rx_queue *queue,
> +			     struct rx_queue_attribute *attribute, char *buf)
> +{
> +	unsigned long flowid;
> +	struct rps_map *map;
> +	u16 cpu;
> +
> +	strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +	rcu_read_lock();
> +	map = rcu_dereference(queue->rps_map);
> +	if (map && flowid < map->len)
> +		cpu = map->cpus[flowid];
> +	else
> +		cpu = 0;
> +	rcu_read_unlock();
> +	return sprintf(buf, "%hu\n", cpu);
> +}
> +
> +static ssize_t store_rps_flow(struct netdev_rx_queue *queue,
> +			      struct rx_queue_attribute *attribute,
> +			      const char *buf, size_t len)
> +{
> +	unsigned long flowid, cpu;
> +	struct rps_map *map;
> +
> +	if (!capable(CAP_NET_ADMIN))
> +		return -EPERM;
> +
> +	if (strict_strtoul(buf, 0, &cpu))
> +		return -EINVAL;
> +	strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +
> +	mutex_lock(&rps_map_lock);
> +	map = queue->rps_map;
> +	if (map && flowid < map->len)
> +		map->cpus[flowid] = cpu;

what can happen is cpu=65000, and NR_CPUS=32 ?

> +	mutex_unlock(&rps_map_lock);
> +
> +	return len;
> +}
> +
> +static struct rx_queue_attribute **rps_flow_attribute;
> +static int rps_flow_attribute_size;
> +
> +/* must be called with rps_map_lock locked */
> +static int update_rps_flow_files(struct kobject *kobj,
> +				 struct rps_map *old_map, struct rps_map *map)
> +{
> +	int i;
> +	int old_map_len = old_map ? old_map->len : 0;
> +	int map_len = map ? map->len : 0;
> +
> +	if (old_map_len >= map_len) {
> +		for (i = map_len; i < old_map_len; i++)
> +			sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr);
	Removing attributes for this rxqueue, while anothe might need them ?

> +		return 0;
> +	}
> +
> +	if (map_len > rps_flow_attribute_size) {
> +		struct rx_queue_attribute **attrs;
> +		char name[sizeof("rps_flow_4294967295")];
> +		char *pname;
> +
> +		attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *),
> +				 GFP_KERNEL);
> +		if (attrs == NULL)
> +			return -ENOMEM;
> +		rps_flow_attribute = attrs;
> +		for (i = rps_flow_attribute_size; i < map_len; i++) {
> +			rps_flow_attribute[i] = kmalloc(sizeof(**attrs),
> +							GFP_KERNEL);
> +			if (rps_flow_attribute[i] == NULL)
> +				break;
> +			sprintf(name, "rps_flow_%d", i);
> +			pname = kstrdup(name, GFP_KERNEL);
> +			if (pname == NULL) {
> +				kfree(rps_flow_attribute[i]);
> +				break;
> +			}
> +			rps_flow_attribute[i]->attr.name = pname;
> +			rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR;
> +			rps_flow_attribute[i]->show = show_rps_flow;
> +			rps_flow_attribute[i]->store = store_rps_flow;
> +		}
> +		rps_flow_attribute_size = i;
> +		if (i != map_len)
> +			return -ENOMEM;
> +	}
> +
> +	for (i = old_map_len; i < map_len; i++) {
> +		if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) {
> +			while (--i >= old_map_len)
> +				sysfs_remove_file(kobj,
> +						  &rps_flow_attribute[i]->attr);

			No changes to rps_flow_atribute_size ?

> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static ssize_t show_rps_map(struct netdev_rx_queue *queue,
>  			    struct rx_queue_attribute *attribute, char *buf)
>  {
> @@ -555,7 +658,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>  	struct rps_map *old_map, *map;
>  	cpumask_var_t mask;
>  	int err, cpu, i;
> -	static DEFINE_SPINLOCK(rps_map_lock);
>  
>  	if (!capable(CAP_NET_ADMIN))
>  		return -EPERM;
> @@ -588,10 +690,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>  		map = NULL;
>  	}
>  
> -	spin_lock(&rps_map_lock);
> +	mutex_lock(&rps_map_lock);
>  	old_map = queue->rps_map;
> -	rcu_assign_pointer(queue->rps_map, map);
> -	spin_unlock(&rps_map_lock);
> +	err = update_rps_flow_files(&queue->kobj, old_map, map);
> +	if (!err)
> +		rcu_assign_pointer(queue->rps_map, map);
> +	mutex_unlock(&rps_map_lock);
> +
> +	if (err)
> +		return err;
>  
>  	if (old_map)
>  		call_rcu(&old_map->rcu, rps_map_release);
> @@ -603,8 +710,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>  static struct rx_queue_attribute rps_cpus_attribute =
>  	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
>  
> +static ssize_t show_rps_flows(struct netdev_rx_queue *queue,
> +		struct rx_queue_attribute *attribute, char *buf)
> +{
> +	struct rps_map *map;
> +	unsigned int len;
> +
> +	rcu_read_lock();
> +	map = rcu_dereference(queue->rps_map);
> +	len = map ? map->len : 0;
> +	rcu_read_unlock();
> +	return sprintf(buf, "%u\n", len);
> +}
> +
> +static ssize_t store_rps_flows(struct netdev_rx_queue *queue,
> +			       struct rx_queue_attribute *attribute,
> +			       const char *buf, size_t len)
> +{
> +	struct rps_map *old_map, *map;
> +	unsigned long flows;
> +	int err;
> +
> +	if (!capable(CAP_NET_ADMIN))
> +		return -EPERM;
> +
> +	if (strict_strtoul(buf, 0, &flows))
> +		return -EINVAL;

Are you aware RPS_MAP_SIZE(0x80000000) can overflow ?

> +	if (flows != 0) {
> +		map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows),
> +				    L1_CACHE_BYTES), GFP_KERNEL);
> +		if (map == NULL)
> +			return -ENOMEM;
> +		map->len = flows;
> +	} else {
> +		map = NULL;
> +	}
> +
> +	mutex_lock(&rps_map_lock);
> +	old_map = queue->rps_map;
> +	err = update_rps_flow_files(&queue->kobj, old_map, map);
> +	if (!err) {
> +		if (old_map && map)
> +			memcpy(map->cpus, old_map->cpus,
> +			       sizeof(map->cpus[0]) *
> +			       min_t(unsigned int, flows, old_map->len));
> +		rcu_assign_pointer(queue->rps_map, map);
> +	}
> +	mutex_unlock(&rps_map_lock);
> +
> +	if (err)
> +		return err;
> +
> +	if (old_map)
> +		call_rcu(&old_map->rcu, rps_map_release);
> +
> +	return len;
> +}
> +
> +static struct rx_queue_attribute rps_flows_attribute =
> +	__ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows);
> +
>  static struct attribute *rx_queue_default_attrs[] = {
>  	&rps_cpus_attribute.attr,
> +	&rps_flows_attribute.attr,
>  	NULL
>  };
>  
> --




^ permalink raw reply

* [PATCH net-next-2.6] net: uninline skb_bond_should_drop()
From: Eric Dumazet @ 2010-04-11 16:56 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

skb_bond_should_drop() is too big to be inlined.

This patch reduces kernel text size, and its compilation time as well
(shrinking include/linux/netdevice.h)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netdevice.h |   48 ++---------------------------------
 net/core/dev.c            |   49 ++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 44 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d1a21b5..470f7c9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2089,54 +2089,14 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
-					      struct net_device *master)
-{
-	if (skb->pkt_type == PACKET_HOST) {
-		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
-
-		memcpy(dest, master->dev_addr, ETH_ALEN);
-	}
-}
+extern int __skb_bond_should_drop(struct sk_buff *skb,
+				  struct net_device *master);
 
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
 static inline int skb_bond_should_drop(struct sk_buff *skb,
 				       struct net_device *master)
 {
-	if (master) {
-		struct net_device *dev = skb->dev;
-
-		if (master->priv_flags & IFF_MASTER_ARPMON)
-			dev->last_rx = jiffies;
-
-		if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
-			/* Do address unmangle. The local destination address
-			 * will be always the one master has. Provides the right
-			 * functionality in a bridge.
-			 */
-			skb_bond_set_mac_by_master(skb, master);
-		}
-
-		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
-			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-				return 0;
-
-			if (master->priv_flags & IFF_MASTER_ALB) {
-				if (skb->pkt_type != PACKET_BROADCAST &&
-				    skb->pkt_type != PACKET_MULTICAST)
-					return 0;
-			}
-			if (master->priv_flags & IFF_MASTER_8023AD &&
-			    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
-				return 0;
-
-			return 1;
-		}
-	}
+	if (master)
+		return __skb_bond_should_drop(skb, master);
 	return 0;
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index b98ddc6..c5ca39d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2645,6 +2645,55 @@ void netif_nit_deliver(struct sk_buff *skb)
 	rcu_read_unlock();
 }
 
+static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
+					      struct net_device *master)
+{
+	if (skb->pkt_type == PACKET_HOST) {
+		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+
+		memcpy(dest, master->dev_addr, ETH_ALEN);
+	}
+}
+
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
+{
+	struct net_device *dev = skb->dev;
+
+	if (master->priv_flags & IFF_MASTER_ARPMON)
+		dev->last_rx = jiffies;
+
+	if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+		/* Do address unmangle. The local destination address
+		 * will be always the one master has. Provides the right
+		 * functionality in a bridge.
+		 */
+		skb_bond_set_mac_by_master(skb, master);
+	}
+
+	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
+		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
+			return 0;
+
+		if (master->priv_flags & IFF_MASTER_ALB) {
+			if (skb->pkt_type != PACKET_BROADCAST &&
+			    skb->pkt_type != PACKET_MULTICAST)
+				return 0;
+		}
+		if (master->priv_flags & IFF_MASTER_8023AD &&
+		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+			return 0;
+
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__skb_bond_should_drop);
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;



^ permalink raw reply related

* [PATCH] netstat 1.42 patch adding ROSE support
From: Bernard Pidoux @ 2010-04-11 17:22 UTC (permalink / raw)
  To: dc6iq, net-tools, Linux Netdev List, philb

[-- Attachment #1: Type: text/plain, Size: 2054 bytes --]

Hi,

In netstat README file one can read :

"route/netstat -r do not yet support different address families
cleanly.  IPX/DDP/AX25 people, please feel free to add the code."

This is why I am sending this patch against netstat v 1.42 included 
in net-tools 1.60-23. 
The patch adds AMPR ROSE address family support as may be seen here :

[root@f6bvp-11 net-tools-1.60-23]# netstat -V
net-tools 1.60
netstat 1.42 (2001-04-15)
Fred Baumgarten, Alan Cox, Bernd Eckenfels, Phil Blundell, Tuan Hoang and others
+NEW_ADDRT +RTF_IRTT +RTF_REJECT -FW_MASQUERADE -I18N
AF: (inet) +UNIX +INET -INET6 -IPX +AX25 +NETROM -X25 -ATALK -ECONET +ROSE 
HW:  +ETHER -ARC -SLIP -PPP -TUNNEL -TR +AX25 +NETROM -X25 -FR +ROSE -ASH -SIT -FDDI -HIPPI -HDLC/LAPB 

[root@f6bvp-11 net-tools-1.60-23]# netstat -h
usage: netstat [-veenNcCF] [<Af>] -r         netstat {-V|--version|-h|--help}
------
------
 <Socket>={-t|--tcp} {-u|--udp} {-w|--raw} {-x|--unix} --ax25 --ipx --netrom --rose
  <AF>=Use '-A <af>' or '--<af>'; default: inet
  List of possible address families (which support routing):
    inet (DARPA Internet) ax25 (AMPR AX.25) netrom (AMPR NET/ROM) 
    rose (AMPR ROSE) 

[root@f6bvp-11 net-tools-1.60-23]# netstat --rose
Active ROSE sockets
dest_addr   dest_call  src_addr    src_call  dev   lci neigh   state
2080175520  WP-0       2080175502  WP-0      rose0  32    16   ESTABLISHED
*           *          2080175502  FE6BVP-1  rose0   0     0   LISTENING
2080175520  F6BVP-1    2080175502  F6BVP-12  rose0   0     0   LISTENING
2080175502  FPAD-0     2080175502  WP-0      rose0  32     1   ESTABLISHED
2080428501  F4BWT-10   2080175502  F6BVP-0   rose0   0     0   LISTENING
*           *          2080175502  ROUTE-0   rose0   0     0   LISTENING
*           *          2080175502  F6BVP-15  rose0   0     0   LISTENING
*           *          2080175502  WP-0      rose0   0     0   LISTENING
2080175502  WP-0       2080175502  FPAD-0    rose0   1     1   ESTABLISHED
*           *          2080175502  ??????-?  rose0   0     0   LISTENING

Bernard Pidoux


[-- Attachment #2: net-tools-1.60-23.netstat.rose.patch --]
[-- Type: text/plain, Size: 11913 bytes --]

diff -ruN net-tools-1.60-23/config.in net-tools-1.60-23/config.in
--- net-tools-1.60-23/config.in	2000-05-21 16:32:12.000000000 +0200
+++ net-tools-1.60-23/config.in	2009-11-21 17:19:02.937147071 +0100
@@ -54,7 +54,7 @@
 bool 'Appletalk DDP protocol family' HAVE_AFATALK y
 bool 'AX25 (packet radio) protocol family' HAVE_AFAX25 y
 bool 'NET/ROM (packet radio) protocol family' HAVE_AFNETROM y
-bool 'Rose (packet radio) protocol family' HAVE_AFROSE n
+bool 'Rose (packet radio) protocol family' HAVE_AFROSE y
 bool 'X.25 (CCITT) protocol family' HAVE_AFX25 y
 bool 'Econet protocol family' HAVE_AFECONET n
 bool 'DECnet protocol family' HAVE_AFDECnet n
@@ -71,7 +71,7 @@
 bool 'STRIP (Metricom radio) support' HAVE_HWSTRIP y
 bool 'Token ring (generic) support' HAVE_HWTR y
 bool 'AX25 (packet radio) support' HAVE_HWAX25 y
-bool 'Rose (packet radio) support' HAVE_HWROSE n
+bool 'Rose (packet radio) support' HAVE_HWROSE y
 bool 'NET/ROM (packet radio) support' HAVE_HWNETROM y
 bool 'X.25 (generic) support' HAVE_HWX25 y
 bool 'DLCI/FRAD (frame relay) support' HAVE_HWFR y
diff -ruN net-tools-1.60-23/lib/af.c net-tools-1.60-23/lib/af.c
--- net-tools-1.60-23/lib/af.c	2000-05-20 20:27:23.000000000 +0200
+++ net-tools-1.60-23/lib/af.c	2009-11-21 17:19:02.937147071 +0100
@@ -32,6 +32,7 @@
 int flag_ax25;
 int flag_ddp;
 int flag_netrom;
+int flag_rose;
 int flag_inet;
 int flag_inet6;
 int flag_econet;
@@ -64,6 +65,9 @@
 	"netrom", "netrom", &flag_netrom
     },
     {
+	"rose", "rose", &flag_rose
+    },
+    {
 	"inet", "inet", &flag_inet
     },
     {
@@ -100,6 +104,7 @@
 extern struct aftype inet6_aftype;
 extern struct aftype ax25_aftype;
 extern struct aftype netrom_aftype;
+extern struct aftype rose_aftype;
 extern struct aftype ipx_aftype;
 extern struct aftype ddp_aftype;
 extern struct aftype ec_aftype;
diff -ruN net-tools-1.60-23/lib/getroute.c net-tools-1.60-23/lib/getroute.c
--- net-tools-1.60-23/lib/getroute.c	2000-05-20 20:27:23.000000000 +0200
+++ net-tools-1.60-23/lib/getroute.c	2009-11-21 17:19:02.937147071 +0100
@@ -44,6 +44,7 @@
 extern struct aftype inet6_aftype;
 extern struct aftype ax25_aftype;
 extern struct aftype netrom_aftype;
+extern struct aftype rose_aftype;
 extern struct aftype ipx_aftype;
 extern struct aftype ddp_aftype;
 extern struct aftype x25_aftype;
@@ -59,6 +60,9 @@
 #if HAVE_AFNETROM
     netrom_aftype.rprint = NETROM_rprint;
 #endif
+#if HAVE_AFROSE
+    rose_aftype.rprint = ROSE_rprint;
+#endif
 #if HAVE_AFAX25
     ax25_aftype.rprint = AX25_rprint;
 #endif
diff -ruN net-tools-1.60-23/lib/net-support.h net-tools-1.60-23/lib/net-support.h
--- net-tools-1.60-23/lib/net-support.h	2000-10-28 12:59:42.000000000 +0200
+++ net-tools-1.60-23/lib/net-support.h	2009-11-21 17:19:02.937147071 +0100
@@ -109,6 +109,7 @@
 extern int DDP_rprint(int options);
 extern int IPX_rprint(int options);
 extern int NETROM_rprint(int options);
+extern int ROSE_rprint(int options);
 extern int AX25_rprint(int options);
 extern int X25_rprint(int options);
 
@@ -117,6 +118,7 @@
 extern int DDP_rinput(int action, int flags, char **argv);
 extern int IPX_rinput(int action, int flags, char **argv);
 extern int NETROM_rinput(int action, int flags, char **argv);
+extern int ROSE_rinput(int action, int flags, char **argv);
 extern int AX25_rinput(int action, int flags, char **argv);
 extern int X25_rinput(int action, int flags, char **argv);
 
@@ -130,6 +132,7 @@
 extern int flag_ax25;
 extern int flag_ddp;
 extern int flag_netrom;
+extern int flag_rose;
 extern int flag_x25;
 extern int flag_inet;
 extern int flag_inet6;
@@ -143,6 +146,7 @@
 	{"ipx",         0,	0,	1}, \
 	{"appletalk",	0,	0,	1}, \
 	{"netrom",	0,	0,	1}, \
+       {"rose",        0,      0,      1}, \
 	{"inet",	0,	0,	1}, \
 	{"inet6",	0,	0,	1}, \
 	{"ddp",		0,	0,	1}, \
diff -ruN net-tools-1.60-23/lib/pathnames.h net-tools-1.60-23/lib/pathnames.h
--- net-tools-1.60-23/lib/pathnames.h	2000-05-20 20:27:26.000000000 +0200
+++ net-tools-1.60-23/lib/pathnames.h	2009-11-21 17:19:02.937147071 +0100
@@ -41,6 +41,9 @@
 #define _PATH_PROCNET_DEV		"/proc/net/dev"
 #define _PATH_PROCNET_RARP		"/proc/net/rarp"
 #define _PATH_ETHERS			"/etc/ethers"
+#define _PATH_PROCNET_ROSE		"/proc/net/rose"
+#define _PATH_PROCNET_ROSE_NEIGH	"/proc/net/rose_neigh"
+#define _PATH_PROCNET_ROSE_NODES	"/proc/net/rose_nodes"
 #define _PATH_PROCNET_ROSE_ROUTE	"/proc/net/rose_routes"
 #define _PATH_PROCNET_X25              "/proc/net/x25"
 #define _PATH_PROCNET_X25_ROUTE                "/proc/net/x25_routes"
diff -ruN net-tools-1.60-23/lib/rose_gr.c net-tools-1.60-23/lib/rose_gr.c
--- net-tools-1.60-23/lib/rose_gr.c	1999-01-09 16:55:24.000000000 +0100
+++ net-tools-1.60-23/lib/rose_gr.c	2009-11-21 17:19:02.937147071 +0100
@@ -11,6 +11,9 @@
  *              Copyright 1999 Bernd Eckenfels, Germany
  *              base on Code from Jonathan Naylor <jsn@Cs.Nott.AC.UK>
  *
+ *              Largely rewritten by Bernard Pidoux f6bvp@amsat.org
+ *              November 2009.              
+ *
  *              This program is free software; you can redistribute it
  *              and/or  modify it under  the terms of  the GNU General
  *              Public  License as  published  by  the  Free  Software
@@ -41,27 +44,48 @@
 
 int ROSE_rprint(int options)
 {
-    FILE *f = NULL;
-    char buffer[256];
-    int use;
+    FILE *f1 = NULL;
+    FILE *f2 = NULL;
+    char buffer1[256], buffer2[256];
+    int i, neigh, use;
 
-    f=fopen(_PATH_PROCNET_ROSE_ROUTE, "r");
-    if (f == NULL) {
-	perror(_PATH_PROCNET_ROSE_ROUTE);
+    f2=fopen(_PATH_PROCNET_ROSE_NEIGH, "r");
+    f1=fopen(_PATH_PROCNET_ROSE_NODES, "r");
+    if (f1 == NULL) {
+	perror(_PATH_PROCNET_ROSE_NODES);
 	printf(_("ROSE not configured in this system.\n"));	/* xxx */
 	return 1;
     }
     printf(_("Kernel ROSE routing table\n"));
-    printf(_("Destination  Iface    Use\n"));
-    fgets(buffer, 256, f);
-    while (fgets(buffer, 256, f)) {
-	buffer[9] = 0;
-	buffer[14] = 0;
-	use = atoi(buffer + 15);
-	printf("%-9s    %-5s  %5d\n",
-	       buffer, buffer + 10, use);
+    printf(_("Destination  neigh1 callsign  device  neigh2 callsign  device  neigh3 callsign  device\n"));
+    fgets(buffer1, 256, f1);
+    while (fgets(buffer1, 256, f1)) {
+	buffer1[10] = 0; /* address */
+	buffer1[15] = 0; /* mask */
+	buffer1[17] = 0; /* use */
+	buffer1[23] = 0; /* neigh 1 */
+	buffer1[29] = 0; /* neigh 2 */
+	buffer1[35] = 0; /* neigh 3 */
+/*	mask = atoi(buffer1 + 11); */
+	use = atoi(buffer1 + 16);
+	neigh = atoi(buffer1 + 18);
+	printf("%-10s   ", buffer1);
+	for (i=0; i < use; i++) {
+		neigh = atoi(buffer1 + 6*(i+3));
+		printf("%05d  ", neigh);
+		rewind(f2);
+		fgets(buffer2, 256, f2);
+		while (fgets(buffer2, 256, f2)) {
+			buffer2[15] = 0;
+			buffer2[21] = 0;
+			if (atoi(buffer2) == neigh)
+				printf("%-10s   %-4s", buffer2 + 6, buffer2 + 16);
+    		}
+	}
+	printf("\n");
     }
-    fclose(f);
+    fclose(f1);
+    fclose(f2);
     return 0;
 }
 
diff -ruN net-tools-1.60-23/netstat.c net-tools-1.60-23/netstat.c
--- net-tools-1.60-23/netstat.c	2001-04-15 16:41:17.000000000 +0200
+++ net-tools-1.60-23/netstat.c	2009-11-21 17:19:02.937147071 +0100
@@ -58,6 +58,7 @@
  *
  *990420 {1.38} Tuan Hoang              removed a useless assignment from igmp_do_one()
  *20010404 {1.39} Arnaldo Carvalho de Melo - use setlocale
+ *20091121      Bernard Pidoux          completed ampr ROSE support
  *
  *              This program is free software; you can redistribute it
  *              and/or  modify it under  the terms of  the GNU General
@@ -488,6 +489,49 @@
 }
 #endif
 
+#if HAVE_AFROSE
+static const char *rose_state[] =
+{
+    N_("LISTENING"),
+    N_("CONN SENT"),
+    N_("DISC SENT"),
+    N_("ESTABLISHED")
+};
+
+static int rose_info(void)
+{
+    FILE *f;
+    char buffer[256], dev[6];
+    int ret, st, lci, neigh;
+    char src_addr[10], src_call[9], dest_addr[10], dest_call[9];
+
+    f = fopen(_PATH_PROCNET_ROSE, "r");
+    if (f == NULL) {
+	if (errno != ENOENT) {
+	    perror(_PATH_PROCNET_ROSE);
+	    return (-1);
+	}
+	if (flag_arg || flag_ver)
+	    ESYSNOT("netstat", "AF ROSE");
+	if (flag_arg)
+	    return (1);
+	else
+	    return (0);
+    }
+    printf(_("Active ROSE sockets\n"));
+    printf(_("dest_addr   dest_call  src_addr    src_call  dev   lci neigh  state\n")); 
+    fgets(buffer, 256, f);
+
+    while (fgets(buffer, 256, f)) {
+	ret = sscanf(buffer, "%s %s %s %s %s %d %d %d", dest_addr, dest_call, src_addr, src_call, dev, &lci, &neigh, &st);
+	printf("%-10s  %-9s  %-10s  %-9s %-5s %3d %5d   %s\n", dest_addr, dest_call, src_addr, src_call, dev, lci, neigh, _(rose_state[st]));
+	
+    }
+    fclose(f);
+    return 0;
+}
+#endif
+
 /* These enums are used by IPX too. :-( */
 enum {
     TCP_ESTABLISHED = 1,
@@ -1502,7 +1546,7 @@
     fprintf(stderr, _("        -F, --fib                display Forwarding Information Base (default)\n"));
     fprintf(stderr, _("        -C, --cache              display routing cache instead of FIB\n\n"));
 
-    fprintf(stderr, _("  <Socket>={-t|--tcp} {-u|--udp} {-w|--raw} {-x|--unix} --ax25 --ipx --netrom\n"));
+    fprintf(stderr, _("  <Socket>={-t|--tcp} {-u|--udp} {-w|--raw} {-x|--unix} --ax25 --ipx --netrom --rose\n"));
     fprintf(stderr, _("  <AF>=Use '-A <af>' or '--<af>'; default: %s\n"), DFLT_AF);
     fprintf(stderr, _("  List of possible address families (which support routing):\n"));
     print_aflist(1); /* 1 = routeable */
@@ -1665,7 +1709,7 @@
         flag_inet = flag_inet6 = 1;
 
     flag_arg = flag_tcp + flag_udp + flag_raw + flag_unx + flag_ipx
-	+ flag_ax25 + flag_netrom + flag_igmp + flag_x25;
+	+ flag_ax25 + flag_netrom + flag_rose + flag_igmp + flag_x25;
 
     if (flag_mas) {
 #if HAVE_FW_MASQUERADE && HAVE_AFINET
@@ -1845,6 +1889,18 @@
 	    }
 #endif
 	}
+	if (!flag_arg || flag_rose) {
+#if HAVE_AFROSE
+	    i = rose_info();
+	    if (i)
+		return (i);
+#else
+	    if (flag_arg) {
+		i = 1;
+		ENOSUPP("netstat", "AF ROSE");
+	    }
+#endif
+	}
 	if (!flag_cnt || i)
 	    break;
 	sleep(1);
@@ -1852,3 +1908,4 @@
     }
     return (i);
 }
+
diff -ruN net-tools-1.60-23/po/fr.po net-tools-1.60-23/po/fr.po
--- net-tools-1.60-23/po/fr.po	2000-02-20 22:47:00.000000000 +0100
+++ net-tools-1.60-23/po/fr.po	2009-11-21 17:19:02.937147071 +0100
@@ -797,7 +797,7 @@
 
 #: ../netstat.c:394 ../netstat.c:1089 ../netstat.c:1166
 msgid "LISTENING"
-msgstr "LISTENING"
+msgstr "ECOUTE"
 
 #: ../netstat.c:395
 msgid "CONN SENT"
@@ -809,11 +809,19 @@
 
 #: ../netstat.c:397 ../netstat.c:464 ../netstat.c:809 ../netstat.c:1169
 msgid "ESTABLISHED"
-msgstr "ESTABLISHED"
+msgstr "ETABLIE"
 
-#: ../netstat.c:419
+#: ../netstat.c:467
 msgid "Active NET/ROM sockets\n"
-msgstr "sockets NET/ROM actives\n"
+msgstr "Prises (sockets) NET/ROM actives\n"
+
+#: ../netstat.c:521
+msgid "Active ROSE sockets\n"
+msgstr "Prises (sockets) ROSE actives\n"
+
+#: ../netstat.c:522
+msgid "dest_addr   dest_call  src_addr    src_call  dev   lci neigh  state\n" 
+msgstr "Destinat    call_dest  Source      call_src  Periph lci Voisin Etat \n" 
 
 #: ../netstat.c:420
 msgid ""
@@ -826,7 +834,7 @@
 #: ../netstat.c:430 ../netstat.c:1208
 #, c-format
 msgid "Problem reading data from %s\n"
-msgstr ""
+msgstr "Ne peut lire les données de %s\n"
 
 #: ../netstat.c:465
 msgid "SYN_SENT"
@@ -1008,7 +1016,7 @@
 
 #: ../netstat.c:1184
 msgid "Active AX.25 sockets\n"
-msgstr "Sockets AX.25 actives\n"
+msgstr "Prises (sockets) AX.25 actives\n"
 
 #: ../netstat.c:1185
 msgid "Dest       Source     Device  State        Vr/Vs    Send-Q  Recv-Q\n"

^ permalink raw reply

* [RFC PATCH 2/9] net: fib_rules: set family in fib_rule_hdr centrally
From: kaber @ 2010-04-11 17:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1271007435-20035-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

All fib_rules implementations need to set the family in their ->fill()
functions. Since the value is available to the generic fib_nl_fill_rule()
function, set it there.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/core/fib_rules.c  |    1 +
 net/decnet/dn_rules.c |    1 -
 net/ipv4/fib_rules.c  |    1 -
 net/ipv6/fib6_rules.c |    1 -
 4 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index ca8215a..9ac60ec 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -534,6 +534,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 		return -EMSGSIZE;
 
 	frh = nlmsg_data(nlh);
+	frh->family = ops->family;
 	frh->table = rule->table;
 	NLA_PUT_U32(skb, FRA_TABLE, rule->table);
 	frh->res1 = 0;
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 2d14093..1c8cc6d 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -196,7 +196,6 @@ static int dn_fib_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 {
 	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
 
-	frh->family = AF_DECnet;
 	frh->dst_len = r->dst_len;
 	frh->src_len = r->src_len;
 	frh->tos = 0;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 73b6784..a18355e 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -213,7 +213,6 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 {
 	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
 
-	frh->family = AF_INET;
 	frh->dst_len = rule4->dst_len;
 	frh->src_len = rule4->src_len;
 	frh->tos = rule4->tos;
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 5e463c4..92b2b7f 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -208,7 +208,6 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 {
 	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
 
-	frh->family = AF_INET6;
 	frh->dst_len = rule6->dst.plen;
 	frh->src_len = rule6->src.plen;
 	frh->tos = rule6->tclass;
-- 
1.7.0.4


^ permalink raw reply related

* [RFC PATCH 1/9] net: fib_rules: consolidate IPv4 and DECnet ->default_pref() functions.
From: kaber @ 2010-04-11 17:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1271007435-20035-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Both functions are equivalent, consolidate them since a following patch
needs a third implementation for multicast routing.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/fib_rules.h |    1 +
 net/core/fib_rules.c    |   18 ++++++++++++++++++
 net/decnet/dn_rules.c   |   19 +------------------
 net/ipv4/fib_rules.c    |   19 +------------------
 4 files changed, 21 insertions(+), 36 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index c07ac96..665b9fe 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -113,4 +113,5 @@ extern int			fib_rules_lookup(struct fib_rules_ops *,
 extern int			fib_default_rule_add(struct fib_rules_ops *,
 						     u32 pref, u32 table,
 						     u32 flags);
+extern u32			fib_default_rule_pref(struct fib_rules_ops *ops);
 #endif
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 2ff3489..ca8215a 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -38,6 +38,24 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 }
 EXPORT_SYMBOL(fib_default_rule_add);
 
+u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+{
+	struct list_head *pos;
+	struct fib_rule *rule;
+
+	if (!list_empty(&ops->rules_list)) {
+		pos = ops->rules_list.next;
+		if (pos->next != &ops->rules_list) {
+			rule = list_entry(pos->next, struct fib_rule, list);
+			if (rule->pref)
+				return rule->pref - 1;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_pref);
+
 static void notify_rule_change(int event, struct fib_rule *rule,
 			       struct fib_rules_ops *ops, struct nlmsghdr *nlh,
 			       u32 pid);
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 7466c54..2d14093 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -212,23 +212,6 @@ nla_put_failure:
 	return -ENOBUFS;
 }
 
-static u32 dn_fib_rule_default_pref(struct fib_rules_ops *ops)
-{
-	struct list_head *pos;
-	struct fib_rule *rule;
-
-	if (!list_empty(&dn_fib_rules_ops->rules_list)) {
-		pos = dn_fib_rules_ops->rules_list.next;
-		if (pos->next != &dn_fib_rules_ops->rules_list) {
-			rule = list_entry(pos->next, struct fib_rule, list);
-			if (rule->pref)
-				return rule->pref - 1;
-		}
-	}
-
-	return 0;
-}
-
 static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
 {
 	dn_rt_cache_flush(-1);
@@ -243,7 +226,7 @@ static struct fib_rules_ops dn_fib_rules_ops_template = {
 	.configure	= dn_fib_rule_configure,
 	.compare	= dn_fib_rule_compare,
 	.fill		= dn_fib_rule_fill,
-	.default_pref	= dn_fib_rule_default_pref,
+	.default_pref	= fib_default_rule_pref,
 	.flush_cache	= dn_fib_rule_flush_cache,
 	.nlgroup	= RTNLGRP_DECnet_RULE,
 	.policy		= dn_fib_rule_policy,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index ca2d07b..73b6784 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -234,23 +234,6 @@ nla_put_failure:
 	return -ENOBUFS;
 }
 
-static u32 fib4_rule_default_pref(struct fib_rules_ops *ops)
-{
-	struct list_head *pos;
-	struct fib_rule *rule;
-
-	if (!list_empty(&ops->rules_list)) {
-		pos = ops->rules_list.next;
-		if (pos->next != &ops->rules_list) {
-			rule = list_entry(pos->next, struct fib_rule, list);
-			if (rule->pref)
-				return rule->pref - 1;
-		}
-	}
-
-	return 0;
-}
-
 static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
 {
 	return nla_total_size(4) /* dst */
@@ -272,7 +255,7 @@ static struct fib_rules_ops fib4_rules_ops_template = {
 	.configure	= fib4_rule_configure,
 	.compare	= fib4_rule_compare,
 	.fill		= fib4_rule_fill,
-	.default_pref	= fib4_rule_default_pref,
+	.default_pref	= fib_default_rule_pref,
 	.nlmsg_payload	= fib4_rule_nlmsg_payload,
 	.flush_cache	= fib4_rule_flush_cache,
 	.nlgroup	= RTNLGRP_IPV4_RULE,
-- 
1.7.0.4


^ permalink raw reply related

* [RFC PATCH 0/9] net: support multiple independant multicast routing instances
From: kaber @ 2010-04-11 17:37 UTC (permalink / raw)
  To: netdev

The following patches add support for multiple independant multicast
routing instances. This can be useful to seperate traffic when building
a multicast router that is serving multiple independant networks. Patch
09 contains a more detailed description of this feature in the changelog.

The patchset consists of the following parts:

- Patch 01 and 02 consolidate a small amount of code in the different
  fib_rules users.

- Patch 03 decouples fib_rules family values from real address families
  to allow using them in code that is not a seperate address family without
  increasing AF_MAX/NPROTO.

- Patch 04 moves the raw_sock/raw_sk() definitions from icmp.h to raw.h
  since they will also be used by ipmr in a following patch.

- Patch 05-08 contain some preparatory work and cleanup for supporting
  multiple multicast routing instances.

- Patch 09 contains the actual changes to support multiple multicast
  routing instances.

These patches have been tested using pimd by myself and using xorp by
Ben Greear.

Comments welcome.


^ permalink raw reply

* [RFC PATCH 4/9] ipv4: raw: move struct raw_sock and raw_sk() to include/net/raw.h
From: kaber @ 2010-04-11 17:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1271007435-20035-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

A following patch will use struct raw_sock to store state for ipmr,
so having the definitions in icmp.h doesn't fit very well anymore.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/icmp.h |   11 -----------
 include/net/raw.h  |   12 ++++++++++++
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/net/icmp.h b/include/net/icmp.h
index 15b3dfe..6e991e0 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -48,15 +48,4 @@ extern void	icmp_out_count(struct net *net, unsigned char type);
 /* Move into dst.h ? */
 extern int 	xrlim_allow(struct dst_entry *dst, int timeout);
 
-struct raw_sock {
-	/* inet_sock has to be the first member */
-	struct inet_sock   inet;
-	struct icmp_filter filter;
-};
-
-static inline struct raw_sock *raw_sk(const struct sock *sk)
-{
-	return (struct raw_sock *)sk;
-}
-
 #endif	/* _ICMP_H */
diff --git a/include/net/raw.h b/include/net/raw.h
index 6c14a65..67cc643 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -19,6 +19,7 @@
 
 
 #include <net/protocol.h>
+#include <linux/icmp.h>
 
 extern struct proto raw_prot;
 
@@ -56,4 +57,15 @@ int raw_seq_open(struct inode *ino, struct file *file,
 void raw_hash_sk(struct sock *sk);
 void raw_unhash_sk(struct sock *sk);
 
+struct raw_sock {
+	/* inet_sock has to be the first member */
+	struct inet_sock   inet;
+	struct icmp_filter filter;
+};
+
+static inline struct raw_sock *raw_sk(const struct sock *sk)
+{
+	return (struct raw_sock *)sk;
+}
+
 #endif	/* _RAW_H */
-- 
1.7.0.4


^ permalink raw reply related

* [RFC PATCH 3/9] net: fib_rules: decouple address families from real address families
From: kaber @ 2010-04-11 17:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1271007435-20035-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Decouple the address family values used for fib_rules from the real
address families in socket.h. This allows to use fib_rules for
code that is not a real address family without increasing AF_MAX/NPROTO.

Values up to 127 are reserved for real address families and map directly
to the corresponding AF value, values starting from 128 are for other
uses. rtnetlink is changed to invoke the AF_UNSPEC dumpit/doit handlers
for these families.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/fib_rules.h |    7 +++++++
 net/core/rtnetlink.c      |   15 ++++++++++-----
 net/decnet/dn_rules.c     |    2 +-
 net/ipv4/fib_rules.c      |    2 +-
 net/ipv6/fib6_rules.c     |    2 +-
 5 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/linux/fib_rules.h b/include/linux/fib_rules.h
index 51da65b..405e411 100644
--- a/include/linux/fib_rules.h
+++ b/include/linux/fib_rules.h
@@ -15,6 +15,13 @@
 /* try to find source address in routing lookups */
 #define FIB_RULE_FIND_SADDR	0x00010000
 
+/* fib_rules families. values up to 127 are reserved for real address
+ * families, values above 128 may be used arbitrarily.
+ */
+#define FIB_RULES_IPV4		AF_INET
+#define FIB_RULES_IPV6		AF_INET6
+#define FIB_RULES_DECNET	AF_DECnet
+
 struct fib_rule_hdr {
 	__u8		family;
 	__u8		dst_len;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bf919b6..78c8598 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -118,7 +118,11 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)
 {
 	struct rtnl_link *tab;
 
-	tab = rtnl_msg_handlers[protocol];
+	if (protocol < NPROTO)
+		tab = rtnl_msg_handlers[protocol];
+	else
+		tab = NULL;
+
 	if (tab == NULL || tab[msgindex].doit == NULL)
 		tab = rtnl_msg_handlers[PF_UNSPEC];
 
@@ -129,7 +133,11 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
 {
 	struct rtnl_link *tab;
 
-	tab = rtnl_msg_handlers[protocol];
+	if (protocol < NPROTO)
+		tab = rtnl_msg_handlers[protocol];
+	else
+		tab = NULL;
+
 	if (tab == NULL || tab[msgindex].dumpit == NULL)
 		tab = rtnl_msg_handlers[PF_UNSPEC];
 
@@ -1444,9 +1452,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return 0;
 
 	family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family;
-	if (family >= NPROTO)
-		return -EAFNOSUPPORT;
-
 	sz_idx = type>>2;
 	kind = type&3;
 
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 1c8cc6d..af28dcc 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -217,7 +217,7 @@ static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
 }
 
 static struct fib_rules_ops dn_fib_rules_ops_template = {
-	.family		= AF_DECnet,
+	.family		= FIB_RULES_DECNET,
 	.rule_size	= sizeof(struct dn_fib_rule),
 	.addr_size	= sizeof(u16),
 	.action		= dn_fib_rule_action,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index a18355e..3ec84fe 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -246,7 +246,7 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
 }
 
 static struct fib_rules_ops fib4_rules_ops_template = {
-	.family		= AF_INET,
+	.family		= FIB_RULES_IPV4,
 	.rule_size	= sizeof(struct fib4_rule),
 	.addr_size	= sizeof(u32),
 	.action		= fib4_rule_action,
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 92b2b7f..8124f16 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -238,7 +238,7 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
 }
 
 static struct fib_rules_ops fib6_rules_ops_template = {
-	.family			= AF_INET6,
+	.family			= FIB_RULES_IPV6,
 	.rule_size		= sizeof(struct fib6_rule),
 	.addr_size		= sizeof(struct in6_addr),
 	.action			= fib6_rule_action,
-- 
1.7.0.4


^ permalink raw reply related

* [RFC PATCH 5/9] ipv4: ipmr: move unres_queue and timer to per-namespace data
From: kaber @ 2010-04-11 17:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1271007435-20035-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

The unres_queue is currently shared between all namespaces. Following patches
will additionally allow to create multiple multicast routing tables in each
namespace. Having a single shared queue for all these users seems to excessive,
move the queue and the cleanup timer to the per-namespace data to unshare it.

As a side-effect, this fixes a bug in the seq file iteration functions: the
first entry returned is always from the current namespace, entries returned
after that may belong to any namespace.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/netns/ipv4.h |    2 +
 net/ipv4/ipmr.c          |   70 +++++++++++++++++++---------------------------
 2 files changed, 31 insertions(+), 41 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2764994..b15e518 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -60,6 +60,8 @@ struct netns_ipv4 {
 
 #ifdef CONFIG_IP_MROUTE
 	struct sock		*mroute_sk;
+	struct timer_list	ipmr_expire_timer;
+	struct mfc_cache	*mfc_unres_queue;
 	struct mfc_cache	**mfc_cache_array;
 	struct vif_device	*vif_table;
 	int			maxvif;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d0a6092..4f30bd9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -79,8 +79,6 @@ static DEFINE_RWLOCK(mrt_lock);
 
 #define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
 
-static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
-
 /* Special spinlock for queue of unresolved entries */
 static DEFINE_SPINLOCK(mfc_unres_lock);
 
@@ -99,8 +97,6 @@ static int ipmr_cache_report(struct net *net,
 			     struct sk_buff *pkt, vifi_t vifi, int assert);
 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
 
-static struct timer_list ipmr_expire_timer;
-
 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 
 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
@@ -363,25 +359,26 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
 }
 
 
-/* Single timer process for all the unresolved queue. */
+/* Timer process for the unresolved queue. */
 
-static void ipmr_expire_process(unsigned long dummy)
+static void ipmr_expire_process(unsigned long arg)
 {
+	struct net *net = (struct net *)arg;
 	unsigned long now;
 	unsigned long expires;
 	struct mfc_cache *c, **cp;
 
 	if (!spin_trylock(&mfc_unres_lock)) {
-		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
+		mod_timer(&net->ipv4.ipmr_expire_timer, jiffies+HZ/10);
 		return;
 	}
 
-	if (mfc_unres_queue == NULL)
+	if (net->ipv4.mfc_unres_queue == NULL)
 		goto out;
 
 	now = jiffies;
 	expires = 10*HZ;
-	cp = &mfc_unres_queue;
+	cp = &net->ipv4.mfc_unres_queue;
 
 	while ((c=*cp) != NULL) {
 		if (time_after(c->mfc_un.unres.expires, now)) {
@@ -397,8 +394,8 @@ static void ipmr_expire_process(unsigned long dummy)
 		ipmr_destroy_unres(c);
 	}
 
-	if (mfc_unres_queue != NULL)
-		mod_timer(&ipmr_expire_timer, jiffies + expires);
+	if (net->ipv4.mfc_unres_queue != NULL)
+		mod_timer(&net->ipv4.ipmr_expire_timer, jiffies + expires);
 
 out:
 	spin_unlock(&mfc_unres_lock);
@@ -707,9 +704,8 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 
 	spin_lock_bh(&mfc_unres_lock);
-	for (c=mfc_unres_queue; c; c=c->next) {
-		if (net_eq(mfc_net(c), net) &&
-		    c->mfc_mcastgrp == iph->daddr &&
+	for (c=net->ipv4.mfc_unres_queue; c; c=c->next) {
+		if (c->mfc_mcastgrp == iph->daddr &&
 		    c->mfc_origin == iph->saddr)
 			break;
 	}
@@ -750,10 +746,10 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 		}
 
 		atomic_inc(&net->ipv4.cache_resolve_queue_len);
-		c->next = mfc_unres_queue;
-		mfc_unres_queue = c;
+		c->next = net->ipv4.mfc_unres_queue;
+		net->ipv4.mfc_unres_queue = c;
 
-		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
+		mod_timer(&net->ipv4.ipmr_expire_timer, c->mfc_un.unres.expires);
 	}
 
 	/*
@@ -848,18 +844,17 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 	 *	need to send on the frames and tidy up.
 	 */
 	spin_lock_bh(&mfc_unres_lock);
-	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
+	for (cp = &net->ipv4.mfc_unres_queue; (uc=*cp) != NULL;
 	     cp = &uc->next) {
-		if (net_eq(mfc_net(uc), net) &&
-		    uc->mfc_origin == c->mfc_origin &&
+		if (uc->mfc_origin == c->mfc_origin &&
 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
 			*cp = uc->next;
 			atomic_dec(&net->ipv4.cache_resolve_queue_len);
 			break;
 		}
 	}
-	if (mfc_unres_queue == NULL)
-		del_timer(&ipmr_expire_timer);
+	if (net->ipv4.mfc_unres_queue == NULL)
+		del_timer(&net->ipv4.ipmr_expire_timer);
 	spin_unlock_bh(&mfc_unres_lock);
 
 	if (uc) {
@@ -911,14 +906,9 @@ static void mroute_clean_tables(struct net *net)
 		struct mfc_cache *c, **cp;
 
 		spin_lock_bh(&mfc_unres_lock);
-		cp = &mfc_unres_queue;
+		cp = &net->ipv4.mfc_unres_queue;
 		while ((c = *cp) != NULL) {
-			if (!net_eq(mfc_net(c), net)) {
-				cp = &c->next;
-				continue;
-			}
 			*cp = c->next;
-
 			ipmr_destroy_unres(c);
 		}
 		spin_unlock_bh(&mfc_unres_lock);
@@ -1818,11 +1808,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 				return mfc;
 	read_unlock(&mrt_lock);
 
-	it->cache = &mfc_unres_queue;
+	it->cache = &net->ipv4.mfc_unres_queue;
 	spin_lock_bh(&mfc_unres_lock);
-	for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
-		if (net_eq(mfc_net(mfc), net) &&
-		    pos-- == 0)
+	for (mfc = net->ipv4.mfc_unres_queue; mfc; mfc = mfc->next)
+		if (pos-- == 0)
 			return mfc;
 	spin_unlock_bh(&mfc_unres_lock);
 
@@ -1856,7 +1845,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (mfc->next)
 		return mfc->next;
 
-	if (it->cache == &mfc_unres_queue)
+	if (it->cache == &net->ipv4.mfc_unres_queue)
 		goto end_of_list;
 
 	BUG_ON(it->cache != net->ipv4.mfc_cache_array);
@@ -1869,13 +1858,11 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 	/* exhausted cache_array, show unresolved */
 	read_unlock(&mrt_lock);
-	it->cache = &mfc_unres_queue;
+	it->cache = &net->ipv4.mfc_unres_queue;
 	it->ct = 0;
 
 	spin_lock_bh(&mfc_unres_lock);
-	mfc = mfc_unres_queue;
-	while (mfc && !net_eq(mfc_net(mfc), net))
-		mfc = mfc->next;
+	mfc = net->ipv4.mfc_unres_queue;
 	if (mfc)
 		return mfc;
 
@@ -1891,7 +1878,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
 
-	if (it->cache == &mfc_unres_queue)
+	if (it->cache == &net->ipv4.mfc_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
 	else if (it->cache == net->ipv4.mfc_cache_array)
 		read_unlock(&mrt_lock);
@@ -1914,7 +1901,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 			   (unsigned long) mfc->mfc_origin,
 			   mfc->mfc_parent);
 
-		if (it->cache != &mfc_unres_queue) {
+		if (it->cache != &net->ipv4.mfc_unres_queue) {
 			seq_printf(seq, " %8lu %8lu %8lu",
 				   mfc->mfc_un.res.pkt,
 				   mfc->mfc_un.res.bytes,
@@ -1991,6 +1978,9 @@ static int __net_init ipmr_net_init(struct net *net)
 		goto fail_mfc_cache;
 	}
 
+	setup_timer(&net->ipv4.ipmr_expire_timer, ipmr_expire_process,
+		    (unsigned long)net);
+
 #ifdef CONFIG_IP_PIMSM
 	net->ipv4.mroute_reg_vif_num = -1;
 #endif
@@ -2046,7 +2036,6 @@ int __init ip_mr_init(void)
 	if (err)
 		goto reg_pernet_fail;
 
-	setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
 	err = register_netdevice_notifier(&ip_mr_notifier);
 	if (err)
 		goto reg_notif_fail;
@@ -2064,7 +2053,6 @@ add_proto_fail:
 	unregister_netdevice_notifier(&ip_mr_notifier);
 #endif
 reg_notif_fail:
-	del_timer(&ipmr_expire_timer);
 	unregister_pernet_subsys(&ipmr_net_ops);
 reg_pernet_fail:
 	kmem_cache_destroy(mrt_cachep);
-- 
1.7.0.4


^ permalink raw reply related

* [RFC PATCH 6/9] ipv4: ipmr: remove net pointer from struct mfc_cache
From: kaber @ 2010-04-11 17:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1271007435-20035-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Now that cache entries in unres_queue don't need to be distinguished by their
network namespace pointer anymore, we can remove it from struct mfc_cache
add pass the namespace as function argument to the functions that need it.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/mroute.h |   15 -----------
 net/ipv4/ipmr.c        |   65 +++++++++++++++++++++++------------------------
 2 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index c5f3d53..de7780a 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -192,9 +192,6 @@ struct vif_device {
 
 struct mfc_cache {
 	struct mfc_cache *next;			/* Next entry on cache line 	*/
-#ifdef CONFIG_NET_NS
-	struct net *mfc_net;
-#endif
 	__be32 mfc_mcastgrp;			/* Group the entry belongs to 	*/
 	__be32 mfc_origin;			/* Source of packet 		*/
 	vifi_t mfc_parent;			/* Source interface		*/
@@ -217,18 +214,6 @@ struct mfc_cache {
 	} mfc_un;
 };
 
-static inline
-struct net *mfc_net(const struct mfc_cache *mfc)
-{
-	return read_pnet(&mfc->mfc_net);
-}
-
-static inline
-void mfc_net_set(struct mfc_cache *mfc, struct net *net)
-{
-	write_pnet(&mfc->mfc_net, hold_net(net));
-}
-
 #define MFC_STATIC		1
 #define MFC_NOTIFY		2
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 4f30bd9..d7250e2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -92,10 +92,12 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
 
 static struct kmem_cache *mrt_cachep __read_mostly;
 
-static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
+static int ip_mr_forward(struct net *net, struct sk_buff *skb,
+			 struct mfc_cache *cache, int local);
 static int ipmr_cache_report(struct net *net,
 			     struct sk_buff *pkt, vifi_t vifi, int assert);
-static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
+static int ipmr_fill_mroute(struct net *net, struct sk_buff *skb,
+			    struct mfc_cache *c, struct rtmsg *rtm);
 
 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 
@@ -324,7 +326,6 @@ static int vif_delete(struct net *net, int vifi, int notify,
 
 static inline void ipmr_cache_free(struct mfc_cache *c)
 {
-	release_net(mfc_net(c));
 	kmem_cache_free(mrt_cachep, c);
 }
 
@@ -332,11 +333,10 @@ static inline void ipmr_cache_free(struct mfc_cache *c)
    and reporting error to netlink readers.
  */
 
-static void ipmr_destroy_unres(struct mfc_cache *c)
+static void ipmr_destroy_unres(struct net *net, struct mfc_cache *c)
 {
 	struct sk_buff *skb;
 	struct nlmsgerr *e;
-	struct net *net = mfc_net(c);
 
 	atomic_dec(&net->ipv4.cache_resolve_queue_len);
 
@@ -391,7 +391,7 @@ static void ipmr_expire_process(unsigned long arg)
 
 		*cp = c->next;
 
-		ipmr_destroy_unres(c);
+		ipmr_destroy_unres(net, c);
 	}
 
 	if (net->ipv4.mfc_unres_queue != NULL)
@@ -403,10 +403,10 @@ out:
 
 /* Fill oifs list. It is called under write locked mrt_lock. */
 
-static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
+static void ipmr_update_thresholds(struct net *net, struct mfc_cache *cache,
+				   unsigned char *ttls)
 {
 	int vifi;
-	struct net *net = mfc_net(cache);
 
 	cache->mfc_un.res.minvif = MAXVIFS;
 	cache->mfc_un.res.maxvif = 0;
@@ -546,24 +546,22 @@ static struct mfc_cache *ipmr_cache_find(struct net *net,
 /*
  *	Allocate a multicast cache entry
  */
-static struct mfc_cache *ipmr_cache_alloc(struct net *net)
+static struct mfc_cache *ipmr_cache_alloc(void)
 {
 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
 	if (c == NULL)
 		return NULL;
 	c->mfc_un.res.minvif = MAXVIFS;
-	mfc_net_set(c, net);
 	return c;
 }
 
-static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
 {
 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
 	if (c == NULL)
 		return NULL;
 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
 	c->mfc_un.unres.expires = jiffies + 10*HZ;
-	mfc_net_set(c, net);
 	return c;
 }
 
@@ -571,7 +569,8 @@ static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
  *	A cache entry has gone into a resolved state from queued
  */
 
-static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
+static void ipmr_cache_resolve(struct net *net, struct mfc_cache *uc,
+			       struct mfc_cache *c)
 {
 	struct sk_buff *skb;
 	struct nlmsgerr *e;
@@ -584,7 +583,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
 		if (ip_hdr(skb)->version == 0) {
 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
 
-			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
+			if (ipmr_fill_mroute(net, skb, c, NLMSG_DATA(nlh)) > 0) {
 				nlh->nlmsg_len = (skb_tail_pointer(skb) -
 						  (u8 *)nlh);
 			} else {
@@ -596,9 +595,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
 				memset(&e->msg, 0, sizeof(e->msg));
 			}
 
-			rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid);
+			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
 		} else
-			ip_mr_forward(skb, c, 0);
+			ip_mr_forward(net, skb, c, 0);
 	}
 }
 
@@ -716,7 +715,7 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 		 */
 
 		if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 ||
-		    (c = ipmr_cache_alloc_unres(net)) == NULL) {
+		    (c = ipmr_cache_alloc_unres()) == NULL) {
 			spin_unlock_bh(&mfc_unres_lock);
 
 			kfree_skb(skb);
@@ -813,7 +812,7 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 	if (c != NULL) {
 		write_lock_bh(&mrt_lock);
 		c->mfc_parent = mfc->mfcc_parent;
-		ipmr_update_thresholds(c, mfc->mfcc_ttls);
+		ipmr_update_thresholds(net, c, mfc->mfcc_ttls);
 		if (!mrtsock)
 			c->mfc_flags |= MFC_STATIC;
 		write_unlock_bh(&mrt_lock);
@@ -823,14 +822,14 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
 		return -EINVAL;
 
-	c = ipmr_cache_alloc(net);
+	c = ipmr_cache_alloc();
 	if (c == NULL)
 		return -ENOMEM;
 
 	c->mfc_origin = mfc->mfcc_origin.s_addr;
 	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
 	c->mfc_parent = mfc->mfcc_parent;
-	ipmr_update_thresholds(c, mfc->mfcc_ttls);
+	ipmr_update_thresholds(net, c, mfc->mfcc_ttls);
 	if (!mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
@@ -858,7 +857,7 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 	spin_unlock_bh(&mfc_unres_lock);
 
 	if (uc) {
-		ipmr_cache_resolve(uc, c);
+		ipmr_cache_resolve(net, uc, c);
 		ipmr_cache_free(uc);
 	}
 	return 0;
@@ -909,7 +908,7 @@ static void mroute_clean_tables(struct net *net)
 		cp = &net->ipv4.mfc_unres_queue;
 		while ((c = *cp) != NULL) {
 			*cp = c->next;
-			ipmr_destroy_unres(c);
+			ipmr_destroy_unres(net, c);
 		}
 		spin_unlock_bh(&mfc_unres_lock);
 	}
@@ -1220,9 +1219,9 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
  *	Processing handlers for ipmr_forward
  */
 
-static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
+static void ipmr_queue_xmit(struct net *net, struct sk_buff *skb,
+			    struct mfc_cache *c, int vifi)
 {
-	struct net *net = mfc_net(c);
 	const struct iphdr *iph = ip_hdr(skb);
 	struct vif_device *vif = &net->ipv4.vif_table[vifi];
 	struct net_device *dev;
@@ -1334,11 +1333,11 @@ static int ipmr_find_vif(struct net_device *dev)
 
 /* "local" means that we should preserve one skb (for local delivery) */
 
-static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
+static int ip_mr_forward(struct net *net, struct sk_buff *skb,
+			 struct mfc_cache *cache, int local)
 {
 	int psend = -1;
 	int vif, ct;
-	struct net *net = mfc_net(cache);
 
 	vif = cache->mfc_parent;
 	cache->mfc_un.res.pkt++;
@@ -1395,7 +1394,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 			if (psend != -1) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 				if (skb2)
-					ipmr_queue_xmit(skb2, cache, psend);
+					ipmr_queue_xmit(net, skb2, cache, psend);
 			}
 			psend = ct;
 		}
@@ -1404,9 +1403,9 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 		if (local) {
 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 			if (skb2)
-				ipmr_queue_xmit(skb2, cache, psend);
+				ipmr_queue_xmit(net, skb2, cache, psend);
 		} else {
-			ipmr_queue_xmit(skb, cache, psend);
+			ipmr_queue_xmit(net, skb, cache, psend);
 			return 0;
 		}
 	}
@@ -1487,7 +1486,7 @@ int ip_mr_input(struct sk_buff *skb)
 		return -ENODEV;
 	}
 
-	ip_mr_forward(skb, cache, local);
+	ip_mr_forward(net, skb, cache, local);
 
 	read_unlock(&mrt_lock);
 
@@ -1601,11 +1600,11 @@ drop:
 #endif
 
 static int
-ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
+ipmr_fill_mroute(struct net *net, struct sk_buff *skb, struct mfc_cache *c,
+		 struct rtmsg *rtm)
 {
 	int ct;
 	struct rtnexthop *nhp;
-	struct net *net = mfc_net(c);
 	u8 *b = skb_tail_pointer(skb);
 	struct rtattr *mp_head;
 
@@ -1685,7 +1684,7 @@ int ipmr_get_route(struct net *net,
 
 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
 		cache->mfc_flags |= MFC_NOTIFY;
-	err = ipmr_fill_mroute(skb, cache, rtm);
+	err = ipmr_fill_mroute(net, skb, cache, rtm);
 	read_unlock(&mrt_lock);
 	return err;
 }
-- 
1.7.0.4


^ permalink raw reply related

* [RFC PATCH 7/9] ipv4: ipmr: convert struct mfc_cache to struct list_head
From: kaber @ 2010-04-11 17:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1271007435-20035-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/mroute.h   |    2 +-
 include/net/netns/ipv4.h |    4 +-
 net/ipv4/ipmr.c          |  125 ++++++++++++++++++++++-----------------------
 3 files changed, 64 insertions(+), 67 deletions(-)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index de7780a..7ff6c77 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -191,7 +191,7 @@ struct vif_device {
 #define VIFF_STATIC 0x8000
 
 struct mfc_cache {
-	struct mfc_cache *next;			/* Next entry on cache line 	*/
+	struct list_head list;
 	__be32 mfc_mcastgrp;			/* Group the entry belongs to 	*/
 	__be32 mfc_origin;			/* Source of packet 		*/
 	vifi_t mfc_parent;			/* Source interface		*/
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index b15e518..5d06429 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -61,8 +61,8 @@ struct netns_ipv4 {
 #ifdef CONFIG_IP_MROUTE
 	struct sock		*mroute_sk;
 	struct timer_list	ipmr_expire_timer;
-	struct mfc_cache	*mfc_unres_queue;
-	struct mfc_cache	**mfc_cache_array;
+	struct list_head	mfc_unres_queue;
+	struct list_head	*mfc_cache_array;
 	struct vif_device	*vif_table;
 	int			maxvif;
 	atomic_t		cache_resolve_queue_len;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d7250e2..6107790 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -366,35 +366,32 @@ static void ipmr_expire_process(unsigned long arg)
 	struct net *net = (struct net *)arg;
 	unsigned long now;
 	unsigned long expires;
-	struct mfc_cache *c, **cp;
+	struct mfc_cache *c, *next;
 
 	if (!spin_trylock(&mfc_unres_lock)) {
 		mod_timer(&net->ipv4.ipmr_expire_timer, jiffies+HZ/10);
 		return;
 	}
 
-	if (net->ipv4.mfc_unres_queue == NULL)
+	if (list_empty(&net->ipv4.mfc_unres_queue))
 		goto out;
 
 	now = jiffies;
 	expires = 10*HZ;
-	cp = &net->ipv4.mfc_unres_queue;
 
-	while ((c=*cp) != NULL) {
+	list_for_each_entry_safe(c, next, &net->ipv4.mfc_unres_queue, list) {
 		if (time_after(c->mfc_un.unres.expires, now)) {
 			unsigned long interval = c->mfc_un.unres.expires - now;
 			if (interval < expires)
 				expires = interval;
-			cp = &c->next;
 			continue;
 		}
 
-		*cp = c->next;
-
+		list_del(&c->list);
 		ipmr_destroy_unres(net, c);
 	}
 
-	if (net->ipv4.mfc_unres_queue != NULL)
+	if (!list_empty(&net->ipv4.mfc_unres_queue))
 		mod_timer(&net->ipv4.ipmr_expire_timer, jiffies + expires);
 
 out:
@@ -536,11 +533,11 @@ static struct mfc_cache *ipmr_cache_find(struct net *net,
 	int line = MFC_HASH(mcastgrp, origin);
 	struct mfc_cache *c;
 
-	for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) {
-		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
-			break;
+	list_for_each_entry(c, &net->ipv4.mfc_cache_array[line], list) {
+		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
+			return c;
 	}
-	return c;
+	return NULL;
 }
 
 /*
@@ -698,18 +695,21 @@ static int ipmr_cache_report(struct net *net,
 static int
 ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 {
+	bool found = false;
 	int err;
 	struct mfc_cache *c;
 	const struct iphdr *iph = ip_hdr(skb);
 
 	spin_lock_bh(&mfc_unres_lock);
-	for (c=net->ipv4.mfc_unres_queue; c; c=c->next) {
+	list_for_each_entry(c, &net->ipv4.mfc_unres_queue, list) {
 		if (c->mfc_mcastgrp == iph->daddr &&
-		    c->mfc_origin == iph->saddr)
+		    c->mfc_origin == iph->saddr) {
+			found = true;
 			break;
+		}
 	}
 
-	if (c == NULL) {
+	if (!found) {
 		/*
 		 *	Create a new entry if allowable
 		 */
@@ -745,8 +745,7 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 		}
 
 		atomic_inc(&net->ipv4.cache_resolve_queue_len);
-		c->next = net->ipv4.mfc_unres_queue;
-		net->ipv4.mfc_unres_queue = c;
+		list_add_tail(&c->list, &net->ipv4.mfc_unres_queue);
 
 		mod_timer(&net->ipv4.ipmr_expire_timer, c->mfc_un.unres.expires);
 	}
@@ -773,16 +772,15 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
 {
 	int line;
-	struct mfc_cache *c, **cp;
+	struct mfc_cache *c, *next;
 
 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
 
-	for (cp = &net->ipv4.mfc_cache_array[line];
-	     (c = *cp) != NULL; cp = &c->next) {
+	list_for_each_entry_safe(c, next, &net->ipv4.mfc_cache_array[line], list) {
 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
 			write_lock_bh(&mrt_lock);
-			*cp = c->next;
+			list_del(&c->list);
 			write_unlock_bh(&mrt_lock);
 
 			ipmr_cache_free(c);
@@ -794,22 +792,24 @@ static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
 
 static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 {
+	bool found = false;
 	int line;
-	struct mfc_cache *uc, *c, **cp;
+	struct mfc_cache *uc, *c;
 
 	if (mfc->mfcc_parent >= MAXVIFS)
 		return -ENFILE;
 
 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
 
-	for (cp = &net->ipv4.mfc_cache_array[line];
-	     (c = *cp) != NULL; cp = &c->next) {
+	list_for_each_entry(c, &net->ipv4.mfc_cache_array[line], list) {
 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
-		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
+		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+			found = true;
 			break;
+		}
 	}
 
-	if (c != NULL) {
+	if (found) {
 		write_lock_bh(&mrt_lock);
 		c->mfc_parent = mfc->mfcc_parent;
 		ipmr_update_thresholds(net, c, mfc->mfcc_ttls);
@@ -834,8 +834,7 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
 	write_lock_bh(&mrt_lock);
-	c->next = net->ipv4.mfc_cache_array[line];
-	net->ipv4.mfc_cache_array[line] = c;
+	list_add_tail(&c->list, &net->ipv4.mfc_cache_array[line]);
 	write_unlock_bh(&mrt_lock);
 
 	/*
@@ -843,16 +842,15 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 	 *	need to send on the frames and tidy up.
 	 */
 	spin_lock_bh(&mfc_unres_lock);
-	for (cp = &net->ipv4.mfc_unres_queue; (uc=*cp) != NULL;
-	     cp = &uc->next) {
+	list_for_each_entry(uc, &net->ipv4.mfc_unres_queue, list) {
 		if (uc->mfc_origin == c->mfc_origin &&
 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
-			*cp = uc->next;
+			list_del(&uc->list);
 			atomic_dec(&net->ipv4.cache_resolve_queue_len);
 			break;
 		}
 	}
-	if (net->ipv4.mfc_unres_queue == NULL)
+	if (list_empty(&net->ipv4.mfc_unres_queue))
 		del_timer(&net->ipv4.ipmr_expire_timer);
 	spin_unlock_bh(&mfc_unres_lock);
 
@@ -871,6 +869,7 @@ static void mroute_clean_tables(struct net *net)
 {
 	int i;
 	LIST_HEAD(list);
+	struct mfc_cache *c, *next;
 
 	/*
 	 *	Shut down all active vif entries
@@ -884,17 +883,12 @@ static void mroute_clean_tables(struct net *net)
 	/*
 	 *	Wipe the cache
 	 */
-	for (i=0; i<MFC_LINES; i++) {
-		struct mfc_cache *c, **cp;
-
-		cp = &net->ipv4.mfc_cache_array[i];
-		while ((c = *cp) != NULL) {
-			if (c->mfc_flags&MFC_STATIC) {
-				cp = &c->next;
+	for (i = 0; i < MFC_LINES; i++) {
+		list_for_each_entry_safe(c, next, &net->ipv4.mfc_cache_array[i], list) {
+			if (c->mfc_flags&MFC_STATIC)
 				continue;
-			}
 			write_lock_bh(&mrt_lock);
-			*cp = c->next;
+			list_del(&c->list);
 			write_unlock_bh(&mrt_lock);
 
 			ipmr_cache_free(c);
@@ -902,12 +896,9 @@ static void mroute_clean_tables(struct net *net)
 	}
 
 	if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) {
-		struct mfc_cache *c, **cp;
-
 		spin_lock_bh(&mfc_unres_lock);
-		cp = &net->ipv4.mfc_unres_queue;
-		while ((c = *cp) != NULL) {
-			*cp = c->next;
+		list_for_each_entry_safe(c, next, &net->ipv4.mfc_unres_queue, list) {
+			list_del(&c->list);
 			ipmr_destroy_unres(net, c);
 		}
 		spin_unlock_bh(&mfc_unres_lock);
@@ -1788,7 +1779,7 @@ static const struct file_operations ipmr_vif_fops = {
 
 struct ipmr_mfc_iter {
 	struct seq_net_private p;
-	struct mfc_cache **cache;
+	struct list_head *cache;
 	int ct;
 };
 
@@ -1798,18 +1789,18 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 {
 	struct mfc_cache *mfc;
 
-	it->cache = net->ipv4.mfc_cache_array;
 	read_lock(&mrt_lock);
-	for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
-		for (mfc = net->ipv4.mfc_cache_array[it->ct];
-		     mfc; mfc = mfc->next)
+	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
+		it->cache = &net->ipv4.mfc_cache_array[it->ct];
+		list_for_each_entry(mfc, it->cache, list)
 			if (pos-- == 0)
 				return mfc;
+	}
 	read_unlock(&mrt_lock);
 
-	it->cache = &net->ipv4.mfc_unres_queue;
 	spin_lock_bh(&mfc_unres_lock);
-	for (mfc = net->ipv4.mfc_unres_queue; mfc; mfc = mfc->next)
+	it->cache = &net->ipv4.mfc_unres_queue;
+	list_for_each_entry(mfc, it->cache, list)
 		if (pos-- == 0)
 			return mfc;
 	spin_unlock_bh(&mfc_unres_lock);
@@ -1841,18 +1832,19 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (v == SEQ_START_TOKEN)
 		return ipmr_mfc_seq_idx(net, seq->private, 0);
 
-	if (mfc->next)
-		return mfc->next;
+	if (mfc->list.next != it->cache)
+		return list_entry(mfc->list.next, struct mfc_cache, list);
 
 	if (it->cache == &net->ipv4.mfc_unres_queue)
 		goto end_of_list;
 
-	BUG_ON(it->cache != net->ipv4.mfc_cache_array);
+	BUG_ON(it->cache != &net->ipv4.mfc_cache_array[it->ct]);
 
 	while (++it->ct < MFC_LINES) {
-		mfc = net->ipv4.mfc_cache_array[it->ct];
-		if (mfc)
-			return mfc;
+		it->cache = &net->ipv4.mfc_cache_array[it->ct];
+		if (list_empty(it->cache))
+			continue;
+		return list_first_entry(it->cache, struct mfc_cache, list);
 	}
 
 	/* exhausted cache_array, show unresolved */
@@ -1861,9 +1853,8 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	it->ct = 0;
 
 	spin_lock_bh(&mfc_unres_lock);
-	mfc = net->ipv4.mfc_unres_queue;
-	if (mfc)
-		return mfc;
+	if (!list_empty(it->cache))
+		return list_first_entry(it->cache, struct mfc_cache, list);
 
  end_of_list:
 	spin_unlock_bh(&mfc_unres_lock);
@@ -1879,7 +1870,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 
 	if (it->cache == &net->ipv4.mfc_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == net->ipv4.mfc_cache_array)
+	else if (it->cache == &net->ipv4.mfc_cache_array[it->ct])
 		read_unlock(&mrt_lock);
 }
 
@@ -1959,6 +1950,7 @@ static const struct net_protocol pim_protocol = {
  */
 static int __net_init ipmr_net_init(struct net *net)
 {
+	unsigned int i;
 	int err = 0;
 
 	net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
@@ -1970,13 +1962,18 @@ static int __net_init ipmr_net_init(struct net *net)
 
 	/* Forwarding cache */
 	net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
-					    sizeof(struct mfc_cache *),
+					    sizeof(struct list_head),
 					    GFP_KERNEL);
 	if (!net->ipv4.mfc_cache_array) {
 		err = -ENOMEM;
 		goto fail_mfc_cache;
 	}
 
+	for (i = 0; i < MFC_LINES; i++)
+		INIT_LIST_HEAD(&net->ipv4.mfc_cache_array[i]);
+
+	INIT_LIST_HEAD(&net->ipv4.mfc_unres_queue);
+
 	setup_timer(&net->ipv4.ipmr_expire_timer, ipmr_expire_process,
 		    (unsigned long)net);
 
-- 
1.7.0.4


^ permalink raw reply related

* [RFC PATCH 9/9] net: ipmr: support multiple tables
From: kaber @ 2010-04-11 17:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1271007435-20035-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

This patch adds support for multiple independant multicast routing instances,
named "tables".

Userspace multicast routing daemons can bind to a specific table instance by
issuing a setsockopt call using a new option MRT_TABLE. The table number is
stored in the raw socket data and affects all following ipmr setsockopt(),
getsockopt() and ioctl() calls. By default, a single table (RT_TABLE_DEFAULT)
is created with a default routing rule pointing to it. Newly created pimreg
devices have the table number appended ("pimregX"), with the exception of
devices created in the default table, which are named just "pimreg" for
compatibility reasons.

Packets are directed to a specific table instance using routing rules,
similar to how regular routing rules work. Currently iif, oif and mark
are supported as keys, source and destination addresses could be supported
additionally.

Example usage:

- bind pimd/xorp/... to a specific table:

uint32_t table = 123;
setsockopt(fd, IPPROTO_IP, MRT_TABLE, &table, sizeof(table));

- create routing rules directing packets to the new table:

# ip mrule add iif eth0 lookup 123
# ip mrule add oif eth0 lookup 123

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/fib_rules.h |    1 +
 include/linux/mroute.h    |    3 +-
 include/net/netns/ipv4.h  |    5 +
 include/net/raw.h         |    1 +
 net/ipv4/Kconfig          |   14 ++
 net/ipv4/ipmr.c           |  399 ++++++++++++++++++++++++++++++++++++++-------
 6 files changed, 361 insertions(+), 62 deletions(-)

diff --git a/include/linux/fib_rules.h b/include/linux/fib_rules.h
index 405e411..04a3976 100644
--- a/include/linux/fib_rules.h
+++ b/include/linux/fib_rules.h
@@ -21,6 +21,7 @@
 #define FIB_RULES_IPV4		AF_INET
 #define FIB_RULES_IPV6		AF_INET6
 #define FIB_RULES_DECNET	AF_DECnet
+#define FIB_RULES_IPMR		128
 
 struct fib_rule_hdr {
 	__u8		family;
diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 7ff6c77..fa04b24 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -27,7 +27,8 @@
 #define MRT_DEL_MFC	(MRT_BASE+5)	/* Delete a multicast forwarding entry	*/
 #define MRT_VERSION	(MRT_BASE+6)	/* Get the kernel multicast version	*/
 #define MRT_ASSERT	(MRT_BASE+7)	/* Activate PIM assert mode		*/
-#define MRT_PIM		(MRT_BASE+8)	/* enable PIM code	*/
+#define MRT_PIM		(MRT_BASE+8)	/* enable PIM code			*/
+#define MRT_TABLE	(MRT_BASE+9)	/* Specify mroute table ID		*/
 
 #define SIOCGETVIFCNT	SIOCPROTOPRIVATE	/* IP protocol privates */
 #define SIOCGETSGCNT	(SIOCPROTOPRIVATE+1)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 72e762a..ae07fee 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -59,7 +59,12 @@ struct netns_ipv4 {
 	atomic_t rt_genid;
 
 #ifdef CONFIG_IP_MROUTE
+#ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES
 	struct mr_table		*mrt;
+#else
+	struct list_head	mr_tables;
+	struct fib_rules_ops	*mr_rules_ops;
+#endif
 #endif
 };
 #endif
diff --git a/include/net/raw.h b/include/net/raw.h
index 67cc643..43c5750 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -61,6 +61,7 @@ struct raw_sock {
 	/* inet_sock has to be the first member */
 	struct inet_sock   inet;
 	struct icmp_filter filter;
+	u32		   ipmr_table;
 };
 
 static inline struct raw_sock *raw_sk(const struct sock *sk)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index c9a1c68..be59774 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -250,6 +250,20 @@ config IP_MROUTE
 	  <file:Documentation/networking/multicast.txt>. If you haven't heard
 	  about it, you don't need it.
 
+config IP_MROUTE_MULTIPLE_TABLES
+	bool "IP: multicast policy routing"
+	depends on IP_ADVANCED_ROUTER
+	select FIB_RULES
+	help
+	  Normally, a multicast router runs a userspace daemon and decides
+	  what to do with a multicast packet based on the source and
+	  destination addresses. If you say Y here, the multicast router
+	  will also be able to take interfaces and packet marks into
+	  account and run multiple instances of userspace daemons
+	  simultaneously, each one handling a single table.
+
+	  If unsure, say N.
+
 config IP_PIMSM_V1
 	bool "IP: PIM-SM version 1 support"
 	depends on IP_MROUTE
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index b733a12..2046413 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -62,12 +62,15 @@
 #include <net/ipip.h>
 #include <net/checksum.h>
 #include <net/netlink.h>
+#include <net/fib_rules.h>
 
 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
 #define CONFIG_IP_PIMSM	1
 #endif
 
 struct mr_table {
+	struct list_head	list;
+	u32			id;
 	struct sock		*mroute_sk;
 	struct timer_list	ipmr_expire_timer;
 	struct list_head	mfc_unres_queue;
@@ -82,6 +85,14 @@ struct mr_table {
 #endif
 };
 
+struct ipmr_rule {
+	struct fib_rule		common;
+};
+
+struct ipmr_result {
+	struct mr_table		*mrt;
+};
+
 /* Big lock, protecting vif table, mrt cache and mroute socket state.
    Note that the changes are semaphored via rtnl_lock.
  */
@@ -107,6 +118,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
 
 static struct kmem_cache *mrt_cachep __read_mostly;
 
+static struct mr_table *ipmr_new_table(struct net *net, u32 id);
 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 			 struct sk_buff *skb, struct mfc_cache *cache,
 			 int local);
@@ -114,6 +126,206 @@ static int ipmr_cache_report(struct mr_table *mrt,
 			     struct sk_buff *pkt, vifi_t vifi, int assert);
 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 			    struct mfc_cache *c, struct rtmsg *rtm);
+static void ipmr_expire_process(unsigned long arg);
+
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+#define ipmr_for_each_table(mrt, net) \
+	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+	struct mr_table *mrt;
+
+	ipmr_for_each_table(mrt, net) {
+		if (mrt->id == id)
+			return mrt;
+	}
+	return NULL;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
+			   struct mr_table **mrt)
+{
+	struct ipmr_result res;
+	struct fib_lookup_arg arg = { .result = &res, };
+	int err;
+
+	err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
+	if (err < 0)
+		return err;
+	*mrt = res.mrt;
+	return 0;
+}
+
+static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	struct ipmr_result *res = arg->result;
+	struct mr_table *mrt;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	mrt = ipmr_get_table(rule->fr_net, rule->table);
+	if (mrt == NULL)
+		return -EAGAIN;
+	res->mrt = mrt;
+	return 0;
+}
+
+static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+	return 1;
+}
+
+static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
+	FRA_GENERIC_POLICY,
+};
+
+static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+			       struct fib_rule_hdr *frh, struct nlattr **tb)
+{
+	return 0;
+}
+
+static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			     struct nlattr **tb)
+{
+	return 1;
+}
+
+static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			  struct fib_rule_hdr *frh)
+{
+	frh->dst_len = 0;
+	frh->src_len = 0;
+	frh->tos     = 0;
+	return 0;
+}
+
+static struct fib_rules_ops ipmr_rules_ops_template = {
+	.family		= FIB_RULES_IPMR,
+	.rule_size	= sizeof(struct ipmr_rule),
+	.addr_size	= sizeof(u32),
+	.action		= ipmr_rule_action,
+	.match		= ipmr_rule_match,
+	.configure	= ipmr_rule_configure,
+	.compare	= ipmr_rule_compare,
+	.default_pref	= fib_default_rule_pref,
+	.fill		= ipmr_rule_fill,
+	.nlgroup	= RTNLGRP_IPV4_RULE,
+	.policy		= ipmr_rule_policy,
+	.owner		= THIS_MODULE,
+};
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+	struct fib_rules_ops *ops;
+	struct mr_table *mrt;
+	int err;
+
+	ops = fib_rules_register(&ipmr_rules_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	INIT_LIST_HEAD(&net->ipv4.mr_tables);
+
+	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+	if (mrt == NULL) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
+	if (err < 0)
+		goto err2;
+
+	net->ipv4.mr_rules_ops = ops;
+	return 0;
+
+err2:
+	kfree(mrt);
+err1:
+	fib_rules_unregister(ops);
+	return err;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+	struct mr_table *mrt, *next;
+
+	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list)
+		kfree(mrt);
+	fib_rules_unregister(net->ipv4.mr_rules_ops);
+}
+#else
+#define ipmr_for_each_table(mrt, net) \
+	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+	return net->ipv4.mrt;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
+			   struct mr_table **mrt)
+{
+	*mrt = net->ipv4.mrt;
+	return 0;
+}
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+	net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+	return net->ipv4.mrt ? 0 : -ENOMEM;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+	kfree(net->ipv4.mrt);
+}
+#endif
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id)
+{
+	struct mr_table *mrt;
+	unsigned int i;
+
+	mrt = ipmr_get_table(net, id);
+	if (mrt != NULL)
+		return mrt;
+
+	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+	if (mrt == NULL)
+		return NULL;
+	mrt->id = id;
+
+	/* Forwarding cache */
+	for (i = 0; i < MFC_LINES; i++)
+		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
+
+	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
+
+	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
+		    (unsigned long)mrt);
+
+#ifdef CONFIG_IP_PIMSM
+	mrt->mroute_reg_vif_num = -1;
+#endif
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
+#endif
+	return mrt;
+}
 
 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 
@@ -214,7 +426,17 @@ failure:
 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct net *net = dev_net(dev);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt;
+	struct flowi fl = {
+		.oif		= dev->ifindex,
+		.iif		= skb->skb_iif,
+		.mark		= skb->mark,
+	};
+	int err;
+
+	err = ipmr_fib_lookup(net, &fl, &mrt);
+	if (err < 0)
+		return err;
 
 	read_lock(&mrt_lock);
 	dev->stats.tx_bytes += skb->len;
@@ -239,12 +461,18 @@ static void reg_vif_setup(struct net_device *dev)
 	dev->features		|= NETIF_F_NETNS_LOCAL;
 }
 
-static struct net_device *ipmr_reg_vif(struct net *net)
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 {
 	struct net_device *dev;
 	struct in_device *in_dev;
+	char name[IFNAMSIZ];
+
+	if (mrt->id == RT_TABLE_DEFAULT)
+		sprintf(name, "pimreg");
+	else
+		sprintf(name, "pimreg%u", mrt->id);
 
-	dev = alloc_netdev(0, "pimreg", reg_vif_setup);
+	dev = alloc_netdev(0, name, reg_vif_setup);
 
 	if (dev == NULL)
 		return NULL;
@@ -460,7 +688,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 		 */
 		if (mrt->mroute_reg_vif_num >= 0)
 			return -EADDRINUSE;
-		dev = ipmr_reg_vif(net);
+		dev = ipmr_reg_vif(net, mrt);
 		if (!dev)
 			return -ENOBUFS;
 		err = dev_set_allmulti(dev, 1);
@@ -927,17 +1155,19 @@ static void mroute_clean_tables(struct mr_table *mrt)
 static void mrtsock_destruct(struct sock *sk)
 {
 	struct net *net = sock_net(sk);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt;
 
 	rtnl_lock();
-	if (sk == mrt->mroute_sk) {
-		IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
+	ipmr_for_each_table(mrt, net) {
+		if (sk == mrt->mroute_sk) {
+			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
 
-		write_lock_bh(&mrt_lock);
-		mrt->mroute_sk = NULL;
-		write_unlock_bh(&mrt_lock);
+			write_lock_bh(&mrt_lock);
+			mrt->mroute_sk = NULL;
+			write_unlock_bh(&mrt_lock);
 
-		mroute_clean_tables(mrt);
+			mroute_clean_tables(mrt);
+		}
 	}
 	rtnl_unlock();
 }
@@ -955,7 +1185,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 	struct vifctl vif;
 	struct mfcctl mfc;
 	struct net *net = sock_net(sk);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return -ENOENT;
 
 	if (optname != MRT_INIT) {
 		if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
@@ -1054,6 +1288,27 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 		return ret;
 	}
 #endif
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+	case MRT_TABLE:
+	{
+		u32 v;
+
+		if (optlen != sizeof(u32))
+			return -EINVAL;
+		if (get_user(v, (u32 __user *)optval))
+			return -EFAULT;
+		if (sk == mrt->mroute_sk)
+			return -EBUSY;
+
+		rtnl_lock();
+		ret = 0;
+		if (!ipmr_new_table(net, v))
+			ret = -ENOMEM;
+		raw_sk(sk)->ipmr_table = v;
+		rtnl_unlock();
+		return ret;
+	}
+#endif
 	/*
 	 *	Spurious command, or MRT_VERSION which you cannot
 	 *	set.
@@ -1072,7 +1327,11 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
 	int olr;
 	int val;
 	struct net *net = sock_net(sk);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return -ENOENT;
 
 	if (optname != MRT_VERSION &&
 #ifdef CONFIG_IP_PIMSM
@@ -1114,7 +1373,11 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
 	struct vif_device *vif;
 	struct mfc_cache *c;
 	struct net *net = sock_net(sk);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return -ENOENT;
 
 	switch (cmd) {
 	case SIOCGETVIFCNT:
@@ -1165,17 +1428,20 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
 {
 	struct net_device *dev = ptr;
 	struct net *net = dev_net(dev);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt;
 	struct vif_device *v;
 	int ct;
 	LIST_HEAD(list);
 
 	if (event != NETDEV_UNREGISTER)
 		return NOTIFY_DONE;
-	v = &mrt->vif_table[0];
-	for (ct = 0; ct < mrt->maxvif; ct++, v++) {
-		if (v->dev == dev)
-			vif_delete(mrt, ct, 1, &list);
+
+	ipmr_for_each_table(mrt, net) {
+		v = &mrt->vif_table[0];
+		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
+			if (v->dev == dev)
+				vif_delete(mrt, ct, 1, &list);
+		}
 	}
 	unregister_netdevice_many(&list);
 	return NOTIFY_DONE;
@@ -1442,8 +1708,9 @@ int ip_mr_input(struct sk_buff *skb)
 {
 	struct mfc_cache *cache;
 	struct net *net = dev_net(skb->dev);
-	struct mr_table *mrt = net->ipv4.mrt;
 	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
+	struct mr_table *mrt;
+	int err;
 
 	/* Packet is looped back after forward, it should not be
 	   forwarded second time, but still can be delivered locally.
@@ -1451,6 +1718,10 @@ int ip_mr_input(struct sk_buff *skb)
 	if (IPCB(skb)->flags&IPSKB_FORWARDED)
 		goto dont_forward;
 
+	err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
+	if (err < 0)
+		return err;
+
 	if (!local) {
 		    if (IPCB(skb)->opt.router_alert) {
 			    if (ip_call_ra_chain(skb))
@@ -1521,12 +1792,11 @@ dont_forward:
 }
 
 #ifdef CONFIG_IP_PIMSM
-static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
+static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
+		     unsigned int pimlen)
 {
 	struct net_device *reg_dev = NULL;
 	struct iphdr *encap;
-	struct net *net = dev_net(skb->dev);
-	struct mr_table *mrt = net->ipv4.mrt;
 
 	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
 	/*
@@ -1577,18 +1847,21 @@ int pim_rcv_v1(struct sk_buff * skb)
 {
 	struct igmphdr *pim;
 	struct net *net = dev_net(skb->dev);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt;
 
 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
 		goto drop;
 
 	pim = igmp_hdr(skb);
 
+	if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
+		goto drop;
+
 	if (!mrt->mroute_do_pim ||
 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
 		goto drop;
 
-	if (__pim_rcv(skb, sizeof(*pim))) {
+	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
 drop:
 		kfree_skb(skb);
 	}
@@ -1600,6 +1873,8 @@ drop:
 static int pim_rcv(struct sk_buff * skb)
 {
 	struct pimreghdr *pim;
+	struct net *net = dev_net(skb->dev);
+	struct mr_table *mrt;
 
 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
 		goto drop;
@@ -1611,7 +1886,10 @@ static int pim_rcv(struct sk_buff * skb)
 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
 		goto drop;
 
-	if (__pim_rcv(skb, sizeof(*pim))) {
+	if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
+		goto drop;
+
+	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
 drop:
 		kfree_skb(skb);
 	}
@@ -1662,10 +1940,14 @@ int ipmr_get_route(struct net *net,
 		   struct sk_buff *skb, struct rtmsg *rtm, int nowait)
 {
 	int err;
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt;
 	struct mfc_cache *cache;
 	struct rtable *rt = skb_rtable(skb);
 
+	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return -ENOENT;
+
 	read_lock(&mrt_lock);
 	cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
 
@@ -1716,6 +1998,7 @@ int ipmr_get_route(struct net *net,
  */
 struct ipmr_vif_iter {
 	struct seq_net_private p;
+	struct mr_table *mrt;
 	int ct;
 };
 
@@ -1723,7 +2006,7 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net,
 					   struct ipmr_vif_iter *iter,
 					   loff_t pos)
 {
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt = iter->mrt;
 
 	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
 		if (!VIF_EXISTS(mrt, iter->ct))
@@ -1737,7 +2020,15 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net,
 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(mrt_lock)
 {
+	struct ipmr_vif_iter *iter = seq->private;
 	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return ERR_PTR(-ENOENT);
+
+	iter->mrt = mrt;
 
 	read_lock(&mrt_lock);
 	return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
@@ -1748,7 +2039,7 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct ipmr_vif_iter *iter = seq->private;
 	struct net *net = seq_file_net(seq);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt = iter->mrt;
 
 	++*pos;
 	if (v == SEQ_START_TOKEN)
@@ -1770,8 +2061,8 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
 
 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 {
-	struct net *net = seq_file_net(seq);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct ipmr_vif_iter *iter = seq->private;
+	struct mr_table *mrt = iter->mrt;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
@@ -1813,6 +2104,7 @@ static const struct file_operations ipmr_vif_fops = {
 
 struct ipmr_mfc_iter {
 	struct seq_net_private p;
+	struct mr_table *mrt;
 	struct list_head *cache;
 	int ct;
 };
@@ -1821,7 +2113,7 @@ struct ipmr_mfc_iter {
 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 					  struct ipmr_mfc_iter *it, loff_t pos)
 {
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt = it->mrt;
 	struct mfc_cache *mfc;
 
 	read_lock(&mrt_lock);
@@ -1849,7 +2141,13 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return ERR_PTR(-ENOENT);
 
+	it->mrt = mrt;
 	it->cache = NULL;
 	it->ct = 0;
 	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
@@ -1861,7 +2159,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	struct mfc_cache *mfc = v;
 	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt = it->mrt;
 
 	++*pos;
 
@@ -1902,8 +2200,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 {
 	struct ipmr_mfc_iter *it = seq->private;
-	struct net *net = seq_file_net(seq);
-	struct mr_table *mrt = net->ipv4.mrt;
+	struct mr_table *mrt = it->mrt;
 
 	if (it->cache == &mrt->mfc_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
@@ -1914,8 +2211,6 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 {
 	int n;
-	struct net *net = seq_file_net(seq);
-	struct mr_table *mrt = net->ipv4.mrt;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
@@ -1923,6 +2218,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 	} else {
 		const struct mfc_cache *mfc = v;
 		const struct ipmr_mfc_iter *it = seq->private;
+		const struct mr_table *mrt = it->mrt;
 
 		seq_printf(seq, "%08lX %08lX %-3hd",
 			   (unsigned long) mfc->mfc_mcastgrp,
@@ -1988,28 +2284,11 @@ static const struct net_protocol pim_protocol = {
  */
 static int __net_init ipmr_net_init(struct net *net)
 {
-	struct mr_table *mrt;
-	unsigned int i;
-	int err = 0;
+	int err;
 
-	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
-	if (mrt == NULL) {
-		err = -ENOMEM;
+	err = ipmr_rules_init(net);
+	if (err < 0)
 		goto fail;
-	}
-
-	/* Forwarding cache */
-	for (i = 0; i < MFC_LINES; i++)
-		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
-
-	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
-
-	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
-		    (unsigned long)net);
-
-#ifdef CONFIG_IP_PIMSM
-	mrt->mroute_reg_vif_num = -1;
-#endif
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
@@ -2018,15 +2297,13 @@ static int __net_init ipmr_net_init(struct net *net)
 	if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
 		goto proc_cache_fail;
 #endif
-
-	net->ipv4.mrt = mrt;
 	return 0;
 
 #ifdef CONFIG_PROC_FS
 proc_cache_fail:
 	proc_net_remove(net, "ip_mr_vif");
 proc_vif_fail:
-	kfree(mrt);
+	ipmr_rules_exit(net);
 #endif
 fail:
 	return err;
@@ -2038,7 +2315,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
 	proc_net_remove(net, "ip_mr_cache");
 	proc_net_remove(net, "ip_mr_vif");
 #endif
-	kfree(net->ipv4.mrt);
+	ipmr_rules_exit(net);
 }
 
 static struct pernet_operations ipmr_net_ops = {
-- 
1.7.0.4


^ permalink raw reply related

* [RFC PATCH 8/9] net: ipmr: move mroute data into seperate structure
From: kaber @ 2010-04-11 17:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1271007435-20035-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/netns/ipv4.h |   13 +--
 net/ipv4/ipmr.c          |  369 +++++++++++++++++++++++++---------------------
 2 files changed, 200 insertions(+), 182 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 5d06429..72e762a 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -59,18 +59,7 @@ struct netns_ipv4 {
 	atomic_t rt_genid;
 
 #ifdef CONFIG_IP_MROUTE
-	struct sock		*mroute_sk;
-	struct timer_list	ipmr_expire_timer;
-	struct list_head	mfc_unres_queue;
-	struct list_head	*mfc_cache_array;
-	struct vif_device	*vif_table;
-	int			maxvif;
-	atomic_t		cache_resolve_queue_len;
-	int			mroute_do_assert;
-	int			mroute_do_pim;
-#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
-	int			mroute_reg_vif_num;
-#endif
+	struct mr_table		*mrt;
 #endif
 };
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 6107790..b733a12 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -67,6 +67,21 @@
 #define CONFIG_IP_PIMSM	1
 #endif
 
+struct mr_table {
+	struct sock		*mroute_sk;
+	struct timer_list	ipmr_expire_timer;
+	struct list_head	mfc_unres_queue;
+	struct list_head	mfc_cache_array[MFC_LINES];
+	struct vif_device	vif_table[MAXVIFS];
+	int			maxvif;
+	atomic_t		cache_resolve_queue_len;
+	int			mroute_do_assert;
+	int			mroute_do_pim;
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+	int			mroute_reg_vif_num;
+#endif
+};
+
 /* Big lock, protecting vif table, mrt cache and mroute socket state.
    Note that the changes are semaphored via rtnl_lock.
  */
@@ -77,7 +92,7 @@ static DEFINE_RWLOCK(mrt_lock);
  *	Multicast router control variables
  */
 
-#define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
+#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
 
 /* Special spinlock for queue of unresolved entries */
 static DEFINE_SPINLOCK(mfc_unres_lock);
@@ -92,11 +107,12 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
 
 static struct kmem_cache *mrt_cachep __read_mostly;
 
-static int ip_mr_forward(struct net *net, struct sk_buff *skb,
-			 struct mfc_cache *cache, int local);
-static int ipmr_cache_report(struct net *net,
+static int ip_mr_forward(struct net *net, struct mr_table *mrt,
+			 struct sk_buff *skb, struct mfc_cache *cache,
+			 int local);
+static int ipmr_cache_report(struct mr_table *mrt,
 			     struct sk_buff *pkt, vifi_t vifi, int assert);
-static int ipmr_fill_mroute(struct net *net, struct sk_buff *skb,
+static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 			    struct mfc_cache *c, struct rtmsg *rtm);
 
 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
@@ -198,12 +214,12 @@ failure:
 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct net *net = dev_net(dev);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	read_lock(&mrt_lock);
 	dev->stats.tx_bytes += skb->len;
 	dev->stats.tx_packets++;
-	ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num,
-			  IGMPMSG_WHOLEPKT);
+	ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
 	read_unlock(&mrt_lock);
 	kfree_skb(skb);
 	return NETDEV_TX_OK;
@@ -273,17 +289,17 @@ failure:
  *	@notify: Set to 1, if the caller is a notifier_call
  */
 
-static int vif_delete(struct net *net, int vifi, int notify,
+static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 		      struct list_head *head)
 {
 	struct vif_device *v;
 	struct net_device *dev;
 	struct in_device *in_dev;
 
-	if (vifi < 0 || vifi >= net->ipv4.maxvif)
+	if (vifi < 0 || vifi >= mrt->maxvif)
 		return -EADDRNOTAVAIL;
 
-	v = &net->ipv4.vif_table[vifi];
+	v = &mrt->vif_table[vifi];
 
 	write_lock_bh(&mrt_lock);
 	dev = v->dev;
@@ -295,17 +311,17 @@ static int vif_delete(struct net *net, int vifi, int notify,
 	}
 
 #ifdef CONFIG_IP_PIMSM
-	if (vifi == net->ipv4.mroute_reg_vif_num)
-		net->ipv4.mroute_reg_vif_num = -1;
+	if (vifi == mrt->mroute_reg_vif_num)
+		mrt->mroute_reg_vif_num = -1;
 #endif
 
-	if (vifi+1 == net->ipv4.maxvif) {
+	if (vifi+1 == mrt->maxvif) {
 		int tmp;
 		for (tmp=vifi-1; tmp>=0; tmp--) {
-			if (VIF_EXISTS(net, tmp))
+			if (VIF_EXISTS(mrt, tmp))
 				break;
 		}
-		net->ipv4.maxvif = tmp+1;
+		mrt->maxvif = tmp+1;
 	}
 
 	write_unlock_bh(&mrt_lock);
@@ -333,12 +349,13 @@ static inline void ipmr_cache_free(struct mfc_cache *c)
    and reporting error to netlink readers.
  */
 
-static void ipmr_destroy_unres(struct net *net, struct mfc_cache *c)
+static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
 {
+	struct net *net = NULL; //mrt->net;
 	struct sk_buff *skb;
 	struct nlmsgerr *e;
 
-	atomic_dec(&net->ipv4.cache_resolve_queue_len);
+	atomic_dec(&mrt->cache_resolve_queue_len);
 
 	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
 		if (ip_hdr(skb)->version == 0) {
@@ -363,23 +380,23 @@ static void ipmr_destroy_unres(struct net *net, struct mfc_cache *c)
 
 static void ipmr_expire_process(unsigned long arg)
 {
-	struct net *net = (struct net *)arg;
+	struct mr_table *mrt = (struct mr_table *)arg;
 	unsigned long now;
 	unsigned long expires;
 	struct mfc_cache *c, *next;
 
 	if (!spin_trylock(&mfc_unres_lock)) {
-		mod_timer(&net->ipv4.ipmr_expire_timer, jiffies+HZ/10);
+		mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
 		return;
 	}
 
-	if (list_empty(&net->ipv4.mfc_unres_queue))
+	if (list_empty(&mrt->mfc_unres_queue))
 		goto out;
 
 	now = jiffies;
 	expires = 10*HZ;
 
-	list_for_each_entry_safe(c, next, &net->ipv4.mfc_unres_queue, list) {
+	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
 		if (time_after(c->mfc_un.unres.expires, now)) {
 			unsigned long interval = c->mfc_un.unres.expires - now;
 			if (interval < expires)
@@ -388,11 +405,11 @@ static void ipmr_expire_process(unsigned long arg)
 		}
 
 		list_del(&c->list);
-		ipmr_destroy_unres(net, c);
+		ipmr_destroy_unres(mrt, c);
 	}
 
-	if (!list_empty(&net->ipv4.mfc_unres_queue))
-		mod_timer(&net->ipv4.ipmr_expire_timer, jiffies + expires);
+	if (!list_empty(&mrt->mfc_unres_queue))
+		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
 
 out:
 	spin_unlock(&mfc_unres_lock);
@@ -400,7 +417,7 @@ out:
 
 /* Fill oifs list. It is called under write locked mrt_lock. */
 
-static void ipmr_update_thresholds(struct net *net, struct mfc_cache *cache,
+static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
 				   unsigned char *ttls)
 {
 	int vifi;
@@ -409,8 +426,8 @@ static void ipmr_update_thresholds(struct net *net, struct mfc_cache *cache,
 	cache->mfc_un.res.maxvif = 0;
 	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
 
-	for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) {
-		if (VIF_EXISTS(net, vifi) &&
+	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
+		if (VIF_EXISTS(mrt, vifi) &&
 		    ttls[vifi] && ttls[vifi] < 255) {
 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
 			if (cache->mfc_un.res.minvif > vifi)
@@ -421,16 +438,17 @@ static void ipmr_update_thresholds(struct net *net, struct mfc_cache *cache,
 	}
 }
 
-static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
+static int vif_add(struct net *net, struct mr_table *mrt,
+		   struct vifctl *vifc, int mrtsock)
 {
 	int vifi = vifc->vifc_vifi;
-	struct vif_device *v = &net->ipv4.vif_table[vifi];
+	struct vif_device *v = &mrt->vif_table[vifi];
 	struct net_device *dev;
 	struct in_device *in_dev;
 	int err;
 
 	/* Is vif busy ? */
-	if (VIF_EXISTS(net, vifi))
+	if (VIF_EXISTS(mrt, vifi))
 		return -EADDRINUSE;
 
 	switch (vifc->vifc_flags) {
@@ -440,7 +458,7 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
 		 * Special Purpose VIF in PIM
 		 * All the packets will be sent to the daemon
 		 */
-		if (net->ipv4.mroute_reg_vif_num >= 0)
+		if (mrt->mroute_reg_vif_num >= 0)
 			return -EADDRINUSE;
 		dev = ipmr_reg_vif(net);
 		if (!dev)
@@ -518,22 +536,22 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
 	v->dev = dev;
 #ifdef CONFIG_IP_PIMSM
 	if (v->flags&VIFF_REGISTER)
-		net->ipv4.mroute_reg_vif_num = vifi;
+		mrt->mroute_reg_vif_num = vifi;
 #endif
-	if (vifi+1 > net->ipv4.maxvif)
-		net->ipv4.maxvif = vifi+1;
+	if (vifi+1 > mrt->maxvif)
+		mrt->maxvif = vifi+1;
 	write_unlock_bh(&mrt_lock);
 	return 0;
 }
 
-static struct mfc_cache *ipmr_cache_find(struct net *net,
+static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 					 __be32 origin,
 					 __be32 mcastgrp)
 {
 	int line = MFC_HASH(mcastgrp, origin);
 	struct mfc_cache *c;
 
-	list_for_each_entry(c, &net->ipv4.mfc_cache_array[line], list) {
+	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
 		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
 			return c;
 	}
@@ -566,8 +584,8 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
  *	A cache entry has gone into a resolved state from queued
  */
 
-static void ipmr_cache_resolve(struct net *net, struct mfc_cache *uc,
-			       struct mfc_cache *c)
+static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
+			       struct mfc_cache *uc, struct mfc_cache *c)
 {
 	struct sk_buff *skb;
 	struct nlmsgerr *e;
@@ -580,7 +598,7 @@ static void ipmr_cache_resolve(struct net *net, struct mfc_cache *uc,
 		if (ip_hdr(skb)->version == 0) {
 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
 
-			if (ipmr_fill_mroute(net, skb, c, NLMSG_DATA(nlh)) > 0) {
+			if (ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
 				nlh->nlmsg_len = (skb_tail_pointer(skb) -
 						  (u8 *)nlh);
 			} else {
@@ -594,7 +612,7 @@ static void ipmr_cache_resolve(struct net *net, struct mfc_cache *uc,
 
 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
 		} else
-			ip_mr_forward(net, skb, c, 0);
+			ip_mr_forward(net, mrt, skb, c, 0);
 	}
 }
 
@@ -605,7 +623,7 @@ static void ipmr_cache_resolve(struct net *net, struct mfc_cache *uc,
  *	Called under mrt_lock.
  */
 
-static int ipmr_cache_report(struct net *net,
+static int ipmr_cache_report(struct mr_table *mrt,
 			     struct sk_buff *pkt, vifi_t vifi, int assert)
 {
 	struct sk_buff *skb;
@@ -638,7 +656,7 @@ static int ipmr_cache_report(struct net *net,
 		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
 		msg->im_msgtype = IGMPMSG_WHOLEPKT;
 		msg->im_mbz = 0;
-		msg->im_vif = net->ipv4.mroute_reg_vif_num;
+		msg->im_vif = mrt->mroute_reg_vif_num;
 		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
 		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
 					     sizeof(struct iphdr));
@@ -670,7 +688,7 @@ static int ipmr_cache_report(struct net *net,
 	skb->transport_header = skb->network_header;
 	}
 
-	if (net->ipv4.mroute_sk == NULL) {
+	if (mrt->mroute_sk == NULL) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -678,7 +696,7 @@ static int ipmr_cache_report(struct net *net,
 	/*
 	 *	Deliver to mrouted
 	 */
-	ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb);
+	ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
 	if (ret < 0) {
 		if (net_ratelimit())
 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -693,7 +711,7 @@ static int ipmr_cache_report(struct net *net,
  */
 
 static int
-ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
+ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
 {
 	bool found = false;
 	int err;
@@ -701,7 +719,7 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 
 	spin_lock_bh(&mfc_unres_lock);
-	list_for_each_entry(c, &net->ipv4.mfc_unres_queue, list) {
+	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
 		if (c->mfc_mcastgrp == iph->daddr &&
 		    c->mfc_origin == iph->saddr) {
 			found = true;
@@ -714,7 +732,7 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 		 *	Create a new entry if allowable
 		 */
 
-		if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 ||
+		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
 		    (c = ipmr_cache_alloc_unres()) == NULL) {
 			spin_unlock_bh(&mfc_unres_lock);
 
@@ -732,7 +750,7 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 		/*
 		 *	Reflect first query at mrouted.
 		 */
-		err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE);
+		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
 		if (err < 0) {
 			/* If the report failed throw the cache entry
 			   out - Brad Parker
@@ -744,10 +762,10 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
 			return err;
 		}
 
-		atomic_inc(&net->ipv4.cache_resolve_queue_len);
-		list_add_tail(&c->list, &net->ipv4.mfc_unres_queue);
+		atomic_inc(&mrt->cache_resolve_queue_len);
+		list_add_tail(&c->list, &mrt->mfc_unres_queue);
 
-		mod_timer(&net->ipv4.ipmr_expire_timer, c->mfc_un.unres.expires);
+		mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
 	}
 
 	/*
@@ -769,14 +787,14 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
  *	MFC cache manipulation by user space mroute daemon
  */
 
-static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
+static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
 {
 	int line;
 	struct mfc_cache *c, *next;
 
 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
 
-	list_for_each_entry_safe(c, next, &net->ipv4.mfc_cache_array[line], list) {
+	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
 			write_lock_bh(&mrt_lock);
@@ -790,7 +808,8 @@ static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
 	return -ENOENT;
 }
 
-static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
+static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
+			struct mfcctl *mfc, int mrtsock)
 {
 	bool found = false;
 	int line;
@@ -801,7 +820,7 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 
 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
 
-	list_for_each_entry(c, &net->ipv4.mfc_cache_array[line], list) {
+	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
 			found = true;
@@ -812,7 +831,7 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 	if (found) {
 		write_lock_bh(&mrt_lock);
 		c->mfc_parent = mfc->mfcc_parent;
-		ipmr_update_thresholds(net, c, mfc->mfcc_ttls);
+		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
 		if (!mrtsock)
 			c->mfc_flags |= MFC_STATIC;
 		write_unlock_bh(&mrt_lock);
@@ -829,12 +848,12 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 	c->mfc_origin = mfc->mfcc_origin.s_addr;
 	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
 	c->mfc_parent = mfc->mfcc_parent;
-	ipmr_update_thresholds(net, c, mfc->mfcc_ttls);
+	ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
 	if (!mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
 	write_lock_bh(&mrt_lock);
-	list_add_tail(&c->list, &net->ipv4.mfc_cache_array[line]);
+	list_add_tail(&c->list, &mrt->mfc_cache_array[line]);
 	write_unlock_bh(&mrt_lock);
 
 	/*
@@ -842,20 +861,20 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 	 *	need to send on the frames and tidy up.
 	 */
 	spin_lock_bh(&mfc_unres_lock);
-	list_for_each_entry(uc, &net->ipv4.mfc_unres_queue, list) {
+	list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
 		if (uc->mfc_origin == c->mfc_origin &&
 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
 			list_del(&uc->list);
-			atomic_dec(&net->ipv4.cache_resolve_queue_len);
+			atomic_dec(&mrt->cache_resolve_queue_len);
 			break;
 		}
 	}
-	if (list_empty(&net->ipv4.mfc_unres_queue))
-		del_timer(&net->ipv4.ipmr_expire_timer);
+	if (list_empty(&mrt->mfc_unres_queue))
+		del_timer(&mrt->ipmr_expire_timer);
 	spin_unlock_bh(&mfc_unres_lock);
 
 	if (uc) {
-		ipmr_cache_resolve(net, uc, c);
+		ipmr_cache_resolve(net, mrt, uc, c);
 		ipmr_cache_free(uc);
 	}
 	return 0;
@@ -865,7 +884,7 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
  *	Close the multicast socket, and clear the vif tables etc
  */
 
-static void mroute_clean_tables(struct net *net)
+static void mroute_clean_tables(struct mr_table *mrt)
 {
 	int i;
 	LIST_HEAD(list);
@@ -874,9 +893,9 @@ static void mroute_clean_tables(struct net *net)
 	/*
 	 *	Shut down all active vif entries
 	 */
-	for (i = 0; i < net->ipv4.maxvif; i++) {
-		if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC))
-			vif_delete(net, i, 0, &list);
+	for (i = 0; i < mrt->maxvif; i++) {
+		if (!(mrt->vif_table[i].flags&VIFF_STATIC))
+			vif_delete(mrt, i, 0, &list);
 	}
 	unregister_netdevice_many(&list);
 
@@ -884,7 +903,7 @@ static void mroute_clean_tables(struct net *net)
 	 *	Wipe the cache
 	 */
 	for (i = 0; i < MFC_LINES; i++) {
-		list_for_each_entry_safe(c, next, &net->ipv4.mfc_cache_array[i], list) {
+		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
 			if (c->mfc_flags&MFC_STATIC)
 				continue;
 			write_lock_bh(&mrt_lock);
@@ -895,11 +914,11 @@ static void mroute_clean_tables(struct net *net)
 		}
 	}
 
-	if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) {
+	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
 		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry_safe(c, next, &net->ipv4.mfc_unres_queue, list) {
+		list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
 			list_del(&c->list);
-			ipmr_destroy_unres(net, c);
+			ipmr_destroy_unres(mrt, c);
 		}
 		spin_unlock_bh(&mfc_unres_lock);
 	}
@@ -908,16 +927,17 @@ static void mroute_clean_tables(struct net *net)
 static void mrtsock_destruct(struct sock *sk)
 {
 	struct net *net = sock_net(sk);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	rtnl_lock();
-	if (sk == net->ipv4.mroute_sk) {
+	if (sk == mrt->mroute_sk) {
 		IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
 
 		write_lock_bh(&mrt_lock);
-		net->ipv4.mroute_sk = NULL;
+		mrt->mroute_sk = NULL;
 		write_unlock_bh(&mrt_lock);
 
-		mroute_clean_tables(net);
+		mroute_clean_tables(mrt);
 	}
 	rtnl_unlock();
 }
@@ -935,9 +955,10 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 	struct vifctl vif;
 	struct mfcctl mfc;
 	struct net *net = sock_net(sk);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	if (optname != MRT_INIT) {
-		if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
+		if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
 			return -EACCES;
 	}
 
@@ -950,7 +971,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 			return -ENOPROTOOPT;
 
 		rtnl_lock();
-		if (net->ipv4.mroute_sk) {
+		if (mrt->mroute_sk) {
 			rtnl_unlock();
 			return -EADDRINUSE;
 		}
@@ -958,7 +979,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
 		if (ret == 0) {
 			write_lock_bh(&mrt_lock);
-			net->ipv4.mroute_sk = sk;
+			mrt->mroute_sk = sk;
 			write_unlock_bh(&mrt_lock);
 
 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
@@ -966,7 +987,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 		rtnl_unlock();
 		return ret;
 	case MRT_DONE:
-		if (sk != net->ipv4.mroute_sk)
+		if (sk != mrt->mroute_sk)
 			return -EACCES;
 		return ip_ra_control(sk, 0, NULL);
 	case MRT_ADD_VIF:
@@ -979,9 +1000,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 			return -ENFILE;
 		rtnl_lock();
 		if (optname == MRT_ADD_VIF) {
-			ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk);
+			ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
 		} else {
-			ret = vif_delete(net, vif.vifc_vifi, 0, NULL);
+			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
 		}
 		rtnl_unlock();
 		return ret;
@@ -998,9 +1019,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 			return -EFAULT;
 		rtnl_lock();
 		if (optname == MRT_DEL_MFC)
-			ret = ipmr_mfc_delete(net, &mfc);
+			ret = ipmr_mfc_delete(mrt, &mfc);
 		else
-			ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk);
+			ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
 		rtnl_unlock();
 		return ret;
 		/*
@@ -1011,7 +1032,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 		int v;
 		if (get_user(v,(int __user *)optval))
 			return -EFAULT;
-		net->ipv4.mroute_do_assert = (v) ? 1 : 0;
+		mrt->mroute_do_assert = (v) ? 1 : 0;
 		return 0;
 	}
 #ifdef CONFIG_IP_PIMSM
@@ -1025,9 +1046,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 
 		rtnl_lock();
 		ret = 0;
-		if (v != net->ipv4.mroute_do_pim) {
-			net->ipv4.mroute_do_pim = v;
-			net->ipv4.mroute_do_assert = v;
+		if (v != mrt->mroute_do_pim) {
+			mrt->mroute_do_pim = v;
+			mrt->mroute_do_assert = v;
 		}
 		rtnl_unlock();
 		return ret;
@@ -1051,6 +1072,7 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
 	int olr;
 	int val;
 	struct net *net = sock_net(sk);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	if (optname != MRT_VERSION &&
 #ifdef CONFIG_IP_PIMSM
@@ -1072,10 +1094,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
 		val = 0x0305;
 #ifdef CONFIG_IP_PIMSM
 	else if (optname == MRT_PIM)
-		val = net->ipv4.mroute_do_pim;
+		val = mrt->mroute_do_pim;
 #endif
 	else
-		val = net->ipv4.mroute_do_assert;
+		val = mrt->mroute_do_assert;
 	if (copy_to_user(optval, &val, olr))
 		return -EFAULT;
 	return 0;
@@ -1092,16 +1114,17 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
 	struct vif_device *vif;
 	struct mfc_cache *c;
 	struct net *net = sock_net(sk);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	switch (cmd) {
 	case SIOCGETVIFCNT:
 		if (copy_from_user(&vr, arg, sizeof(vr)))
 			return -EFAULT;
-		if (vr.vifi >= net->ipv4.maxvif)
+		if (vr.vifi >= mrt->maxvif)
 			return -EINVAL;
 		read_lock(&mrt_lock);
-		vif = &net->ipv4.vif_table[vr.vifi];
-		if (VIF_EXISTS(net, vr.vifi)) {
+		vif = &mrt->vif_table[vr.vifi];
+		if (VIF_EXISTS(mrt, vr.vifi)) {
 			vr.icount = vif->pkt_in;
 			vr.ocount = vif->pkt_out;
 			vr.ibytes = vif->bytes_in;
@@ -1119,7 +1142,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
 			return -EFAULT;
 
 		read_lock(&mrt_lock);
-		c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr);
+		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
 		if (c) {
 			sr.pktcnt = c->mfc_un.res.pkt;
 			sr.bytecnt = c->mfc_un.res.bytes;
@@ -1142,16 +1165,17 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
 {
 	struct net_device *dev = ptr;
 	struct net *net = dev_net(dev);
+	struct mr_table *mrt = net->ipv4.mrt;
 	struct vif_device *v;
 	int ct;
 	LIST_HEAD(list);
 
 	if (event != NETDEV_UNREGISTER)
 		return NOTIFY_DONE;
-	v = &net->ipv4.vif_table[0];
-	for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) {
+	v = &mrt->vif_table[0];
+	for (ct = 0; ct < mrt->maxvif; ct++, v++) {
 		if (v->dev == dev)
-			vif_delete(net, ct, 1, &list);
+			vif_delete(mrt, ct, 1, &list);
 	}
 	unregister_netdevice_many(&list);
 	return NOTIFY_DONE;
@@ -1210,11 +1234,11 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
  *	Processing handlers for ipmr_forward
  */
 
-static void ipmr_queue_xmit(struct net *net, struct sk_buff *skb,
-			    struct mfc_cache *c, int vifi)
+static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
+			    struct sk_buff *skb, struct mfc_cache *c, int vifi)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	struct vif_device *vif = &net->ipv4.vif_table[vifi];
+	struct vif_device *vif = &mrt->vif_table[vifi];
 	struct net_device *dev;
 	struct rtable *rt;
 	int    encap = 0;
@@ -1228,7 +1252,7 @@ static void ipmr_queue_xmit(struct net *net, struct sk_buff *skb,
 		vif->bytes_out += skb->len;
 		vif->dev->stats.tx_bytes += skb->len;
 		vif->dev->stats.tx_packets++;
-		ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT);
+		ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
 		goto out_free;
 	}
 #endif
@@ -1311,12 +1335,12 @@ out_free:
 	return;
 }
 
-static int ipmr_find_vif(struct net_device *dev)
+static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
 {
-	struct net *net = dev_net(dev);
 	int ct;
-	for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) {
-		if (net->ipv4.vif_table[ct].dev == dev)
+
+	for (ct = mrt->maxvif-1; ct >= 0; ct--) {
+		if (mrt->vif_table[ct].dev == dev)
 			break;
 	}
 	return ct;
@@ -1324,8 +1348,9 @@ static int ipmr_find_vif(struct net_device *dev)
 
 /* "local" means that we should preserve one skb (for local delivery) */
 
-static int ip_mr_forward(struct net *net, struct sk_buff *skb,
-			 struct mfc_cache *cache, int local)
+static int ip_mr_forward(struct net *net, struct mr_table *mrt,
+			 struct sk_buff *skb, struct mfc_cache *cache,
+			 int local)
 {
 	int psend = -1;
 	int vif, ct;
@@ -1337,7 +1362,7 @@ static int ip_mr_forward(struct net *net, struct sk_buff *skb,
 	/*
 	 * Wrong interface: drop packet and (maybe) send PIM assert.
 	 */
-	if (net->ipv4.vif_table[vif].dev != skb->dev) {
+	if (mrt->vif_table[vif].dev != skb->dev) {
 		int true_vifi;
 
 		if (skb_rtable(skb)->fl.iif == 0) {
@@ -1356,26 +1381,26 @@ static int ip_mr_forward(struct net *net, struct sk_buff *skb,
 		}
 
 		cache->mfc_un.res.wrong_if++;
-		true_vifi = ipmr_find_vif(skb->dev);
+		true_vifi = ipmr_find_vif(mrt, skb->dev);
 
-		if (true_vifi >= 0 && net->ipv4.mroute_do_assert &&
+		if (true_vifi >= 0 && mrt->mroute_do_assert &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
 		       so that we cannot check that packet arrived on an oif.
 		       It is bad, but otherwise we would need to move pretty
 		       large chunk of pimd to kernel. Ough... --ANK
 		     */
-		    (net->ipv4.mroute_do_pim ||
+		    (mrt->mroute_do_pim ||
 		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
 		    time_after(jiffies,
 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
 			cache->mfc_un.res.last_assert = jiffies;
-			ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF);
+			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
 		}
 		goto dont_forward;
 	}
 
-	net->ipv4.vif_table[vif].pkt_in++;
-	net->ipv4.vif_table[vif].bytes_in += skb->len;
+	mrt->vif_table[vif].pkt_in++;
+	mrt->vif_table[vif].bytes_in += skb->len;
 
 	/*
 	 *	Forward the frame
@@ -1385,7 +1410,8 @@ static int ip_mr_forward(struct net *net, struct sk_buff *skb,
 			if (psend != -1) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 				if (skb2)
-					ipmr_queue_xmit(net, skb2, cache, psend);
+					ipmr_queue_xmit(net, mrt, skb2, cache,
+							psend);
 			}
 			psend = ct;
 		}
@@ -1394,9 +1420,9 @@ static int ip_mr_forward(struct net *net, struct sk_buff *skb,
 		if (local) {
 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 			if (skb2)
-				ipmr_queue_xmit(net, skb2, cache, psend);
+				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
 		} else {
-			ipmr_queue_xmit(net, skb, cache, psend);
+			ipmr_queue_xmit(net, mrt, skb, cache, psend);
 			return 0;
 		}
 	}
@@ -1416,6 +1442,7 @@ int ip_mr_input(struct sk_buff *skb)
 {
 	struct mfc_cache *cache;
 	struct net *net = dev_net(skb->dev);
+	struct mr_table *mrt = net->ipv4.mrt;
 	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
 
 	/* Packet is looped back after forward, it should not be
@@ -1436,9 +1463,9 @@ int ip_mr_input(struct sk_buff *skb)
 			       that we can forward NO IGMP messages.
 			     */
 			    read_lock(&mrt_lock);
-			    if (net->ipv4.mroute_sk) {
+			    if (mrt->mroute_sk) {
 				    nf_reset(skb);
-				    raw_rcv(net->ipv4.mroute_sk, skb);
+				    raw_rcv(mrt->mroute_sk, skb);
 				    read_unlock(&mrt_lock);
 				    return 0;
 			    }
@@ -1447,7 +1474,7 @@ int ip_mr_input(struct sk_buff *skb)
 	}
 
 	read_lock(&mrt_lock);
-	cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
 
 	/*
 	 *	No usable cache entry
@@ -1465,9 +1492,9 @@ int ip_mr_input(struct sk_buff *skb)
 			skb = skb2;
 		}
 
-		vif = ipmr_find_vif(skb->dev);
+		vif = ipmr_find_vif(mrt, skb->dev);
 		if (vif >= 0) {
-			int err = ipmr_cache_unresolved(net, vif, skb);
+			int err = ipmr_cache_unresolved(mrt, vif, skb);
 			read_unlock(&mrt_lock);
 
 			return err;
@@ -1477,7 +1504,7 @@ int ip_mr_input(struct sk_buff *skb)
 		return -ENODEV;
 	}
 
-	ip_mr_forward(net, skb, cache, local);
+	ip_mr_forward(net, mrt, skb, cache, local);
 
 	read_unlock(&mrt_lock);
 
@@ -1499,6 +1526,7 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
 	struct net_device *reg_dev = NULL;
 	struct iphdr *encap;
 	struct net *net = dev_net(skb->dev);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
 	/*
@@ -1513,8 +1541,8 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
 		return 1;
 
 	read_lock(&mrt_lock);
-	if (net->ipv4.mroute_reg_vif_num >= 0)
-		reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev;
+	if (mrt->mroute_reg_vif_num >= 0)
+		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
 	if (reg_dev)
 		dev_hold(reg_dev);
 	read_unlock(&mrt_lock);
@@ -1549,13 +1577,14 @@ int pim_rcv_v1(struct sk_buff * skb)
 {
 	struct igmphdr *pim;
 	struct net *net = dev_net(skb->dev);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
 		goto drop;
 
 	pim = igmp_hdr(skb);
 
-	if (!net->ipv4.mroute_do_pim ||
+	if (!mrt->mroute_do_pim ||
 	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
 		goto drop;
 
@@ -1591,7 +1620,7 @@ drop:
 #endif
 
 static int
-ipmr_fill_mroute(struct net *net, struct sk_buff *skb, struct mfc_cache *c,
+ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c,
 		 struct rtmsg *rtm)
 {
 	int ct;
@@ -1603,19 +1632,19 @@ ipmr_fill_mroute(struct net *net, struct sk_buff *skb, struct mfc_cache *c,
 	if (c->mfc_parent > MAXVIFS)
 		return -ENOENT;
 
-	if (VIF_EXISTS(net, c->mfc_parent))
-		RTA_PUT(skb, RTA_IIF, 4, &net->ipv4.vif_table[c->mfc_parent].dev->ifindex);
+	if (VIF_EXISTS(mrt, c->mfc_parent))
+		RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
 
 	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
 
 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
-		if (VIF_EXISTS(net, ct) && c->mfc_un.res.ttls[ct] < 255) {
+		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
 				goto rtattr_failure;
 			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
 			nhp->rtnh_flags = 0;
 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
-			nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex;
+			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
 			nhp->rtnh_len = sizeof(*nhp);
 		}
 	}
@@ -1633,11 +1662,12 @@ int ipmr_get_route(struct net *net,
 		   struct sk_buff *skb, struct rtmsg *rtm, int nowait)
 {
 	int err;
+	struct mr_table *mrt = net->ipv4.mrt;
 	struct mfc_cache *cache;
 	struct rtable *rt = skb_rtable(skb);
 
 	read_lock(&mrt_lock);
-	cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst);
+	cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
 
 	if (cache == NULL) {
 		struct sk_buff *skb2;
@@ -1651,7 +1681,7 @@ int ipmr_get_route(struct net *net,
 		}
 
 		dev = skb->dev;
-		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
+		if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
 			read_unlock(&mrt_lock);
 			return -ENODEV;
 		}
@@ -1668,14 +1698,14 @@ int ipmr_get_route(struct net *net,
 		iph->saddr = rt->rt_src;
 		iph->daddr = rt->rt_dst;
 		iph->version = 0;
-		err = ipmr_cache_unresolved(net, vif, skb2);
+		err = ipmr_cache_unresolved(mrt, vif, skb2);
 		read_unlock(&mrt_lock);
 		return err;
 	}
 
 	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
 		cache->mfc_flags |= MFC_NOTIFY;
-	err = ipmr_fill_mroute(net, skb, cache, rtm);
+	err = ipmr_fill_mroute(mrt, skb, cache, rtm);
 	read_unlock(&mrt_lock);
 	return err;
 }
@@ -1693,11 +1723,13 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net,
 					   struct ipmr_vif_iter *iter,
 					   loff_t pos)
 {
-	for (iter->ct = 0; iter->ct < net->ipv4.maxvif; ++iter->ct) {
-		if (!VIF_EXISTS(net, iter->ct))
+	struct mr_table *mrt = net->ipv4.mrt;
+
+	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
+		if (!VIF_EXISTS(mrt, iter->ct))
 			continue;
 		if (pos-- == 0)
-			return &net->ipv4.vif_table[iter->ct];
+			return &mrt->vif_table[iter->ct];
 	}
 	return NULL;
 }
@@ -1716,15 +1748,16 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct ipmr_vif_iter *iter = seq->private;
 	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	++*pos;
 	if (v == SEQ_START_TOKEN)
 		return ipmr_vif_seq_idx(net, iter, 0);
 
-	while (++iter->ct < net->ipv4.maxvif) {
-		if (!VIF_EXISTS(net, iter->ct))
+	while (++iter->ct < mrt->maxvif) {
+		if (!VIF_EXISTS(mrt, iter->ct))
 			continue;
-		return &net->ipv4.vif_table[iter->ct];
+		return &mrt->vif_table[iter->ct];
 	}
 	return NULL;
 }
@@ -1738,6 +1771,7 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
@@ -1748,7 +1782,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 
 		seq_printf(seq,
 			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
-			   vif - net->ipv4.vif_table,
+			   vif - mrt->vif_table,
 			   name, vif->bytes_in, vif->pkt_in,
 			   vif->bytes_out, vif->pkt_out,
 			   vif->flags, vif->local, vif->remote);
@@ -1787,11 +1821,12 @@ struct ipmr_mfc_iter {
 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 					  struct ipmr_mfc_iter *it, loff_t pos)
 {
+	struct mr_table *mrt = net->ipv4.mrt;
 	struct mfc_cache *mfc;
 
 	read_lock(&mrt_lock);
 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
-		it->cache = &net->ipv4.mfc_cache_array[it->ct];
+		it->cache = &mrt->mfc_cache_array[it->ct];
 		list_for_each_entry(mfc, it->cache, list)
 			if (pos-- == 0)
 				return mfc;
@@ -1799,7 +1834,7 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 	read_unlock(&mrt_lock);
 
 	spin_lock_bh(&mfc_unres_lock);
-	it->cache = &net->ipv4.mfc_unres_queue;
+	it->cache = &mrt->mfc_unres_queue;
 	list_for_each_entry(mfc, it->cache, list)
 		if (pos-- == 0)
 			return mfc;
@@ -1826,6 +1861,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	struct mfc_cache *mfc = v;
 	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	++*pos;
 
@@ -1835,13 +1871,13 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (mfc->list.next != it->cache)
 		return list_entry(mfc->list.next, struct mfc_cache, list);
 
-	if (it->cache == &net->ipv4.mfc_unres_queue)
+	if (it->cache == &mrt->mfc_unres_queue)
 		goto end_of_list;
 
-	BUG_ON(it->cache != &net->ipv4.mfc_cache_array[it->ct]);
+	BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
 
 	while (++it->ct < MFC_LINES) {
-		it->cache = &net->ipv4.mfc_cache_array[it->ct];
+		it->cache = &mrt->mfc_cache_array[it->ct];
 		if (list_empty(it->cache))
 			continue;
 		return list_first_entry(it->cache, struct mfc_cache, list);
@@ -1849,7 +1885,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 	/* exhausted cache_array, show unresolved */
 	read_unlock(&mrt_lock);
-	it->cache = &net->ipv4.mfc_unres_queue;
+	it->cache = &mrt->mfc_unres_queue;
 	it->ct = 0;
 
 	spin_lock_bh(&mfc_unres_lock);
@@ -1867,10 +1903,11 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 {
 	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = net->ipv4.mrt;
 
-	if (it->cache == &net->ipv4.mfc_unres_queue)
+	if (it->cache == &mrt->mfc_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == &net->ipv4.mfc_cache_array[it->ct])
+	else if (it->cache == &mrt->mfc_cache_array[it->ct])
 		read_unlock(&mrt_lock);
 }
 
@@ -1878,6 +1915,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 {
 	int n;
 	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = net->ipv4.mrt;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
@@ -1891,14 +1929,14 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 			   (unsigned long) mfc->mfc_origin,
 			   mfc->mfc_parent);
 
-		if (it->cache != &net->ipv4.mfc_unres_queue) {
+		if (it->cache != &mrt->mfc_unres_queue) {
 			seq_printf(seq, " %8lu %8lu %8lu",
 				   mfc->mfc_un.res.pkt,
 				   mfc->mfc_un.res.bytes,
 				   mfc->mfc_un.res.wrong_if);
 			for (n = mfc->mfc_un.res.minvif;
 			     n < mfc->mfc_un.res.maxvif; n++ ) {
-				if (VIF_EXISTS(net, n) &&
+				if (VIF_EXISTS(mrt, n) &&
 				    mfc->mfc_un.res.ttls[n] < 255)
 					seq_printf(seq,
 					   " %2d:%-3d",
@@ -1950,35 +1988,27 @@ static const struct net_protocol pim_protocol = {
  */
 static int __net_init ipmr_net_init(struct net *net)
 {
+	struct mr_table *mrt;
 	unsigned int i;
 	int err = 0;
 
-	net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
-				      GFP_KERNEL);
-	if (!net->ipv4.vif_table) {
+	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+	if (mrt == NULL) {
 		err = -ENOMEM;
 		goto fail;
 	}
 
 	/* Forwarding cache */
-	net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
-					    sizeof(struct list_head),
-					    GFP_KERNEL);
-	if (!net->ipv4.mfc_cache_array) {
-		err = -ENOMEM;
-		goto fail_mfc_cache;
-	}
-
 	for (i = 0; i < MFC_LINES; i++)
-		INIT_LIST_HEAD(&net->ipv4.mfc_cache_array[i]);
+		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
 
-	INIT_LIST_HEAD(&net->ipv4.mfc_unres_queue);
+	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
 
-	setup_timer(&net->ipv4.ipmr_expire_timer, ipmr_expire_process,
+	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
 		    (unsigned long)net);
 
 #ifdef CONFIG_IP_PIMSM
-	net->ipv4.mroute_reg_vif_num = -1;
+	mrt->mroute_reg_vif_num = -1;
 #endif
 
 #ifdef CONFIG_PROC_FS
@@ -1988,16 +2018,16 @@ static int __net_init ipmr_net_init(struct net *net)
 	if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
 		goto proc_cache_fail;
 #endif
+
+	net->ipv4.mrt = mrt;
 	return 0;
 
 #ifdef CONFIG_PROC_FS
 proc_cache_fail:
 	proc_net_remove(net, "ip_mr_vif");
 proc_vif_fail:
-	kfree(net->ipv4.mfc_cache_array);
+	kfree(mrt);
 #endif
-fail_mfc_cache:
-	kfree(net->ipv4.vif_table);
 fail:
 	return err;
 }
@@ -2008,8 +2038,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
 	proc_net_remove(net, "ip_mr_cache");
 	proc_net_remove(net, "ip_mr_vif");
 #endif
-	kfree(net->ipv4.mfc_cache_array);
-	kfree(net->ipv4.vif_table);
+	kfree(net->ipv4.mrt);
 }
 
 static struct pernet_operations ipmr_net_ops = {
-- 
1.7.0.4


^ permalink raw reply related

* Re: [PATCH] net_sched: make traffic control network namespace aware
From: Patrick McHardy @ 2010-04-11 17:48 UTC (permalink / raw)
  To: Tom Goff; +Cc: netdev
In-Reply-To: <4BA7B13C.7020304@trash.net>

Patrick McHardy wrote:
> Patrick McHardy wrote:
>> Tom Goff wrote:
>>> Mostly minor changes to add a net argument to various functions and
>>> remove initial network namespace checks.
>>>
>>> Make /proc/net/psched per network namespace.
>> Looks fine from a qdisc POV. One thing that appears to be missing
>> though is teql master netdev registration in other than the initial
>> namespace.
> 
> Actually we could take this opportunity and add rtnl_link support
> for teql device registration. I can look into this in a couple of
> days.

I tried to do that, but adding proper netns support is more
complicated than I expected. sch_teql registers a qdisc for
each master device using the name of the master. Currently
qdisc registrations are global, so this doesn't work with
network namespaces.

We could of course make them per netns, but that would require
duplicating all global registrations for each namespace. I'm
not convinced that its worth doing this since its only teql
that needs it and it doesn't seem to be very useful to use
teql in a virtual environment.


^ permalink raw reply

* Re: [PATCH] tcp: add setsockopt to disable slow start after idle
From: Cristian KLEIN @ 2010-04-11 18:45 UTC (permalink / raw)
  To: David Miller; +Cc: linux-kernel, netdev
In-Reply-To: <20100410.154709.201110995.davem@davemloft.net>

On 11/04/2010 00:47, David Miller wrote:
> From: Cristian KLEIN<cristiklein@gmail.com>
> Date: Sat, 10 Apr 2010 14:09:03 +0200
>
>> Could you please explain me why it is dangerous? To me it seems that
>> it's just like allowing applications to disable NAGLE or to choose a
>> congestion control algorithm.
>
> Because you can cause undue congestion to other people on the network
> because you are believing path information that has been outdated and
> has not been validated by sending data for a certain amount of time.

I consider your argument an important concern, but I'm not quite 
convinced this patch is so bad.

An application which does not need this behaviour will continue to slow 
start after idle by default.

Without this patch, an application which needs this behaviour (i.e. not 
to slow start after idle) is forced to implement its own UDP-based 
protocol with all the congestion control, retransmission etc. Undue 
congestion might still occur.


If you don't agree with the above two points, would you consider 
accepting a patch with an allow_user_fast_start_after_idle sysctl?

Cristi.

^ permalink raw reply

* Re: Linux 2.6.34-rc3 + CAN build problem
From: Oliver Hartkopp @ 2010-04-11 18:53 UTC (permalink / raw)
  To: David Miller
  Cc: nm127-Y8qEzhMunLyT9ig0jae3mg, eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w,
	urs.thuermann-l29pVbxQd1IUtdQbppsyvg,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	oliver.hartkopp-l29pVbxQd1IUtdQbppsyvg
In-Reply-To: <20100410.155009.66184583.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

David Miller wrote:
> From: Oliver Hartkopp <socketcan-fJ+pQTUTwRTk1uMJSBkQmQ@public.gmane.org>
> Date: Sat, 10 Apr 2010 14:36:48 +0200
> 
>> So i wonder why Nemeth trapped into this problem ... probably an include file
>> mix-up?
> 
> Do you have CONFIG_DEBUG_STRICT_USER_COPY_CHECKS enabled in your
> kernel config?
> 
> That's the only way you get an actual failure of a build when
> the user copy size can't be proven to be in range by the
> compiler, otherwise it just warns.

No, indeed i do not have it set.

# CONFIG_DEBUG_STRICT_USER_COPY_CHECKS is not set

What would be the best approach to fix this build failure then?

By applying the patch thankfully provided by Eric?

Regards,
Oliver

^ permalink raw reply

* Read Attached Mail From Miss. Gladys Duke
From: gla11 @ 2010-04-11 19:45 UTC (permalink / raw)
  To: Gladys Duke

[-- Attachment #1: Mail message body --]
[-- Type: text/plain, Size: 0 bytes --]



[-- Attachment #2: From Miss.Gladys Duke.doc --]
[-- Type: application/msword, Size: 20992 bytes --]

^ permalink raw reply

* Re: [PATCH] tcp: add setsockopt to disable slow start after idle
From: David Miller @ 2010-04-11 20:18 UTC (permalink / raw)
  To: cristiklein; +Cc: linux-kernel, netdev
In-Reply-To: <4BC218B6.8050703@gmail.com>

From: Cristian KLEIN <cristiklein@gmail.com>
Date: Sun, 11 Apr 2010 20:45:10 +0200

> Without this patch, an application which needs this behaviour
> (i.e. not to slow start after idle) is forced to implement its own
> UDP-based protocol with all the congestion control, retransmission
> etc. Undue congestion might still occur.

Ask your system administrator to set the existing sysctl, because it
is a physical network attribute whether this behavior is safe or not.

And if it is safe, it is safe for all applications, there is no reason
for one application to ask for it and others to not.  If it's legal,
it helps all applications without exception.

Your attempts to tie this to NAGLE is complete nonsense.

NAGLE changes acking behavior, whereas this feature controls in what
way we trust congestion control information we've probed for in the
past.

^ permalink raw reply

* Re: Linux 2.6.34-rc3 + CAN build problem
From: David Miller @ 2010-04-11 20:19 UTC (permalink / raw)
  To: socketcan
  Cc: nm127, eric.dumazet, oliver.hartkopp, urs.thuermann,
	socketcan-core, netdev, linux-kernel
In-Reply-To: <4BC21AC4.8060403@hartkopp.net>

From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Sun, 11 Apr 2010 20:53:56 +0200

> By applying the patch thankfully provided by Eric?

I'll probably do that, yes.

^ permalink raw reply

* NULL pointer dereference panic in stable (2.6.33.2), amd64
From: Denys Fedorysychenko @ 2010-04-11 20:38 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: Text/Plain, Size: 386 bytes --]

Hi

Got recently NULL pointer dereference
Note - it seems in 32-bit i didn't experience this issue.

Router with NAT, HFSC shapers terminated with bfifo qdiscs (some attached to 
802.1Q vlan's), userspace application using tun.
Hardware - network cards e1000e, Core 2 based architecture (two Quad Xeon), 

full message received over netconsole attached
More info can provide on request

[-- Attachment #2: BUG --]
[-- Type: text/plain, Size: 7186 bytes --]

Apr 11 23:28:52 ROUTER kernel: [21095.453138] BUG: unable to handle kernel NULL pointer dereference at (null)
Apr 11 23:28:52 ROUTER kernel: [21095.453334] IP: [<ffffffff811e5877>] dev_queue_xmit+0x284/0x465
Apr 11 23:28:52 ROUTER kernel: [21095.453522] PGD 20b247067 PUD 20b24c067 PMD 0 
Apr 11 23:28:52 ROUTER kernel: [21095.453704] Oops: 0000 [#1] SMP 
Apr 11 23:28:52 ROUTER kernel: [21095.453880] last sysfs file: /sys/devices/virtual/misc/tun/dev
Apr 11 23:28:52 ROUTER kernel: [21095.454001] CPU 0 
Apr 11 23:28:52 ROUTER kernel: [21095.454001] Pid: 2864, comm: globax Not tainted 2.6.33.2-build-0051-64 #4         /        
Apr 11 23:28:52 ROUTER kernel: [21095.454001] RIP: 0010:[<ffffffff811e5877>]  [<ffffffff811e5877>] dev_queue_xmit+0x284/0x465
Apr 11 23:28:52 ROUTER kernel: [21095.454001] RSP: 0000:ffff880028203db0  EFLAGS: 00010202
Apr 11 23:28:52 ROUTER kernel: [21095.454001] RAX: 0000000000002000 RBX: 0000000000000000 RCX: ffff8802087d8d00
Apr 11 23:28:52 ROUTER kernel: [21095.454001] RDX: ffff88021d818000 RSI: 0000000000000000 RDI: ffff8802135850e8
Apr 11 23:28:52 ROUTER kernel: [21095.454001] RBP: ffff880028203de0 R08: ffff88021c01d89c R09: ffff88021c01dc00
Apr 11 23:28:52 ROUTER kernel: [21095.454001] R10: dead000000200200 R11: dead000000100100 R12: ffff88021e212b80
Apr 11 23:28:52 ROUTER kernel: [21095.454001] R13: ffff88021da51e80 R14: ffff8802135850e8 R15: ffff88021be62000
Apr 11 23:28:52 ROUTER kernel: [21095.454001] FS:  0000000000000000(0000) GS:ffff880028200000(0063) knlGS:00000000f7e51b10
Apr 11 23:28:52 ROUTER kernel: [21095.454001] CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
Apr 11 23:28:52 ROUTER kernel: [21095.454001] CR2: 0000000000000000 CR3: 000000020b248000 CR4: 00000000000006f0
Apr 11 23:28:52 ROUTER kernel: [21095.454001] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Apr 11 23:28:52 ROUTER kernel: [21095.454001] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Apr 11 23:28:52 ROUTER kernel: [21095.454001] Process globax (pid: 2864, threadinfo ffff8802118c0000, task ffff8802182c2c20)
Apr 11 23:28:52 ROUTER kernel: [21095.454001] Stack:
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  ffff88021d818000 ffff88021da51e80 0000000000000042 ffff88021da51e80
Apr 11 23:28:52 ROUTER kernel: [21095.454001] <0> ffff88021be62000 ffff88021be62000 ffff880028203e00 ffffffffa01c12a9
Apr 11 23:28:52 ROUTER kernel: [21095.454001] <0> 0000000000000000 ffff8802135850e8 ffff880028203e50 ffffffff811e540e
Apr 11 23:28:52 ROUTER kernel: [21095.454001] Call Trace:
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  <IRQ> 
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffffa01c12a9>] vlan_dev_hwaccel_hard_start_xmit+0x68/0x86 [8021q]
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffff811e540e>] dev_hard_start_xmit+0x232/0x304
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffff811f6482>] sch_direct_xmit+0x5d/0x16b
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffff811f664c>] __qdisc_run+0xbc/0xdc
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffff811e2c89>] net_tx_action+0xc2/0x120
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffff81039670>] __do_softirq+0x96/0x11a
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffff810037cc>] call_softirq+0x1c/0x28
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffff81005543>] do_softirq+0x33/0x68
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffff81039407>] irq_exit+0x36/0x75
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffff81016f63>] smp_apic_timer_interrupt+0x88/0x96
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  [<ffffffff81003293>] apic_timer_interrupt+0x13/0x20
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  <EOI>
Apr 11 23:28:52 ROUTER kernel: [21095.454001] Code: e2 48 8b 55 d0 49 c1 e4 07 66 41 8b 86 a6 00 00 00 4c 03 a2 00 03 00 00 80 e4 cf 80 cc 20 49 8b 5c 24 08 66 41 89 86 a6 00 00 00 <48> 83 3b 00 0f 84 bb 00 00 00 4c 8d ab 9c 00 00 00 4c 89 ef e8
Apr 11 23:28:52 ROUTER kernel: [21095.454001] RIP  [<ffffffff811e5877>] dev_queue_xmit+0x284/0x465
Apr 11 23:28:52 ROUTER kernel: [21095.454001]  RSP <ffff880028203db0>
Apr 11 23:28:52 ROUTER kernel: [21095.454001] CR2: 0000000000000000
Apr 11 23:28:52 ROUTER kernel: [21095.462975] ---[ end trace 2421d995c1afd7c3 ]---
Apr 11 23:28:52 ROUTER kernel: [21095.463199] Kernel panic - not syncing: Fatal exception in interrupt
Apr 11 23:28:52 ROUTER kernel: [21095.463428] Pid: 2864, comm: globax Tainted: G      D    2.6.33.2-build-0051-64 #4
Apr 11 23:28:52 ROUTER kernel: [21095.463819] Call Trace:
Apr 11 23:28:52 ROUTER kernel: [21095.464036]  <IRQ>  [<ffffffff81259743>] panic+0xa0/0x161
Apr 11 23:28:52 ROUTER kernel: [21095.464307]  [<ffffffff81003293>] ? apic_timer_interrupt+0x13/0x20
Apr 11 23:28:52 ROUTER kernel: [21095.464533]  [<ffffffff81035673>] ? kmsg_dump+0x112/0x12c
Apr 11 23:28:52 ROUTER kernel: [21095.464757]  [<ffffffff81006651>] oops_end+0xaa/0xba
Apr 11 23:28:52 ROUTER kernel: [21095.464980]  [<ffffffff8101e653>] no_context+0x1f3/0x202
Apr 11 23:28:52 ROUTER kernel: [21095.465211]  [<ffffffff810523a4>] ? tick_program_event+0x25/0x27
Apr 11 23:28:52 ROUTER kernel: [21095.465437]  [<ffffffff8101e81c>] __bad_area_nosemaphore+0x1ba/0x1e0
Apr 11 23:28:52 ROUTER kernel: [21095.465668]  [<ffffffff8113f8b3>] ? swiotlb_map_page+0x0/0xd5
Apr 11 23:28:52 ROUTER kernel: [21095.465907]  [<ffffffffa015c55a>] ? pci_map_single+0x8a/0x99 [e1000e]
Apr 11 23:28:52 ROUTER kernel: [21095.466139]  [<ffffffff8113f0c0>] ? swiotlb_dma_mapping_error+0x18/0x25
Apr 11 23:28:52 ROUTER kernel: [21095.466372]  [<ffffffffa015a2e0>] ? pci_dma_mapping_error+0x31/0x3d [e1000e]
Apr 11 23:28:52 ROUTER kernel: [21095.466605]  [<ffffffffa015cc37>] ? e1000_xmit_frame+0x6ce/0xa43 [e1000e]
Apr 11 23:28:52 ROUTER kernel: [21095.466833]  [<ffffffff8101e850>] bad_area_nosemaphore+0xe/0x10
Apr 11 23:28:52 ROUTER kernel: [21095.467064]  [<ffffffff8101eb32>] do_page_fault+0x114/0x24a
Apr 11 23:28:52 ROUTER kernel: [21095.467293]  [<ffffffff8125bc9f>] page_fault+0x1f/0x30
Apr 11 23:28:52 ROUTER kernel: [21095.467516]  [<ffffffff811e5877>] ? dev_queue_xmit+0x284/0x465
Apr 11 23:28:52 ROUTER kernel: [21095.467743]  [<ffffffffa01c12a9>] vlan_dev_hwaccel_hard_start_xmit+0x68/0x86 [8021q]
Apr 11 23:28:52 ROUTER kernel: [21095.468139]  [<ffffffff811e540e>] dev_hard_start_xmit+0x232/0x304
Apr 11 23:28:52 ROUTER kernel: [21095.468366]  [<ffffffff811f6482>] sch_direct_xmit+0x5d/0x16b
Apr 11 23:28:52 ROUTER kernel: [21095.468591]  [<ffffffff811f664c>] __qdisc_run+0xbc/0xdc
Apr 11 23:28:52 ROUTER kernel: [21095.468814]  [<ffffffff811e2c89>] net_tx_action+0xc2/0x120
Apr 11 23:28:52 ROUTER kernel: [21095.469042]  [<ffffffff81039670>] __do_softirq+0x96/0x11a
Apr 11 23:28:52 ROUTER kernel: [21095.469267]  [<ffffffff810037cc>] call_softirq+0x1c/0x28
Apr 11 23:28:52 ROUTER kernel: [21095.469491]  [<ffffffff81005543>] do_softirq+0x33/0x68
Apr 11 23:28:52 ROUTER kernel: [21095.469714]  [<ffffffff81039407>] irq_exit+0x36/0x75
Apr 11 23:28:52 ROUTER kernel: [21095.469936]  [<ffffffff81016f63>] smp_apic_timer_interrupt+0x88/0x96
Apr 11 23:28:52 ROUTER kernel: [21095.470167]  [<ffffffff81003293>] apic_timer_interrupt+0x13/0x20

^ permalink raw reply

* Re: [PATCH 1/3] tcp: Handle CHECKSUM_PARTIAL for SYNACK packets for IPv4
From: Yinghai @ 2010-04-11 21:07 UTC (permalink / raw)
  To: Herbert Xu, davem; +Cc: linux-kernel, netdev, torvalds
In-Reply-To: <E1O0w4v-0005UZ-GF@gondolin.me.apana.org.au>

On 04/11/2010 05:15 AM, Herbert Xu wrote:
> tcp: Handle CHECKSUM_PARTIAL for SYNACK packets for IPv4
> 
> This patch moves the common code between tcp_v4_send_check and
> tcp_v4_gso_send_check into a new function __tcp_v4_send_check.
> 
> It then uses the new function in tcp_v4_send_synack so that it
> handles CHECKSUM_PARTIAL properly.
> 
Good, three patches fix the problem.

Thanks

Yinghai

^ permalink raw reply

* Re: IGB handling of zero length checksumming?
From: David Miller @ 2010-04-11 21:42 UTC (permalink / raw)
  To: netdev
  Cc: jeffrey.t.kirsher, jesse.brandeburg, bruce.w.allan,
	alexander.h.duyck, peter.p.waskiewicz.jr, john.ronciak
In-Reply-To: <20100411.024027.120459168.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Sun, 11 Apr 2010 02:40:27 -0700 (PDT)

> 
> If the IGB is given a "skb->ip_summed === CHECKSUM_PARTIAL" packet and
> the data area past the TCP header is of zero length, will it do the
> right thing?

Hey guys you don't have to worry about this.

The problem turned out to be somewhere else. :-)


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox