[PATCH] netns: Delete virtual interfaces during namespace cleanup

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] netns: Delete virtual interfaces during namespace cleanup
@ 2008-10-03  0:39 Eric W. Biederman
       [not found] ` <m18wt6v7eb.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
  0 siblings, 1 reply; 28+ messages in thread
From: Eric W. Biederman @ 2008-10-03  0:39 UTC (permalink / raw)
  To: David Miller
  Cc: Linux Containers, Denis V. Lunev, dlezcano-NmTC/0ZBporQT0dZR+AlfA

When physical devices are inside of network namespace and that
network namespace terminates we can not make them go away.  We
have to keep them and moving them to the initial network namespace
is the best we can do.

For virtual devices left in a network namespace that is exiting
we have no need to preserve them and we now have the infrastructure
that allows us to delete them.  So delete virtual devices when we
exit a network namespace.  Keeping the necessary user space clean up
after a network namespace exits much more tractable.

This patch removes much of the need for user space clean up code to
run after a network namespace exits.

Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
---
 net/core/dev.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 7091040..f3476d4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4844,6 +4844,12 @@ static void __net_exit default_device_exit(struct net *net)
 		if (dev->features & NETIF_F_NETNS_LOCAL)
 			continue;

+		/* Delete virtual devices */
+		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
+			dev->rtnl_link_ops->dellink(dev);
+			continue;
+		}
+
 		/* Push remaing network devices to init_net */
 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 		err = dev_change_net_namespace(dev, &init_net, fb_name);
-- 
1.5.3.rc6.17.g1911

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH] net: Support specifying the network namespace upon device creation.
       [not found] ` <m18wt6v7eb.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
@ 2008-10-03  0:46   ` Eric W. Biederman
       [not found]     ` <m1vdwatshs.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
  2008-10-07 10:16   ` [PATCH] netns: Delete virtual interfaces during namespace cleanup Daniel Lezcano
  1 sibling, 1 reply; 28+ messages in thread
From: Eric W. Biederman @ 2008-10-03  0:46 UTC (permalink / raw)
  To: David Miller
  Cc: Linux Containers, Denis V. Lunev, dlezcano-NmTC/0ZBporQT0dZR+AlfA,
	Benjamin Thery


There is no good reason to not support userspace specifying the
network namespace during device creation and it seems a handy
thing to do.

We have to be a little extra careful in this case to ensure that
the network namespace exists through the point where we call
register_netdevice.

In addition we need to pass the network namespace to the
rtnl_link_ops.newlink method so we can properly create
the new device in another namespace and have it be a vlan
device of a device in our current network namespace.

In summary this patch makes ip link add somename netns NNN type sometype
do the obvious thing instead of ignoring the network namespace parameter.

Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
---
 drivers/net/macvlan.c    |    4 ++--
 drivers/net/veth.c       |    5 +++--
 include/net/rtnetlink.h  |    3 ++-
 net/8021q/vlan_netlink.c |    4 ++--
 net/core/rtnetlink.c     |   17 ++++++++++++++++-
 5 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 4239450..fc5933b 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -416,7 +416,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
-static int macvlan_newlink(struct net_device *dev,
+static int macvlan_newlink(struct net *net, struct net_device *dev,
 			   struct nlattr *tb[], struct nlattr *data[])
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
@@ -427,7 +427,7 @@ static int macvlan_newlink(struct net_device *dev,
 	if (!tb[IFLA_LINK])
 		return -EINVAL;
 
-	lowerdev = __dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK]));
+	lowerdev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
 	if (lowerdev == NULL)
 		return -ENODEV;
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 31cd817..3a2d818 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -335,7 +335,7 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[])
 
 static struct rtnl_link_ops veth_link_ops;
 
-static int veth_newlink(struct net_device *dev,
+static int veth_newlink(struct net *net, struct net_device *dev,
 			 struct nlattr *tb[], struct nlattr *data[])
 {
 	int err;
@@ -375,7 +375,7 @@ static int veth_newlink(struct net_device *dev,
 	else
 		snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
 
-	peer = rtnl_create_link(dev_net(dev), ifname, &veth_link_ops, tbp);
+	peer = rtnl_create_link(net, ifname, &veth_link_ops, tbp);
 	if (IS_ERR(peer))
 		return PTR_ERR(peer);
 
@@ -383,6 +383,7 @@ static int veth_newlink(struct net_device *dev,
 		random_ether_addr(peer->dev_addr);
 
 	err = register_netdevice(peer);
+	put_net(peer->nd_net);
 	if (err < 0)
 		goto err_register_peer;
 
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 3c1895e..dbf546f 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -55,7 +55,8 @@ struct rtnl_link_ops {
 	int			(*validate)(struct nlattr *tb[],
 					    struct nlattr *data[]);
 
-	int			(*newlink)(struct net_device *dev,
+	int			(*newlink)(struct net *net,
+					   struct net_device *dev,
 					   struct nlattr *tb[],
 					   struct nlattr *data[]);
 	int			(*changelink)(struct net_device *dev,
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index e9c91dc..e6190f7 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -100,7 +100,7 @@ static int vlan_changelink(struct net_device *dev,
 	return 0;
 }
 
-static int vlan_newlink(struct net_device *dev,
+static int vlan_newlink(struct net *net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[])
 {
 	struct vlan_dev_info *vlan = vlan_dev_info(dev);
@@ -112,7 +112,7 @@ static int vlan_newlink(struct net_device *dev,
 
 	if (!tb[IFLA_LINK])
 		return -EINVAL;
-	real_dev = __dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK]));
+	real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
 	if (!real_dev)
 		return -ENODEV;
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8862498..069b176 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1002,6 +1002,19 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname,
 			goto err_free;
 	}
 
+	/* To support userspace specifying a network namespace during
+	 * device creation we grab the network namespace here and hold
+	 * it until just after register_netdevice to prevent races.
+	 */
+	if (!tb[IFLA_NET_NS_PID])
+		get_net(net);
+	else {
+		net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+		if (IS_ERR(net)) {
+			err = PTR_ERR(net);
+			goto err_free;
+		}
+	}
 	dev_net_set(dev, net);
 	dev->rtnl_link_ops = ops;
 
@@ -1150,10 +1163,12 @@ replay:
 		if (IS_ERR(dev))
 			err = PTR_ERR(dev);
 		else if (ops->newlink)
-			err = ops->newlink(dev, tb, data);
+			err = ops->newlink(net, dev, tb, data);
 		else
 			err = register_netdevice(dev);
 
+		if (!IS_ERR(dev))
+			put_net(dev->nd_net);
 		if (err < 0 && !IS_ERR(dev))
 			free_netdev(dev);
 		return err;
-- 
1.5.3.rc6.17.g1911

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] netns: Delete virtual interfaces during namespace cleanup
       [not found] ` <m18wt6v7eb.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
  2008-10-03  0:46   ` [PATCH] net: Support specifying the network namespace upon device creation Eric W. Biederman
@ 2008-10-07 10:16   ` Daniel Lezcano
       [not found]     ` <48EB36FC.4000008-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 28+ messages in thread
From: Daniel Lezcano @ 2008-10-07 10:16 UTC (permalink / raw)
  To: Eric W. Biederman, Pavel Emelianov
  Cc: Linux Containers, Denis V. Lunev, David Miller

Eric W. Biederman wrote:
> When physical devices are inside of network namespace and that
> network namespace terminates we can not make them go away.  We
> have to keep them and moving them to the initial network namespace
> is the best we can do.
> 
> For virtual devices left in a network namespace that is exiting
> we have no need to preserve them and we now have the infrastructure
> that allows us to delete them.  So delete virtual devices when we
> exit a network namespace.  Keeping the necessary user space clean up
> after a network namespace exits much more tractable.
> 
> This patch removes much of the need for user space clean up code to
> run after a network namespace exits.

I agree that will make the life easier for user space developer :)

I have a few questions about this new behaviour.

After discussing with Benjamin, this patch means an user can no longer 
manage a pool of virtual devices because they will be automatically 
destroyed when the namespace exits. I don't think it is a big concern, 
but just in case I am asking :)

Another point, at present, the virtual devices go back to the initial 
network namespace when the namespace dies, and this behaviour is used to 
track the namespace life cycle. With this patch we have no way to know 
when the network namespace has exited. So we can have the last process 
exiting the network namespace, but the network namespace can stay alive 
(eg. some sockets still have buffer to send) ?

AFAIR, Pavel told us about a patch extending the "wait" semantic and 
pass namespace options to wait for. Is that right Pavel ? Shouldn't we 
apply this path before deleting the virtual network devices ?

Thanks.
   -- Daniel

> Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> ---
>  net/core/dev.c |    6 ++++++
>  1 files changed, 6 insertions(+), 0 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 7091040..f3476d4 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4844,6 +4844,12 @@ static void __net_exit default_device_exit(struct net *net)
>  		if (dev->features & NETIF_F_NETNS_LOCAL)
>  			continue;
> 
> +		/* Delete virtual devices */
> +		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
> +			dev->rtnl_link_ops->dellink(dev);
> +			continue;
> +		}
> +
>  		/* Push remaing network devices to init_net */
>  		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
>  		err = dev_change_net_namespace(dev, &init_net, fb_name);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] netns: Delete virtual interfaces during namespace cleanup
       [not found]     ` <48EB36FC.4000008-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
@ 2008-10-07 10:41       ` Eric W. Biederman
       [not found]         ` <m1ej2s7kmj.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
  2008-10-07 10:52       ` Pavel Emelyanov
  1 sibling, 1 reply; 28+ messages in thread
From: Eric W. Biederman @ 2008-10-07 10:41 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Linux Containers, Denis V. Lunev, David Miller, Pavel Emelianov

Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> writes:

> I agree that will make the life easier for user space developer :)
>
> I have a few questions about this new behaviour.
>
> After discussing with Benjamin, this patch means an user can no longer manage a
> pool of virtual devices because they will be automatically destroyed when the
> namespace exits. I don't think it is a big concern, but just in case I am asking
> :)
>
> Another point, at present, the virtual devices go back to the initial network
> namespace when the namespace dies, and this behaviour is used to track the
> namespace life cycle. With this patch we have no way to know when the network
> namespace has exited. So we can have the last process exiting the network
> namespace, but the network namespace can stay alive (eg. some sockets still have
> buffer to send) ?

Depending on the network interfaces going back to the initial network namespace
is problematic in the long term because is breaks recursive containers.  I can't
see any behavior that does that as anything other than a bug.  Especially
as I already have production uses for recursive containers.

> AFAIR, Pavel told us about a patch extending the "wait" semantic and pass
> namespace options to wait for. Is that right Pavel ? Shouldn't we apply this
> path before deleting the virtual network devices ?

Tell you what.  I will post in a bit my patchset that makes /proc/net
it's own filesystem, that magically mounts on /proc/self/net.  It is a
long stupid story why I haven't posted it publicly.  At which point we have
complete visibility into when a network namespace exits and if we need to
be able to wait for it we can just implement a poll method on the root
directory.

That is simple and will work quite nicely in the long term, and gives
us a lot more visibility than a simple this namespace has exited bit.

Eric

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] netns: Delete virtual interfaces during namespace cleanup
       [not found]     ` <48EB36FC.4000008-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
  2008-10-07 10:41       ` Eric W. Biederman
@ 2008-10-07 10:52       ` Pavel Emelyanov
       [not found]         ` <48EB3F72.5090201-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
  1 sibling, 1 reply; 28+ messages in thread
From: Pavel Emelyanov @ 2008-10-07 10:52 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Linux Containers, Denis V. Lunev, Eric W. Biederman, David Miller

Daniel Lezcano wrote:
> Eric W. Biederman wrote:
>> When physical devices are inside of network namespace and that
>> network namespace terminates we can not make them go away.  We
>> have to keep them and moving them to the initial network namespace
>> is the best we can do.
>>
>> For virtual devices left in a network namespace that is exiting
>> we have no need to preserve them and we now have the infrastructure
>> that allows us to delete them.  So delete virtual devices when we
>> exit a network namespace.  Keeping the necessary user space clean up
>> after a network namespace exits much more tractable.
>>
>> This patch removes much of the need for user space clean up code to
>> run after a network namespace exits.
> 
> I agree that will make the life easier for user space developer :)
> 
> I have a few questions about this new behaviour.
> 
> After discussing with Benjamin, this patch means an user can no longer 
> manage a pool of virtual devices because they will be automatically 
> destroyed when the namespace exits. I don't think it is a big concern, 
> but just in case I am asking :)
> 
> Another point, at present, the virtual devices go back to the initial 
> network namespace when the namespace dies, and this behaviour is used to 
> track the namespace life cycle. With this patch we have no way to know 
> when the network namespace has exited. So we can have the last process 
> exiting the network namespace, but the network namespace can stay alive 
> (eg. some sockets still have buffer to send) ?
> 
> AFAIR, Pavel told us about a patch extending the "wait" semantic and 
> pass namespace options to wait for. Is that right Pavel ? Shouldn't we 
> apply this path before deleting the virtual network devices ?

I remember that I promised to prepare the wait-extending patch. But I
haven't manage to find time for this, sorry :( In a month or two I will
finish one time-hungry task and hopefully be able to do it.

As far as this particular patch is concerned.

All the virtual devices we have now in namespaces (vlan and tunnels)
kill themselves *before* this code is called. But even if we try to
move this destruction from modules to here, we'll be in a tricky
situation, when the e.g. ipip module has already kfree-d the net_ipip
structure, while none of ipip devices are released yet.

If we try to look in the future - if we ever have a virtual device
driver, that will be able to create its devices in namespace, we'll
have to destroy all these devices *before* (or inside) this driver's
net->exit callback is called, but this patch dies the ->dellink call
at the very end, i.e. *after* any potential ->exit callback.

Eric - did you see any device, that was ->dellink-ed by this patch?

> Thanks.
>    -- Daniel
> 
>> Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
>> ---
>>  net/core/dev.c |    6 ++++++
>>  1 files changed, 6 insertions(+), 0 deletions(-)
>>
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 7091040..f3476d4 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -4844,6 +4844,12 @@ static void __net_exit default_device_exit(struct net *net)
>>  		if (dev->features & NETIF_F_NETNS_LOCAL)
>>  			continue;
>>
>> +		/* Delete virtual devices */
>> +		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
>> +			dev->rtnl_link_ops->dellink(dev);
>> +			continue;
>> +		}
>> +
>>  		/* Push remaing network devices to init_net */
>>  		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
>>  		err = dev_change_net_namespace(dev, &init_net, fb_name);
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] netns: Delete virtual interfaces during namespace cleanup
       [not found]         ` <m1ej2s7kmj.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
@ 2008-10-07 11:22           ` Daniel Lezcano
       [not found]             ` <48EB4679.1040602-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Lezcano @ 2008-10-07 11:22 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linux Containers, Denis V. Lunev, David Miller, Pavel Emelianov

Eric W. Biederman wrote:
> Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> writes:
> 
>> I agree that will make the life easier for user space developer :)
>>
>> I have a few questions about this new behaviour.
>>
>> After discussing with Benjamin, this patch means an user can no longer manage a
>> pool of virtual devices because they will be automatically destroyed when the
>> namespace exits. I don't think it is a big concern, but just in case I am asking
>> :)
>>
>> Another point, at present, the virtual devices go back to the initial network
>> namespace when the namespace dies, and this behaviour is used to track the
>> namespace life cycle. With this patch we have no way to know when the network
>> namespace has exited. So we can have the last process exiting the network
>> namespace, but the network namespace can stay alive (eg. some sockets still have
>> buffer to send) ?
> 
> Depending on the network interfaces going back to the initial network namespace
> is problematic in the long term because is breaks recursive containers.  I can't
> see any behavior that does that as anything other than a bug.  

Perhaps, I am misunderstanding your sentence :) But just in case, let me 
clarify my idea.

If you have a TCP connection with a send queue not empty (the kernel has 
buffered the data the application has sent), when your process exits 
because the last socket write was done in userspace, the TCP socket will 
be orphan but still there taking a ref count on your network namespace. 
Hence the process and the network namespace life cycles is not 
correlated. This is a correct behavior, it is not a bug.

> Especially
> as I already have production uses for recursive containers.
> 
>> AFAIR, Pavel told us about a patch extending the "wait" semantic and pass
>> namespace options to wait for. Is that right Pavel ? Shouldn't we apply this
>> path before deleting the virtual network devices ?
> 
> Tell you what.  I will post in a bit my patchset that makes /proc/net
> it's own filesystem, that magically mounts on /proc/self/net.  It is a
> long stupid story why I haven't posted it publicly.  At which point we have
> complete visibility into when a network namespace exits and if we need to
> be able to wait for it we can just implement a poll method on the root
> directory.

Can a process outside of the network namespace look at this directory 
even if there are no more processes running inside the namespace ?

> That is simple and will work quite nicely in the long term, and gives
> us a lot more visibility than a simple this namespace has exited bit.

This is nice :)

Shouldn't you send /proc/net filesystem patch before sending the virtual 
network devices autodestroy patch ?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] netns: Delete virtual interfaces during namespace cleanup
       [not found]         ` <48EB3F72.5090201-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
@ 2008-10-07 11:28           ` Eric W. Biederman
       [not found]             ` <m1d4ic4pbr.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
  0 siblings, 1 reply; 28+ messages in thread
From: Eric W. Biederman @ 2008-10-07 11:28 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Linux Containers, Denis V. Lunev, Daniel Lezcano, David Miller

Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> writes:

> I remember that I promised to prepare the wait-extending patch. But I
> haven't manage to find time for this, sorry :( In a month or two I will
> finish one time-hungry task and hopefully be able to do it.
>
> As far as this particular patch is concerned.
>
> All the virtual devices we have now in namespaces (vlan and tunnels)
> kill themselves *before* this code is called. But even if we try to
> move this destruction from modules to here, we'll be in a tricky
> situation, when the e.g. ipip module has already kfree-d the net_ipip
> structure, while none of ipip devices are released yet.
>
> If we try to look in the future - if we ever have a virtual device
> driver, that will be able to create its devices in namespace, we'll
> have to destroy all these devices *before* (or inside) this driver's
> net->exit callback is called, but this patch dies the ->dellink call
> at the very end, i.e. *after* any potential ->exit callback.
>
> Eric - did you see any device, that was ->dellink-ed by this patch?

macvlan, veth, and dummy interfaces.  Basically
everything I use this deletes the virtual interfaces, instead
of sending them back to the initial network namespace.

From the looks of the code vlans are also be handled.

Things like tun/tap and ipip are handled in the exit methods and I
have no problem with that, although I do wonder if we are handling
moving or refusing to move them between namespaces properly.

For devices that don't need a dedicated virtual interface in
every network namespace this certainly looks like the easiest
way to handle them, as the driver doesn't need to a thing
about network namespaces and the right thing just happens.

Eric

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] netns: Delete virtual interfaces during namespace cleanup
       [not found]             ` <48EB4679.1040602-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
@ 2008-10-07 11:45               ` Eric W. Biederman
       [not found]                 ` <m1fxn839y3.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
  0 siblings, 1 reply; 28+ messages in thread
From: Eric W. Biederman @ 2008-10-07 11:45 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Linux Containers, Denis V. Lunev, David Miller, Pavel Emelianov

Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> writes:

> Perhaps, I am misunderstanding your sentence :) But just in case, let me clarify
> my idea.
>
> If you have a TCP connection with a send queue not empty (the kernel has
> buffered the data the application has sent), when your process exits because the
> last socket write was done in userspace, the TCP socket will be orphan but still
> there taking a ref count on your network namespace. Hence the process and the
> network namespace life cycles is not correlated. This is a correct behavior, it
> is not a bug.

Agreed, letting the network stack finish sending the last bits of data from
sockets is not a bug.

The problem I see is having a veth device that exists in a child container,
and the application waiting for the child device to return to the current network
namespace.  Since we always return network devices to the initial network namespace
waiting for a device in a child container does not work in a recursive container.
And thus is a bug.

> Can a process outside of the network namespace look at this directory even if
> there are no more processes running inside the namespace ?

If you mount it somewhere besides /proc/self/net yes.

Say: mount --bind /proc/self/net /tmp/net

>> That is simple and will work quite nicely in the long term, and gives
>> us a lot more visibility than a simple this namespace has exited bit.
>
> This is nice :)
>
> Shouldn't you send /proc/net filesystem patch before sending the virtual network
> devices autodestroy patch ?

Well there is the cheap trick with this patch of waiting until the local end
of veth dies.

Eric

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] netns: Delete virtual interfaces during namespace cleanup
       [not found]                 ` <m1fxn839y3.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
@ 2008-10-07 12:07                   ` Daniel Lezcano
       [not found]                     ` <48EB50E4.3060303-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Lezcano @ 2008-10-07 12:07 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linux Containers, Denis V. Lunev, David Miller, Pavel Emelianov

Eric W. Biederman wrote:
> Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> writes:
> 
>> Perhaps, I am misunderstanding your sentence :) But just in case, let me clarify
>> my idea.
>>
>> If you have a TCP connection with a send queue not empty (the kernel has
>> buffered the data the application has sent), when your process exits because the
>> last socket write was done in userspace, the TCP socket will be orphan but still
>> there taking a ref count on your network namespace. Hence the process and the
>> network namespace life cycles is not correlated. This is a correct behavior, it
>> is not a bug.
> 
> Agreed, letting the network stack finish sending the last bits of data from
> sockets is not a bug.
> 
> The problem I see is having a veth device that exists in a child container,
> and the application waiting for the child device to return to the current network
> namespace.  Since we always return network devices to the initial network namespace
> waiting for a device in a child container does not work in a recursive container.
> And thus is a bug.

I agree.

>> Can a process outside of the network namespace look at this directory even if
>> there are no more processes running inside the namespace ?
> 
> If you mount it somewhere besides /proc/self/net yes.
> 
> Say: mount --bind /proc/self/net /tmp/net

Ok, thanks.

>>> That is simple and will work quite nicely in the long term, and gives
>>> us a lot more visibility than a simple this namespace has exited bit.
>> This is nice :)
>>
>> Shouldn't you send /proc/net filesystem patch before sending the virtual network
>> devices autodestroy patch ?
> 
> Well there is the cheap trick with this patch of waiting until the local end
> of veth dies.

I actually use veth, macvlan, empty netns and physical. But if you are 
planning the send netns fs soon, I guess I can live with that a time.

Thanks.
   -- Daniel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] net: Support specifying the network namespace upon device creation.
       [not found]     ` <m1vdwatshs.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
@ 2008-10-07 12:20       ` Daniel Lezcano
       [not found]         ` <48EB541A.5070306-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Lezcano @ 2008-10-07 12:20 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linux Containers, Denis V. Lunev, David Miller, Benjamin Thery

Eric W. Biederman wrote:
> There is no good reason to not support userspace specifying the
> network namespace during device creation and it seems a handy
> thing to do.
> 
> We have to be a little extra careful in this case to ensure that
> the network namespace exists through the point where we call
> register_netdevice.
> 
> In addition we need to pass the network namespace to the
> rtnl_link_ops.newlink method so we can properly create
> the new device in another namespace and have it be a vlan
> device of a device in our current network namespace.
> 
> In summary this patch makes ip link add somename netns NNN type sometype
> do the obvious thing instead of ignoring the network namespace parameter.
> 
> Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> ---
>  drivers/net/macvlan.c    |    4 ++--
>  drivers/net/veth.c       |    5 +++--
>  include/net/rtnetlink.h  |    3 ++-
>  net/8021q/vlan_netlink.c |    4 ++--
>  net/core/rtnetlink.c     |   17 ++++++++++++++++-
>  5 files changed, 25 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
> index 4239450..fc5933b 100644
> --- a/drivers/net/macvlan.c
> +++ b/drivers/net/macvlan.c
> @@ -416,7 +416,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[])
>  	return 0;
>  }
> 
> -static int macvlan_newlink(struct net_device *dev,
> +static int macvlan_newlink(struct net *net, struct net_device *dev,
>  			   struct nlattr *tb[], struct nlattr *data[])
>  {
>  	struct macvlan_dev *vlan = netdev_priv(dev);
> @@ -427,7 +427,7 @@ static int macvlan_newlink(struct net_device *dev,
>  	if (!tb[IFLA_LINK])
>  		return -EINVAL;
> 
> -	lowerdev = __dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK]));
> +	lowerdev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
>  	if (lowerdev == NULL)
>  		return -ENODEV;
> 
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 31cd817..3a2d818 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -335,7 +335,7 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[])
> 
>  static struct rtnl_link_ops veth_link_ops;
> 
> -static int veth_newlink(struct net_device *dev,
> +static int veth_newlink(struct net *net, struct net_device *dev,
>  			 struct nlattr *tb[], struct nlattr *data[])
>  {
>  	int err;
> @@ -375,7 +375,7 @@ static int veth_newlink(struct net_device *dev,
>  	else
>  		snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
> 
> -	peer = rtnl_create_link(dev_net(dev), ifname, &veth_link_ops, tbp);
> +	peer = rtnl_create_link(net, ifname, &veth_link_ops, tbp);
>  	if (IS_ERR(peer))
>  		return PTR_ERR(peer);
> 
> @@ -383,6 +383,7 @@ static int veth_newlink(struct net_device *dev,
>  		random_ether_addr(peer->dev_addr);
> 
>  	err = register_netdevice(peer);
> +	put_net(peer->nd_net);
>  	if (err < 0)
>  		goto err_register_peer;
> 
> diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
> index 3c1895e..dbf546f 100644
> --- a/include/net/rtnetlink.h
> +++ b/include/net/rtnetlink.h
> @@ -55,7 +55,8 @@ struct rtnl_link_ops {
>  	int			(*validate)(struct nlattr *tb[],
>  					    struct nlattr *data[]);
> 
> -	int			(*newlink)(struct net_device *dev,
> +	int			(*newlink)(struct net *net,
> +					   struct net_device *dev,
>  					   struct nlattr *tb[],
>  					   struct nlattr *data[]);
>  	int			(*changelink)(struct net_device *dev,
> diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
> index e9c91dc..e6190f7 100644
> --- a/net/8021q/vlan_netlink.c
> +++ b/net/8021q/vlan_netlink.c
> @@ -100,7 +100,7 @@ static int vlan_changelink(struct net_device *dev,
>  	return 0;
>  }
> 
> -static int vlan_newlink(struct net_device *dev,
> +static int vlan_newlink(struct net *net, struct net_device *dev,
>  			struct nlattr *tb[], struct nlattr *data[])
>  {
>  	struct vlan_dev_info *vlan = vlan_dev_info(dev);
> @@ -112,7 +112,7 @@ static int vlan_newlink(struct net_device *dev,
> 
>  	if (!tb[IFLA_LINK])
>  		return -EINVAL;
> -	real_dev = __dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK]));
> +	real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));

Hmm, if the macvlan is created inside a namespace, the network namespace 
specified in the parameter function will not be the namespace where 
belongs IFLA_LINK and the __dev_get_by_index will fail, no ?

>  	if (!real_dev)
>  		return -ENODEV;
> 
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index 8862498..069b176 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -1002,6 +1002,19 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname,
>  			goto err_free;
>  	}
> 
> +	/* To support userspace specifying a network namespace during
> +	 * device creation we grab the network namespace here and hold
> +	 * it until just after register_netdevice to prevent races.
> +	 */
> +	if (!tb[IFLA_NET_NS_PID])
> +		get_net(net);
> +	else {
> +		net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
> +		if (IS_ERR(net)) {
> +			err = PTR_ERR(net);
> +			goto err_free;
> +		}
> +	}
>  	dev_net_set(dev, net);
>  	dev->rtnl_link_ops = ops;
> 
> @@ -1150,10 +1163,12 @@ replay:
>  		if (IS_ERR(dev))
>  			err = PTR_ERR(dev);
>  		else if (ops->newlink)
> -			err = ops->newlink(dev, tb, data);
> +			err = ops->newlink(net, dev, tb, data);
>  		else
>  			err = register_netdevice(dev);
> 
> +		if (!IS_ERR(dev))
> +			put_net(dev->nd_net);

If there is an error in ops->newlink or register_netdevice, we will exit 
without releasing the net refcount.

>  		if (err < 0 && !IS_ERR(dev))
>  			free_netdev(dev);
>  		return err;

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] netns: Delete virtual interfaces during namespace cleanup
       [not found]                     ` <48EB50E4.3060303-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
@ 2008-10-07 23:08                       ` David Miller
       [not found]                         ` <20081007.160807.32968959.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
  0 siblings, 1 reply; 28+ messages in thread
From: David Miller @ 2008-10-07 23:08 UTC (permalink / raw)
  To: dlezcano-NmTC/0ZBporQT0dZR+AlfA
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, den-GEFAQzZX7r8dnm+yROfE0A,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w, xemul-GEFAQzZX7r8dnm+yROfE0A

From: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
Date: Tue, 07 Oct 2008 14:07:00 +0200

> Eric W. Biederman wrote:
> > Well there is the cheap trick with this patch of waiting until the local end
> > of veth dies.
>
> I actually use veth, macvlan, empty netns and physical. But if you
> are planning the send netns fs soon, I guess I can live with that a
> time.

Are we anywhere near consensus with this patch?

Please resubmit once there is some agreement.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] net: Support specifying the network namespace upon device creation.
       [not found]         ` <48EB541A.5070306-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
@ 2008-10-07 23:38           ` Eric W. Biederman
  0 siblings, 0 replies; 28+ messages in thread
From: Eric W. Biederman @ 2008-10-07 23:38 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Linux Containers, Denis V. Lunev, David Miller, Benjamin Thery

Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> writes:

> Eric W. Biederman wrote:
>> There is no good reason to not support userspace specifying the
>> network namespace during device creation and it seems a handy
>> thing to do.
>>
>> We have to be a little extra careful in this case to ensure that
>> the network namespace exists through the point where we call
>> register_netdevice.
>>
>> In addition we need to pass the network namespace to the
>> rtnl_link_ops.newlink method so we can properly create
>> the new device in another namespace and have it be a vlan
>> device of a device in our current network namespace.
>>
>> In summary this patch makes ip link add somename netns NNN type sometype
>> do the obvious thing instead of ignoring the network namespace parameter.
>>
>> Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
>> ---
>>  drivers/net/macvlan.c    |    4 ++--
>>  drivers/net/veth.c       |    5 +++--
>>  include/net/rtnetlink.h  |    3 ++-
>>  net/8021q/vlan_netlink.c |    4 ++--
>>  net/core/rtnetlink.c     |   17 ++++++++++++++++-
>>  5 files changed, 25 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
>> index 4239450..fc5933b 100644
>> --- a/drivers/net/macvlan.c
>> +++ b/drivers/net/macvlan.c
>> @@ -416,7 +416,7 @@ static int macvlan_validate(struct nlattr *tb[], struct
> nlattr *data[])
>>  	return 0;
>>  }
>>
>> -static int macvlan_newlink(struct net_device *dev,
>> +static int macvlan_newlink(struct net *net, struct net_device *dev,
>>  			   struct nlattr *tb[], struct nlattr *data[])
>>  {
>>  	struct macvlan_dev *vlan = netdev_priv(dev);
>> @@ -427,7 +427,7 @@ static int macvlan_newlink(struct net_device *dev,
>>  	if (!tb[IFLA_LINK])
>>  		return -EINVAL;
>>
>> - lowerdev = __dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK]));
>> +	lowerdev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
>>  	if (lowerdev == NULL)
>>  		return -ENODEV;
>>
>> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
>> index 31cd817..3a2d818 100644
>> --- a/drivers/net/veth.c
>> +++ b/drivers/net/veth.c
>> @@ -335,7 +335,7 @@ static int veth_validate(struct nlattr *tb[], struct
> nlattr *data[])
>>
>>  static struct rtnl_link_ops veth_link_ops;
>>
>> -static int veth_newlink(struct net_device *dev,
>> +static int veth_newlink(struct net *net, struct net_device *dev,
>>  			 struct nlattr *tb[], struct nlattr *data[])
>>  {
>>  	int err;
>> @@ -375,7 +375,7 @@ static int veth_newlink(struct net_device *dev,
>>  	else
>>  		snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
>>
>> -	peer = rtnl_create_link(dev_net(dev), ifname, &veth_link_ops, tbp);
>> +	peer = rtnl_create_link(net, ifname, &veth_link_ops, tbp);
>>  	if (IS_ERR(peer))
>>  		return PTR_ERR(peer);
>>
>> @@ -383,6 +383,7 @@ static int veth_newlink(struct net_device *dev,
>>  		random_ether_addr(peer->dev_addr);
>>
>>  	err = register_netdevice(peer);
>> +	put_net(peer->nd_net);
>>  	if (err < 0)
>>  		goto err_register_peer;
>>
>> diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
>> index 3c1895e..dbf546f 100644
>> --- a/include/net/rtnetlink.h
>> +++ b/include/net/rtnetlink.h
>> @@ -55,7 +55,8 @@ struct rtnl_link_ops {
>>  	int			(*validate)(struct nlattr *tb[],
>>  					    struct nlattr *data[]);
>>
>> -	int			(*newlink)(struct net_device *dev,
>> +	int			(*newlink)(struct net *net,
>> +					   struct net_device *dev,
>>  					   struct nlattr *tb[],
>>  					   struct nlattr *data[]);
>>  	int			(*changelink)(struct net_device *dev,
>> diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
>> index e9c91dc..e6190f7 100644
>> --- a/net/8021q/vlan_netlink.c
>> +++ b/net/8021q/vlan_netlink.c
>> @@ -100,7 +100,7 @@ static int vlan_changelink(struct net_device *dev,
>>  	return 0;
>>  }
>>
>> -static int vlan_newlink(struct net_device *dev,
>> +static int vlan_newlink(struct net *net, struct net_device *dev,
>>  			struct nlattr *tb[], struct nlattr *data[])
>>  {
>>  	struct vlan_dev_info *vlan = vlan_dev_info(dev);
>> @@ -112,7 +112,7 @@ static int vlan_newlink(struct net_device *dev,
>>
>>  	if (!tb[IFLA_LINK])
>>  		return -EINVAL;
>> - real_dev = __dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK]));
>> +	real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
>
> Hmm, if the macvlan is created inside a namespace, the network namespace
> specified in the parameter function will not be the namespace where belongs
> IFLA_LINK and the __dev_get_by_index will fail, no ?

The actual operation is creation in the current network namespace and then
immediately move to another namespace.  Anything else gets into some
semantics problems.

The typical case would be to create a macvlan from eth0 in the initial
network namespace, and then move it to the namespace where you want to
use it.


>>  	if (!real_dev)
>>  		return -ENODEV;
>>
>> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
>> index 8862498..069b176 100644
>> --- a/net/core/rtnetlink.c
>> +++ b/net/core/rtnetlink.c
>> @@ -1002,6 +1002,19 @@ struct net_device *rtnl_create_link(struct net *net,
> char *ifname,
>>  			goto err_free;
>>  	}
>>
>> +	/* To support userspace specifying a network namespace during
>> +	 * device creation we grab the network namespace here and hold
>> +	 * it until just after register_netdevice to prevent races.
>> +	 */
>> +	if (!tb[IFLA_NET_NS_PID])
>> +		get_net(net);
>> +	else {
>> +		net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
>> +		if (IS_ERR(net)) {
>> +			err = PTR_ERR(net);
>> +			goto err_free;
>> +		}
>> +	}
>>  	dev_net_set(dev, net);
>>  	dev->rtnl_link_ops = ops;
>>
>> @@ -1150,10 +1163,12 @@ replay:
>>  		if (IS_ERR(dev))
>>  			err = PTR_ERR(dev);
>>  		else if (ops->newlink)
>> -			err = ops->newlink(dev, tb, data);
>> +			err = ops->newlink(net, dev, tb, data);
>>  		else
>>  			err = register_netdevice(dev);
>>
>> +		if (!IS_ERR(dev))
>> +			put_net(dev->nd_net);
>
> If there is an error in ops->newlink or register_netdevice, we will exit without
> releasing the net refcount.

Nope.  That is IS_ERR(dev).  Which only is true if rtnl_create_link fails.
newlink and register_netdevice set error but don't change dev.

Eric

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] netns: Delete virtual interfaces during namespace cleanup
       [not found]                         ` <20081007.160807.32968959.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
@ 2008-10-08  8:19                           ` Daniel Lezcano
  0 siblings, 0 replies; 28+ messages in thread
From: Daniel Lezcano @ 2008-10-08  8:19 UTC (permalink / raw)
  To: David Miller
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, den-GEFAQzZX7r8dnm+yROfE0A,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w, xemul-GEFAQzZX7r8dnm+yROfE0A

David Miller wrote:
> From: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
> Date: Tue, 07 Oct 2008 14:07:00 +0200
> 
>> Eric W. Biederman wrote:
>>> Well there is the cheap trick with this patch of waiting until the local end
>>> of veth dies.
>> I actually use veth, macvlan, empty netns and physical. But if you
>> are planning the send netns fs soon, I guess I can live with that a
>> time.
> 
> Are we anywhere near consensus with this patch?
> 
> Please resubmit once there is some agreement.

I am fine with this patch.

Acked-by: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] netns: Delete virtual interfaces during namespace cleanup
       [not found]             ` <m1d4ic4pbr.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
@ 2008-10-08 12:34               ` Pavel Emelyanov
       [not found]                 ` <48ECA8D2.4090406-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
  0 siblings, 1 reply; 28+ messages in thread
From: Pavel Emelyanov @ 2008-10-08 12:34 UTC (permalink / raw)
  To: Eric W. Biederman, David Miller
  Cc: Linux Containers, Denis V. Lunev, Daniel Lezcano

Eric W. Biederman wrote:
> Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> writes:
> 
>> I remember that I promised to prepare the wait-extending patch. But I
>> haven't manage to find time for this, sorry :( In a month or two I will
>> finish one time-hungry task and hopefully be able to do it.
>>
>> As far as this particular patch is concerned.
>>
>> All the virtual devices we have now in namespaces (vlan and tunnels)
>> kill themselves *before* this code is called. But even if we try to
>> move this destruction from modules to here, we'll be in a tricky
>> situation, when the e.g. ipip module has already kfree-d the net_ipip
>> structure, while none of ipip devices are released yet.
>>
>> If we try to look in the future - if we ever have a virtual device
>> driver, that will be able to create its devices in namespace, we'll
>> have to destroy all these devices *before* (or inside) this driver's
>> net->exit callback is called, but this patch dies the ->dellink call
>> at the very end, i.e. *after* any potential ->exit callback.
>>
>> Eric - did you see any device, that was ->dellink-ed by this patch?
> 
> macvlan, veth, and dummy interfaces.  Basically
> everything I use this deletes the virtual interfaces, instead
> of sending them back to the initial network namespace.

OK, then.

Acked-by: Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>

>>From the looks of the code vlans are also be handled.
> 
> Things like tun/tap and ipip are handled in the exit methods and I
> have no problem with that, although I do wonder if we are handling
> moving or refusing to move them between namespaces properly.
> 
> For devices that don't need a dedicated virtual interface in
> every network namespace this certainly looks like the easiest
> way to handle them, as the driver doesn't need to a thing
> about network namespaces and the right thing just happens.
> 
> Eric
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/3] netns: Delete virtual interfaces during namespace cleanup
       [not found]                 ` <48ECA8D2.4090406-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
@ 2008-11-05 23:22                   ` Eric W. Biederman
  2008-11-05 23:25                     ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device Eric W. Biederman
       [not found]                     ` <m14p2l4v2l.fsf_-_-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
  0 siblings, 2 replies; 28+ messages in thread
From: Eric W. Biederman @ 2008-11-05 23:22 UTC (permalink / raw)
  To: David Miller
  Cc: Linux Containers, Denis V. Lunev, Daniel Lezcano, Pavel Emelyanov


When physical devices are inside of network namespace and that
network namespace terminates we can not make them go away.  We
have to keep them and moving them to the initial network namespace
is the best we can do.

For virtual devices left in a network namespace that is exiting
we have no need to preserve them and we now have the infrastructure
that allows us to delete them.  So delete virtual devices when we
exit a network namespace.  Keeping the necessary user space clean up
after a network namespace exits much more tractable.

Acked-by: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
Acked-by: Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
---
 net/core/dev.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 8f9d3b3..9475f3e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4852,6 +4852,12 @@ static void __net_exit default_device_exit(struct net *net)
 		if (dev->features & NETIF_F_NETNS_LOCAL)
 			continue;
 
+		/* Delete virtual devices */
+		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
+			dev->rtnl_link_ops->dellink(dev);
+			continue;
+		}
+
 		/* Push remaing network devices to init_net */
 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 		err = dev_change_net_namespace(dev, &init_net, fb_name);
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device.
  2008-11-05 23:22                   ` [PATCH 1/3] " Eric W. Biederman
@ 2008-11-05 23:25                     ` Eric W. Biederman
  2008-11-05 23:27                       ` [PATCH 3/3] net: Don't leak packets when a netns is going down Eric W. Biederman
  2008-11-06  0:00                       ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device David Miller
       [not found]                     ` <m14p2l4v2l.fsf_-_-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
  1 sibling, 2 replies; 28+ messages in thread
From: Eric W. Biederman @ 2008-11-05 23:25 UTC (permalink / raw)
  To: David Miller
  Cc: Daniel Lezcano, Linux Containers, Denis V. Lunev, Pavel Emelyanov,
	netdev


I was recently hunting a bug that occurred in network namespace
cleanup.  In looking at the code it became apparrent that we have
and will continue to have cases where if we have anything going
on in a network namespace there will be assumptions that the
loopback device is present.   Things like sending igmp unsubscribe
messages when we bring down network devices invokes the routing
code which assumes that at least the loopback driver is present.

Therefore to avoid magic initcall ordering hackery that is hard
to follow and hard to get right insert a call to register the
loopback device directly from net_dev_init().    This guarantes
that the loopback device is the first device registered and
the last network device to go away.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/loopback.c    |   13 ++-----------
 include/linux/netdevice.h |    1 +
 net/core/dev.c            |   12 ++++++++++++
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 91d0858..c4516b5 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -204,17 +204,8 @@ static __net_exit void loopback_net_exit(struct net *net)
 	unregister_netdev(dev);
 }
 
-static struct pernet_operations __net_initdata loopback_net_ops = {
+/* Registered in net/core/dev.c */
+struct pernet_operations __net_initdata loopback_net_ops = {
        .init = loopback_net_init,
        .exit = loopback_net_exit,
 };
-
-static int __init loopback_init(void)
-{
-	return register_pernet_device(&loopback_net_ops);
-}
-
-/* Loopback is special. It should be initialized before any other network
- * device and network subsystem.
- */
-fs_initcall(loopback_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f1b0dbe..12d7f44 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1766,6 +1766,7 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 	return 0;
 }
 
+extern struct pernet_operations __net_initdata loopback_net_ops;
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 9475f3e..811507c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4904,6 +4904,18 @@ static int __init net_dev_init(void)
 	if (register_pernet_subsys(&netdev_net_ops))
 		goto out;
 
+	/* The loopback device is special if any other network devices
+	 * is present in a network namespace the loopback device must
+	 * be present. Since we now dynamically allocate and free the
+	 * loopback device ensure this invariant is maintained by
+	 * keeping the loopback device as the first device on the
+	 * list of network devices.  Ensuring the loopback devices
+	 * is the first device that appears and the last network device
+	 * that disappears.
+	 */
+	if (register_pernet_device(&loopback_net_ops))
+		goto out;
+
 	if (register_pernet_device(&default_device_ops))
 		goto out;
 
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/3] net: Don't leak packets when a netns is going down
  2008-11-05 23:25                     ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device Eric W. Biederman
@ 2008-11-05 23:27                       ` Eric W. Biederman
  2008-11-06  0:00                         ` David Miller
  2008-11-06  0:00                       ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device David Miller
  1 sibling, 1 reply; 28+ messages in thread
From: Eric W. Biederman @ 2008-11-05 23:27 UTC (permalink / raw)
  To: David Miller
  Cc: Daniel Lezcano, Linux Containers, Denis V. Lunev, Pavel Emelyanov,
	netdev

I have been tracking for a while a case where when the
network namespace exits the cleanup gets stck in an
endless precessess of:

unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3

It turns out that if you listen on a multicast address an unsubscribe
packet is sent when the network device goes down.   If you shutdown
the network namespace without carefully cleaning up this can trigger
the unsubscribe packet to be sent over the loopback interface while
the network namespace is going down.

All of which is fine except when we drop the packet and forget to
free it leaking the skb and the dst entry attached to.  As it
turns out the dst entry hold a reference to the idev which holds
the dev and keeps everything from being cleaned up.  Yuck!

By fixing my earlier thinko and add the needed kfree_skb and everything
cleans up beautifully. 

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 net/core/dev.c |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 811507c..a0c6060 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2253,8 +2253,10 @@ int netif_receive_skb(struct sk_buff *skb)
 	rcu_read_lock();

 	/* Don't receive packets in an exiting network namespace */
-	if (!net_alive(dev_net(skb->dev)))
+	if (!net_alive(dev_net(skb->dev))) {
+		kfree_skb(skb);
 		goto out;
+	}

 #ifdef CONFIG_NET_CLS_ACT
 	if (skb->tc_verd & TC_NCLS) {
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] netns: Delete virtual interfaces during namespace cleanup
       [not found]                     ` <m14p2l4v2l.fsf_-_-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
@ 2008-11-06  0:00                       ` David Miller
  0 siblings, 0 replies; 28+ messages in thread
From: David Miller @ 2008-11-06  0:00 UTC (permalink / raw)
  To: ebiederm-aS9lmoZGLiVWk0Htik3J/w
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, den-GEFAQzZX7r8dnm+yROfE0A,
	dlezcano-NmTC/0ZBporQT0dZR+AlfA, xemul-GEFAQzZX7r8dnm+yROfE0A

From: ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org (Eric W. Biederman)
Date: Wed, 05 Nov 2008 15:22:26 -0800

> 
> When physical devices are inside of network namespace and that
> network namespace terminates we can not make them go away.  We
> have to keep them and moving them to the initial network namespace
> is the best we can do.
> 
> For virtual devices left in a network namespace that is exiting
> we have no need to preserve them and we now have the infrastructure
> that allows us to delete them.  So delete virtual devices when we
> exit a network namespace.  Keeping the necessary user space clean up
> after a network namespace exits much more tractable.
> 
> Acked-by: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
> Acked-by: Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> Signed-off-by: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>

Applied.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device.
  2008-11-05 23:25                     ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device Eric W. Biederman
  2008-11-05 23:27                       ` [PATCH 3/3] net: Don't leak packets when a netns is going down Eric W. Biederman
@ 2008-11-06  0:00                       ` David Miller
  2008-11-06 13:02                         ` Eric W. Biederman
  1 sibling, 1 reply; 28+ messages in thread
From: David Miller @ 2008-11-06  0:00 UTC (permalink / raw)
  To: ebiederm; +Cc: dlezcano, containers, den, xemul, netdev

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Wed, 05 Nov 2008 15:25:39 -0800

> 
> I was recently hunting a bug that occurred in network namespace
> cleanup.  In looking at the code it became apparrent that we have
> and will continue to have cases where if we have anything going
> on in a network namespace there will be assumptions that the
> loopback device is present.   Things like sending igmp unsubscribe
> messages when we bring down network devices invokes the routing
> code which assumes that at least the loopback driver is present.
> 
> Therefore to avoid magic initcall ordering hackery that is hard
> to follow and hard to get right insert a call to register the
> loopback device directly from net_dev_init().    This guarantes
> that the loopback device is the first device registered and
> the last network device to go away.
> 
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

Applied.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] net: Don't leak packets when a netns is going down
  2008-11-05 23:27                       ` [PATCH 3/3] net: Don't leak packets when a netns is going down Eric W. Biederman
@ 2008-11-06  0:00                         ` David Miller
  0 siblings, 0 replies; 28+ messages in thread
From: David Miller @ 2008-11-06  0:00 UTC (permalink / raw)
  To: ebiederm; +Cc: dlezcano, containers, den, xemul, netdev

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Wed, 05 Nov 2008 15:27:34 -0800

> 
> I have been tracking for a while a case where when the
> network namespace exits the cleanup gets stck in an
> endless precessess of:
> 
> unregister_netdevice: waiting for lo to become free. Usage count = 3
> unregister_netdevice: waiting for lo to become free. Usage count = 3
> unregister_netdevice: waiting for lo to become free. Usage count = 3
> unregister_netdevice: waiting for lo to become free. Usage count = 3
> unregister_netdevice: waiting for lo to become free. Usage count = 3
> unregister_netdevice: waiting for lo to become free. Usage count = 3
> unregister_netdevice: waiting for lo to become free. Usage count = 3
> 
> It turns out that if you listen on a multicast address an unsubscribe
> packet is sent when the network device goes down.   If you shutdown
> the network namespace without carefully cleaning up this can trigger
> the unsubscribe packet to be sent over the loopback interface while
> the network namespace is going down.
> 
> All of which is fine except when we drop the packet and forget to
> free it leaking the skb and the dst entry attached to.  As it
> turns out the dst entry hold a reference to the idev which holds
> the dev and keeps everything from being cleaned up.  Yuck!
> 
> By fixing my earlier thinko and add the needed kfree_skb and everything
> cleans up beautifully. 
> 
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

Applied.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device.
  2008-11-06  0:00                       ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device David Miller
@ 2008-11-06 13:02                         ` Eric W. Biederman
  2008-11-06 15:34                           ` [PATCH 1/2] net: fib_rules ordering fixes Eric W. Biederman
                                             ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Eric W. Biederman @ 2008-11-06 13:02 UTC (permalink / raw)
  To: David Miller; +Cc: dlezcano, containers, den, xemul, netdev

Dave can you please drop this one for the moment.

I cleaned up my patch after the basic testing was over and the
result is a kernel that won't boot.  So if we can prevent this
patch from spreading and breaking a git-bisect that would be great.

I will follow up in a moment with a properly tested version.
My apologies.

Eric

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/2] net: fib_rules ordering fixes.
  2008-11-06 13:02                         ` Eric W. Biederman
@ 2008-11-06 15:34                           ` Eric W. Biederman
  2008-11-06 15:36                             ` [PATCH 2/2] net: Guaranetee the proper ordering of the loopback device. v2 Eric W. Biederman
  2008-11-08  6:54                             ` [PATCH 1/2] net: fib_rules ordering fixes David Miller
  2008-11-06 21:20                           ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device David Miller
  2008-11-08  6:53                           ` David Miller
  2 siblings, 2 replies; 28+ messages in thread
From: Eric W. Biederman @ 2008-11-06 15:34 UTC (permalink / raw)
  To: David Miller; +Cc: dlezcano, containers, den, xemul, netdev


We need to setup the network namespace state before we register
the notifier.  Otherwise if a network device is already registered
we get a nasty NULL pointer dereference.

Signed-off-by: Eric W. Biederman <ebiederm@maxwell.aristanetworks.com>
---
 net/core/fib_rules.c |    7 ++++---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 79de3b1..32b3a01 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -664,17 +664,18 @@ static int __init fib_rules_init(void)
 	rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL);
 	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule);
 
-	err = register_netdevice_notifier(&fib_rules_notifier);
+	err = register_pernet_subsys(&fib_rules_net_ops);
 	if (err < 0)
 		goto fail;
 
-	err = register_pernet_subsys(&fib_rules_net_ops);
+	err = register_netdevice_notifier(&fib_rules_notifier);
 	if (err < 0)
 		goto fail_unregister;
+
 	return 0;
 
 fail_unregister:
-	unregister_netdevice_notifier(&fib_rules_notifier);
+	unregister_pernet_subsys(&fib_rules_net_ops);
 fail:
 	rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
 	rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/2] net: Guaranetee the proper ordering of the loopback device. v2
  2008-11-06 15:34                           ` [PATCH 1/2] net: fib_rules ordering fixes Eric W. Biederman
@ 2008-11-06 15:36                             ` Eric W. Biederman
  2008-11-08  6:55                               ` David Miller
  2008-11-08  6:54                             ` [PATCH 1/2] net: fib_rules ordering fixes David Miller
  1 sibling, 1 reply; 28+ messages in thread
From: Eric W. Biederman @ 2008-11-06 15:36 UTC (permalink / raw)
  To: David Miller; +Cc: dlezcano, containers, den, xemul, netdev


I was recently hunting a bug that occurred in network namespace
cleanup.  In looking at the code it became apparrent that we have
and will continue to have cases where if we have anything going
on in a network namespace there will be assumptions that the
loopback device is present.   Things like sending igmp unsubscribe
messages when we bring down network devices invokes the routing
code which assumes that at least the loopback driver is present.

Therefore to avoid magic initcall ordering hackery that is hard
to follow and hard to get right insert a call to register the
loopback device directly from net_dev_init().    This guarantes
that the loopback device is the first device registered and
the last network device to go away.

But do it carefully so we register the loopback device after
we clear dev_boot_phase.

Signed-off-by: Eric W. Biederman <ebiederm@maxwell.aristanetworks.com>
---
 drivers/net/loopback.c    |   13 ++-----------
 include/linux/netdevice.h |    1 +
 net/core/dev.c            |   22 +++++++++++++++++-----
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 3b43bfd..bcc9945 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -215,17 +215,8 @@ static __net_exit void loopback_net_exit(struct net *net)
 	unregister_netdev(dev);
 }
 
-static struct pernet_operations __net_initdata loopback_net_ops = {
+/* Registered in net/core/dev.c */
+struct pernet_operations __net_initdata loopback_net_ops = {
        .init = loopback_net_init,
        .exit = loopback_net_exit,
 };
-
-static int __init loopback_init(void)
-{
-	return register_pernet_device(&loopback_net_ops);
-}
-
-/* Loopback is special. It should be initialized before any other network
- * device and network subsystem.
- */
-fs_initcall(loopback_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 488c56e..c7004a5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1726,6 +1726,7 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 	return 0;
 }
 
+extern struct pernet_operations __net_initdata loopback_net_ops;
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 3785c4b..cf54670 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4824,9 +4824,6 @@ static int __init net_dev_init(void)
 	if (register_pernet_subsys(&netdev_net_ops))
 		goto out;
 
-	if (register_pernet_device(&default_device_ops))
-		goto out;
-
 	/*
 	 *	Initialise the packet receive queues.
 	 */
@@ -4843,10 +4840,25 @@ static int __init net_dev_init(void)
 		queue->backlog.weight = weight_p;
 	}
 
-	netdev_dma_register();
-
 	dev_boot_phase = 0;
 
+	/* The loopback device is special if any other network devices
+	 * is present in a network namespace the loopback device must
+	 * be present. Since we now dynamically allocate and free the
+	 * loopback device ensure this invariant is maintained by
+	 * keeping the loopback device as the first device on the
+	 * list of network devices.  Ensuring the loopback devices
+	 * is the first device that appears and the last network device
+	 * that disappears.
+	 */
+	if (register_pernet_device(&loopback_net_ops))
+		goto out;
+
+	if (register_pernet_device(&default_device_ops))
+		goto out;
+
+	netdev_dma_register();
+
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
 
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device.
  2008-11-06 13:02                         ` Eric W. Biederman
  2008-11-06 15:34                           ` [PATCH 1/2] net: fib_rules ordering fixes Eric W. Biederman
@ 2008-11-06 21:20                           ` David Miller
  2008-11-08  6:53                           ` David Miller
  2 siblings, 0 replies; 28+ messages in thread
From: David Miller @ 2008-11-06 21:20 UTC (permalink / raw)
  To: ebiederm; +Cc: dlezcano, containers, den, xemul, netdev

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Thu, 06 Nov 2008 05:02:33 -0800

> 
> Dave can you please drop this one for the moment.
> 
> I cleaned up my patch after the basic testing was over and the
> result is a kernel that won't boot.  So if we can prevent this
> patch from spreading and breaking a git-bisect that would be great.
> 
> I will follow up in a moment with a properly tested version.
> My apologies.

It's already in my tree, so I need a relative fixup patch not
an entire new one.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device.
  2008-11-06 13:02                         ` Eric W. Biederman
  2008-11-06 15:34                           ` [PATCH 1/2] net: fib_rules ordering fixes Eric W. Biederman
  2008-11-06 21:20                           ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device David Miller
@ 2008-11-08  6:53                           ` David Miller
  2008-11-08  7:13                             ` Eric W. Biederman
  2 siblings, 1 reply; 28+ messages in thread
From: David Miller @ 2008-11-08  6:53 UTC (permalink / raw)
  To: ebiederm; +Cc: dlezcano, containers, den, xemul, netdev

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Thu, 06 Nov 2008 05:02:33 -0800

> Dave can you please drop this one for the moment.
> 
> I cleaned up my patch after the basic testing was over and the
> result is a kernel that won't boot.  So if we can prevent this
> patch from spreading and breaking a git-bisect that would be great.
> 
> I will follow up in a moment with a properly tested version.
> My apologies.

I'm putting a revert changeset in there, then your two new patches on
top.

I'm not screwing up my GIT tree for everyone who pulls from me just
because you can't be bothered to test the actual changes you send me.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] net: fib_rules ordering fixes.
  2008-11-06 15:34                           ` [PATCH 1/2] net: fib_rules ordering fixes Eric W. Biederman
  2008-11-06 15:36                             ` [PATCH 2/2] net: Guaranetee the proper ordering of the loopback device. v2 Eric W. Biederman
@ 2008-11-08  6:54                             ` David Miller
  1 sibling, 0 replies; 28+ messages in thread
From: David Miller @ 2008-11-08  6:54 UTC (permalink / raw)
  To: ebiederm; +Cc: dlezcano, containers, den, xemul, netdev

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Thu, 06 Nov 2008 07:34:28 -0800

> 
> We need to setup the network namespace state before we register
> the notifier.  Otherwise if a network device is already registered
> we get a nasty NULL pointer dereference.
> 
> Signed-off-by: Eric W. Biederman <ebiederm@maxwell.aristanetworks.com>

Applied.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] net: Guaranetee the proper ordering of the loopback device. v2
  2008-11-06 15:36                             ` [PATCH 2/2] net: Guaranetee the proper ordering of the loopback device. v2 Eric W. Biederman
@ 2008-11-08  6:55                               ` David Miller
  0 siblings, 0 replies; 28+ messages in thread
From: David Miller @ 2008-11-08  6:55 UTC (permalink / raw)
  To: ebiederm; +Cc: dlezcano, containers, den, xemul, netdev

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Thu, 06 Nov 2008 07:36:00 -0800

> I was recently hunting a bug that occurred in network namespace
> cleanup.  In looking at the code it became apparrent that we have
> and will continue to have cases where if we have anything going
> on in a network namespace there will be assumptions that the
> loopback device is present.   Things like sending igmp unsubscribe
> messages when we bring down network devices invokes the routing
> code which assumes that at least the loopback driver is present.
> 
> Therefore to avoid magic initcall ordering hackery that is hard
> to follow and hard to get right insert a call to register the
> loopback device directly from net_dev_init().    This guarantes
> that the loopback device is the first device registered and
> the last network device to go away.
> 
> But do it carefully so we register the loopback device after
> we clear dev_boot_phase.
> 
> Signed-off-by: Eric W. Biederman <ebiederm@maxwell.aristanetworks.com>

Applied.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device.
  2008-11-08  6:53                           ` David Miller
@ 2008-11-08  7:13                             ` Eric W. Biederman
  0 siblings, 0 replies; 28+ messages in thread
From: Eric W. Biederman @ 2008-11-08  7:13 UTC (permalink / raw)
  To: David Miller; +Cc: dlezcano, containers, den, xemul, netdev

David Miller <davem@davemloft.net> writes:

> I'm putting a revert changeset in there, then your two new patches on
> top.
>
> I'm not screwing up my GIT tree for everyone who pulls from me just
> because you can't be bothered to test the actual changes you send me.

Understood.  When I sent the email it looked like it might not have landed
in your git tree yet.  I badly messed up on that one and I apologize.

Eric


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2008-11-08  7:13 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-03  0:39 [PATCH] netns: Delete virtual interfaces during namespace cleanup Eric W. Biederman
     [not found] ` <m18wt6v7eb.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
2008-10-03  0:46   ` [PATCH] net: Support specifying the network namespace upon device creation Eric W. Biederman
     [not found]     ` <m1vdwatshs.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
2008-10-07 12:20       ` Daniel Lezcano
     [not found]         ` <48EB541A.5070306-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
2008-10-07 23:38           ` Eric W. Biederman
2008-10-07 10:16   ` [PATCH] netns: Delete virtual interfaces during namespace cleanup Daniel Lezcano
     [not found]     ` <48EB36FC.4000008-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
2008-10-07 10:41       ` Eric W. Biederman
     [not found]         ` <m1ej2s7kmj.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
2008-10-07 11:22           ` Daniel Lezcano
     [not found]             ` <48EB4679.1040602-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
2008-10-07 11:45               ` Eric W. Biederman
     [not found]                 ` <m1fxn839y3.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
2008-10-07 12:07                   ` Daniel Lezcano
     [not found]                     ` <48EB50E4.3060303-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
2008-10-07 23:08                       ` David Miller
     [not found]                         ` <20081007.160807.32968959.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2008-10-08  8:19                           ` Daniel Lezcano
2008-10-07 10:52       ` Pavel Emelyanov
     [not found]         ` <48EB3F72.5090201-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
2008-10-07 11:28           ` Eric W. Biederman
     [not found]             ` <m1d4ic4pbr.fsf-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
2008-10-08 12:34               ` Pavel Emelyanov
     [not found]                 ` <48ECA8D2.4090406-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
2008-11-05 23:22                   ` [PATCH 1/3] " Eric W. Biederman
2008-11-05 23:25                     ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device Eric W. Biederman
2008-11-05 23:27                       ` [PATCH 3/3] net: Don't leak packets when a netns is going down Eric W. Biederman
2008-11-06  0:00                         ` David Miller
2008-11-06  0:00                       ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device David Miller
2008-11-06 13:02                         ` Eric W. Biederman
2008-11-06 15:34                           ` [PATCH 1/2] net: fib_rules ordering fixes Eric W. Biederman
2008-11-06 15:36                             ` [PATCH 2/2] net: Guaranetee the proper ordering of the loopback device. v2 Eric W. Biederman
2008-11-08  6:55                               ` David Miller
2008-11-08  6:54                             ` [PATCH 1/2] net: fib_rules ordering fixes David Miller
2008-11-06 21:20                           ` [PATCH 2/3] net: Guaranetee the proper ordering of the loopback device David Miller
2008-11-08  6:53                           ` David Miller
2008-11-08  7:13                             ` Eric W. Biederman
     [not found]                     ` <m14p2l4v2l.fsf_-_-B27657KtZYmhTnVgQlOflh2eb7JE58TQ@public.gmane.org>
2008-11-06  0:00                       ` [PATCH 1/3] netns: Delete virtual interfaces during namespace cleanup David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.