Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH 01/19] netfilter: move nf_conntrack initialize out of pernet operations
From: canqun zhang @ 2012-12-28  3:52 UTC (permalink / raw)
  To: Gao feng
  Cc: netfilter-devel, netdev@vger.kernel.org, Patrick McHardy, pablo,
	ebiederm
In-Reply-To: <1356662206-2260-1-git-send-email-gaofeng@cn.fujitsu.com>

Hi all
As discussed above,if the host machine create several linux
containers, there will be several  net namespaces.Resources with "nf
conntrack" are registered or unregistered on the first net
namespace(init_net),But init_net is not unregistered lastly,so
cleanuping other net namespaces  will triger painic.
If net namespaces are created  with the order of 1,2,...n,they should
be cleaned with the order of n,...2,1,so in this case init_net will be
unregistered lastly.
I fixed it up (see below). I have taken a lot of test!

diff -r 6a1a258923f5 -r 2667e89e6f50 net/core/net_namespace.c
--- a/net/core/net_namespace.c  Fri Dec 28 11:01:17 2012 +0800
+++ b/net/core/net_namespace.c  Fri Dec 28 11:05:12 2012 +0800
@@ -450,7 +450,7 @@

        list_del(&ops->list);
        for_each_net(net)
-               list_add_tail(&net->exit_list, &net_exit_list);
+              list_add(&net->exit_list, &net_exit_list);
        ops_exit_list(ops, &net_exit_list);
        ops_free_list(ops, &net_exit_lis

2012/12/28 Gao feng <gaofeng@cn.fujitsu.com>:
> canqun zhang reported a panic BUG,kernel may panic when
> unloading nf_conntrack module.
>
> It's because we reset nf_ct_destroy to NULL when we deal
> with init_net,it's too early.Some packets belongs to other
> netns still refers to the conntrack.when these packets need
> to be freed, kfree_skb will call nf_ct_destroy which is
> NULL.
>
> fix this bug by moving the nf_conntrack initialize and cleanup
> codes out of the pernet operations,this job should be done
> in module_init/exit.We can't use init_net to identify if
> it's the right time.
>
> Reported-by: canqun zhang <canqunzhang@gmail.com>
> Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
> ---
>  include/net/netfilter/nf_conntrack_core.h | 10 +++-
>  net/netfilter/nf_conntrack_core.c         | 99 ++++++++++++-------------------
>  net/netfilter/nf_conntrack_standalone.c   | 29 ++++++---
>  3 files changed, 67 insertions(+), 71 deletions(-)
>
> diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
> index d8f5b9f..ec51a3c 100644
> --- a/include/net/netfilter/nf_conntrack_core.h
> +++ b/include/net/netfilter/nf_conntrack_core.h
> @@ -25,8 +25,14 @@ extern unsigned int nf_conntrack_in(struct net *net,
>                                     unsigned int hooknum,
>                                     struct sk_buff *skb);
>
> -extern int nf_conntrack_init(struct net *net);
> -extern void nf_conntrack_cleanup(struct net *net);
> +extern int nf_conntrack_init_net(struct net *net);
> +extern void nf_conntrack_cleanup_net(struct net *net);
> +
> +extern int nf_conntrack_init_start(void);
> +extern void nf_conntrack_cleanup_start(void);
> +
> +extern void nf_conntrack_init_end(void);
> +extern void nf_conntrack_cleanup_end(void);
>
>  extern int nf_conntrack_proto_init(struct net *net);
>  extern void nf_conntrack_proto_fini(struct net *net);
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 08cdc71..ffb2463 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -1331,18 +1331,23 @@ static int untrack_refs(void)
>         return cnt;
>  }
>
> -static void nf_conntrack_cleanup_init_net(void)
> +void nf_conntrack_cleanup_start(void)
>  {
> -       while (untrack_refs() > 0)
> -               schedule();
> -
> -#ifdef CONFIG_NF_CONNTRACK_ZONES
> -       nf_ct_extend_unregister(&nf_ct_zone_extend);
> -#endif
> +       RCU_INIT_POINTER(ip_ct_attach, NULL);
>  }
>
> -static void nf_conntrack_cleanup_net(struct net *net)
> +/*
> + * Mishearing the voices in his head, our hero wonders how he's
> + * supposed to kill the mall.
> + */
> +void nf_conntrack_cleanup_net(struct net *net)
>  {
> +       /*
> +        * This makes sure all current packets have passed through
> +        * netfilter framework.  Roll on, two-stage module
> +        * delete...
> +        */
> +       synchronize_net();
>   i_see_dead_people:
>         nf_ct_iterate_cleanup(net, kill_all, NULL);
>         nf_ct_release_dying_list(net);
> @@ -1352,6 +1357,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
>         }
>
>         nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
> +       nf_conntrack_proto_fini(net);
>         nf_conntrack_helper_fini(net);
>         nf_conntrack_timeout_fini(net);
>         nf_conntrack_ecache_fini(net);
> @@ -1363,24 +1369,15 @@ static void nf_conntrack_cleanup_net(struct net *net)
>         free_percpu(net->ct.stat);
>  }
>
> -/* Mishearing the voices in his head, our hero wonders how he's
> -   supposed to kill the mall. */
> -void nf_conntrack_cleanup(struct net *net)
> +void nf_conntrack_cleanup_end(void)
>  {
> -       if (net_eq(net, &init_net))
> -               RCU_INIT_POINTER(ip_ct_attach, NULL);
> -
> -       /* This makes sure all current packets have passed through
> -          netfilter framework.  Roll on, two-stage module
> -          delete... */
> -       synchronize_net();
> -       nf_conntrack_proto_fini(net);
> -       nf_conntrack_cleanup_net(net);
> +       RCU_INIT_POINTER(nf_ct_destroy, NULL);
> +       while (untrack_refs() > 0)
> +               schedule();
>
> -       if (net_eq(net, &init_net)) {
> -               RCU_INIT_POINTER(nf_ct_destroy, NULL);
> -               nf_conntrack_cleanup_init_net();
> -       }
> +#ifdef CONFIG_NF_CONNTRACK_ZONES
> +       nf_ct_extend_unregister(&nf_ct_zone_extend);
> +#endif
>  }
>
>  void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
> @@ -1473,7 +1470,7 @@ void nf_ct_untracked_status_or(unsigned long bits)
>  }
>  EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
>
> -static int nf_conntrack_init_init_net(void)
> +int nf_conntrack_init_start(void)
>  {
>         int max_factor = 8;
>         int ret, cpu;
> @@ -1527,7 +1524,7 @@ err_extend:
>  #define UNCONFIRMED_NULLS_VAL  ((1<<30)+0)
>  #define DYING_NULLS_VAL                ((1<<30)+1)
>
> -static int nf_conntrack_init_net(struct net *net)
> +int nf_conntrack_init_net(struct net *net)
>  {
>         int ret;
>
> @@ -1580,7 +1577,12 @@ static int nf_conntrack_init_net(struct net *net)
>         ret = nf_conntrack_helper_init(net);
>         if (ret < 0)
>                 goto err_helper;
> +       ret = nf_conntrack_proto_init(net);
> +       if (ret < 0)
> +               goto out_proto;
>         return 0;
> +out_proto:
> +       nf_conntrack_helper_fini(net);
>  err_helper:
>         nf_conntrack_timeout_fini(net);
>  err_timeout:
> @@ -1603,42 +1605,17 @@ err_stat:
>         return ret;
>  }
>
> +void nf_conntrack_init_end(void)
> +{
> +       /* For use by REJECT target */
> +       RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
> +       RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
> +
> +       /* Howto get NAT offsets */
> +       RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
> +}
> +
>  s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
>                         enum ip_conntrack_dir dir,
>                         u32 seq);
>  EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
> -
> -int nf_conntrack_init(struct net *net)
> -{
> -       int ret;
> -
> -       if (net_eq(net, &init_net)) {
> -               ret = nf_conntrack_init_init_net();
> -               if (ret < 0)
> -                       goto out_init_net;
> -       }
> -       ret = nf_conntrack_proto_init(net);
> -       if (ret < 0)
> -               goto out_proto;
> -       ret = nf_conntrack_init_net(net);
> -       if (ret < 0)
> -               goto out_net;
> -
> -       if (net_eq(net, &init_net)) {
> -               /* For use by REJECT target */
> -               RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
> -               RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
> -
> -               /* Howto get NAT offsets */
> -               RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
> -       }
> -       return 0;
> -
> -out_net:
> -       nf_conntrack_proto_fini(net);
> -out_proto:
> -       if (net_eq(net, &init_net))
> -               nf_conntrack_cleanup_init_net();
> -out_init_net:
> -       return ret;
> -}
> diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
> index 363285d..00bf93c 100644
> --- a/net/netfilter/nf_conntrack_standalone.c
> +++ b/net/netfilter/nf_conntrack_standalone.c
> @@ -530,11 +530,11 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net)
>  }
>  #endif /* CONFIG_SYSCTL */
>
> -static int nf_conntrack_net_init(struct net *net)
> +static int nf_conntrack_pernet_init(struct net *net)
>  {
>         int ret;
>
> -       ret = nf_conntrack_init(net);
> +       ret = nf_conntrack_init_net(net);
>         if (ret < 0)
>                 goto out_init;
>         ret = nf_conntrack_standalone_init_proc(net);
> @@ -550,31 +550,44 @@ static int nf_conntrack_net_init(struct net *net)
>  out_sysctl:
>         nf_conntrack_standalone_fini_proc(net);
>  out_proc:
> -       nf_conntrack_cleanup(net);
> +       nf_conntrack_cleanup_net(net);
>  out_init:
>         return ret;
>  }
>
> -static void nf_conntrack_net_exit(struct net *net)
> +static void nf_conntrack_pernet_exit(struct net *net)
>  {
>         nf_conntrack_standalone_fini_sysctl(net);
>         nf_conntrack_standalone_fini_proc(net);
> -       nf_conntrack_cleanup(net);
> +       nf_conntrack_cleanup_net(net);
>  }
>
>  static struct pernet_operations nf_conntrack_net_ops = {
> -       .init = nf_conntrack_net_init,
> -       .exit = nf_conntrack_net_exit,
> +       .init = nf_conntrack_pernet_init,
> +       .exit = nf_conntrack_pernet_exit,
>  };
>
>  static int __init nf_conntrack_standalone_init(void)
>  {
> -       return register_pernet_subsys(&nf_conntrack_net_ops);
> +       int ret = nf_conntrack_init_start();
> +       if (ret < 0)
> +               goto out_start;
> +       ret = register_pernet_subsys(&nf_conntrack_net_ops);
> +       if (ret < 0)
> +               goto out_pernet;
> +       nf_conntrack_init_end();
> +       return 0;
> +out_pernet:
> +       nf_conntrack_cleanup_end();
> +out_start:
> +       return ret;
>  }
>
>  static void __exit nf_conntrack_standalone_fini(void)
>  {
> +       nf_conntrack_cleanup_start();
>         unregister_pernet_subsys(&nf_conntrack_net_ops);
> +       nf_conntrack_cleanup_end();
>  }
>
>  module_init(nf_conntrack_standalone_init);
> --
> 1.7.11.7
>

^ permalink raw reply

* Re: [PATCH 2/2] vhost: handle polling failure
From: Jason Wang @ 2012-12-28  4:29 UTC (permalink / raw)
  To: gaowanlong; +Cc: netdev, virtualization, linux-kernel, kvm, mst
In-Reply-To: <50DC1C8F.3020008@cn.fujitsu.com>

On 12/27/2012 06:01 PM, Wanlong Gao wrote:
> On 12/27/2012 02:39 PM, Jason Wang wrote:
>> > Currently, polling error were ignored in vhost. This may lead some issues (e.g
>> > kenrel crash when passing a tap fd to vhost before calling TUNSETIFF). Fix this
>> > by:
> Can this kernel crash be reproduced by hand?
>
> Thanks,
> Wanlong Gao
>
>> > 
Yes, it could be simply reproduced by: open a tap fd but does not cal
TUNSETIFF, then pass it to qemu and enable vhost.

^ permalink raw reply

* Re: [PATCH] bnx2x: use ARRAY_SIZE where possible
From: David Miller @ 2012-12-28  4:30 UTC (permalink / raw)
  To: eilong; +Cc: sasha.levin, netdev, linux-kernel
In-Reply-To: <1356252567.15507.1.camel@lb-tlvb-eilong.il.broadcom.com>

From: "Eilon Greenstein" <eilong@broadcom.com>
Date: Sun, 23 Dec 2012 10:49:27 +0200

> On Thu, 2012-12-20 at 14:11 -0500, Sasha Levin wrote:
>> Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
> 
> Acked-by Eilon Greenstein <eilong@broadcom.com>

Applied.

^ permalink raw reply

* net-next is OPEN
From: David Miller @ 2012-12-28  4:31 UTC (permalink / raw)
  To: netdev; +Cc: netfilter-devel, linux-wireless


The net-next tree is now open for submissions.

Thanks.

^ permalink raw reply

* Re: [PATCH 01/19] netfilter: move nf_conntrack initialize out of pernet operations
From: Eric W. Biederman @ 2012-12-28  4:48 UTC (permalink / raw)
  To: canqun zhang
  Cc: Gao feng, netfilter-devel, netdev@vger.kernel.org,
	Patrick McHardy, pablo
In-Reply-To: <CAFFEFTXT_fkF2pPSxDEEgic80NVWLqBWtFuvs6W9uDUW2aCnqw@mail.gmail.com>

canqun zhang <canqunzhang@gmail.com> writes:

> Hi all
> As discussed above,if the host machine create several linux
> containers, there will be several  net namespaces.Resources with "nf
> conntrack" are registered or unregistered on the first net
> namespace(init_net),But init_net is not unregistered lastly,so
> cleanuping other net namespaces  will triger painic.
> If net namespaces are created  with the order of 1,2,...n,they should
> be cleaned with the order of n,...2,1,so in this case init_net will be
> unregistered lastly.

No.  Network namespaces in general can be cleaned up in any order.

In particular you should never ever expect to see the order
n,n-1,n-2,...,2,1.

It may make sense to special case init_net in the cleanup order
but I would really rather not.

Now init_net is special and really should never be cleaned up
for non-modular code.  So it almost makes sense to special
case init_net.

Does anyone know why Alexy decided to do this only for init_net?

My inclination is that Gao Feng is on the rigt path by just removing
the strange init_net special case and performing the work once
per module load, and once per module unload.

> I fixed it up (see below). I have taken a lot of test!

Thank you.

It is nice to see that we have exposed this mis-assumption.

I am inclined to leave the order of this list as is so that
other assumptions of network namespace unregistration order
are exposed.

Unless there is a truly good reason to perform magic on init_net.

Eric

> diff -r 6a1a258923f5 -r 2667e89e6f50 net/core/net_namespace.c
> --- a/net/core/net_namespace.c  Fri Dec 28 11:01:17 2012 +0800
> +++ b/net/core/net_namespace.c  Fri Dec 28 11:05:12 2012 +0800
> @@ -450,7 +450,7 @@
>
>         list_del(&ops->list);
>         for_each_net(net)
> -               list_add_tail(&net->exit_list, &net_exit_list);
> +              list_add(&net->exit_list, &net_exit_list);
>         ops_exit_list(ops, &net_exit_list);
>         ops_free_list(ops, &net_exit_lis
>

^ permalink raw reply

* Re: [PATCH 1/2] vhost_net: correct error hanlding in vhost_net_set_backend()
From: Jason Wang @ 2012-12-28  4:58 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20121227130305.GE20595@redhat.com>

On 12/27/2012 09:03 PM, Michael S. Tsirkin wrote:
> On Thu, Dec 27, 2012 at 02:39:20PM +0800, Jason Wang wrote:
>> Currently, polling error were ignored in vhost. This may lead some issues (e.g
>> kenrel crash when passing a tap fd to vhost before calling TUNSETIFF). Fix this
>> by:
>>
>> - extend the idea of vhost_net_poll_state to all vhost_polls
>> - change the state only when polling is succeed
>> - make vhost_poll_start() report errors to the caller, which could be used
>>   caller or userspace.
> Maybe it could but this patch just ignores these errors.
> And it's not clear how would userspace handle these errors.

Not all were ignored, one example is vhost_net_enable_vq(), this could
be used to let userspace know the fd were not setup correctly.
> Also, since we have a reference on the fd, it would seem
> that once poll succeeds it can't fail in the future.

Right.
>
> So two other options would make more sense to me:
> - if vhost is bound to tun without SETIFF, fail this immediately
> - if vhost is bound to tun without SETIFF, defer polling
>   until SETIFF
>
> Option 1 would seem much easier to implement, I think it's
> preferable.

Option 1 seems better, since userspace may also disable a queue in the
meantime. Will add a vq_err() and break out of the loop when fails to
start the polling.
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>  drivers/vhost/net.c   |   75 +++++++++++++++++--------------------------------
>>  drivers/vhost/vhost.c |   16 +++++++++-
>>  drivers/vhost/vhost.h |   11 ++++++-
>>  3 files changed, 50 insertions(+), 52 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index 629d6b5..56e7f5a 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -64,20 +64,10 @@ enum {
>>  	VHOST_NET_VQ_MAX = 2,
>>  };
>>  
>> -enum vhost_net_poll_state {
>> -	VHOST_NET_POLL_DISABLED = 0,
>> -	VHOST_NET_POLL_STARTED = 1,
>> -	VHOST_NET_POLL_STOPPED = 2,
>> -};
>> -
>>  struct vhost_net {
>>  	struct vhost_dev dev;
>>  	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
>>  	struct vhost_poll poll[VHOST_NET_VQ_MAX];
>> -	/* Tells us whether we are polling a socket for TX.
>> -	 * We only do this when socket buffer fills up.
>> -	 * Protected by tx vq lock. */
>> -	enum vhost_net_poll_state tx_poll_state;
>>  	/* Number of TX recently submitted.
>>  	 * Protected by tx vq lock. */
>>  	unsigned tx_packets;
>> @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
>>  	}
>>  }
>>  
>> -/* Caller must have TX VQ lock */
>> -static void tx_poll_stop(struct vhost_net *net)
>> -{
>> -	if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
>> -		return;
>> -	vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
>> -	net->tx_poll_state = VHOST_NET_POLL_STOPPED;
>> -}
>> -
>> -/* Caller must have TX VQ lock */
>> -static void tx_poll_start(struct vhost_net *net, struct socket *sock)
>> -{
>> -	if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
>> -		return;
>> -	vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
>> -	net->tx_poll_state = VHOST_NET_POLL_STARTED;
>> -}
>> -
>>  /* In case of DMA done not in order in lower device driver for some reason.
>>   * upend_idx is used to track end of used idx, done_idx is used to track head
>>   * of used idx. Once lower device DMA done contiguously, we will signal KVM
>> @@ -252,7 +224,7 @@ static void handle_tx(struct vhost_net *net)
>>  	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
>>  	if (wmem >= sock->sk->sk_sndbuf) {
>>  		mutex_lock(&vq->mutex);
>> -		tx_poll_start(net, sock);
>> +		vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
>>  		mutex_unlock(&vq->mutex);
>>  		return;
>>  	}
>> @@ -261,7 +233,7 @@ static void handle_tx(struct vhost_net *net)
>>  	vhost_disable_notify(&net->dev, vq);
>>  
>>  	if (wmem < sock->sk->sk_sndbuf / 2)
>> -		tx_poll_stop(net);
>> +		vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
>>  	hdr_size = vq->vhost_hlen;
>>  	zcopy = vq->ubufs;
>>  
>> @@ -283,7 +255,8 @@ static void handle_tx(struct vhost_net *net)
>>  
>>  			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
>>  			if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
>> -				tx_poll_start(net, sock);
>> +				vhost_poll_start(net->poll + VHOST_NET_VQ_TX,
>> +						 sock->file);
>>  				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
>>  				break;
>>  			}
>> @@ -294,7 +267,8 @@ static void handle_tx(struct vhost_net *net)
>>  				    (vq->upend_idx - vq->done_idx) :
>>  				    (vq->upend_idx + UIO_MAXIOV - vq->done_idx);
>>  			if (unlikely(num_pends > VHOST_MAX_PEND)) {
>> -				tx_poll_start(net, sock);
>> +				vhost_poll_start(net->poll + VHOST_NET_VQ_TX,
>> +						 sock->file);
>>  				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
>>  				break;
>>  			}
>> @@ -360,7 +334,8 @@ static void handle_tx(struct vhost_net *net)
>>  			}
>>  			vhost_discard_vq_desc(vq, 1);
>>  			if (err == -EAGAIN || err == -ENOBUFS)
>> -				tx_poll_start(net, sock);
>> +				vhost_poll_start(net->poll + VHOST_NET_VQ_TX,
>> +						 sock->file);
>>  			break;
>>  		}
>>  		if (err != len)
>> @@ -623,7 +598,6 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>>  
>>  	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
>>  	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
>> -	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
>>  
>>  	f->private_data = n;
>>  
>> @@ -635,27 +609,26 @@ static void vhost_net_disable_vq(struct vhost_net *n,
>>  {
>>  	if (!vq->private_data)
>>  		return;
>> -	if (vq == n->vqs + VHOST_NET_VQ_TX) {
>> -		tx_poll_stop(n);
>> -		n->tx_poll_state = VHOST_NET_POLL_DISABLED;
>> -	} else
>> +	if (vq == n->vqs + VHOST_NET_VQ_TX)
>> +		vhost_poll_stop(n->poll + VHOST_NET_VQ_TX);
>> +	else
>>  		vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
>>  }
>>  
>> -static void vhost_net_enable_vq(struct vhost_net *n,
>> -				struct vhost_virtqueue *vq)
>> +static int vhost_net_enable_vq(struct vhost_net *n,
>> +			       struct vhost_virtqueue *vq)
>>  {
>> +	int err, index = vq - n->vqs;
>>  	struct socket *sock;
>>  
>>  	sock = rcu_dereference_protected(vq->private_data,
>>  					 lockdep_is_held(&vq->mutex));
>>  	if (!sock)
>> -		return;
>> -	if (vq == n->vqs + VHOST_NET_VQ_TX) {
>> -		n->tx_poll_state = VHOST_NET_POLL_STOPPED;
>> -		tx_poll_start(n, sock);
>> -	} else
>> -		vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
>> +		return 0;
>> +
>> +	n->poll[index].state = VHOST_POLL_STOPPED;
>> +	err = vhost_poll_start(n->poll + index, sock->file);
>> +	return err;
>>  }
>>  
>>  static struct socket *vhost_net_stop_vq(struct vhost_net *n,
>> @@ -831,12 +804,16 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>>  		vq->ubufs = ubufs;
>>  		vhost_net_disable_vq(n, vq);
>>  		rcu_assign_pointer(vq->private_data, sock);
>> -		vhost_net_enable_vq(n, vq);
>> +		r = vhost_net_enable_vq(n, vq);
>> +		if (r) {
>> +			sock = NULL;
>> +			goto err_enable;
>> +		}
>>  
>>  		r = vhost_init_used(vq);
>>  		if (r) {
>>  			sock = NULL;
>> -			goto err_used;
>> +			goto err_enable;
>>  		}
>>  
>>  		n->tx_packets = 0;
>> @@ -861,7 +838,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>>  	mutex_unlock(&n->dev.mutex);
>>  	return 0;
>>  
>> -err_used:
>> +err_enable:
>>  	if (oldubufs)
>>  		vhost_ubuf_put_and_wait(oldubufs);
>>  	if (oldsock)
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 34389f7..1cb2604 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -77,26 +77,36 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
>>  	init_poll_funcptr(&poll->table, vhost_poll_func);
>>  	poll->mask = mask;
>>  	poll->dev = dev;
>> +	poll->state = VHOST_POLL_DISABLED;
>>  
>>  	vhost_work_init(&poll->work, fn);
>>  }
>>  
>>  /* Start polling a file. We add ourselves to file's wait queue. The caller must
>>   * keep a reference to a file until after vhost_poll_stop is called. */
>> -void vhost_poll_start(struct vhost_poll *poll, struct file *file)
>> +int vhost_poll_start(struct vhost_poll *poll, struct file *file)
>>  {
>>  	unsigned long mask;
>> +	if (unlikely(poll->state != VHOST_POLL_STOPPED))
>> +		return 0;
>>  
>>  	mask = file->f_op->poll(file, &poll->table);
>> +	if (mask & POLLERR)
>> +		return -EINVAL;
>>  	if (mask)
>>  		vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask);
>> +	poll->state = VHOST_POLL_STARTED;
>> +	return 0;
>>  }
>>  
> Hmm, interesting. I note that tun has this:
>
>        if (tun->dev->reg_state != NETREG_REGISTERED)
>                 mask = POLLERR;
>
> So apparently we sometimes return POLLERR when poll
> did succeed, then test below wouldn't remove
> from wqh in this case. Maybe it's a bug in tun,
> need to look into this.

Looks a bug of tun to me, looks like a POLLHUP is better here.
>
>>  /* Stop polling a file. After this function returns, it becomes safe to drop the
>>   * file reference. You must also flush afterwards. */
>>  void vhost_poll_stop(struct vhost_poll *poll)
>>  {
>> +	if (likely(poll->state != VHOST_POLL_STARTED))
>> +		return;
>>  	remove_wait_queue(poll->wqh, &poll->wait);
>> +	poll->state = VHOST_POLL_STOPPED;
>>  }
>>  
>>  static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
>> @@ -791,8 +801,10 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
>>  	if (filep)
>>  		fput(filep);
>>  
>> -	if (pollstart && vq->handle_kick)
>> +	if (pollstart && vq->handle_kick) {
>> +		vq->poll.state = VHOST_POLL_STOPPED;
>>  		vhost_poll_start(&vq->poll, vq->kick);
>> +	}
>>  
>>  	mutex_unlock(&vq->mutex);
>>  
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 2639c58..98861d9 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -26,6 +26,12 @@ struct vhost_work {
>>  	unsigned		  done_seq;
>>  };
>>  
>> +enum vhost_poll_state {
>> +	VHOST_POLL_DISABLED = 0,
>> +	VHOST_POLL_STARTED = 1,
>> +	VHOST_POLL_STOPPED = 2,
>> +};
>> +
>>  /* Poll a file (eventfd or socket) */
>>  /* Note: there's nothing vhost specific about this structure. */
>>  struct vhost_poll {
>> @@ -35,6 +41,9 @@ struct vhost_poll {
>>  	struct vhost_work	  work;
>>  	unsigned long		  mask;
>>  	struct vhost_dev	 *dev;
>> +	/* Tells us whether we are polling a file.
>> +	 * Protected by tx vq lock. */
> tx vq lock does not make sense in this context.

Yes, thanks for pointing this out.
>> +	enum vhost_poll_state	  state;
>>  };
>>  
>>  void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
>> @@ -42,7 +51,7 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
>>  
>>  void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
>>  		     unsigned long mask, struct vhost_dev *dev);
>> -void vhost_poll_start(struct vhost_poll *poll, struct file *file);
>> +int vhost_poll_start(struct vhost_poll *poll, struct file *file);
>>  void vhost_poll_stop(struct vhost_poll *poll);
>>  void vhost_poll_flush(struct vhost_poll *poll);
>>  void vhost_poll_queue(struct vhost_poll *poll);
>> -- 
>> 1.7.1

^ permalink raw reply

* Re: [PATCH 08/14] xen: netback: Remove redundant check on unsigned variable
From: Tushar Behera @ 2012-12-28  5:15 UTC (permalink / raw)
  To: Ian Campbell
  Cc: linux-kernel@vger.kernel.org, patches@linaro.org,
	xen-devel@lists.xensource.com, netdev@vger.kernel.org
In-Reply-To: <1353057394.3499.159.camel@zakaz.uk.xensource.com>

On 11/16/2012 02:46 PM, Ian Campbell wrote:
> On Fri, 2012-11-16 at 06:50 +0000, Tushar Behera wrote:
>> No need to check whether unsigned variable is less than 0.
>>
>> CC: Ian Campbell <ian.campbell@citrix.com>
>> CC: xen-devel@lists.xensource.com
>> CC: netdev@vger.kernel.org
>> Signed-off-by: Tushar Behera <tushar.behera@linaro.org>
> 
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> 
> Thanks.
> 

This patch was not picked up for 3.8-rc1. Any idea, who should pick this up?

>> ---
>>  drivers/net/xen-netback/netback.c |    4 ++--
>>  1 files changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
>> index aab8677..515e10c 100644
>> --- a/drivers/net/xen-netback/netback.c
>> +++ b/drivers/net/xen-netback/netback.c
>> @@ -190,14 +190,14 @@ static int get_page_ext(struct page *pg,
>>  
>>  	group = ext.e.group - 1;
>>  
>> -	if (group < 0 || group >= xen_netbk_group_nr)
>> +	if (group >= xen_netbk_group_nr)
>>  		return 0;
>>  
>>  	netbk = &xen_netbk[group];
>>  
>>  	idx = ext.e.idx;
>>  
>> -	if ((idx < 0) || (idx >= MAX_PENDING_REQS))
>> +	if (idx >= MAX_PENDING_REQS)
>>  		return 0;
>>  
>>  	if (netbk->mmap_pages[idx] != pg)
> 
> 


-- 
Tushar Behera

^ permalink raw reply

* Re: [PATCH 10/14] atm: Removed redundant check on unsigned variable
From: Tushar Behera @ 2012-12-28  5:16 UTC (permalink / raw)
  To: linux-kernel; +Cc: patches, Chas Williams, linux-atm-general, netdev
In-Reply-To: <1353048646-10935-11-git-send-email-tushar.behera@linaro.org>

Ping.

On 11/16/2012 12:20 PM, Tushar Behera wrote:
> No need to check whether unsigned variable is less than 0.
> 
> CC: Chas Williams <chas@cmf.nrl.navy.mil>
> CC: linux-atm-general@lists.sourceforge.net
> CC: netdev@vger.kernel.org
> Signed-off-by: Tushar Behera <tushar.behera@linaro.org>
> ---
>  drivers/atm/fore200e.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
> index 361f5ae..fdd3fe7 100644
> --- a/drivers/atm/fore200e.c
> +++ b/drivers/atm/fore200e.c
> @@ -972,7 +972,7 @@ int bsq_audit(int where, struct host_bsq* bsq, int scheme, int magn)
>  		   where, scheme, magn, buffer->index, buffer->scheme);
>  	}
>  
> -	if ((buffer->index < 0) || (buffer->index >= fore200e_rx_buf_nbr[ scheme ][ magn ])) {
> +	if (buffer->index >= fore200e_rx_buf_nbr[ scheme ][ magn ]) {
>  	    printk(FORE200E "bsq_audit(%d): queue %d.%d, out of range buffer index = %ld !\n",
>  		   where, scheme, magn, buffer->index);
>  	}
> 


-- 
Tushar Behera

^ permalink raw reply

* Re: [PATCH 1/2] vhost_net: correct error hanlding in vhost_net_set_backend()
From: Jason Wang @ 2012-12-28  5:31 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20121227131442.GF20595@redhat.com>

On 12/27/2012 09:14 PM, Michael S. Tsirkin wrote:
> On Thu, Dec 27, 2012 at 02:39:19PM +0800, Jason Wang wrote:
>> Fix the leaking of oldubufs and fd refcnt when fail to initialized used ring.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>  drivers/vhost/net.c |   14 +++++++++++---
>>  1 files changed, 11 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index ebd08b2..629d6b5 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -834,8 +834,10 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>>  		vhost_net_enable_vq(n, vq);
>>  
>>  		r = vhost_init_used(vq);
>> -		if (r)
>> -			goto err_vq;
>> +		if (r) {
>> +			sock = NULL;
>> +			goto err_used;
>> +		}
>>  
>>  		n->tx_packets = 0;
>>  		n->tx_zcopy_err = 0;
>> @@ -859,8 +861,14 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>>  	mutex_unlock(&n->dev.mutex);
>>  	return 0;
>>  
>> +err_used:
>> +	if (oldubufs)
>> +		vhost_ubuf_put_and_wait(oldubufs);
>> +	if (oldsock)
>> +		fput(oldsock->file);
>>  err_ubufs:
>> -	fput(sock->file);
>> +	if (sock)
>> +		fput(sock->file);
>>  err_vq:
>>  	mutex_unlock(&vq->mutex);
>>  err:
> I think it's a real bug, but I don't see how the fix
> makes sense.
> We are returning an error, so we ideally
> revert to the state before the faulty
> operation. So this should put sock and ubufs,
> not oldsock/oldubufs.

Agree.
>
> The best way is probably to change
> vhost_init_used so that it gets private data
> pointer as a parameter.
>
> We can then call it before ubuf alloc.
> You can then add err_used right after err_ubufs
> with no extra logic.
>

Make more sense, thanks.
>
>

^ permalink raw reply

* Re: [PATCH 01/19] netfilter: move nf_conntrack initialize out of pernet operations
From: canqun zhang @ 2012-12-28  5:32 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Gao feng, netfilter-devel, netdev@vger.kernel.org,
	Patrick McHardy, pablo
In-Reply-To: <87ip7mlr2r.fsf@xmission.com>

yes，Network namespaces in general can be cleaned up in any order，but
when doing /etc/ini.d/iptables restart, the system need cleaning up
all net namespace,and init_net should be cleanup lastly.init_net is
the first namespace,other net namespace is copied for it ,and it is
diuty for Initializing resources,so It in itself is special.

2012/12/28 Eric W. Biederman <ebiederm@xmission.com>:
> canqun zhang <canqunzhang@gmail.com> writes:
>
>> Hi all
>> As discussed above,if the host machine create several linux
>> containers, there will be several  net namespaces.Resources with "nf
>> conntrack" are registered or unregistered on the first net
>> namespace(init_net),But init_net is not unregistered lastly,so
>> cleanuping other net namespaces  will triger painic.
>> If net namespaces are created  with the order of 1,2,...n,they should
>> be cleaned with the order of n,...2,1,so in this case init_net will be
>> unregistered lastly.
>
> No.  Network namespaces in general can be cleaned up in any order.
>
> In particular you should never ever expect to see the order
> n,n-1,n-2,...,2,1.
>
> It may make sense to special case init_net in the cleanup order
> but I would really rather not.
>
> Now init_net is special and really should never be cleaned up
> for non-modular code.  So it almost makes sense to special
> case init_net.
>
> Does anyone know why Alexy decided to do this only for init_net?
>
> My inclination is that Gao Feng is on the rigt path by just removing
> the strange init_net special case and performing the work once
> per module load, and once per module unload.
>
>> I fixed it up (see below). I have taken a lot of test!
>
> Thank you.
>
> It is nice to see that we have exposed this mis-assumption.
>
> I am inclined to leave the order of this list as is so that
> other assumptions of network namespace unregistration order
> are exposed.
>
> Unless there is a truly good reason to perform magic on init_net.
>
> Eric
>
>> diff -r 6a1a258923f5 -r 2667e89e6f50 net/core/net_namespace.c
>> --- a/net/core/net_namespace.c  Fri Dec 28 11:01:17 2012 +0800
>> +++ b/net/core/net_namespace.c  Fri Dec 28 11:05:12 2012 +0800
>> @@ -450,7 +450,7 @@
>>
>>         list_del(&ops->list);
>>         for_each_net(net)
>> -               list_add_tail(&net->exit_list, &net_exit_list);
>> +              list_add(&net->exit_list, &net_exit_list);
>>         ops_exit_list(ops, &net_exit_list);
>>         ops_free_list(ops, &net_exit_lis
>>
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: TUN problems (regression?)
From: Jason Wang @ 2012-12-28  5:43 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Eric Dumazet, Paul Moore, netdev
In-Reply-To: <20121227164106.078604a8@nehalam.linuxnetplumber.net>

On 12/28/2012 08:41 AM, Stephen Hemminger wrote:
> On Fri, 21 Dec 2012 12:26:56 +0800
> Jason Wang <jasowang@redhat.com> wrote:
>
>> On 12/21/2012 11:39 AM, Eric Dumazet wrote:
>>> On Fri, 2012-12-21 at 11:32 +0800, Jason Wang wrote:
>>>> On 12/21/2012 07:50 AM, Stephen Hemminger wrote:
>>>>> On Thu, 20 Dec 2012 15:38:17 -0800
>>>>> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>>
>>>>>> On Thu, 2012-12-20 at 18:16 -0500, Paul Moore wrote:
>>>>>>> [CC'ing netdev in case this is a known problem I just missed ...]
>>>>>>>
>>>>>>> Hi Jason,
>>>>>>>
>>>>>>> I started doing some more testing with the multiqueue TUN changes and I ran 
>>>>>>> into a problem when running tunctl: running it once w/o arguments works as 
>>>>>>> expected, but running it a second time results in failure and a 
>>>>>>> kmem_cache_sanity_check() failure.  The problem appears to be very repeatable 
>>>>>>> on my test VM and happens independent of the LSM/SELinux fixup patches.
>>>>>>>
>>>>>>> Have you seen this before?
>>>>>>>
>>>>>> Obviously code in tun_flow_init() is wrong...
>>>>>>
>>>>>> static int tun_flow_init(struct tun_struct *tun)
>>>>>> {
>>>>>>         int i;
>>>>>>
>>>>>>         tun->flow_cache = kmem_cache_create("tun_flow_cache",
>>>>>>                                             sizeof(struct tun_flow_entry), 0, 0,
>>>>>>                                             NULL);
>>>>>>         if (!tun->flow_cache)
>>>>>>                 return -ENOMEM;
>>>>>> ...
>>>>>> }
>>>>>>
>>>>>>
>>>>>> I have no idea why we would need a kmem_cache per tun_struct,
>>>>>> and why we even need a kmem_cache.
>>>>> Normally flow malloc/free should be good enough.
>>>>> It might make sense to use private kmem_cache if doing hlist_nulls.
>>>>>
>>>>>
>>>>> Acked-by: Stephen Hemminger <shemminger@vyatta.com>
>>>> Should be at least a global cache, I thought I can get some speed-up by
>>>> using kmem_cache.
>>>>
>>>> Acked-by: Jason Wang <jasowang@redhat.com>
>>> Was it with SLUB or SLAB ?
>>>
>>> Using generic kmalloc-64 is better than a dedicated kmem_cache of 48
>>> bytes per object, as we guarantee each object is on a single cache line.
>>>
>>>
>> Right, thanks for the explanation.
>>
> I wonder if TUN would be better if it used a array to translate
> receive hash to receive queue. This is how real hardware works with the
> indirection table, and it would allow RFS acceleration. The current flow
> cache stuff is prone to DoS attack and scaling problems with lots of
> short lived flows.

The problem of indirection table is hash collision which may even happen
when few flows existed.

For the RFS, we can open a API/ioctl for userspace to add or remove a
flow cache.

For the DoS/scaling issue, I have an idea of:
- limit the total number of flow entries in tun/tap
- only update the flow entry every N (say 20 like ixgbe) packets or the
the tcp packet has sync flag
- I'm not sure skb_get_rxhash() is lightweight enough, or change to more
lightweight one?

Any suggestions?

Thanks

^ permalink raw reply

* Re: [PATCH 01/19] netfilter: move nf_conntrack initialize out of pernet operations
From: Eric W. Biederman @ 2012-12-28  6:00 UTC (permalink / raw)
  To: canqun zhang
  Cc: Gao feng, netfilter-devel, netdev@vger.kernel.org,
	Patrick McHardy, pablo
In-Reply-To: <CAFFEFTU8kxXV2pQ3B_goRs2Y7p2ecZ1YuSKSjfYF_58eD1tDqw@mail.gmail.com>

canqun zhang <canqunzhang@gmail.com> writes:

> yes，Network namespaces in general can be cleaned up in any order，but
> when doing /etc/ini.d/iptables restart, the system need cleaning up
> all net namespace,and init_net should be cleanup lastly.init_net is
> the first namespace,other net namespace is copied for it ,and it is
> diuty for Initializing resources,so It in itself is special.

"other net namespaces is copied for it"  I don't have a clue what
you mean by that.  Every network namespace starts out in a default
state not in a copied state.

Nowhere else in the network stack does &init_net have the duty
of initializing or cleaning up resources.

That /etc/init.d/iptables restart removes modules in general is a little
dubious.  That /etc/init.d/iptables restart removes modules when there
are other existing network namespaces using those modules is down right
dangerous.  Dangerous in the anyone can ssh into the machine way.  I
suspect it has taken 5 years for this bug to show up because it is so
idiotic to remove code that someone else is using.

I won't argue that making it so that &init_net is the last network
namespace to go will solve this problem.  But I can't see how adding
the guarantee that &init_net will always be cleaned up last is a good
long term solution.

Removing the init_net special case gives a simpler mental model, and
less to learn and maintain about network namespaces.

Eric
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: TUN problems (regression?)
From: Stephen Hemminger @ 2012-12-28  6:25 UTC (permalink / raw)
  To: Jason Wang; +Cc: Eric Dumazet, Paul Moore, netdev
In-Reply-To: <50DD319A.5000708@redhat.com>

On Fri, 28 Dec 2012 13:43:54 +0800
Jason Wang <jasowang@redhat.com> wrote:

> On 12/28/2012 08:41 AM, Stephen Hemminger wrote:
> > On Fri, 21 Dec 2012 12:26:56 +0800
> > Jason Wang <jasowang@redhat.com> wrote:
> >
> >> On 12/21/2012 11:39 AM, Eric Dumazet wrote:
> >>> On Fri, 2012-12-21 at 11:32 +0800, Jason Wang wrote:
> >>>> On 12/21/2012 07:50 AM, Stephen Hemminger wrote:
> >>>>> On Thu, 20 Dec 2012 15:38:17 -0800
> >>>>> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >>>>>
> >>>>>> On Thu, 2012-12-20 at 18:16 -0500, Paul Moore wrote:
> >>>>>>> [CC'ing netdev in case this is a known problem I just missed ...]
> >>>>>>>
> >>>>>>> Hi Jason,
> >>>>>>>
> >>>>>>> I started doing some more testing with the multiqueue TUN changes and I ran 
> >>>>>>> into a problem when running tunctl: running it once w/o arguments works as 
> >>>>>>> expected, but running it a second time results in failure and a 
> >>>>>>> kmem_cache_sanity_check() failure.  The problem appears to be very repeatable 
> >>>>>>> on my test VM and happens independent of the LSM/SELinux fixup patches.
> >>>>>>>
> >>>>>>> Have you seen this before?
> >>>>>>>
> >>>>>> Obviously code in tun_flow_init() is wrong...
> >>>>>>
> >>>>>> static int tun_flow_init(struct tun_struct *tun)
> >>>>>> {
> >>>>>>         int i;
> >>>>>>
> >>>>>>         tun->flow_cache = kmem_cache_create("tun_flow_cache",
> >>>>>>                                             sizeof(struct tun_flow_entry), 0, 0,
> >>>>>>                                             NULL);
> >>>>>>         if (!tun->flow_cache)
> >>>>>>                 return -ENOMEM;
> >>>>>> ...
> >>>>>> }
> >>>>>>
> >>>>>>
> >>>>>> I have no idea why we would need a kmem_cache per tun_struct,
> >>>>>> and why we even need a kmem_cache.
> >>>>> Normally flow malloc/free should be good enough.
> >>>>> It might make sense to use private kmem_cache if doing hlist_nulls.
> >>>>>
> >>>>>
> >>>>> Acked-by: Stephen Hemminger <shemminger@vyatta.com>
> >>>> Should be at least a global cache, I thought I can get some speed-up by
> >>>> using kmem_cache.
> >>>>
> >>>> Acked-by: Jason Wang <jasowang@redhat.com>
> >>> Was it with SLUB or SLAB ?
> >>>
> >>> Using generic kmalloc-64 is better than a dedicated kmem_cache of 48
> >>> bytes per object, as we guarantee each object is on a single cache line.
> >>>
> >>>
> >> Right, thanks for the explanation.
> >>
> > I wonder if TUN would be better if it used a array to translate
> > receive hash to receive queue. This is how real hardware works with the
> > indirection table, and it would allow RFS acceleration. The current flow
> > cache stuff is prone to DoS attack and scaling problems with lots of
> > short lived flows.
> 
> The problem of indirection table is hash collision which may even happen
> when few flows existed.

Hash collision is fine, as long as the the statistical average of
hash across queue's is approximately equal it will be faster. A simple
array indirection is much faster than walking a hash table.

> For the RFS, we can open a API/ioctl for userspace to add or remove a
> flow cache.

RFS acceleration relies on programming the table. It is easier if
TUN looks more like hardware.

> For the DoS/scaling issue, I have an idea of:
> - limit the total number of flow entries in tun/tap
> - only update the flow entry every N (say 20 like ixgbe) packets or the
> the tcp packet has sync flag
> - I'm not sure skb_get_rxhash() is lightweight enough, or change to more
> lightweight one?

Ideally the hash should be programmable L2 vs L3, but that is splitting
hairs at this point.

Flow tables are scaling problem, especially on highly loaded servers where
they are most needed.

^ permalink raw reply

* [PATCH net-next] bridge: respect RFC2863 operational state
From: Stephen Hemminger @ 2012-12-28  6:28 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

The bridge link detection should follow the operational state
of the lower device, rather than the carrier bit. This allows devices
like tunnels that are controlled by userspace control plane to work
with bridge STP link management.


Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/bridge/br_if.c	2012-10-25 09:11:15.627272524 -0700
+++ b/net/bridge/br_if.c	2012-12-14 08:58:14.329847361 -0800
@@ -66,14 +66,14 @@ void br_port_carrier_check(struct net_br
 	struct net_device *dev = p->dev;
 	struct net_bridge *br = p->br;
 
-	if (netif_running(dev) && netif_carrier_ok(dev))
+	if (netif_running(dev) && netif_oper_up(dev))
 		p->path_cost = port_cost(dev);
 
 	if (!netif_running(br->dev))
 		return;
 
 	spin_lock_bh(&br->lock);
-	if (netif_running(dev) && netif_carrier_ok(dev)) {
+	if (netif_running(dev) && netif_oper_up(dev))
 		if (p->state == BR_STATE_DISABLED)
 			br_stp_enable_port(p);
 	} else {
--- a/net/bridge/br_notify.c	2012-10-25 09:11:15.631272484 -0700
+++ b/net/bridge/br_notify.c	2012-12-14 08:57:36.954222724 -0800
@@ -82,7 +82,7 @@ static int br_device_event(struct notifi
 		break;
 
 	case NETDEV_UP:
-		if (netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) {
+		if (netif_running(br->dev) && netif_oper_up(dev)) {
 			spin_lock_bh(&br->lock);
 			br_stp_enable_port(p);
 			spin_unlock_bh(&br->lock);

^ permalink raw reply

* Re: ppoll() stuck on POLLIN while TCP peer is sending
From: Eric Wong @ 2012-12-28  7:06 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: Andreas Voellmy, viro, linux-fsdevel, Junchang(Jason) Wang
In-Reply-To: <20121228014503.GA5017@dcvr.yhbt.net>

Eric Wong <normalperson@yhbt.net> wrote:
> I'm finding ppoll() unexpectedly stuck when waiting for POLLIN on a
> local TCP socket.  The isolated code below can reproduces the issue
> after many minutes (<1 hour).  It might be easier to reproduce on
> a busy system while disk I/O is happening.

Ugh, I can't seem to reproduce this anymore...  Will try something
else tomorrow.

^ permalink raw reply

* [PATCH v2 1/1 net-next] NET: FEC: dynamtic check DMA desc buff type
From: Frank Li @ 2012-12-28  6:29 UTC (permalink / raw)
  To: lznuaa, davem, s.hauer; +Cc: linux-arm-kernel, shawn.guo, netdev, Frank Li

MX6 and mx28 support enhanced DMA descript buff to support 1588
ptp. But MX25, MX3x, MX5x can't support enhanced DMA descript buff.
Check fec type and choose correct DAM descript buff type.

Remove static config CONFIG_FEC_PTP.
ptp function will be auto detected.

Signed-off-by: Frank Li <Frank.Li@freescale.com>
---
change from v1->v2:
 1. remove CONFIG_FEC_PTP
 2. fix code style issue. pass checkpatch
 3. fix issue "return bdp++", return value is not added
 4. remove unnecessary module parameter fec_ptp_enable
 5. change get_nextdesc to fec_enet_get_nextdesc

 drivers/net/ethernet/freescale/Kconfig  |    9 +--
 drivers/net/ethernet/freescale/Makefile |    3 +-
 drivers/net/ethernet/freescale/fec.c    |  175 +++++++++++++++++++------------
 drivers/net/ethernet/freescale/fec.h    |   16 +--
 4 files changed, 116 insertions(+), 87 deletions(-)

diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig
index ec490d7..6048dc8 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -26,6 +26,7 @@ config FEC
 		   ARCH_MXC || SOC_IMX28)
 	default ARCH_MXC || SOC_IMX28 if ARM
 	select PHYLIB
+	select PTP_1588_CLOCK
 	---help---
 	  Say Y here if you want to use the built-in 10/100 Fast ethernet
 	  controller on some Motorola ColdFire and Freescale i.MX processors.
@@ -92,12 +93,4 @@ config GIANFAR
 	  This driver supports the Gigabit TSEC on the MPC83xx, MPC85xx,
 	  and MPC86xx family of chips, and the FEC on the 8540.
 
-config FEC_PTP
-	bool "PTP Hardware Clock (PHC)"
-	depends on FEC && ARCH_MXC && !SOC_IMX25 && !SOC_IMX27 && !SOC_IMX35 && !SOC_IMX5
-	select PTP_1588_CLOCK
-	--help---
-	  Say Y here if you want to use PTP Hardware Clock (PHC) in the
-	  driver.  Only the basic clock operations have been implemented.
-
 endif # NET_VENDOR_FREESCALE
diff --git a/drivers/net/ethernet/freescale/Makefile b/drivers/net/ethernet/freescale/Makefile
index d4d19b3..b7d58fe 100644
--- a/drivers/net/ethernet/freescale/Makefile
+++ b/drivers/net/ethernet/freescale/Makefile
@@ -2,8 +2,7 @@
 # Makefile for the Freescale network device drivers.
 #
 
-obj-$(CONFIG_FEC) += fec.o
-obj-$(CONFIG_FEC_PTP) += fec_ptp.o
+obj-$(CONFIG_FEC) += fec.o fec_ptp.o
 obj-$(CONFIG_FEC_MPC52xx) += fec_mpc52xx.o
 ifeq ($(CONFIG_FEC_MPC52xx_MDIO),y)
 	obj-$(CONFIG_FEC_MPC52xx) += fec_mpc52xx_phy.o
diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c
index 0704bca..290f91c 100644
--- a/drivers/net/ethernet/freescale/fec.c
+++ b/drivers/net/ethernet/freescale/fec.c
@@ -76,6 +76,8 @@
 #define FEC_QUIRK_USE_GASKET		(1 << 2)
 /* Controller has GBIT support */
 #define FEC_QUIRK_HAS_GBIT		(1 << 3)
+/* Controller has extend desc buffer */
+#define FEC_QUICK_HAS_BUFDESC_EX	(1 << 4)
 
 static struct platform_device_id fec_devtype[] = {
 	{
@@ -93,7 +95,8 @@ static struct platform_device_id fec_devtype[] = {
 		.driver_data = FEC_QUIRK_ENET_MAC | FEC_QUIRK_SWAP_FRAME,
 	}, {
 		.name = "imx6q-fec",
-		.driver_data = FEC_QUIRK_ENET_MAC | FEC_QUIRK_HAS_GBIT,
+		.driver_data = FEC_QUIRK_ENET_MAC | FEC_QUIRK_HAS_GBIT |
+				FEC_QUICK_HAS_BUFDESC_EX,
 	}, {
 		/* sentinel */
 	}
@@ -140,7 +143,7 @@ MODULE_PARM_DESC(macaddr, "FEC Ethernet MAC address");
 #endif
 #endif /* CONFIG_M5272 */
 
-#if (((RX_RING_SIZE + TX_RING_SIZE) * 8) > PAGE_SIZE)
+#if (((RX_RING_SIZE + TX_RING_SIZE) * 32) > PAGE_SIZE)
 #error "FEC: descriptor ring size constants too large"
 #endif
 
@@ -192,6 +195,24 @@ MODULE_PARM_DESC(macaddr, "FEC Ethernet MAC address");
 
 static int mii_cnt;
 
+static struct bufdesc *fec_enet_get_nextdesc(struct bufdesc *bdp, int is_ex)
+{
+	struct bufdesc_ex *ex = (struct bufdesc_ex *)bdp;
+	if (is_ex)
+		return (struct bufdesc *)(ex + 1);
+	else
+		return bdp + 1;
+}
+
+static struct bufdesc *fec_enet_get_prevdesc(struct bufdesc *bdp, int is_ex)
+{
+	struct bufdesc_ex *ex = (struct bufdesc_ex *)bdp;
+	if (is_ex)
+		return (struct bufdesc *)(ex - 1);
+	else
+		return bdp - 1;
+}
+
 static void *swap_buffer(void *bufaddr, int len)
 {
 	int i;
@@ -248,7 +269,11 @@ fec_enet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	 */
 	if (((unsigned long) bufaddr) & FEC_ALIGNMENT) {
 		unsigned int index;
-		index = bdp - fep->tx_bd_base;
+		if (fep->bufdesc_ex)
+			index = (struct bufdesc_ex *)bdp -
+				(struct bufdesc_ex *)fep->tx_bd_base;
+		else
+			index = bdp - fep->tx_bd_base;
 		memcpy(fep->tx_bounce[index], skb->data, skb->len);
 		bufaddr = fep->tx_bounce[index];
 	}
@@ -280,17 +305,19 @@ fec_enet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 			| BD_ENET_TX_LAST | BD_ENET_TX_TC);
 	bdp->cbd_sc = status;
 
-#ifdef CONFIG_FEC_PTP
-	bdp->cbd_bdu = 0;
-	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
+	if (fep->bufdesc_ex) {
+
+		struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
+		ebdp->cbd_bdu = 0;
+		if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP &&
 			fep->hwts_tx_en)) {
-			bdp->cbd_esc = (BD_ENET_TX_TS | BD_ENET_TX_INT);
+			ebdp->cbd_esc = (BD_ENET_TX_TS | BD_ENET_TX_INT);
 			skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
-	} else {
+		} else {
 
-		bdp->cbd_esc = BD_ENET_TX_INT;
+			ebdp->cbd_esc = BD_ENET_TX_INT;
+		}
 	}
-#endif
 	/* Trigger transmission start */
 	writel(0, fep->hwp + FEC_X_DES_ACTIVE);
 
@@ -298,7 +325,7 @@ fec_enet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	if (status & BD_ENET_TX_WRAP)
 		bdp = fep->tx_bd_base;
 	else
-		bdp++;
+		bdp = fec_enet_get_nextdesc(bdp, fep->bufdesc_ex);
 
 	if (bdp == fep->dirty_tx) {
 		fep->tx_full = 1;
@@ -359,8 +386,12 @@ fec_restart(struct net_device *ndev, int duplex)
 
 	/* Set receive and transmit descriptor base. */
 	writel(fep->bd_dma, fep->hwp + FEC_R_DES_START);
-	writel((unsigned long)fep->bd_dma + sizeof(struct bufdesc) * RX_RING_SIZE,
-			fep->hwp + FEC_X_DES_START);
+	if (fep->bufdesc_ex)
+		writel((unsigned long)fep->bd_dma + sizeof(struct bufdesc_ex)
+			* RX_RING_SIZE, fep->hwp + FEC_X_DES_START);
+	else
+		writel((unsigned long)fep->bd_dma + sizeof(struct bufdesc)
+			* RX_RING_SIZE,	fep->hwp + FEC_X_DES_START);
 
 	fep->dirty_tx = fep->cur_tx = fep->tx_bd_base;
 	fep->cur_rx = fep->rx_bd_base;
@@ -448,17 +479,16 @@ fec_restart(struct net_device *ndev, int duplex)
 		writel(1 << 8, fep->hwp + FEC_X_WMRK);
 	}
 
-#ifdef CONFIG_FEC_PTP
-	ecntl |= (1 << 4);
-#endif
+	if (fep->bufdesc_ex)
+		ecntl |= (1 << 4);
 
 	/* And last, enable the transmit and receive processing */
 	writel(ecntl, fep->hwp + FEC_ECNTRL);
 	writel(0, fep->hwp + FEC_R_DES_ACTIVE);
 
-#ifdef CONFIG_FEC_PTP
-	fec_ptp_start_cyclecounter(ndev);
-#endif
+	if (fep->bufdesc_ex)
+		fec_ptp_start_cyclecounter(ndev);
+
 	/* Enable interrupts we wish to service */
 	writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
 }
@@ -544,19 +574,20 @@ fec_enet_tx(struct net_device *ndev)
 			ndev->stats.tx_packets++;
 		}
 
-#ifdef CONFIG_FEC_PTP
-		if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) {
+		if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS) &&
+			fep->bufdesc_ex) {
 			struct skb_shared_hwtstamps shhwtstamps;
 			unsigned long flags;
+			struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
 
 			memset(&shhwtstamps, 0, sizeof(shhwtstamps));
 			spin_lock_irqsave(&fep->tmreg_lock, flags);
 			shhwtstamps.hwtstamp = ns_to_ktime(
-				timecounter_cyc2time(&fep->tc, bdp->ts));
+				timecounter_cyc2time(&fep->tc, ebdp->ts));
 			spin_unlock_irqrestore(&fep->tmreg_lock, flags);
 			skb_tstamp_tx(skb, &shhwtstamps);
 		}
-#endif
+
 		if (status & BD_ENET_TX_READY)
 			printk("HEY! Enet xmit interrupt and TX_READY.\n");
 
@@ -575,7 +606,7 @@ fec_enet_tx(struct net_device *ndev)
 		if (status & BD_ENET_TX_WRAP)
 			bdp = fep->tx_bd_base;
 		else
-			bdp++;
+			bdp = fec_enet_get_nextdesc(bdp, fep->bufdesc_ex);
 
 		/* Since we have freed up a buffer, the ring is no longer full
 		 */
@@ -683,21 +714,23 @@ fec_enet_rx(struct net_device *ndev)
 			skb_put(skb, pkt_len - 4);	/* Make room */
 			skb_copy_to_linear_data(skb, data, pkt_len - 4);
 			skb->protocol = eth_type_trans(skb, ndev);
-#ifdef CONFIG_FEC_PTP
+
 			/* Get receive timestamp from the skb */
-			if (fep->hwts_rx_en) {
+			if (fep->hwts_rx_en && fep->bufdesc_ex) {
 				struct skb_shared_hwtstamps *shhwtstamps =
 							    skb_hwtstamps(skb);
 				unsigned long flags;
+				struct bufdesc_ex *ebdp =
+					(struct bufdesc_ex *)bdp;
 
 				memset(shhwtstamps, 0, sizeof(*shhwtstamps));
 
 				spin_lock_irqsave(&fep->tmreg_lock, flags);
 				shhwtstamps->hwtstamp = ns_to_ktime(
-				    timecounter_cyc2time(&fep->tc, bdp->ts));
+				    timecounter_cyc2time(&fep->tc, ebdp->ts));
 				spin_unlock_irqrestore(&fep->tmreg_lock, flags);
 			}
-#endif
+
 			if (!skb_defer_rx_timestamp(skb))
 				netif_rx(skb);
 		}
@@ -712,17 +745,19 @@ rx_processing_done:
 		status |= BD_ENET_RX_EMPTY;
 		bdp->cbd_sc = status;
 
-#ifdef CONFIG_FEC_PTP
-		bdp->cbd_esc = BD_ENET_RX_INT;
-		bdp->cbd_prot = 0;
-		bdp->cbd_bdu = 0;
-#endif
+		if (fep->bufdesc_ex) {
+			struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
+
+			ebdp->cbd_esc = BD_ENET_RX_INT;
+			ebdp->cbd_prot = 0;
+			ebdp->cbd_bdu = 0;
+		}
 
 		/* Update BD pointer to next entry */
 		if (status & BD_ENET_RX_WRAP)
 			bdp = fep->rx_bd_base;
 		else
-			bdp++;
+			bdp = fec_enet_get_nextdesc(bdp, fep->bufdesc_ex);
 		/* Doing this here will keep the FEC running while we process
 		 * incoming frames.  On a heavily loaded network, we should be
 		 * able to keep up at the expense of system resources.
@@ -1157,10 +1192,9 @@ static int fec_enet_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
 	if (!phydev)
 		return -ENODEV;
 
-#ifdef CONFIG_FEC_PTP
-	if (cmd == SIOCSHWTSTAMP)
+	if (cmd == SIOCSHWTSTAMP && fep->bufdesc_ex)
 		return fec_ptp_ioctl(ndev, rq, cmd);
-#endif
+
 	return phy_mii_ioctl(phydev, rq, cmd);
 }
 
@@ -1180,7 +1214,7 @@ static void fec_enet_free_buffers(struct net_device *ndev)
 					FEC_ENET_RX_FRSIZE, DMA_FROM_DEVICE);
 		if (skb)
 			dev_kfree_skb(skb);
-		bdp++;
+		bdp = fec_enet_get_nextdesc(bdp, fep->bufdesc_ex);
 	}
 
 	bdp = fep->tx_bd_base;
@@ -1207,14 +1241,17 @@ static int fec_enet_alloc_buffers(struct net_device *ndev)
 		bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev, skb->data,
 				FEC_ENET_RX_FRSIZE, DMA_FROM_DEVICE);
 		bdp->cbd_sc = BD_ENET_RX_EMPTY;
-#ifdef CONFIG_FEC_PTP
-		bdp->cbd_esc = BD_ENET_RX_INT;
-#endif
-		bdp++;
+
+		if (fep->bufdesc_ex) {
+			struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
+			ebdp->cbd_esc = BD_ENET_RX_INT;
+		}
+
+		bdp = fec_enet_get_nextdesc(bdp, fep->bufdesc_ex);
 	}
 
 	/* Set the last buffer to wrap. */
-	bdp--;
+	bdp = fec_enet_get_prevdesc(bdp, fep->bufdesc_ex);
 	bdp->cbd_sc |= BD_SC_WRAP;
 
 	bdp = fep->tx_bd_base;
@@ -1224,14 +1261,16 @@ static int fec_enet_alloc_buffers(struct net_device *ndev)
 		bdp->cbd_sc = 0;
 		bdp->cbd_bufaddr = 0;
 
-#ifdef CONFIG_FEC_PTP
-		bdp->cbd_esc = BD_ENET_RX_INT;
-#endif
-		bdp++;
+		if (fep->bufdesc_ex) {
+			struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
+			ebdp->cbd_esc = BD_ENET_RX_INT;
+		}
+
+		bdp = fec_enet_get_nextdesc(bdp, fep->bufdesc_ex);
 	}
 
 	/* Set the last buffer to wrap. */
-	bdp--;
+	bdp = fec_enet_get_prevdesc(bdp, fep->bufdesc_ex);
 	bdp->cbd_sc |= BD_SC_WRAP;
 
 	return 0;
@@ -1444,7 +1483,11 @@ static int fec_enet_init(struct net_device *ndev)
 
 	/* Set receive and transmit descriptor base. */
 	fep->rx_bd_base = cbd_base;
-	fep->tx_bd_base = cbd_base + RX_RING_SIZE;
+	if (fep->bufdesc_ex)
+		fep->tx_bd_base = (struct bufdesc *)
+			(((struct bufdesc_ex *)cbd_base) + RX_RING_SIZE);
+	else
+		fep->tx_bd_base = cbd_base + RX_RING_SIZE;
 
 	/* The FEC Ethernet specific entries in the device structure */
 	ndev->watchdog_timeo = TX_TIMEOUT;
@@ -1457,11 +1500,11 @@ static int fec_enet_init(struct net_device *ndev)
 
 		/* Initialize the BD for every fragment in the page. */
 		bdp->cbd_sc = 0;
-		bdp++;
+		bdp = fec_enet_get_nextdesc(bdp, fep->bufdesc_ex);
 	}
 
 	/* Set the last buffer to wrap */
-	bdp--;
+	bdp = fec_enet_get_prevdesc(bdp, fep->bufdesc_ex);
 	bdp->cbd_sc |= BD_SC_WRAP;
 
 	/* ...and the same for transmit */
@@ -1471,11 +1514,11 @@ static int fec_enet_init(struct net_device *ndev)
 		/* Initialize the BD for every fragment in the page. */
 		bdp->cbd_sc = 0;
 		bdp->cbd_bufaddr = 0;
-		bdp++;
+		bdp = fec_enet_get_nextdesc(bdp, fep->bufdesc_ex);
 	}
 
 	/* Set the last buffer to wrap */
-	bdp--;
+	bdp = fec_enet_get_prevdesc(bdp, fep->bufdesc_ex);
 	bdp->cbd_sc |= BD_SC_WRAP;
 
 	fec_restart(ndev, 0);
@@ -1574,6 +1617,8 @@ fec_probe(struct platform_device *pdev)
 	fep->pdev = pdev;
 	fep->dev_id = dev_id++;
 
+	fep->bufdesc_ex = 0;
+
 	if (!fep->hwp) {
 		ret = -ENOMEM;
 		goto failed_ioremap;
@@ -1628,19 +1673,19 @@ fec_probe(struct platform_device *pdev)
 		goto failed_clk;
 	}
 
-#ifdef CONFIG_FEC_PTP
 	fep->clk_ptp = devm_clk_get(&pdev->dev, "ptp");
+	fep->bufdesc_ex =
+		pdev->id_entry->driver_data & FEC_QUICK_HAS_BUFDESC_EX;
 	if (IS_ERR(fep->clk_ptp)) {
 		ret = PTR_ERR(fep->clk_ptp);
-		goto failed_clk;
+		fep->bufdesc_ex = 0;
 	}
-#endif
 
 	clk_prepare_enable(fep->clk_ahb);
 	clk_prepare_enable(fep->clk_ipg);
-#ifdef CONFIG_FEC_PTP
-	clk_prepare_enable(fep->clk_ptp);
-#endif
+	if (!IS_ERR(fep->clk_ptp))
+		clk_prepare_enable(fep->clk_ptp);
+
 	reg_phy = devm_regulator_get(&pdev->dev, "phy");
 	if (!IS_ERR(reg_phy)) {
 		ret = regulator_enable(reg_phy);
@@ -1668,9 +1713,8 @@ fec_probe(struct platform_device *pdev)
 	if (ret)
 		goto failed_register;
 
-#ifdef CONFIG_FEC_PTP
-	fec_ptp_init(ndev, pdev);
-#endif
+	if (fep->bufdesc_ex)
+		fec_ptp_init(ndev, pdev);
 
 	return 0;
 
@@ -1681,9 +1725,8 @@ failed_init:
 failed_regulator:
 	clk_disable_unprepare(fep->clk_ahb);
 	clk_disable_unprepare(fep->clk_ipg);
-#ifdef CONFIG_FEC_PTP
-	clk_disable_unprepare(fep->clk_ptp);
-#endif
+	if (!IS_ERR(fep->clk_ptp))
+		clk_disable_unprepare(fep->clk_ptp);
 failed_pin:
 failed_clk:
 	for (i = 0; i < FEC_IRQ_NUM; i++) {
@@ -1716,12 +1759,10 @@ fec_drv_remove(struct platform_device *pdev)
 		if (irq > 0)
 			free_irq(irq, ndev);
 	}
-#ifdef CONFIG_FEC_PTP
 	del_timer_sync(&fep->time_keep);
 	clk_disable_unprepare(fep->clk_ptp);
 	if (fep->ptp_clock)
 		ptp_clock_unregister(fep->ptp_clock);
-#endif
 	clk_disable_unprepare(fep->clk_ahb);
 	clk_disable_unprepare(fep->clk_ipg);
 	iounmap(fep->hwp);
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index c5a3bc1..4862394 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -13,11 +13,9 @@
 #define	FEC_H
 /****************************************************************************/
 
-#ifdef CONFIG_FEC_PTP
 #include <linux/clocksource.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
-#endif
 
 #if defined(CONFIG_M523x) || defined(CONFIG_M527x) || defined(CONFIG_M528x) || \
     defined(CONFIG_M520x) || defined(CONFIG_M532x) || \
@@ -94,14 +92,17 @@ struct bufdesc {
 	unsigned short cbd_datlen;	/* Data length */
 	unsigned short cbd_sc;	/* Control and status info */
 	unsigned long cbd_bufaddr;	/* Buffer address */
-#ifdef CONFIG_FEC_PTP
+};
+
+struct bufdesc_ex {
+	struct bufdesc desc;
 	unsigned long cbd_esc;
 	unsigned long cbd_prot;
 	unsigned long cbd_bdu;
 	unsigned long ts;
 	unsigned short res0[4];
-#endif
 };
+
 #else
 struct bufdesc {
 	unsigned short	cbd_sc;			/* Control and status info */
@@ -203,9 +204,7 @@ struct fec_enet_private {
 
 	struct clk *clk_ipg;
 	struct clk *clk_ahb;
-#ifdef CONFIG_FEC_PTP
 	struct clk *clk_ptp;
-#endif
 
 	/* The saved address of a sent-in-place packet/buffer, for skfree(). */
 	unsigned char *tx_bounce[TX_RING_SIZE];
@@ -243,8 +242,8 @@ struct fec_enet_private {
 	int	full_duplex;
 	struct	completion mdio_done;
 	int	irq[FEC_IRQ_NUM];
+	int	bufdesc_ex;
 
-#ifdef CONFIG_FEC_PTP
 	struct ptp_clock *ptp_clock;
 	struct ptp_clock_info ptp_caps;
 	unsigned long last_overflow_check;
@@ -257,15 +256,12 @@ struct fec_enet_private {
 	int hwts_rx_en;
 	int hwts_tx_en;
 	struct timer_list time_keep;
-#endif
 
 };
 
-#ifdef CONFIG_FEC_PTP
 void fec_ptp_init(struct net_device *ndev, struct platform_device *pdev);
 void fec_ptp_start_cyclecounter(struct net_device *ndev);
 int fec_ptp_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd);
-#endif
 
 /****************************************************************************/
 #endif /* FEC_H */
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH 01/19] netfilter: move nf_conntrack initialize out of pernet operations
From: Gao feng @ 2012-12-28  7:16 UTC (permalink / raw)
  To: canqun zhang
  Cc: netfilter-devel, netdev@vger.kernel.org, Patrick McHardy, pablo,
	ebiederm
In-Reply-To: <CAFFEFTXT_fkF2pPSxDEEgic80NVWLqBWtFuvs6W9uDUW2aCnqw@mail.gmail.com>

On 12/28/12 11:52, canqun zhang wrote:
> Hi all
> As discussed above,if the host machine create several linux
> containers, there will be several  net namespaces.Resources with "nf
> conntrack" are registered or unregistered on the first net
> namespace(init_net),But init_net is not unregistered lastly,so
> cleanuping other net namespaces  will triger painic.
> If net namespaces are created  with the order of 1,2,...n,they should
> be cleaned with the order of n,...2,1,so in this case init_net will be
> unregistered lastly.
> I fixed it up (see below). I have taken a lot of test!
> 

I thinks this BUG is a netfilter BUG,not a netns BUG.
Other subsystems implemented netns support don't use init_net to
do some special works((un)register/(un)set).

In fact,we can't use init_net to do this job well.such as function
nf_conntrack_clean,we shoud set ip_ct_attach to NULL before any
netns doing cleanup jobs, and set nf_ct_destroy to NULL after all of
netns finish these cleanup jobs.

So I think finally we still need this patchset,And this is a regular
way to fix this problem.

Can you help me to test if the panic bug is fixed by this patchset?
and then give me your tested-by?

thank you very much!

^ permalink raw reply

* [PATCH net-next] xfrm: removes a superfluous check and add a statistic
From: roy.qing.li @ 2012-12-28  8:06 UTC (permalink / raw)
  To: netdev

From: Li RongQing <roy.qing.li@gmail.com>

Remove the check if x->km.state equal to XFRM_STATE_VALID in
xfrm_state_check_expire(), which will be done before call
xfrm_state_check_expire().

add a LINUX_MIB_XFRMOUTSTATEINVALID statistic to record the
outbound error due to invalid xfrm state.

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
---
 include/uapi/linux/snmp.h |    1 +
 net/xfrm/xfrm_output.c    |    6 ++++++
 net/xfrm/xfrm_proc.c      |    1 +
 net/xfrm/xfrm_state.c     |    3 ---
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index fdfba23..b49eab8 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -278,6 +278,7 @@ enum
 	LINUX_MIB_XFRMOUTPOLDEAD,		/* XfrmOutPolDead */
 	LINUX_MIB_XFRMOUTPOLERROR,		/* XfrmOutPolError */
 	LINUX_MIB_XFRMFWDHDRERROR,		/* XfrmFwdHdrError*/
+	LINUX_MIB_XFRMOUTSTATEINVALID,		/* XfrmOutStateInvalid */
 	__LINUX_MIB_XFRMMAX
 };
 
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 95a338c..3670526 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -61,6 +61,12 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
 		}
 
 		spin_lock_bh(&x->lock);
+
+		if (unlikely(x->km.state != XFRM_STATE_VALID)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEINVALID);
+			goto error_nolock;
+		}
+
 		err = xfrm_state_check_expire(x);
 		if (err) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEEXPIRED);
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index d0a1af8..6039038 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -43,6 +43,7 @@ static const struct snmp_mib xfrm_mib_list[] = {
 	SNMP_MIB_ITEM("XfrmOutPolDead", LINUX_MIB_XFRMOUTPOLDEAD),
 	SNMP_MIB_ITEM("XfrmOutPolError", LINUX_MIB_XFRMOUTPOLERROR),
 	SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR),
+	SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 3459692..05db236 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1370,9 +1370,6 @@ int xfrm_state_check_expire(struct xfrm_state *x)
 	if (!x->curlft.use_time)
 		x->curlft.use_time = get_seconds();
 
-	if (x->km.state != XFRM_STATE_VALID)
-		return -EINVAL;
-
 	if (x->curlft.bytes >= x->lft.hard_byte_limit ||
 	    x->curlft.packets >= x->lft.hard_packet_limit) {
 		x->km.state = XFRM_STATE_EXPIRED;
-- 
1.7.10.4

^ permalink raw reply related

* [RFC PATCH] ah4/esp4: set transport header correctly for IPsec tunnel mode.
From: roy.qing.li @ 2012-12-28  8:07 UTC (permalink / raw)
  To: netdev

From: Li RongQing <roy.qing.li@gmail.com>

IPsec tunnel does not set ECN field to CE in inner header when
the ECN field in the outer header is CE, and the ECN field in
the inner header is ECT(0) or ECT(1).

The cause is ipip_hdr() does not return the correct address of
inner header since skb->transport-header is not the inner header
after esp_input_done2(), or ah_input().

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
---
I know this bug, but no lab to verify if my patch is correct,
hope netdev experts can inspect this patch carefully, if this
can be accepted, I will do same fix for ah6/esp6

 net/ipv4/ah4.c  |   11 +++++++++--
 net/ipv4/esp4.c |    5 ++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index a0d8392..a154d0a 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -269,7 +269,11 @@ static void ah_input_done(struct crypto_async_request *base, int err)
 	skb->network_header += ah_hlen;
 	memcpy(skb_network_header(skb), work_iph, ihl);
 	__skb_pull(skb, ah_hlen + ihl);
-	skb_set_transport_header(skb, -ihl);
+
+	if (x->props.mode == XFRM_MODE_TUNNEL)
+		skb_reset_transport_header(skb);
+	else
+		skb_set_transport_header(skb, -ihl);
 out:
 	kfree(AH_SKB_CB(skb)->tmp);
 	xfrm_input_resume(skb, err);
@@ -381,7 +385,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 	skb->network_header += ah_hlen;
 	memcpy(skb_network_header(skb), work_iph, ihl);
 	__skb_pull(skb, ah_hlen + ihl);
-	skb_set_transport_header(skb, -ihl);
+	if (x->props.mode == XFRM_MODE_TUNNEL)
+		skb_reset_transport_header(skb);
+	else
+		skb_set_transport_header(skb, -ihl);
 
 	err = nexthdr;
 
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b61e9de..fd26ff4 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -346,7 +346,10 @@ static int esp_input_done2(struct sk_buff *skb, int err)
 
 	pskb_trim(skb, skb->len - alen - padlen - 2);
 	__skb_pull(skb, hlen);
-	skb_set_transport_header(skb, -ihl);
+	if (x->props.mode == XFRM_MODE_TUNNEL)
+		skb_reset_transport_header(skb);
+	else
+		skb_set_transport_header(skb, -ihl);
 
 	err = nexthdr[1];
 
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH net-next] bridge: respect RFC2863 operational state
From: Jiri Pirko @ 2012-12-28  8:40 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev
In-Reply-To: <20121227222854.6ec132dd@nehalam.linuxnetplumber.net>

Fri, Dec 28, 2012 at 07:28:54AM CET, shemminger@vyatta.com wrote:
>The bridge link detection should follow the operational state
>of the lower device, rather than the carrier bit. This allows devices
>like tunnels that are controlled by userspace control plane to work
>with bridge STP link management.
>
>
>Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>
>
>--- a/net/bridge/br_if.c	2012-10-25 09:11:15.627272524 -0700
>+++ b/net/bridge/br_if.c	2012-12-14 08:58:14.329847361 -0800
>@@ -66,14 +66,14 @@ void br_port_carrier_check(struct net_br
> 	struct net_device *dev = p->dev;
> 	struct net_bridge *br = p->br;
> 
>-	if (netif_running(dev) && netif_carrier_ok(dev))
>+	if (netif_running(dev) && netif_oper_up(dev))
> 		p->path_cost = port_cost(dev);
> 
> 	if (!netif_running(br->dev))
> 		return;
> 
> 	spin_lock_bh(&br->lock);
>-	if (netif_running(dev) && netif_carrier_ok(dev)) {
>+	if (netif_running(dev) && netif_oper_up(dev))
> 		if (p->state == BR_STATE_DISABLED)
> 			br_stp_enable_port(p);
> 	} else {
>--- a/net/bridge/br_notify.c	2012-10-25 09:11:15.631272484 -0700
>+++ b/net/bridge/br_notify.c	2012-12-14 08:57:36.954222724 -0800
>@@ -82,7 +82,7 @@ static int br_device_event(struct notifi
> 		break;
> 
> 	case NETDEV_UP:
>-		if (netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) {
>+		if (netif_running(br->dev) && netif_oper_up(dev)) {
> 			spin_lock_bh(&br->lock);
> 			br_stp_enable_port(p);
> 			spin_unlock_bh(&br->lock);
>--
>To unsubscribe from this list: send the line "unsubscribe netdev" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html


Reviewed-by: Jiri Pirko <jiri@resnulli.us>

^ permalink raw reply

* Re: [PATCH 01/19] netfilter: move nf_conntrack initialize out of pernet operations
From: canqun zhang @ 2012-12-28  8:48 UTC (permalink / raw)
  To: Gao feng
  Cc: netfilter-devel, netdev@vger.kernel.org, Patrick McHardy, pablo,
	Eric W. Biederman
In-Reply-To: <50DD4737.2070306@cn.fujitsu.com>

ok, I can help you take a test, please send a big patch container this
patchset  to my email.


2012/12/28 Gao feng <gaofeng@cn.fujitsu.com>:
> On 12/28/12 11:52, canqun zhang wrote:
>> Hi all
>> As discussed above,if the host machine create several linux
>> containers, there will be several  net namespaces.Resources with "nf
>> conntrack" are registered or unregistered on the first net
>> namespace(init_net),But init_net is not unregistered lastly,so
>> cleanuping other net namespaces  will triger painic.
>> If net namespaces are created  with the order of 1,2,...n,they should
>> be cleaned with the order of n,...2,1,so in this case init_net will be
>> unregistered lastly.
>> I fixed it up (see below). I have taken a lot of test!
>>
>
> I thinks this BUG is a netfilter BUG,not a netns BUG.
> Other subsystems implemented netns support don't use init_net to
> do some special works((un)register/(un)set).
>
> In fact,we can't use init_net to do this job well.such as function
> nf_conntrack_clean,we shoud set ip_ct_attach to NULL before any
> netns doing cleanup jobs, and set nf_ct_destroy to NULL after all of
> netns finish these cleanup jobs.
>
> So I think finally we still need this patchset,And this is a regular
> way to fix this problem.
>
> Can you help me to test if the panic bug is fixed by this patchset?
> and then give me your tested-by?
>
> thank you very much!

^ permalink raw reply

* [patch net-next V3,repost 0/4] net: allow to change carrier from userspace
From: Jiri Pirko @ 2012-12-28  9:49 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, bhutchings, mirqus, shemminger, greearb, fbl,
	john.r.fastabend

This is basically a V3 of a repost of my previous patchset:
"[patch net-next-2.6 0/2] net: allow to change carrier via sysfs" from Aug 30

The way net-sysfs stores values changed and this patchset reflects it.
Also, I exposed carrier via rtnetlink iface.

So far, only dummy driver uses carrier change ndo. In very near future
team driver will use that as well.

V2->V3:
 - updated ndo_change_carrier comment by Dan Williams

V1->v2:
 - added bigger comment to ndo and also note to operstate.txt documentation
   stating the clear purpose of this iface

Jiri Pirko (4):
  net: add change_carrier netdev op
  net: allow to change carrier via sysfs
  rtnl: expose carrier value with possibility to set it
  dummy: implement carrier change

 Documentation/networking/operstates.txt |  4 ++++
 drivers/net/dummy.c                     | 10 ++++++++++
 include/linux/netdevice.h               | 12 ++++++++++++
 include/uapi/linux/if_link.h            |  1 +
 net/core/dev.c                          | 19 +++++++++++++++++++
 net/core/net-sysfs.c                    | 15 ++++++++++++++-
 net/core/rtnetlink.c                    | 10 ++++++++++
 7 files changed, 70 insertions(+), 1 deletion(-)

-- 
1.8.0

^ permalink raw reply

* [patch net-next 1/4] net: add change_carrier netdev op
From: Jiri Pirko @ 2012-12-28  9:49 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, bhutchings, mirqus, shemminger, greearb, fbl,
	john.r.fastabend
In-Reply-To: <1356688180-3549-1-git-send-email-jiri@resnulli.us>

This allows a driver to register change_carrier callback which will be
called whenever user will like to change carrier state. This is useful
for devices like dummy, gre, team and so on.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
 include/linux/netdevice.h | 12 ++++++++++++
 net/core/dev.c            | 19 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c599e47..0e1b92a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -891,6 +891,14 @@ struct netdev_fcoe_hbainfo {
  * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh)
  * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
  *			     struct net_device *dev)
+ *
+ * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
+ *	Called to change device carrier. Soft-devices (like dummy, team, etc)
+ *	which do not represent real hardware may define this to allow their
+ *	userspace components to manage their virtual carrier state. Devices
+ *	that determine carrier state from physical hardware properties (eg
+ *	network cables) or protocol-dependent mechanisms (eg
+ *	USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1008,6 +1016,8 @@ struct net_device_ops {
 	int			(*ndo_bridge_getlink)(struct sk_buff *skb,
 						      u32 pid, u32 seq,
 						      struct net_device *dev);
+	int			(*ndo_change_carrier)(struct net_device *dev,
+						      bool new_carrier);
 };
 
 /*
@@ -2194,6 +2204,8 @@ extern int		dev_set_mtu(struct net_device *, int);
 extern void		dev_set_group(struct net_device *, int);
 extern int		dev_set_mac_address(struct net_device *,
 					    struct sockaddr *);
+extern int		dev_change_carrier(struct net_device *,
+					   bool new_carrier);
 extern int		dev_hard_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev,
 					    struct netdev_queue *txq);
diff --git a/net/core/dev.c b/net/core/dev.c
index 515473e..21c5b97 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5027,6 +5027,25 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 }
 EXPORT_SYMBOL(dev_set_mac_address);
 
+/**
+ *	dev_change_carrier - Change device carrier
+ *	@dev: device
+ *	@new_carries: new value
+ *
+ *	Change device carrier
+ */
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_change_carrier)
+		return -EOPNOTSUPP;
+	if (!netif_device_present(dev))
+		return -ENODEV;
+	return ops->ndo_change_carrier(dev, new_carrier);
+}
+EXPORT_SYMBOL(dev_change_carrier);
+
 /*
  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
  */
-- 
1.8.0

^ permalink raw reply related

* [patch net-next 2/4] net: allow to change carrier via sysfs
From: Jiri Pirko @ 2012-12-28  9:49 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, bhutchings, mirqus, shemminger, greearb, fbl,
	john.r.fastabend
In-Reply-To: <1356688180-3549-1-git-send-email-jiri@resnulli.us>

Make carrier writable

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
 net/core/net-sysfs.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 28c5f5a..29c884a 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -126,6 +126,19 @@ static ssize_t show_broadcast(struct device *dev,
 	return -EINVAL;
 }
 
+static int change_carrier(struct net_device *net, unsigned long new_carrier)
+{
+	if (!netif_running(net))
+		return -EINVAL;
+	return dev_change_carrier(net, (bool) new_carrier);
+}
+
+static ssize_t store_carrier(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, change_carrier);
+}
+
 static ssize_t show_carrier(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
@@ -331,7 +344,7 @@ static struct device_attribute net_class_attributes[] = {
 	__ATTR(link_mode, S_IRUGO, show_link_mode, NULL),
 	__ATTR(address, S_IRUGO, show_address, NULL),
 	__ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
-	__ATTR(carrier, S_IRUGO, show_carrier, NULL),
+	__ATTR(carrier, S_IRUGO | S_IWUSR, show_carrier, store_carrier),
 	__ATTR(speed, S_IRUGO, show_speed, NULL),
 	__ATTR(duplex, S_IRUGO, show_duplex, NULL),
 	__ATTR(dormant, S_IRUGO, show_dormant, NULL),
-- 
1.8.0

^ permalink raw reply related

* [patch net-next 3/4] rtnl: expose carrier value with possibility to set it
From: Jiri Pirko @ 2012-12-28  9:49 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, bhutchings, mirqus, shemminger, greearb, fbl,
	john.r.fastabend
In-Reply-To: <1356688180-3549-1-git-send-email-jiri@resnulli.us>

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
 Documentation/networking/operstates.txt |  4 ++++
 include/uapi/linux/if_link.h            |  1 +
 net/core/rtnetlink.c                    | 10 ++++++++++
 3 files changed, 15 insertions(+)

diff --git a/Documentation/networking/operstates.txt b/Documentation/networking/operstates.txt
index 1a77a3c..9769457 100644
--- a/Documentation/networking/operstates.txt
+++ b/Documentation/networking/operstates.txt
@@ -88,6 +88,10 @@ set this flag. On netif_carrier_off(), the scheduler stops sending
 packets. The name 'carrier' and the inversion are historical, think of
 it as lower layer.
 
+Note that for certain kind of soft-devices, which are not managing any
+real hardware, there is possible to set this bit from userpsace.
+One should use TVL IFLA_CARRIER to do so.
+
 netif_carrier_ok() can be used to query that bit.
 
 __LINK_STATE_DORMANT, maps to IFF_DORMANT:
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 60f3b6b..c4edfe1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -142,6 +142,7 @@ enum {
 #define IFLA_PROMISCUITY IFLA_PROMISCUITY
 	IFLA_NUM_TX_QUEUES,
 	IFLA_NUM_RX_QUEUES,
+	IFLA_CARRIER,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1868625..2ef7a56 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -780,6 +780,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4) /* IFLA_MTU */
 	       + nla_total_size(4) /* IFLA_LINK */
 	       + nla_total_size(4) /* IFLA_MASTER */
+	       + nla_total_size(1) /* IFLA_CARRIER */
 	       + nla_total_size(4) /* IFLA_PROMISCUITY */
 	       + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
 	       + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
@@ -909,6 +910,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	     nla_put_u32(skb, IFLA_LINK, dev->iflink)) ||
 	    (dev->master &&
 	     nla_put_u32(skb, IFLA_MASTER, dev->master->ifindex)) ||
+	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
 	    (dev->qdisc &&
 	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
 	    (dev->ifalias &&
@@ -1108,6 +1110,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_MTU]		= { .type = NLA_U32 },
 	[IFLA_LINK]		= { .type = NLA_U32 },
 	[IFLA_MASTER]		= { .type = NLA_U32 },
+	[IFLA_CARRIER]		= { .type = NLA_U8 },
 	[IFLA_TXQLEN]		= { .type = NLA_U32 },
 	[IFLA_WEIGHT]		= { .type = NLA_U32 },
 	[IFLA_OPERSTATE]	= { .type = NLA_U8 },
@@ -1438,6 +1441,13 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		modified = 1;
 	}
 
+	if (tb[IFLA_CARRIER]) {
+		err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
+		if (err)
+			goto errout;
+		modified = 1;
+	}
+
 	if (tb[IFLA_TXQLEN])
 		dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
 
-- 
1.8.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox