Netdev List
 help / color / mirror / Atom feed
* [PATCH 4/9] tproxy: added tproxy sockopt interface in the IPV6 layer
From: KOVACS Krisztian @ 2010-10-20 11:21 UTC (permalink / raw)
  To: netdev, netfilter-devel; +Cc: Patrick McHardy, David Miller
In-Reply-To: <20101020112118.6260.31618.stgit@este.odu>

From: Balazs Scheidler <bazsi@balabit.hu>

Support for IPV6_RECVORIGDSTADDR sockopt for UDP sockets were contributed by
Harry Mason.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
---
 include/linux/in6.h      |    4 ++++
 include/linux/ipv6.h     |    4 +++-
 net/ipv6/datagram.c      |   19 +++++++++++++++++++
 net/ipv6/ipv6_sockglue.c |   23 +++++++++++++++++++++++
 4 files changed, 49 insertions(+), 1 deletions(-)

diff --git a/include/linux/in6.h b/include/linux/in6.h
index c4bf46f..097a34b 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -268,6 +268,10 @@ struct in6_flowlabel_req {
 /* RFC5082: Generalized Ttl Security Mechanism */
 #define IPV6_MINHOPCOUNT		73
 
+#define IPV6_ORIGDSTADDR        74
+#define IPV6_RECVORIGDSTADDR    IPV6_ORIGDSTADDR
+#define IPV6_TRANSPARENT        75
+
 /*
  * Multicast Routing:
  * see include/linux/mroute6.h.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e62683b..8e429d0 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -341,7 +341,9 @@ struct ipv6_pinfo {
 				odstopts:1,
                                 rxflow:1,
 				rxtclass:1,
-				rxpmtu:1;
+				rxpmtu:1,
+				rxorigdstaddr:1;
+				/* 2 bits hole */
 		} bits;
 		__u16		all;
 	} rxopt;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index ef371aa..320bdb8 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -577,6 +577,25 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
 		u8 *ptr = nh + opt->dst1;
 		put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
 	}
+	if (np->rxopt.bits.rxorigdstaddr) {
+		struct sockaddr_in6 sin6;
+		u16 *ports = (u16 *) skb_transport_header(skb);
+
+		if (skb_transport_offset(skb) + 4 <= skb->len) {
+			/* All current transport protocols have the port numbers in the
+			 * first four bytes of the transport header and this function is
+			 * written with this assumption in mind.
+			 */
+
+			sin6.sin6_family = AF_INET6;
+			ipv6_addr_copy(&sin6.sin6_addr, &ipv6_hdr(skb)->daddr);
+			sin6.sin6_port = ports[1];
+			sin6.sin6_flowinfo = 0;
+			sin6.sin6_scope_id = 0;
+
+			put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
+		}
+	}
 	return 0;
 }
 
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a7f66bc..0553867 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -342,6 +342,21 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		retv = 0;
 		break;
 
+	case IPV6_TRANSPARENT:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		/* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */
+		inet_sk(sk)->transparent = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RECVORIGDSTADDR:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.rxorigdstaddr = valbool;
+		retv = 0;
+		break;
+
 	case IPV6_HOPOPTS:
 	case IPV6_RTHDRDSTOPTS:
 	case IPV6_RTHDR:
@@ -1104,6 +1119,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 	}
 
+	case IPV6_TRANSPARENT:
+		val = inet_sk(sk)->transparent;
+		break;
+
+	case IPV6_RECVORIGDSTADDR:
+		val = np->rxopt.bits.rxorigdstaddr;
+		break;
+
 	case IPV6_UNICAST_HOPS:
 	case IPV6_MULTICAST_HOPS:
 	{



^ permalink raw reply related

* [PATCH 5/9] tproxy: allow non-local binds of IPv6 sockets if IP_TRANSPARENT is enabled
From: KOVACS Krisztian @ 2010-10-20 11:21 UTC (permalink / raw)
  To: netdev, netfilter-devel; +Cc: Patrick McHardy, David Miller
In-Reply-To: <20101020112118.6260.31618.stgit@este.odu>

From: Balazs Scheidler <bazsi@balabit.hu>

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
---
 net/ipv6/af_inet6.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 6022098..9480572 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -343,7 +343,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 			 */
 			v4addr = LOOPBACK4_IPV6;
 			if (!(addr_type & IPV6_ADDR_MULTICAST))	{
-				if (!ipv6_chk_addr(net, &addr->sin6_addr,
+				if (!inet->transparent && !ipv6_chk_addr(net, &addr->sin6_addr,
 						   dev, 0)) {
 					err = -EADDRNOTAVAIL;
 					goto out_unlock;



^ permalink raw reply related

* Re: [PATCH 5/9] tproxy: allow non-local binds of IPv6 sockets if IP_TRANSPARENT is enabled
From: YOSHIFUJI Hideaki @ 2010-10-20 12:45 UTC (permalink / raw)
  To: KOVACS Krisztian; +Cc: netdev, netfilter-devel, Patrick McHardy, David Miller
In-Reply-To: <20101020112118.6260.93956.stgit@este.odu>

Hello.

(2010/10/20 20:21), KOVACS Krisztian wrote:
> From: Balazs Scheidler<bazsi@balabit.hu>
> 
> Signed-off-by: Balazs Scheidler<bazsi@balabit.hu>
> Signed-off-by: KOVACS Krisztian<hidden@balabit.hu>
> ---
>   net/ipv6/af_inet6.c |    2 +-
>   1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> index 6022098..9480572 100644
> --- a/net/ipv6/af_inet6.c
> +++ b/net/ipv6/af_inet6.c
> @@ -343,7 +343,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
>   			 */
>   			v4addr = LOOPBACK4_IPV6;
>   			if (!(addr_type&  IPV6_ADDR_MULTICAST))	{
> -				if (!ipv6_chk_addr(net,&addr->sin6_addr,
> +				if (!inet->transparent&&  !ipv6_chk_addr(net,&addr->sin6_addr,
>   						   dev, 0)) {
>   					err = -EADDRNOTAVAIL;
>   					goto out_unlock;
> 
> 

As I wrote before in other thread, this does not seem sufficient --
well, it is sufficient to allow non-local bind, but before we're
allowing this, we need add checks of source address in sending side.

Regards,

--yoshfuji

^ permalink raw reply

* Hello,
From: Ron Abrahams @ 2010-10-20 13:04 UTC (permalink / raw)



Hello,

Contact me for details to transfer US$ 21,300,000.00 to 

you for us.This fund originally belongs to a client who 

had no blood relation in his account-opening package.

email: ronabrahams.uk@rediff.com 

RON ABRAHAMS

^ permalink raw reply

* Re: [RFC PATCH 3/9] ipvs network name space aware
From: Simon Horman @ 2010-10-20 14:03 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: lvs-devel, netdev, netfilter-devel, ja, wensong, daniel.lezcano
In-Reply-To: <201010081316.57914.hans.schillstrom@ericsson.com>

On Fri, Oct 08, 2010 at 01:16:57PM +0200, Hans Schillstrom wrote:
> 
> This patch just contains ip_vs_conn.c
> and does the normal
>  - moving to vars to struct ipvs
>  - adding per netns init and exit
> 
> proc_fs required some extra work with adding/chaning private data to get the net ptr.

I am currently working on rebasing this patch against the
current nf-next-2.6 tree with includes persistence engines
and I noticed a few things.

> Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>
> 
> diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
> index b71c69a..c47828f 100644
> --- a/net/netfilter/ipvs/ip_vs_conn.c
> +++ b/net/netfilter/ipvs/ip_vs_conn.c
> @@ -47,7 +47,7 @@
> 
>  /*
>   * Connection hash size. Default is what was selected at compile time.
> -*/
> + */
>  int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
>  module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
>  MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");

This fragment is not needed.

> @@ -56,23 +56,12 @@ MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
>  int ip_vs_conn_tab_size;
>  int ip_vs_conn_tab_mask;
> 
> -/*
> - *  Connection hash table: for input and output packets lookups of IPVS
> - */
> -static struct list_head *ip_vs_conn_tab;
> -
> -/*  SLAB cache for IPVS connections */
> -static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
> -
> -/*  counter for current IPVS connections */
> -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
> -
> -/*  counter for no client port connections */
> -static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
> -
>  /* random value for IPVS connection hash */
>  static unsigned int ip_vs_conn_rnd;
> 
> +/* cache name cnt */
> +static atomic_t conn_cache_nr = ATOMIC_INIT(0);
> +
>  /*
>   *  Fine locking granularity for big connection hash table
>   */
> @@ -153,7 +142,7 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
>   *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
>   *	returns bool success.
>   */
> -static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
> +static inline int ip_vs_conn_hash(struct net *net, struct ip_vs_conn *cp)
>  {
>  	unsigned hash;
>  	int ret;
> @@ -168,7 +157,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
>  	spin_lock(&cp->lock);
> 
>  	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
> -		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
> +		list_add(&cp->c_list, &net->ipvs->conn_tab[hash]);
>  		cp->flags |= IP_VS_CONN_F_HASHED;
>  		atomic_inc(&cp->refcnt);
>  		ret = 1;
> @@ -221,18 +210,20 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
>   *	s_addr, s_port: pkt source address (foreign host)
>   *	d_addr, d_port: pkt dest address (load balancer)
>   */
> -static inline struct ip_vs_conn *__ip_vs_conn_in_get
> -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
> - const union nf_inet_addr *d_addr, __be16 d_port)
> +static inline struct ip_vs_conn *
> +__ip_vs_conn_in_get(struct net *net, int af, int protocol,
> +		    const union nf_inet_addr *s_addr, __be16 s_port,
> +		    const union nf_inet_addr *d_addr, __be16 d_port)
>  {
>  	unsigned hash;
>  	struct ip_vs_conn *cp;
> +	struct netns_ipvs *ipvs = net->ipvs;
> 
>  	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
> 
>  	ct_read_lock(hash);
> 
> -	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> +	list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
>  		if (cp->af == af &&
>  		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
>  		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
> @@ -251,16 +242,18 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
>  	return NULL;
>  }
> 
> -struct ip_vs_conn *ip_vs_conn_in_get
> -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
> - const union nf_inet_addr *d_addr, __be16 d_port)
> +struct ip_vs_conn *
> +ip_vs_conn_in_get(struct net *net, int af, int protocol,
> +		  const union nf_inet_addr *s_addr, __be16 s_port,
> +		  const union nf_inet_addr *d_addr, __be16 d_port)
>  {
>  	struct ip_vs_conn *cp;
> 
> -	cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
> -	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
> -		cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
> -					 d_port);
> +	cp = __ip_vs_conn_in_get(net, af, protocol,
> +				 s_addr, s_port, d_addr, d_port);
> +	if (!cp && atomic_read(&net->ipvs->conn_no_cport_cnt))
> +		cp = __ip_vs_conn_in_get(net, af, protocol,
> +					 s_addr, 0, d_addr, d_port);
> 
>  	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
>  		      ip_vs_proto_name(protocol),
> @@ -278,35 +271,41 @@ ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
>  			unsigned int proto_off, int inverse)
>  {
>  	__be16 _ports[2], *pptr;
> +	struct net *net = dev_net(skb->dev);
> 
>  	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
>  	if (pptr == NULL)
>  		return NULL;
> 
> +	BUG_ON(!net);

Can you explain why BUG_ON is here?

>  	if (likely(!inverse))
> -		return ip_vs_conn_in_get(af, iph->protocol,
> +		return ip_vs_conn_in_get(net, af, iph->protocol,
>  					 &iph->saddr, pptr[0],
>  					 &iph->daddr, pptr[1]);
>  	else
> -		return ip_vs_conn_in_get(af, iph->protocol,
> +		return ip_vs_conn_in_get(net, af, iph->protocol,
>  					 &iph->daddr, pptr[1],
>  					 &iph->saddr, pptr[0]);
>  }
>  EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
> 
> -/* Get reference to connection template */
> -struct ip_vs_conn *ip_vs_ct_in_get
> -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
> - const union nf_inet_addr *d_addr, __be16 d_port)
> +/*
> + *  Get reference to connection template
> + */
> +struct ip_vs_conn *
> +ip_vs_ct_in_get(struct net *net, int af, int protocol,
> +		const union nf_inet_addr *s_addr, __be16 s_port,
> +		const union nf_inet_addr *d_addr, __be16 d_port)
>  {
>  	unsigned hash;
>  	struct ip_vs_conn *cp;
> +	struct netns_ipvs *ipvs = net->ipvs;
> 
>  	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
> 
>  	ct_read_lock(hash);
> 
> -	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> +	list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
>  		if (cp->af == af &&
>  		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
>  		    /* protocol should only be IPPROTO_IP if
> @@ -341,12 +340,14 @@ struct ip_vs_conn *ip_vs_ct_in_get
>   *	s_addr, s_port: pkt source address (inside host)
>   *	d_addr, d_port: pkt dest address (foreign host)
>   */
> -struct ip_vs_conn *ip_vs_conn_out_get
> -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
> - const union nf_inet_addr *d_addr, __be16 d_port)
> +struct ip_vs_conn *
> +ip_vs_conn_out_get(struct net *net, int af, int protocol,
> +		   const union nf_inet_addr *s_addr, __be16 s_port,
> +		   const union nf_inet_addr *d_addr, __be16 d_port)
>  {
>  	unsigned hash;
>  	struct ip_vs_conn *cp, *ret=NULL;
> +	struct netns_ipvs *ipvs = net->ipvs;
> 
>  	/*
>  	 *	Check for "full" addressed entries
> @@ -355,7 +356,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
> 
>  	ct_read_lock(hash);
> 
> -	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> +	list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
>  		if (cp->af == af &&
>  		    ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
>  		    ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
> @@ -386,17 +387,19 @@ ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
>  			 unsigned int proto_off, int inverse)
>  {
>  	__be16 _ports[2], *pptr;
> +	struct net *net = dev_net(skb->dev);
> 
>  	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
>  	if (pptr == NULL)
>  		return NULL;
> 
> +	BUG_ON(!net);
>  	if (likely(!inverse))
> -		return ip_vs_conn_out_get(af, iph->protocol,
> +		return ip_vs_conn_out_get(net, af, iph->protocol,
>  					  &iph->saddr, pptr[0],
>  					  &iph->daddr, pptr[1]);
>  	else
> -		return ip_vs_conn_out_get(af, iph->protocol,
> +		return ip_vs_conn_out_get(net, af, iph->protocol,
>  					  &iph->daddr, pptr[1],
>  					  &iph->saddr, pptr[0]);
>  }
> @@ -408,7 +411,7 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
>  void ip_vs_conn_put(struct ip_vs_conn *cp)
>  {
>  	unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
> -		0 : cp->timeout;
> +			   0 : cp->timeout;
>  	mod_timer(&cp->timer, jiffies+t);
> 
>  	__ip_vs_conn_put(cp);
> @@ -418,19 +421,19 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
>  /*
>   *	Fill a no_client_port connection with a client port number
>   */
> -void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
> +void ip_vs_conn_fill_cport(struct net *net, struct ip_vs_conn *cp, __be16 cport)
>  {
>  	if (ip_vs_conn_unhash(cp)) {
>  		spin_lock(&cp->lock);
>  		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
> -			atomic_dec(&ip_vs_conn_no_cport_cnt);
> +			atomic_dec(&net->ipvs->conn_no_cport_cnt);
>  			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
>  			cp->cport = cport;
>  		}
>  		spin_unlock(&cp->lock);
> 
>  		/* hash on new dport */
> -		ip_vs_conn_hash(cp);
> +		ip_vs_conn_hash(net, cp);
>  	}
>  }
> 
> @@ -561,12 +564,12 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
>   * Check if there is a destination for the connection, if so
>   * bind the connection to the destination.
>   */
> -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
> +struct ip_vs_dest *ip_vs_try_bind_dest(struct net *net, struct ip_vs_conn *cp)
>  {
>  	struct ip_vs_dest *dest;
> 
>  	if ((cp) && (!cp->dest)) {
> -		dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
> +		dest = ip_vs_find_dest(net, cp->af, &cp->daddr, cp->dport,
>  				       &cp->vaddr, cp->vport,
>  				       cp->protocol);
>  		ip_vs_bind_dest(cp, dest);
> @@ -638,7 +641,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
>   *	If available, return 1, otherwise invalidate this connection
>   *	template and return 0.
>   */
> -int ip_vs_check_template(struct ip_vs_conn *ct)
> +int ip_vs_check_template(struct net *net, struct ip_vs_conn *ct)
>  {
>  	struct ip_vs_dest *dest = ct->dest;
> 
> @@ -647,7 +650,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
>  	 */
>  	if ((dest == NULL) ||
>  	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
> -	    (sysctl_ip_vs_expire_quiescent_template &&
> +	    (net->ipvs->sysctl_expire_quiescent_template &&
>  	     (atomic_read(&dest->weight) == 0))) {
>  		IP_VS_DBG_BUF(9, "check_template: dest not available for "
>  			      "protocol %s s:%s:%d v:%s:%d "
> @@ -668,7 +671,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
>  				ct->dport = htons(0xffff);
>  				ct->vport = htons(0xffff);
>  				ct->cport = 0;
> -				ip_vs_conn_hash(ct);
> +				ip_vs_conn_hash(net, ct);
>  			}
>  		}
> 
> @@ -720,16 +723,17 @@ static void ip_vs_conn_expire(unsigned long data)
>  		if (unlikely(cp->app != NULL))
>  			ip_vs_unbind_app(cp);
>  		ip_vs_unbind_dest(cp);
> +		BUG_ON(!cp->net);
>  		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
> -			atomic_dec(&ip_vs_conn_no_cport_cnt);
> -		atomic_dec(&ip_vs_conn_count);
> +			atomic_dec(&cp->net->ipvs->conn_no_cport_cnt);
> +		atomic_dec(&cp->net->ipvs->conn_count);
> 
> -		kmem_cache_free(ip_vs_conn_cachep, cp);
> +		kmem_cache_free(cp->net->ipvs->conn_cachep, cp);
>  		return;
>  	}
> 
>  	/* hash it back to the table */
> -	ip_vs_conn_hash(cp);
> +	ip_vs_conn_hash(cp->net, cp);
> 
>    expire_later:
>  	IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
> @@ -748,18 +752,22 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
> 
> 
>  /*
> - *	Create a new connection entry and hash it into the ip_vs_conn_tab
> + *	Create a new connection entry and hash it into the ip_vs_conn_tab,
> + * 	netns ptr will be stored in ip_vs_con here.
>   */
>  struct ip_vs_conn *
> -ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
> +ip_vs_conn_new(struct net *net, int af, int proto,
> +	       const union nf_inet_addr *caddr, __be16 cport,
>  	       const union nf_inet_addr *vaddr, __be16 vport,
> -	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
> -	       struct ip_vs_dest *dest)
> +	       const union nf_inet_addr *daddr, __be16 dport,
> +	       unsigned flags, struct ip_vs_dest *dest)
>  {
>  	struct ip_vs_conn *cp;
> -	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
> +	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, proto);
> +	struct ip_vs_protocol *pp;
> +	struct netns_ipvs *ipvs = net->ipvs;
> 
> -	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
> +	cp = kmem_cache_zalloc(ipvs->conn_cachep, GFP_ATOMIC);
>  	if (cp == NULL) {
>  		IP_VS_ERR_RL("%s(): no memory\n", __func__);
>  		return NULL;
> @@ -790,9 +798,9 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
>  	atomic_set(&cp->n_control, 0);
>  	atomic_set(&cp->in_pkts, 0);
> 
> -	atomic_inc(&ip_vs_conn_count);
> +	atomic_inc(&ipvs->conn_count);
>  	if (flags & IP_VS_CONN_F_NO_CPORT)
> -		atomic_inc(&ip_vs_conn_no_cport_cnt);
> +		atomic_inc(&ipvs->conn_no_cport_cnt);
> 
>  	/* Bind the connection with a destination server */
>  	ip_vs_bind_dest(cp, dest);
> @@ -808,12 +816,14 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
>  	else
>  #endif
>  		ip_vs_bind_xmit(cp);
> -
> -	if (unlikely(pp && atomic_read(&pp->appcnt)))
> -		ip_vs_bind_app(cp, pp);
> -
> +	cp->net = net;	/* netns ptr  needed in timer */
> +	if( pd ) {
> +		pp = pd->pp;
> +		if (unlikely(pp && atomic_read(&pd->appcnt)))
> +			ip_vs_bind_app(net, cp, pp);
> +	}
>  	/* Hash it in the ip_vs_conn_tab finally */
> -	ip_vs_conn_hash(cp);
> +	ip_vs_conn_hash(net, cp);
> 
>  	return cp;
>  }
> @@ -824,16 +834,33 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
>   */
>  #ifdef CONFIG_PROC_FS
> 
> +struct ipvs_private {
> +	struct seq_net_private p;
> +	void *private;
> +};
> +
> +static inline void ipvs_seq_priv_set(struct seq_file *seq, void *data)
> +{
> +	struct ipvs_private *ipriv=(struct ipvs_private *)seq->private;
> +	ipriv->private = data;
> +}
> +static inline void *ipvs_seq_priv_get(struct seq_file *seq)
> +{
> +	return ((struct ipvs_private *)seq->private)->private;
> +}
> +
>  static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
>  {
>  	int idx;
>  	struct ip_vs_conn *cp;
> +	struct net *net = seq_file_net(seq);
> +	struct netns_ipvs *ipvs = net->ipvs;
> 
>  	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
>  		ct_read_lock_bh(idx);
> -		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> +		list_for_each_entry(cp, &ipvs->conn_tab[idx], c_list) {
>  			if (pos-- == 0) {
> -				seq->private = &ip_vs_conn_tab[idx];
> +				ipvs_seq_priv_set(seq, &ipvs->conn_tab[idx]);
>  				return cp;
>  			}
>  		}
> @@ -845,15 +872,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
> 
>  static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
>  {
> -	seq->private = NULL;
> +	ipvs_seq_priv_set(seq, NULL);
>  	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
>  }
> -
> + /* netns: conn_tab OK */
>  static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>  {
>  	struct ip_vs_conn *cp = v;
> -	struct list_head *e, *l = seq->private;
> +	struct list_head *e, *l = ipvs_seq_priv_get(seq);
>  	int idx;
> +	struct net *net = seq_file_net(seq);
> +	struct netns_ipvs *ipvs = net->ipvs;
> 
>  	++*pos;
>  	if (v == SEQ_START_TOKEN)
> @@ -863,27 +892,28 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>  	if ((e = cp->c_list.next) != l)
>  		return list_entry(e, struct ip_vs_conn, c_list);
> 
> -	idx = l - ip_vs_conn_tab;
> +	idx = l - ipvs->conn_tab;
>  	ct_read_unlock_bh(idx);
> 
>  	while (++idx < ip_vs_conn_tab_size) {
>  		ct_read_lock_bh(idx);
> -		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> -			seq->private = &ip_vs_conn_tab[idx];
> +		list_for_each_entry(cp, &ipvs->conn_tab[idx], c_list) {
> +			ipvs_seq_priv_set(seq, &ipvs->conn_tab[idx]);
>  			return cp;
>  		}
>  		ct_read_unlock_bh(idx);
>  	}
> -	seq->private = NULL;
> +	ipvs_seq_priv_set(seq, NULL);
>  	return NULL;
>  }
> -
> +/* netns: conn_tab OK */
>  static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
>  {
> -	struct list_head *l = seq->private;
> +	struct list_head *l = ipvs_seq_priv_get(seq);
> +	struct net *net = seq_file_net(seq);
> 
>  	if (l)
> -		ct_read_unlock_bh(l - ip_vs_conn_tab);
> +		ct_read_unlock_bh(l - net->ipvs->conn_tab);
>  }
> 
>  static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
> @@ -928,7 +958,16 @@ static const struct seq_operations ip_vs_conn_seq_ops = {
> 
>  static int ip_vs_conn_open(struct inode *inode, struct file *file)
>  {
> -	return seq_open(file, &ip_vs_conn_seq_ops);
> +	int ret;
> +	struct ipvs_private *priv;
> +
> +	ret = seq_open_net(inode, file, &ip_vs_conn_seq_ops,
> +			   sizeof(struct ipvs_private));
> +	if (!ret) {
> +		priv = ((struct seq_file *)file->private_data)->private;
> +		priv->private = NULL;
> +	}
> +	return ret;
>  }
> 
>  static const struct file_operations ip_vs_conn_fops = {
> @@ -936,7 +975,8 @@ static const struct file_operations ip_vs_conn_fops = {
>  	.open    = ip_vs_conn_open,
>  	.read    = seq_read,
>  	.llseek  = seq_lseek,
> -	.release = seq_release,
> +	.release = seq_release_private,
> +
>  };
> 
>  static const char *ip_vs_origin_name(unsigned flags)
> @@ -991,7 +1031,17 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
> 
>  static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
>  {
> -	return seq_open(file, &ip_vs_conn_sync_seq_ops);
> +	int ret;
> +	struct ipvs_private *ipriv;
> +
> +	ret = seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
> +			   sizeof(struct ipvs_private));
> +	if (!ret) {
> +		ipriv = ((struct seq_file *)file->private_data)->private;
> +		ipriv->private = NULL;
> +	}
> +	return ret;
> +//	return seq_open(file, &ip_vs_conn_sync_seq_ops);
>  }
> 
>  static const struct file_operations ip_vs_conn_sync_fops = {
> @@ -999,7 +1049,7 @@ static const struct file_operations ip_vs_conn_sync_fops = {
>  	.open    = ip_vs_conn_sync_open,
>  	.read    = seq_read,
>  	.llseek  = seq_lseek,
> -	.release = seq_release,
> +	.release = seq_release_private,
>  };
> 
>  #endif
> @@ -1036,11 +1086,14 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
>  	return 1;
>  }
> 
> -/* Called from keventd and must protect itself from softirqs */
> -void ip_vs_random_dropentry(void)
> +/* Called from keventd and must protect itself from softirqs
> + * netns: conn_tab OK
> + */
> +void ip_vs_random_dropentry(struct net *net)
>  {
>  	int idx;
>  	struct ip_vs_conn *cp;
> +	struct netns_ipvs *ipvs = net->ipvs;
> 
>  	/*
>  	 * Randomly scan 1/32 of the whole table every second
> @@ -1053,7 +1106,7 @@ void ip_vs_random_dropentry(void)
>  		 */
>  		ct_write_lock_bh(hash);
> 
> -		list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> +		list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
>  			if (cp->flags & IP_VS_CONN_F_TEMPLATE)
>  				/* connection template */
>  				continue;
> @@ -1091,11 +1144,13 @@ void ip_vs_random_dropentry(void)
> 
>  /*
>   *      Flush all the connection entries in the ip_vs_conn_tab
> + * netns: conn_tab OK
>   */
> -static void ip_vs_conn_flush(void)
> +static void ip_vs_conn_flush(struct net *net)
>  {
>  	int idx;
>  	struct ip_vs_conn *cp;
> +	struct netns_ipvs *ipvs = net->ipvs;
> 
>    flush_again:
>  	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
> @@ -1104,7 +1159,7 @@ static void ip_vs_conn_flush(void)
>  		 */
>  		ct_write_lock_bh(idx);
> 
> -		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> +		list_for_each_entry(cp, &ipvs->conn_tab[idx], c_list) {
> 
>  			IP_VS_DBG(4, "del connection\n");
>  			ip_vs_conn_expire_now(cp);
> @@ -1118,16 +1173,17 @@ static void ip_vs_conn_flush(void)
> 
>  	/* the counter may be not NULL, because maybe some conn entries
>  	   are run by slow timer handler or unhashed but still referred */
> -	if (atomic_read(&ip_vs_conn_count) != 0) {
> +	if (atomic_read(&ipvs->conn_count) != 0) {
>  		schedule();
>  		goto flush_again;
>  	}
>  }
> 
> 
> -int __init ip_vs_conn_init(void)
> +int __net_init __ip_vs_conn_init(struct net *net)
>  {
>  	int idx;
> +	struct netns_ipvs *ipvs = net->ipvs;
> 
>  	/* Compute size and mask */
>  	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
> @@ -1136,19 +1192,26 @@ int __init ip_vs_conn_init(void)
>  	/*
>  	 * Allocate the connection hash table and initialize its list heads
>  	 */
> -	ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size *
> +	ipvs->conn_tab = vmalloc(ip_vs_conn_tab_size *
>  				 sizeof(struct list_head));
> -	if (!ip_vs_conn_tab)
> +	if (!ipvs->conn_tab)
>  		return -ENOMEM;
> 
>  	/* Allocate ip_vs_conn slab cache */
> -	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
> +	/* Todo: find a better way to name the cache */
> +	snprintf(ipvs->conn_cname, sizeof(ipvs->conn_cname)-1,
> +			"ipvs_conn_%d", atomic_read(&conn_cache_nr) );
> +	atomic_inc(&conn_cache_nr);
> +
> +	ipvs->conn_cachep = kmem_cache_create(ipvs->conn_cname,
>  					      sizeof(struct ip_vs_conn), 0,
>  					      SLAB_HWCACHE_ALIGN, NULL);
> -	if (!ip_vs_conn_cachep) {
> -		vfree(ip_vs_conn_tab);
> +	if (!ipvs->conn_cachep) {
> +		vfree(ipvs->conn_tab);
>  		return -ENOMEM;
>  	}
> +	atomic_set(&ipvs->conn_count, 0);
> +	atomic_set(&ipvs->conn_no_cport_cnt, 0);
> 
>  	pr_info("Connection hash table configured "
>  		"(size=%d, memory=%ldKbytes)\n",
> @@ -1158,31 +1221,46 @@ int __init ip_vs_conn_init(void)
>  		  sizeof(struct ip_vs_conn));
> 
>  	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
> -		INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
> +		INIT_LIST_HEAD(&ipvs->conn_tab[idx]);
>  	}
> 
>  	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
>  		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
>  	}
> 
> -	proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
> -	proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
> -
> -	/* calculate the random value for connection hash */
> -	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
> +	proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
> +	proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
> 
>  	return 0;
>  }
> +/* Cleanup and release all netns related ... */
> +static void __net_exit __ip_vs_conn_cleanup(struct net *net) {
> 
> +	/* flush all the connection entries first */
> +	ip_vs_conn_flush(net);
> +	/* Release the empty cache */
> +	kmem_cache_destroy(net->ipvs->conn_cachep);
> +	proc_net_remove(net, "ip_vs_conn");
> +	proc_net_remove(net, "ip_vs_conn_sync");
> +	vfree(net->ipvs->conn_tab);
> +}
> +static struct pernet_operations ipvs_conn_ops = {
> +	.init = __ip_vs_conn_init,
> +	.exit = __ip_vs_conn_cleanup,
> +};
> 
> -void ip_vs_conn_cleanup(void)
> +int __init ip_vs_conn_init(void)
>  {
> -	/* flush all the connection entries first */
> -	ip_vs_conn_flush();
> +	int rv;
> 
> -	/* Release the empty cache */
> -	kmem_cache_destroy(ip_vs_conn_cachep);
> -	proc_net_remove(&init_net, "ip_vs_conn");
> -	proc_net_remove(&init_net, "ip_vs_conn_sync");
> -	vfree(ip_vs_conn_tab);
> +	rv = register_pernet_subsys(&ipvs_conn_ops);
> +
> +	/* calculate the random value for connection hash */
> +	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
> +	return rv;
> +}
> +
> +void ip_vs_conn_cleanup(void)
> +{
> +	unregister_pernet_subsys(&ipvs_conn_ops);
>  }
> 
> -- 
> Regards
> Hans Schillstrom <hans.schillstrom@ericsson.com>
> --
> To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH 5/9] tproxy: allow non-local binds of IPv6 sockets if IP_TRANSPARENT is enabled
From: Balazs Scheidler @ 2010-10-20 14:07 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki
  Cc: KOVACS Krisztian, netdev, netfilter-devel, Patrick McHardy,
	David Miller
In-Reply-To: <4CBEE45D.2080201@linux-ipv6.org>

On Wed, 2010-10-20 at 21:45 +0900, YOSHIFUJI Hideaki wrote:
> (2010/10/20 20:21), KOVACS Krisztian wrote:
> > From: Balazs Scheidler<bazsi@balabit.hu>
> > 
> > Signed-off-by: Balazs Scheidler<bazsi@balabit.hu>
> > Signed-off-by: KOVACS Krisztian<hidden@balabit.hu>
> > ---
> >   net/ipv6/af_inet6.c |    2 +-
> >   1 files changed, 1 insertions(+), 1 deletions(-)
> > 
> > diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> > index 6022098..9480572 100644
> > --- a/net/ipv6/af_inet6.c
> > +++ b/net/ipv6/af_inet6.c
> > @@ -343,7 +343,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
> >   			 */
> >   			v4addr = LOOPBACK4_IPV6;
> >   			if (!(addr_type&  IPV6_ADDR_MULTICAST))	{
> > -				if (!ipv6_chk_addr(net,&addr->sin6_addr,
> > +				if (!inet->transparent&&  !ipv6_chk_addr(net,&addr->sin6_addr,
> >   						   dev, 0)) {
> >   					err = -EADDRNOTAVAIL;
> >   					goto out_unlock;
> > 
> > 
> 
> As I wrote before in other thread, this does not seem sufficient --
> well, it is sufficient to allow non-local bind, but before we're
> allowing this, we need add checks of source address in sending side.

Can you please elaborate or point us to the other thread? Is it some
kind of address-type check that we miss?

-- 
Bazsi



^ permalink raw reply

* [PATCH -next 1/2] ibmveth: Cleanup error handling inside ibmveth_open
From: Denis Kirjanov @ 2010-10-20 14:21 UTC (permalink / raw)
  To: davem; +Cc: rcj, netdev

Remove duplicated code in one place.

Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
---
 drivers/net/ibmveth.c |   44 ++++++++++++++++++++------------------------
 1 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index b3e157e..2ae8336 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -546,9 +546,8 @@ static int ibmveth_open(struct net_device *netdev)
 	if (!adapter->buffer_list_addr || !adapter->filter_list_addr) {
 		netdev_err(netdev, "unable to allocate filter or buffer list "
 			   "pages\n");
-		ibmveth_cleanup(adapter);
-		napi_disable(&adapter->napi);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto err_out;
 	}
 
 	adapter->rx_queue.queue_len = sizeof(struct ibmveth_rx_q_entry) *
@@ -558,9 +557,8 @@ static int ibmveth_open(struct net_device *netdev)
 
 	if (!adapter->rx_queue.queue_addr) {
 		netdev_err(netdev, "unable to allocate rx queue pages\n");
-		ibmveth_cleanup(adapter);
-		napi_disable(&adapter->napi);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto err_out;
 	}
 
 	dev = &adapter->vdev->dev;
@@ -578,9 +576,8 @@ static int ibmveth_open(struct net_device *netdev)
 	    (dma_mapping_error(dev, adapter->rx_queue.queue_dma))) {
 		netdev_err(netdev, "unable to map filter or buffer list "
 			   "pages\n");
-		ibmveth_cleanup(adapter);
-		napi_disable(&adapter->napi);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto err_out;
 	}
 
 	adapter->rx_queue.index = 0;
@@ -611,9 +608,8 @@ static int ibmveth_open(struct net_device *netdev)
 				     adapter->filter_list_dma,
 				     rxq_desc.desc,
 				     mac_address);
-		ibmveth_cleanup(adapter);
-		napi_disable(&adapter->napi);
-		return -ENONET;
+		rc = -ENONET;
+		goto err_out;
 	}
 
 	for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) {
@@ -622,9 +618,8 @@ static int ibmveth_open(struct net_device *netdev)
 		if (ibmveth_alloc_buffer_pool(&adapter->rx_buff_pool[i])) {
 			netdev_err(netdev, "unable to alloc pool\n");
 			adapter->rx_buff_pool[i].active = 0;
-			ibmveth_cleanup(adapter);
-			napi_disable(&adapter->napi);
-			return -ENOMEM ;
+			rc = -ENOMEM;
+			goto err_out;
 		}
 	}
 
@@ -638,27 +633,23 @@ static int ibmveth_open(struct net_device *netdev)
 			rc = h_free_logical_lan(adapter->vdev->unit_address);
 		} while (H_IS_LONG_BUSY(rc) || (rc == H_BUSY));
 
-		ibmveth_cleanup(adapter);
-		napi_disable(&adapter->napi);
-		return rc;
+		goto err_out;
 	}
 
 	adapter->bounce_buffer =
 	    kmalloc(netdev->mtu + IBMVETH_BUFF_OH, GFP_KERNEL);
 	if (!adapter->bounce_buffer) {
 		netdev_err(netdev, "unable to allocate bounce buffer\n");
-		ibmveth_cleanup(adapter);
-		napi_disable(&adapter->napi);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto err_out;
 	}
 	adapter->bounce_buffer_dma =
 	    dma_map_single(&adapter->vdev->dev, adapter->bounce_buffer,
 			   netdev->mtu + IBMVETH_BUFF_OH, DMA_BIDIRECTIONAL);
 	if (dma_mapping_error(dev, adapter->bounce_buffer_dma)) {
 		netdev_err(netdev, "unable to map bounce buffer\n");
-		ibmveth_cleanup(adapter);
-		napi_disable(&adapter->napi);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto err_out;
 	}
 
 	netdev_dbg(netdev, "initial replenish cycle\n");
@@ -669,6 +660,11 @@ static int ibmveth_open(struct net_device *netdev)
 	netdev_dbg(netdev, "open complete\n");
 
 	return 0;
+
+err_out:
+	ibmveth_cleanup(adapter);
+	napi_disable(&adapter->napi);
+	return rc;
 }
 
 static int ibmveth_close(struct net_device *netdev)
-- 
1.7.2.2


^ permalink raw reply related

* [PATCH -next 2/2] ibmveth: Free irq on error path
From: Denis Kirjanov @ 2010-10-20 14:21 UTC (permalink / raw)
  To: davem; +Cc: rcj, netdev

Free irq on error path.

Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
---
 drivers/net/ibmveth.c |    6 ++++--
 1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index 2ae8336..c454b45 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -641,7 +641,7 @@ static int ibmveth_open(struct net_device *netdev)
 	if (!adapter->bounce_buffer) {
 		netdev_err(netdev, "unable to allocate bounce buffer\n");
 		rc = -ENOMEM;
-		goto err_out;
+		goto err_out_free_irq;
 	}
 	adapter->bounce_buffer_dma =
 	    dma_map_single(&adapter->vdev->dev, adapter->bounce_buffer,
@@ -649,7 +649,7 @@ static int ibmveth_open(struct net_device *netdev)
 	if (dma_mapping_error(dev, adapter->bounce_buffer_dma)) {
 		netdev_err(netdev, "unable to map bounce buffer\n");
 		rc = -ENOMEM;
-		goto err_out;
+		goto err_out_free_irq;
 	}
 
 	netdev_dbg(netdev, "initial replenish cycle\n");
@@ -661,6 +661,8 @@ static int ibmveth_open(struct net_device *netdev)
 
 	return 0;
 
+err_out_free_irq:
+	free_irq(netdev->irq, netdev);
 err_out:
 	ibmveth_cleanup(adapter);
 	napi_disable(&adapter->napi);
-- 
1.7.2.2


^ permalink raw reply related

* Re: [RFC PATCH 5/9] ipvs network name space aware
From: Simon Horman @ 2010-10-20 15:21 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: lvs-devel, netdev, netfilter-devel, ja, wensong, daniel.lezcano
In-Reply-To: <201010081317.04167.hans.schillstrom@ericsson.com>

On Fri, Oct 08, 2010 at 01:17:02PM +0200, Hans Schillstrom wrote:
> This patch just contains ip_vs_ctl
> 
> Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>
> 
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index ca8ec8c..7e99cbc 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c

[ snip ]

> @@ -3377,62 +3383,131 @@ static void ip_vs_genl_unregister(void)
>  }
> 
>  /* End of Generic Netlink interface definitions */
> +/*
> + * per netns intit/exit func.
> + */
> +int /*__net_init*/ __ip_vs_control_init(struct net *net)

Can you describe why __net_init is commented out?

[ snip ]

^ permalink raw reply

* dead code in networking core
From: Stephen Hemminger @ 2010-10-20 15:52 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

The following API's are exported but unused in current code:
  net/core/dev_addr_lists.o
    __hw_addr_del_multiple
    dev_addr_add_multiple
    dev_addr_del_multiple

  net/core/timestamping.o
    skb_clone_tx_timestamp
    skb_complete_tx_timestamp

Any plans to use these soon?

^ permalink raw reply

* Re: [RFC PATCH 1/9] ipvs network name space aware
From: Paul E. McKenney @ 2010-10-20 16:02 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: Daniel Lezcano, lvs-devel@vger.kernel.org, netdev@vger.kernel.org,
	netfilter-devel@vger.kernel.org, horms@verge.net.au, ja@ssi.bg,
	wensong@linux-vs.org
In-Reply-To: <201010201025.20950.hans.schillstrom@ericsson.com>

On Wed, Oct 20, 2010 at 10:25:19AM +0200, Hans Schillstrom wrote:
> On Tuesday 19 October 2010 20:44:36 Paul E. McKenney wrote:
> > On Mon, Oct 18, 2010 at 03:23:48PM +0200, Hans Schillstrom wrote:
> > > On Monday 18 October 2010 13:37:38 Daniel Lezcano wrote:
> > > > On 10/18/2010 11:54 AM, Hans Schillstrom wrote:
> > > > > On Monday 18 October 2010 10:59:25 Daniel Lezcano wrote:
> > > > >
> > > > >> On 10/08/2010 01:16 PM, Hans Schillstrom wrote:
> > > > >>
> > > > >>> This part contains the include files
> > > > >>> where include/net/netns/ip_vs.h is new and contains all moved vars.
> > > > >>>
> > > > >>> SUMMARY
> > > > >>>
> > > > >>>    include/net/ip_vs.h                     |  136 ++++---
> > > > >>>    include/net/net_namespace.h             |    2 +
> > > > >>>    include/net/netns/ip_vs.h               |  112 +++++
> > > > >>>
> > > > >>> Signed-off-by:Hans Schillstrom<hans.schillstrom@ericsson.com>
> > > > >>> ---
> > > > >>>
> > > > >>>
> > > > >>>
> > > > >> [ ... ]
> > > > >>
> > > > >>
> > > > >>>    #ifdef CONFIG_IP_VS_IPV6
> > > > >>> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> > > > >>> index bd10a79..b59cdc5 100644
> > > > >>> --- a/include/net/net_namespace.h
> > > > >>> +++ b/include/net/net_namespace.h
> > > > >>> @@ -15,6 +15,7 @@
> > > > >>>    #include<net/netns/ipv4.h>
> > > > >>>    #include<net/netns/ipv6.h>
> > > > >>>    #include<net/netns/dccp.h>
> > > > >>> +#include<net/netns/ip_vs.h>
> > > > >>>    #include<net/netns/x_tables.h>
> > > > >>>    #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
> > > > >>>    #include<net/netns/conntrack.h>
> > > > >>> @@ -91,6 +92,7 @@ struct net {
> > > > >>>    	struct sk_buff_head	wext_nlevents;
> > > > >>>    #endif
> > > > >>>    	struct net_generic	*gen;
> > > > >>> +	struct netns_ipvs       *ipvs;
> > > > >>>    };
> > > > >>>
> > > > >>>
> > > > >> IMHO, it would be better to use the net_generic infra-structure instead
> > > > >> of adding a new field in the netns structure.
> > > > >>
> > > > >>
> > > > >>
> > > > > I realized that to, but the performance penalty is quite high with net_generic :-(
> > > > > But on the other hand if you are going to backport it, (without recompiling the kernel)
> > > > > you gonna need it!
> > > > >
> > > >
> > > > Hmm, yes. We don't want to have the init_net_ns performances to be impacted.
> > > >
> > > > You use here a pointer which will be dereferenced like the net_generic,
> > > > I don't think there will be
> > > > a big difference between using net_generic and using a pointer in the
> > > > net namespace structure.
> > > >
> > > > The difference is the id usage, but this one is based on the idr which
> > > > is quite fast.
> > > >
> > >
> > > I'm not so sure about that, have a look at net_generic and rcu_read_lock
> > > and compare
> > >  ipvs = net->ipvs;
> > > vs.
> > >  ipvs = net_generic(net, id)
> > >
> > > static inline void *net_generic(struct net *net, int id)
> > > {
> > > 	struct net_generic *ng;
> > > 	void *ptr;
> > >
> > > 	rcu_read_lock();
> > > 	ng = rcu_dereference(net->gen);
> > > 	BUG_ON(id == 0 || id > ng->len);
> > > 	ptr = ng->ptr[id - 1];
> > > 	rcu_read_unlock();
> > >
> > > 	return ptr;
> > > }
> > > ...
> > > static inline void rcu_read_lock(void)
> > > {
> > >         __rcu_read_lock();
> > >         __acquire(RCU);
> > >         rcu_read_acquire();
> > > }
> > >
> > > Another way of doing it is to pass the ipvs ptr instead of the net ptr,
> > > and add *net to the ipvs struct.
> > >
> > > > We should experiment a bit here to compare both solutions.
> > > Agre
> > > >
> > > I single stepped through the rcu_read_lock() on a x86_64
> > > and it's quite many "stepi" that you need to enter :-(
> >
> > Was this by chance with lockdep enabled?  If not, could you please send
> > your .config?
> >
> > 							Thanx, Paul
> 
> No lockdep, but what I ment is that net_generic is not as fast as a plain ptr->xxx.
> IPVS has hooks in the netfilter chain, and gets a huge amount of packets .
> 
> I don't think IPVS is a candidate for net_generic, it should have its own part in "struct net"
> That was my point.
> ( No critic to locking or net_generic)

You said that there were a lot of "stepi" commands to get through
rcu_read_lock() on x86_64.  This is quite surprising, especially if you
built with CONFIG_RCU_TREE.  Even if you built with CONFIG_PREEMPT_RCU_TREE,
you should only see something like the following from rcu_read_lock():

000000b7 <__rcu_read_lock>:
      b7:	55                   	push   %ebp
      b8:	64 a1 00 00 00 00    	mov    %fs:0x0,%eax
      be:	ff 80 80 01 00 00    	incl   0x180(%eax)
      c4:	89 e5                	mov    %esp,%ebp
      c6:	5d                   	pop    %ebp
      c7:	c3                   	ret    

Unless you have some sort of debugging options turned on.  Or unless
six instructions counts for "quite many" stepi commands.  ;-)

So I am quite curious, independent of whether or not IPVS is a candidate
for net_generic.  That choice for IPVS is not mine to make, and I will
trust the relevant developers and maintainers to make the right choice,
whether that be RCU or something else.  Even I do not claim that RCU
is the right tool for all jobs!  ;-)

							Thanx, Paul

^ permalink raw reply

* Re: [Ksummit-2010-discuss] [v2] Remaining BKL users, what to do
From: Ville Syrjälä @ 2010-10-20 16:14 UTC (permalink / raw)
  To: Dave Airlie
  Cc: Arnd Bergmann, Jan Kara, Greg KH, Anders Larsen, dri-devel,
	ksummit-2010-discuss, Mikulas Patocka, codalist, Theodore Kilgore,
	Bryan Schumaker, Christoph Hellwig, Petr Vandrovec,
	Arnaldo Carvalho de Melo, linux-media, Samuel Ortiz,
	Evgeniy Dushistov, Steven Rostedt, autofs, Jan Harkes, netdev,
	linux-kernel, linux-fsdevel, Andrew Hendry
In-Reply-To: <AANLkTinw=Wzh2Ucj6zKSoqC8J3Yq9xDr3mKMUB7K6Yyo@mail.gmail.com>

On Wed, Oct 20, 2010 at 06:50:58AM +1000, Dave Airlie wrote:
> On Tue, Oct 19, 2010 at 11:26 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> > On Tuesday 19 October 2010, Arnd Bergmann wrote:
> >> On Tuesday 19 October 2010 06:52:32 Dave Airlie wrote:
> >> > > I might be able to find some hardware still lying around here that uses an
> >> > > i810. Not sure unless I go hunting it. But I get the impression that if
> >> > > the kernel is a single-CPU kernel there is not any problem anyway? Don't
> >> > > distros offer a non-smp kernel as an installation option in case the user
> >> > > needs it? So in reality how big a problem is this?
> >> >
> >> > Not anymore, which is my old point of making a fuss. Nowadays in the
> >> > modern distro world, we supply a single kernel that can at runtime
> >> > decide if its running on SMP or UP and rewrite the text section
> >> > appropriately with locks etc. Its like magic, and something like
> >> > marking drivers as BROKEN_ON_SMP at compile time is really wrong when
> >> > what you want now is a runtime warning if someone tries to hotplug a
> >> > CPU with a known iffy driver loaded or if someone tries to load the
> >> > driver when we are already in SMP mode.
> >>
> >> We could make the driver run-time non-SMP by adding
> >>
> >>       if (num_present_cpus() > 1) {
> >>               pr_err("i810 no longer supports SMP\n");
> >>               return -EINVAL;
> >>       }
> >>
> >> to the init function. That would cover the vast majority of the
> >> users of i810 hardware, I guess.
> >
> > Some research showed that Intel never support i810/i815 SMP setups,
> > but there was indeed one company (http://www.acorpusa.com at the time,
> > now owned by a domain squatter) that made i815E based dual Pentium-III
> > boards like this one: http://cgi.ebay.com/280319795096
> 
> Also that board has no on-board GPU enabled i815EP (P means no on-board GPU).

A quick search seems to indicate that an i815E variant also existed.

-- 
Ville Syrjälä
syrjala@sci.fi
http://www.sci.fi/~syrjala/
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Future of the Wimedia LLC Protocol (WLP) subsystem/drivers
From: Randy Dunlap @ 2010-10-20 16:15 UTC (permalink / raw)
  To: David Vrabel; +Cc: netdev, gregkh
In-Reply-To: <4CBDC7B7.5070108@csr.com>

On Tue, 19 Oct 2010 17:30:47 +0100 David Vrabel wrote:

> Hi,
> 
> I've have been nominally the maintainer of the Wimedia LLC Protocol
> (WLP) subsystem and driver since it was originally submitted.  I am no
> longer in a position to even pretend to be a maintainer.
> 
> The only usable hardware was an Intel i1480 devices with beta firmware
> that was never released as a product.  Intel have since sold all there
> UWB/WLP IP and I see little prospect of there ever being hardware
> commercially available for WLP.
> 
> Here are a number of options:
> 
> 1. Someone else maintains it.  Any volunteers?
> 
> 2. It gets labelled as Orphaned in MAINTAINERS.
> 
> 3. It gets moved to staging.
> 
> 4, It gets removed.
> 
> If no one says anything I'll submit a patch to Linus to mark it as Orphaned.

I'd say either 3 or 4.

It could go to staging on it way to removal, but that's not really necessary.


cc: gregkh


---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply

* Re: Linux 2.6.35/TIPC 2.0 ABI breaking changes
From: Leandro Lucarella @ 2010-10-20 17:20 UTC (permalink / raw)
  To: Neil Horman
  Cc: jon.maloy, netdev, linux-kernel, tipc-discussion, David Miller
In-Reply-To: <20101019201841.GC14410@hmsreliant.think-freely.org>

Neil Horman, el 19 de octubre a las 16:18 me escribiste:
> 	Heres what I have so far.  Dave as a heads up please don't apply this
> yet.  I'd like to go over it a bit more and be sure of the implications here
> before I post it for inclusion officially.  I wanted Leandro to have a copy
> though so he could confirm functionality for us.  Leandro, This patch lets me
> pass the tipc test code for TIPC 1.6 that you posted earlier this morning.  If
> you could confirm that it works for you that would be grand.  While your doing
> that, I want to read over the spec for TIPC and make sure that I'm not breaking
> anything new with this patch.

I tried the patch (swapping the values of TIPC_SUB_SERVICE and
TIPC_SUB_PORTS) based on 2.6.35.4 and it didn't worked. dmesg sais:
NOT Swapping endianess in subscr_subscribe
NOT Swapping endianess in subscr_subscribe
TIPC: Subscription rejected, illegal request

I tried with a binary compiled with an older tipc.h header, I didn't
tried to recompile it using the new tipc.h (on purpose as the thing that
should be fixed is backwards compatibility).

I've read the TIPC 2.0 specification[1] a little more, and as I see, the
subscription messages are not supposed to go through the wire[2].

	8.  Topology Service

	TIPC provides a message-based mechanism for an application to
	learn about the port names that are visible to its node. This is
	achieved by communicating with a Topology Service that has
	knowledge of the contents of the node's name table.

So, if the idea is to comply with TIPC 2.0, the topology service should
accept the new TIPC_SUB_SERVICE and TIPC_SUB_PORTS values (0 and
1 in NBO respectively), and all the fields in the subscr struct should
be filled in NBO too.

However, if the idea is to keep backwards compatibility too, HBO should
be accepted as well as the old TIPC_SUB_SERVICE and TIPC_SUB_PORTS
values (2 and 1 in HBO respectively).

The real problem is, we can't figure out the endianess of the subscr
struct because 0x0 is a valid filter in TIPC 2.0.

The only solution I see is to change the TIPC 2.0 specification (which
is a "work-in-progress") to make the topology service use the port name
{2,2}, leaving {1,1} for backwards compatibility. Then add the constants
TIPC_SUB_SERVICE2 (0) and TIPC_TOP_SRV2 (2), or similar, to use the TIPC
2.0 interface and leave TIPC_SUB_SERVICE and TIPC_TOP_SRV for the TIPC
1.x interface.

Another option is to change the TIPC 2.0 specification to use the old
format (use HBO in subscriptions and keep TIPC_SUB_SERVICE as a separate
flag with value 2) and forget about all this. After all, I can't see
what advantages gives having to change the BO for internal messages
between the applications and the stack.

[1] http://tipc.sourceforge.net/doc/draft-spec-tipc-06.html
[2] http://tipc.sourceforge.net/doc/draft-spec-tipc-06.html#anchor92

-- 
Leandro Lucarella (AKA luca)                     http://llucax.com.ar/
----------------------------------------------------------------------
GPG Key: 5F5A8D05 (F8CD F9A7 BF00 5431 4145  104C 949E BFB6 5F5A 8D05)
----------------------------------------------------------------------
CARANCHO OBNUBILADO APARECE EN PARQUE CHACABUCO!
	-- Crónica TV

------------------------------------------------------------------------------
Nokia and AT&T present the 2010 Calling All Innovators-North America contest
Create new apps & games for the Nokia N8 for consumers in  U.S. and Canada
$10 million total in prizes - $4M cash, 500 devices, nearly $6M in marketing
Develop with Nokia Qt SDK, Web Runtime, or Java and Publish to Ovi Store 
http://p.sf.net/sfu/nokia-dev2dev
_______________________________________________
tipc-discussion mailing list
tipc-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/tipc-discussion

^ permalink raw reply

* Re: kernel panic in fib_rules_lookup [2.6.27.7 vendor-patched]
From: Joe Buehler @ 2010-10-20 17:43 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1286905245.2703.3.camel@edumazet-laptop>

Eric Dumazet wrote:

> 2.6.27 is a bit old, you might try :
> 
> commit 7fa7cb7109d07c29ab28bb877bc7049a0150dbe5
> Author: Eric Dumazet <eric.dumazet@gmail.com>
> Date:   Mon Sep 27 04:18:27 2010 +0000

Alas, after more load testing I find that the panic still occurs:

CPU 1 Unable to handle kernel paging request at virtual address
0000000000000000, epc == ffffffff8146728c, ra == ffffffff81467258
Oops[#1]:
Cpu 1
$ 0   : 0000000000000000 0000000000000000 0000000000000000 0000000000000000
$ 4   : ffffffffffffffff a80000008c9d78f0 a80000009acd9880 000000000a205a7c
$ 8   : 0000000000000000 0000000000000020 a80000009a9c49d0 0000000000000000
$12   : ffffffff8155de00 0000000000000004 0000000000000001 0000000000000000
$16   : 0000000000000000 a80000008c9d78f0 0000000000000002 a80000009b94ed80
$20   : a80000009b94edf8 0000000000000000 0000000000000003 a80000008c9d78a0
$24   : 0000000000000000 ffffffff812df388
$28   : a80000008c9d4000 a80000008c9d7840 fffffffffffffff5 ffffffff81467258
Hi    : 0000000000000000
Lo    : 0000000000000000
epc   : ffffffff8146728c fib_rules_lookup+0x11c/0x260
    Not tainted
ra    : ffffffff81467258 fib_rules_lookup+0xe8/0x260
Status: 1010cce3    KX SX UX KERNEL EXL IE
Cause : 00800008
BadVA : 0000000000000000
PrId  : 000d0409 (Cavium Octeon)
Modules linked in: x_tables ip_tables iptable_filter nf_conntrack
nf_conntrack_ipv4 nf_nat iptable_nat tun xt_tcpudp xt_state ipt_REJECT
ipv6 ip6_tables ip6table_filter ip6t_ipv6header ip6t_REJECT
Process qscope7500 (pid: 1343, threadinfo=a80000008c9d4000,
task=a80000008c068ac0, tls=000000002d51e920)
Stack : ffffffffffffffff 0000000000000003 a80000008c9d78d8 a80000008c9d79d8
        a80000008c9d78f0 0000000000000000 ffffffff816c39c0 ffffffffffffffff
        0000000000000003 00000000000004a6 0000000000000000 ffffffff814bd2a4
        0000000000000000 a80000008c9d78d8 0000000000000000 ffffffffc001aa24
        a80000008c9d78d8 ffffffff81478a38 0000000000000003 0000000000000001
        0000000000000000 0000000000000000 0000000000000001 000000000a205a7c
        0a2059bf00000000 0000000000000000 0000000000000000 0000000000000000
        0000000000000000 0000000000000000 a80000008c9d79d0 0000000000000000
        a80000008c9d79d8 a8000000994ff380 0000000000000000 0000000000000000
        a80000008c9d79d0 ffffffff816c39c0 0000000000000003 00000000000004a6
        ...
Call Trace:
[<ffffffff8146728c>] fib_rules_lookup+0x11c/0x260
[<ffffffff814bd2a4>] fib_lookup+0x2c/0x48
[<ffffffff81478a38>] __ip_route_output_key+0x918/0xf38
[<ffffffff81479090>] ip_route_output_flow+0x38/0x2e8
[<ffffffff81482c44>] ip_queue_xmit+0x38c/0x3a8
[<ffffffff81497f5c>] tcp_transmit_skb+0x3f4/0x7d0
[<ffffffff8149af2c>] __tcp_push_pending_frames+0x1fc/0x9e0
[<ffffffff8148cf50>] tcp_sendmsg+0x900/0xe00
[<ffffffff81441f7c>] sock_aio_write+0x16c/0x190
[<ffffffff811d0424>] do_sync_write+0xbc/0x130
[<ffffffff811d10a8>] vfs_write+0x150/0x158
[<ffffffff811d124c>] sys_write+0x5c/0x118
[<ffffffff8114532c>] handle_sys+0x12c/0x148


Code: 0040282d  00000000  de100000 <de020000> cc400000  1614ffce
00000000  2405fffd  dfbf0058
Fatal exception: panic in 5 seconds
Kernel panic - not syncing: Fatal exception
Rebooting in 1 seconds..

I'll have a go at adding a spinlock.

Joe Buehler

^ permalink raw reply

* Re: Linux 2.6.35/TIPC 2.0 ABI breaking changes
From: Jon Maloy @ 2010-10-20 17:57 UTC (permalink / raw)
  To: Leandro Lucarella, Neil Horman
  Cc: tipc-discussion@lists.sourceforge.net, David Miller,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20101020172018.GN8781@llucax.com.ar>

<...>
> subscr struct because 0x0 is a valid filter in TIPC 2.0.
> 

> 
> Another option is to change the TIPC 2.0 specification to use 
> the old format (use HBO in subscriptions and keep 
> TIPC_SUB_SERVICE as a separate flag with value 2) and forget 
> about all this. After all, I can't see what advantages gives 
> having to change the BO for internal messages between the 
> applications and the stack.

I agree with this. I have no problems with changing the draft 
(which as Leandro already noted is "work-in-progress") to specify that 
both HBO and NBO are permitted over the wire, and that it is the
topology server's task to keep track of which one is used.

Remember, permitting both is a superset of the current one (NBO only)
so it is fully backwards compatible. We break absolutly nothing by 
permitting this. 


> 
> [1] http://tipc.sourceforge.net/doc/draft-spec-tipc-06.html
> [2] http://tipc.sourceforge.net/doc/draft-spec-tipc-06.html#anchor92
> 
> -- 
> Leandro Lucarella (AKA luca)                     http://llucax.com.ar/
> ----------------------------------------------------------------------
> GPG Key: 5F5A8D05 (F8CD F9A7 BF00 5431 4145  104C 949E BFB6 5F5A 8D05)
> ----------------------------------------------------------------------
> CARANCHO OBNUBILADO APARECE EN PARQUE CHACABUCO!
> 	-- Crónica TV
> --
> To unsubscribe from this list: send the line "unsubscribe 
> netdev" in the body of a message to majordomo@vger.kernel.org 
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
------------------------------------------------------------------------------
Nokia and AT&T present the 2010 Calling All Innovators-North America contest
Create new apps & games for the Nokia N8 for consumers in  U.S. and Canada
$10 million total in prizes - $4M cash, 500 devices, nearly $6M in marketing
Develop with Nokia Qt SDK, Web Runtime, or Java and Publish to Ovi Store 
http://p.sf.net/sfu/nokia-dev2dev

^ permalink raw reply

* Re: Linux 2.6.35/TIPC 2.0 ABI breaking changes
From: Neil Horman @ 2010-10-20 17:57 UTC (permalink / raw)
  To: Leandro Lucarella
  Cc: David Miller, paul.gortmaker, jon.maloy, netdev, linux-kernel,
	tipc-discussion
In-Reply-To: <20101020172018.GN8781@llucax.com.ar>

On Wed, Oct 20, 2010 at 02:20:18PM -0300, Leandro Lucarella wrote:
> Neil Horman, el 19 de octubre a las 16:18 me escribiste:
> > 	Heres what I have so far.  Dave as a heads up please don't apply this
> > yet.  I'd like to go over it a bit more and be sure of the implications here
> > before I post it for inclusion officially.  I wanted Leandro to have a copy
> > though so he could confirm functionality for us.  Leandro, This patch lets me
> > pass the tipc test code for TIPC 1.6 that you posted earlier this morning.  If
> > you could confirm that it works for you that would be grand.  While your doing
> > that, I want to read over the spec for TIPC and make sure that I'm not breaking
> > anything new with this patch.
> 
> I tried the patch (swapping the values of TIPC_SUB_SERVICE and
> TIPC_SUB_PORTS) based on 2.6.35.4 and it didn't worked. dmesg sais:
> NOT Swapping endianess in subscr_subscribe
> NOT Swapping endianess in subscr_subscribe
> TIPC: Subscription rejected, illegal request
thats odd, it worked fine for me.  I wonder If I had an older tipc.h header that
set the flags properly in user space.

> 
> I tried with a binary compiled with an older tipc.h header, I didn't
> tried to recompile it using the new tipc.h (on purpose as the thing that
> should be fixed is backwards compatibility).
> 
> I've read the TIPC 2.0 specification[1] a little more, and as I see, the
> subscription messages are not supposed to go through the wire[2].
> 
> 	8.  Topology Service
> 
> 	TIPC provides a message-based mechanism for an application to
> 	learn about the port names that are visible to its node. This is
> 	achieved by communicating with a Topology Service that has
> 	knowledge of the contents of the node's name table.
> 
> So, if the idea is to comply with TIPC 2.0, the topology service should
> accept the new TIPC_SUB_SERVICE and TIPC_SUB_PORTS values (0 and
> 1 in NBO respectively), and all the fields in the subscr struct should
> be filled in NBO too.
> 
> However, if the idea is to keep backwards compatibility too, HBO should
> be accepted as well as the old TIPC_SUB_SERVICE and TIPC_SUB_PORTS
> values (2 and 1 in HBO respectively).
> 
> The real problem is, we can't figure out the endianess of the subscr
> struct because 0x0 is a valid filter in TIPC 2.0.
> 
> The only solution I see is to change the TIPC 2.0 specification (which
> is a "work-in-progress") to make the topology service use the port name
> {2,2}, leaving {1,1} for backwards compatibility. Then add the constants
> TIPC_SUB_SERVICE2 (0) and TIPC_TOP_SRV2 (2), or similar, to use the TIPC
> 2.0 interface and leave TIPC_SUB_SERVICE and TIPC_TOP_SRV for the TIPC
> 1.x interface.
> 
> Another option is to change the TIPC 2.0 specification to use the old
> format (use HBO in subscriptions and keep TIPC_SUB_SERVICE as a separate
> flag with value 2) and forget about all this. After all, I can't see
> what advantages gives having to change the BO for internal messages
> between the applications and the stack.
> 
> [1] http://tipc.sourceforge.net/doc/draft-spec-tipc-06.html
> [2] http://tipc.sourceforge.net/doc/draft-spec-tipc-06.html#anchor92
> 
Ugh, the tipc 'spec' is just a mess (note section 2.4.2, 8.2.1, etc  also indicates all
mesages should be in network byte order)

What we should probably do is, for the time being, just revert my endian swap
commit, plus pauls bit field change to get us back to a point where we're no
longer breaking user space.  Then we can take our time to find a way to conform
to the spec in a backwards compatible manner.  I'll send patches to do that
shortly.

Neil

> -- 
> Leandro Lucarella (AKA luca)                     http://llucax.com.ar/
> ----------------------------------------------------------------------
> GPG Key: 5F5A8D05 (F8CD F9A7 BF00 5431 4145  104C 949E BFB6 5F5A 8D05)
> ----------------------------------------------------------------------
> CARANCHO OBNUBILADO APARECE EN PARQUE CHACABUCO!
> 	-- Crónica TV
> 

^ permalink raw reply

* Re: Linux 2.6.35/TIPC 2.0 ABI breaking changes
From: Neil Horman @ 2010-10-20 18:04 UTC (permalink / raw)
  To: Jon Maloy
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	Leandro Lucarella, tipc-discussion@lists.sourceforge.net,
	David Miller
In-Reply-To: <0434463FDA60A94FA978ACA44617682DEE84668199@EUSAACMS0702.eamcs.ericsson.se>

On Wed, Oct 20, 2010 at 01:57:06PM -0400, Jon Maloy wrote:
> <...>
> > subscr struct because 0x0 is a valid filter in TIPC 2.0.
> > 
> 
> > 
> > Another option is to change the TIPC 2.0 specification to use 
> > the old format (use HBO in subscriptions and keep 
> > TIPC_SUB_SERVICE as a separate flag with value 2) and forget 
> > about all this. After all, I can't see what advantages gives 
> > having to change the BO for internal messages between the 
> > applications and the stack.
> 
> I agree with this. I have no problems with changing the draft 
> (which as Leandro already noted is "work-in-progress") to specify that 
> both HBO and NBO are permitted over the wire, and that it is the
> topology server's task to keep track of which one is used.
> 
> Remember, permitting both is a superset of the current one (NBO only)
> so it is fully backwards compatible. We break absolutly nothing by 
> permitting this. 
> 
Thats effectively reverting both our patches though, isn't it (not that I'm
disagreeing with it, just looking for clarification).  If we revert my patch and
reintroduce the htohl mechanism which tracks endianess, we might as well revert
the TIPC_SUB_SERVICE flag as well, yeah?

Neil

> 
> > 
> > [1] http://tipc.sourceforge.net/doc/draft-spec-tipc-06.html
> > [2] http://tipc.sourceforge.net/doc/draft-spec-tipc-06.html#anchor92
> > 
> > -- 
> > Leandro Lucarella (AKA luca)                     http://llucax.com.ar/
> > ----------------------------------------------------------------------
> > GPG Key: 5F5A8D05 (F8CD F9A7 BF00 5431 4145  104C 949E BFB6 5F5A 8D05)
> > ----------------------------------------------------------------------
> > CARANCHO OBNUBILADO APARECE EN PARQUE CHACABUCO!
> > 	-- Crónica TV
> > --
> > To unsubscribe from this list: send the line "unsubscribe 
> > netdev" in the body of a message to majordomo@vger.kernel.org 
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

------------------------------------------------------------------------------
Nokia and AT&T present the 2010 Calling All Innovators-North America contest
Create new apps & games for the Nokia N8 for consumers in  U.S. and Canada
$10 million total in prizes - $4M cash, 500 devices, nearly $6M in marketing
Develop with Nokia Qt SDK, Web Runtime, or Java and Publish to Ovi Store 
http://p.sf.net/sfu/nokia-dev2dev

^ permalink raw reply

* Re: Linux 2.6.35/TIPC 2.0 ABI breaking changes
From: Jon Maloy @ 2010-10-20 18:10 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	Leandro Lucarella, tipc-discussion@lists.sourceforge.net,
	David Miller
In-Reply-To: <20101020180454.GC14407@hmsreliant.think-freely.org>

<...>
> > 
> > Remember, permitting both is a superset of the current one 
> (NBO only) 
> > so it is fully backwards compatible. We break absolutly nothing by 
> > permitting this.
> > 
> Thats effectively reverting both our patches though, isn't it 
> (not that I'm disagreeing with it, just looking for 
> clarification).  If we revert my patch and reintroduce the 
> htohl mechanism which tracks endianess, we might as well 
> revert the TIPC_SUB_SERVICE flag as well, yeah?

Absolutely. I think it was a mistake to change that value.
But I don't think we need to reintroduce the htohl(). That
was just one way of doing it. If I understood your suggestion
from yesterday correctly you converted the whole message within
one if()clause, without any htohl(). I have have no problem with 
that approach.

///jon

> 
> Neil
> 
> > 
> > > 
> > > [1] http://tipc.sourceforge.net/doc/draft-spec-tipc-06.html
> > > [2] 
> http://tipc.sourceforge.net/doc/draft-spec-tipc-06.html#anchor92
> > > 
> > > -- 
> > > Leandro Lucarella (AKA luca)                     
> http://llucax.com.ar/
> > > 
> --------------------------------------------------------------------
> > > -- GPG Key: 5F5A8D05 (F8CD F9A7 BF00 5431 4145  104C 949E 
> BFB6 5F5A 
> > > 8D05)
> > > 
> --------------------------------------------------------------------
> > > -- CARANCHO OBNUBILADO APARECE EN PARQUE CHACABUCO!
> > > 	-- Crónica TV
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe 
> netdev" in 
> > > the body of a message to majordomo@vger.kernel.org More majordomo 
> > > info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > To unsubscribe from this list: send the line "unsubscribe 
> netdev" in 
> > the body of a message to majordomo@vger.kernel.org More 
> majordomo info 
> > at  http://vger.kernel.org/majordomo-info.html
> > 
> 
------------------------------------------------------------------------------
Nokia and AT&T present the 2010 Calling All Innovators-North America contest
Create new apps & games for the Nokia N8 for consumers in  U.S. and Canada
$10 million total in prizes - $4M cash, 500 devices, nearly $6M in marketing
Develop with Nokia Qt SDK, Web Runtime, or Java and Publish to Ovi Store 
http://p.sf.net/sfu/nokia-dev2dev

^ permalink raw reply

* Re: Linux 2.6.35/TIPC 2.0 ABI breaking changes
From: Leandro Lucarella @ 2010-10-20 18:24 UTC (permalink / raw)
  To: Jon Maloy
  Cc: Neil Horman, David Miller, paul.gortmaker@windriver.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	tipc-discussion@lists.sourceforge.net
In-Reply-To: <0434463FDA60A94FA978ACA44617682DEE84668199@EUSAACMS0702.eamcs.ericsson.se>

Jon Maloy, el 20 de octubre a las 13:57 me escribiste:
> > Another option is to change the TIPC 2.0 specification to use 
> > the old format (use HBO in subscriptions and keep 
> > TIPC_SUB_SERVICE as a separate flag with value 2) and forget 
> > about all this. After all, I can't see what advantages gives 
> > having to change the BO for internal messages between the 
> > applications and the stack.
> 
> I agree with this. I have no problems with changing the draft 
> (which as Leandro already noted is "work-in-progress") to specify that 
> both HBO and NBO are permitted over the wire, and that it is the
> topology server's task to keep track of which one is used.

Just to try to understand better how things works, or are supposed to
work: do the subscription and event messages (and I mean the struct
tipc_subscr and tipc_event published in tipc.h) really go over the wire
or are only used to communicate the stack to the application inside
a node?

I think this is a crucial matter, since it defines if the changes cross
kernel/userspace boundaries only or it also crosses the kernel/network
boundaries.

> Remember, permitting both is a superset of the current one (NBO only)
> so it is fully backwards compatible. We break absolutly nothing by
> permitting this.

I think if they really go through the wire, it should be in NBO, and if
tipc_subscr and tipc_event are used only internally, we can still fix
the userspace messages when sending them through the wire.

In any case, I agree that the patches should be reverted and a solution
should be planned with more time and consensus.

Thanks.

-- 
Leandro Lucarella (AKA luca)                     http://llucax.com.ar/
----------------------------------------------------------------------
GPG Key: 5F5A8D05 (F8CD F9A7 BF00 5431 4145  104C 949E BFB6 5F5A 8D05)
----------------------------------------------------------------------
The world's best known word is "okay"
The second most well-known word is "Coca-Cola"

^ permalink raw reply

* Re: Linux 2.6.35/TIPC 2.0 ABI breaking changes
From: Leandro Lucarella @ 2010-10-20 18:28 UTC (permalink / raw)
  To: Jon Maloy
  Cc: Neil Horman, netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	tipc-discussion@lists.sourceforge.net, David Miller
In-Reply-To: <0434463FDA60A94FA978ACA44617682DEE846681B9@EUSAACMS0702.eamcs.ericsson.se>

Jon Maloy, el 20 de octubre a las 14:10 me escribiste:
> <...>
> > > 
> > > Remember, permitting both is a superset of the current one 
> > (NBO only) 
> > > so it is fully backwards compatible. We break absolutly nothing by 
> > > permitting this.
> > > 
> > Thats effectively reverting both our patches though, isn't it 
> > (not that I'm disagreeing with it, just looking for 
> > clarification).  If we revert my patch and reintroduce the 
> > htohl mechanism which tracks endianess, we might as well 
> > revert the TIPC_SUB_SERVICE flag as well, yeah?
> 
> Absolutely. I think it was a mistake to change that value.
> But I don't think we need to reintroduce the htohl(). That
> was just one way of doing it. If I understood your suggestion
> from yesterday correctly you converted the whole message within
> one if()clause, without any htohl(). I have have no problem with 
> that approach.

There is a difference between both solutions, the htohl() version
tracked the need for swap as a struct subscription member (which was
used when sending back events). Neils patch doesn't do that tracking.
I don't really know the implications of this, but maybe it would be
a wise idea to stay in the safe side and revert both patches for now.

-- 
Leandro Lucarella (AKA luca)                     http://llucax.com.ar/
----------------------------------------------------------------------
GPG Key: 5F5A8D05 (F8CD F9A7 BF00 5431 4145  104C 949E BFB6 5F5A 8D05)
----------------------------------------------------------------------
It's not a lie, if you believe it.
	-- George Constanza

------------------------------------------------------------------------------
Nokia and AT&T present the 2010 Calling All Innovators-North America contest
Create new apps & games for the Nokia N8 for consumers in  U.S. and Canada
$10 million total in prizes - $4M cash, 500 devices, nearly $6M in marketing
Develop with Nokia Qt SDK, Web Runtime, or Java and Publish to Ovi Store 
http://p.sf.net/sfu/nokia-dev2dev

^ permalink raw reply

* Re: Linux 2.6.35/TIPC 2.0 ABI breaking changes
From: Jon Maloy @ 2010-10-20 18:37 UTC (permalink / raw)
  To: Leandro Lucarella
  Cc: Neil Horman, netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	tipc-discussion@lists.sourceforge.net, David Miller
In-Reply-To: <20101020182411.GO8781@llucax.com.ar>

<...>

> 
> Just to try to understand better how things works, or are supposed to
> work: do the subscription and event messages (and I mean the 
> struct tipc_subscr and tipc_event published in tipc.h) really 
> go over the wire or are only used to communicate the stack to 
> the application inside a node?

Both. And, given TIPC fundamental "location transparency" principle
the sender (or receiver) at user level does not need to know the 
difference.
For a TIPC user, all messages are "local", insofar they stay within
the same cluster.

> 
> I think this is a crucial matter, since it defines if the 
> changes cross kernel/userspace boundaries only or it also 
> crosses the kernel/network boundaries.
> 
> > Remember, permitting both is a superset of the current one 
> (NBO only) 
> > so it is fully backwards compatible. We break absolutly nothing by 
> > permitting this.
> 
> I think if they really go through the wire, it should be in 
> NBO, and if tipc_subscr and tipc_event are used only 
> internally, we can still fix the userspace messages when 
> sending them through the wire.

There are plenty of protocols around not using NBO over the wire.
This is not a must. 

> 
> In any case, I agree that the patches should be reverted and 
> a solution should be planned with more time and consensus.
> 
> Thanks.
> 
> -- 
> Leandro Lucarella (AKA luca)                     http://llucax.com.ar/
> ----------------------------------------------------------------------
> GPG Key: 5F5A8D05 (F8CD F9A7 BF00 5431 4145  104C 949E BFB6 5F5A 8D05)
> ----------------------------------------------------------------------
> The world's best known word is "okay"
> The second most well-known word is "Coca-Cola"
> 
------------------------------------------------------------------------------
Nokia and AT&T present the 2010 Calling All Innovators-North America contest
Create new apps & games for the Nokia N8 for consumers in  U.S. and Canada
$10 million total in prizes - $4M cash, 500 devices, nearly $6M in marketing
Develop with Nokia Qt SDK, Web Runtime, or Java and Publish to Ovi Store 
http://p.sf.net/sfu/nokia-dev2dev

^ permalink raw reply

* Re: Linux 2.6.35/TIPC 2.0 ABI breaking changes
From: Leandro Lucarella @ 2010-10-20 18:44 UTC (permalink / raw)
  To: Jon Maloy
  Cc: Neil Horman, netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	tipc-discussion@lists.sourceforge.net, David Miller
In-Reply-To: <0434463FDA60A94FA978ACA44617682DEE8466821E@EUSAACMS0702.eamcs.ericsson.se>

Jon Maloy, el 20 de octubre a las 14:37 me escribiste:
> > I think if they really go through the wire, it should be in 
> > NBO, and if tipc_subscr and tipc_event are used only 
> > internally, we can still fix the userspace messages when 
> > sending them through the wire.
> 
> There are plenty of protocols around not using NBO over the wire.
> This is not a must. 

Of course, but is harder to sniff and debug if you haven't a fixed BO,
so, if it's easy to adjust transparently to userspace, I think it could
worth the trouble.

But my main concern is backwards compatibility, everything else is
secondary :)

-- 
Leandro Lucarella (AKA luca)                     http://llucax.com.ar/
----------------------------------------------------------------------
GPG Key: 5F5A8D05 (F8CD F9A7 BF00 5431 4145  104C 949E BFB6 5F5A 8D05)
----------------------------------------------------------------------
The Muppet show was banned from TV in Saudi Arabia
Because one of its stars was a pig

------------------------------------------------------------------------------
Nokia and AT&T present the 2010 Calling All Innovators-North America contest
Create new apps & games for the Nokia N8 for consumers in  U.S. and Canada
$10 million total in prizes - $4M cash, 500 devices, nearly $6M in marketing
Develop with Nokia Qt SDK, Web Runtime, or Java and Publish to Ovi Store 
http://p.sf.net/sfu/nokia-dev2dev

^ permalink raw reply

* RE: [PATCH] PCI: MSI: Remove unsafe and unnecessary hardware access
From: Tantilov, Emil S @ 2010-10-20 19:05 UTC (permalink / raw)
  To: Jesse Barnes, Emil S Tantilov
  Cc: Ben Hutchings, Michael Chan, Matthew Wilcox,
	linux-pci@vger.kernel.org, NetDev, Brandeburg, Jesse,
	Kirsher, Jeffrey T
In-Reply-To: <20101015130629.046d3357@jbarnes-desktop>

>-----Original Message-----
>From: Jesse Barnes [mailto:jbarnes@virtuousgeek.org]
>Sent: Friday, October 15, 2010 1:06 PM
>To: Emil S Tantilov
>Cc: Ben Hutchings; Michael Chan; Matthew Wilcox; linux-pci@vger.kernel.org;
>NetDev; Tantilov, Emil S; Brandeburg, Jesse; Kirsher, Jeffrey T
>Subject: Re: [PATCH] PCI: MSI: Remove unsafe and unnecessary hardware
>access
>
>On Fri, 15 Oct 2010 11:26:08 -0700
>Emil S Tantilov <emils.tantilov@gmail.com> wrote:
>
>> On Thu, Jun 17, 2010 at 12:16 PM, Ben Hutchings
>> <bhutchings@solarflare.com> wrote:
>> > During suspend on an SMP system, {read,write}_msi_msg_desc() may be
>> > called to mask and unmask interrupts on a device that is already in a
>> > reduced power state.  At this point memory-mapped registers including
>> > MSI-X tables are not accessible, and config space may not be fully
>> > functional either.
>> >
>> > While a device is in a reduced power state its interrupts are
>> > effectively masked and its MSI(-X) state will be restored when it is
>> > brought back to D0.  Therefore these functions can simply read and
>> > write msi_desc::msg for devices not in D0.
>> >
>> > Further, read_msi_msg_desc() should only ever be used to update a
>> > previously written message, so it can always read msi_desc::msg
>> > and never needs to touch the hardware.
>> >
>> > Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
>> > ---
>> > On Mon, 2010-06-14 at 18:13 -0700, Michael Chan wrote:
>> >> I'm debugging the bnx2 driver which doesn't work after suspend/resume
>if
>> >> it is running in MSI-X mode.  The problem is that during suspend, the
>> >> MSI-X vectors are disabled by the following sequence on x86:
>> >>
>> >> take_cpu_down() -> cpu_disable_common() -> fixup_irqs()
>> >>
>> >> The MSI-X address/data used to disable the vectors are remembered in
>the
>> >> above sequence. During resume, these address/data are then programmed
>> >> back to the device during pci_restore_state(), causing all the vectors
>> >> to remain disabled.
>> >
>> > That's not quite what I see.  What I see is that the message is read
>> > back from the table *after* the driver's suspend method has been
>called.
>> > At this point the device is already in D3 and memory-mapped registers
>> > are not accessible, so we get random bits as the message.  At least,
>> > that's what I see happening with the sfc driver.
>> >
>> >> Some drivers call free_irq() during suspend and request_irq() during
>> >> resume, and that should avoid the problem.  bnx2 and some other
>drivers
>> >> do not do that.  These drivers rely on pci_restore_state() to restore
>> >> the MSI-X vectors to the same working state before suspend.
>> >>
>> >> What's the right way to fix this?  Thanks.
>> >
>> > This is my attempt, which works for sfc.  See if it works for bnx2.
>> >
>> > Ben.
>> >
>> >  drivers/pci/msi.c |   34 +++++++++++-----------------------
>> >  1 files changed, 11 insertions(+), 23 deletions(-)
>> >
>> > diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
>> > index 77b68ea..03f04dc 100644
>> > --- a/drivers/pci/msi.c
>> > +++ b/drivers/pci/msi.c
>> > @@ -196,30 +196,15 @@ void unmask_msi_irq(unsigned int irq)
>> >  void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
>> >  {
>> >        struct msi_desc *entry = get_irq_desc_msi(desc);
>> > -       if (entry->msi_attrib.is_msix) {
>> > -               void __iomem *base = entry->mask_base +
>> > -                       entry->msi_attrib.entry_nr *
>PCI_MSIX_ENTRY_SIZE;
>> >
>> > -               msg->address_lo = readl(base +
>PCI_MSIX_ENTRY_LOWER_ADDR);
>> > -               msg->address_hi = readl(base +
>PCI_MSIX_ENTRY_UPPER_ADDR);
>> > -               msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
>> > -       } else {
>> > -               struct pci_dev *dev = entry->dev;
>> > -               int pos = entry->msi_attrib.pos;
>> > -               u16 data;
>> > +       /* We do not touch the hardware (which may not even be
>> > +        * accessible at the moment) but return the last message
>> > +        * written.  Assert that this is valid, assuming that
>> > +        * valid messages are not all-zeroes. */
>> > +       BUG_ON(!(entry->msg.address_hi | entry->msg.address_lo |
>> > +                entry->msg.data));
>> >
>> > -               pci_read_config_dword(dev, msi_lower_address_reg(pos),
>> > -                                       &msg->address_lo);
>> > -               if (entry->msi_attrib.is_64) {
>> > -                       pci_read_config_dword(dev,
>msi_upper_address_reg(pos),
>> > -                                               &msg->address_hi);
>> > -                       pci_read_config_word(dev, msi_data_reg(pos, 1),
>&data);
>> > -               } else {
>> > -                       msg->address_hi = 0;
>> > -                       pci_read_config_word(dev, msi_data_reg(pos, 0),
>&data);
>> > -               }
>> > -               msg->data = data;
>> > -       }
>> > +       *msg = entry->msg;
>> >  }
>> >
>> >  void read_msi_msg(unsigned int irq, struct msi_msg *msg)
>> > @@ -232,7 +217,10 @@ void read_msi_msg(unsigned int irq, struct msi_msg
>*msg)
>> >  void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
>> >  {
>> >        struct msi_desc *entry = get_irq_desc_msi(desc);
>> > -       if (entry->msi_attrib.is_msix) {
>> > +
>> > +       if (entry->dev->current_state != PCI_D0) {
>>
>> This check exposed a problem in ixgb (patch is on the way) where
>> pci_disable_device() was not being called in ixgb_remove(). As a
>> result the current_state was set to PCI_UNKNOWN and the interface
>> failed to work on subsequent load of the driver.
>>
>> Even though the problem was in ixgb, it made me wonder about this
>> check as the presumption here (low power state) may not always be
>> true. Like in the case of unloading a driver, which sets
>> dev->current_state to PCI_UNKNOWN which is not a representation of the
>> _real_ state of the device (actual state could be D0).
>>
>> BTW - quick search shows other drivers that could potentially suffer
>> the faith of ixgb due to lack of pci_disable_device() call on removal.
>
>Yeah we just ran into this in the DRM layer as well; which does a
>pci_enable_device but never calls _disable, so we're stuck with
>potentially stale state.
>
>I came up with the below to address that, but really I don't like the
>idea of nested pci_enable_device() calls at all.  But I haven't looked
>at the latest Wireless USB stuff to see if those drivers still rely on
>it.
>
>--
>Jesse Barnes, Intel Open Source Technology Center
>
>diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>index 7fa3cbd..37facc1 100644
>--- a/drivers/pci/pci.c
>+++ b/drivers/pci/pci.c
>@@ -994,6 +994,18 @@ static int __pci_enable_device_flags(struct pci_dev
>*dev,
> 	int err;
> 	int i, bars = 0;
>
>+	/*
>+	 * Power state could be unknown at this point, either due to a fresh
>+	 * boot or a device removal call.  So get the current power state
>+	 * so that things like MSI message writing will behave as expected
>+	 * (e.g. if the device really is in D0 at enable time).
>+	 */
>+	if (dev->pm_cap) {
>+		u16 pmcsr;
>+		pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
>+		dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
>+	}
>+
> 	if (atomic_add_return(1, &dev->enable_cnt) > 1)
> 		return 0;		/* already enabled */
>

With this patch applied I could reload the driver and confirmed that current_state is set to the actual power state.

Thanks,
Emil


^ permalink raw reply

* Re: Future of the Wimedia LLC Protocol (WLP) subsystem/drivers
From: Greg KH @ 2010-10-20 19:22 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: David Vrabel, netdev
In-Reply-To: <20101020091541.ec00fe96.randy.dunlap@oracle.com>

On Wed, Oct 20, 2010 at 09:15:41AM -0700, Randy Dunlap wrote:
> On Tue, 19 Oct 2010 17:30:47 +0100 David Vrabel wrote:
> 
> > Hi,
> > 
> > I've have been nominally the maintainer of the Wimedia LLC Protocol
> > (WLP) subsystem and driver since it was originally submitted.  I am no
> > longer in a position to even pretend to be a maintainer.
> > 
> > The only usable hardware was an Intel i1480 devices with beta firmware
> > that was never released as a product.  Intel have since sold all there
> > UWB/WLP IP and I see little prospect of there ever being hardware
> > commercially available for WLP.
> > 
> > Here are a number of options:
> > 
> > 1. Someone else maintains it.  Any volunteers?
> > 
> > 2. It gets labelled as Orphaned in MAINTAINERS.
> > 
> > 3. It gets moved to staging.
> > 
> > 4, It gets removed.
> > 
> > If no one says anything I'll submit a patch to Linus to mark it as Orphaned.
> 
> I'd say either 3 or 4.
> 
> It could go to staging on it way to removal, but that's not really necessary.
> 
> 
> cc: gregkh

3 or 4 is fine with me, which ever David wants.

thanks,

greg k-h

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox