Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH 1/2 v2] af-packet:  Use existing netdev reference for bound sockets.
From: Ben Greear @ 2011-05-27  4:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1306467745.2543.60.camel@edumazet-laptop>

On 05/26/2011 08:42 PM, Eric Dumazet wrote:
> Le jeudi 26 mai 2011 à 16:55 -0700, greearb@candelatech.com a écrit :

>>   out_free:
>>   	kfree_skb(skb);
>>   out_unlock:
>> -	if (dev)
>> +	if (dev&&  need_rls_dev)
>>   		dev_put(dev);
>>   out:
>>   	return err;
>
> Hmmm, I wonder why you want this Ben.
>
> IMHO this is buggy, because we can sleep in this function.
>
> We must take a ref on device (its really cheap these days, now we have a
> percpu device refcnt)

Why must you take the reference?  And if we must, why isn't the
current code that assigns the prot_hook.dev without taking a
reference OK?

It seems a waste to do the lookup and free if we don't have to,
and with thousands of devices, the lookup might take a reasonable
amount of effort?

Thanks,
Ben


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

* Re: [PATCH 2/2 v2] af-packet:  Add flag to distinguish VID 0 from no-vlan.
From: Eric Dumazet @ 2011-05-27  3:46 UTC (permalink / raw)
  To: greearb; +Cc: netdev
In-Reply-To: <1306454141-1634-2-git-send-email-greearb@candelatech.com>

Le jeudi 26 mai 2011 à 16:55 -0700, greearb@candelatech.com a écrit :
> From: Ben Greear <greearb@candelatech.com>
> 
> Currently, user-space cannot determine if a 0 tcp_vlan_tci
> means there is no VLAN tag or the VLAN ID was zero.
> 
> Add flag to make this explicit.  User-space can check for
> TP_STATUS_VLAN_VALID || tp_vlan_tci > 0, which will be backwards
> compatible. Older could would have just checked for tp_vlan_tci,
> so it will work no worse than before.
> 
> Signed-off-by: Ben Greear <greearb@candelatech.com>
> ---
> v2:  add logic to tpacket_rcv too.
> 
> :100644 100644 72bfa5a... 6d66ce1... M	include/linux/if_packet.h
> :100644 100644 f9b807d... 9c2d068... M	net/packet/af_packet.c
>  include/linux/if_packet.h |    1 +
>  net/packet/af_packet.c    |   14 ++++++++++++--
>  2 files changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> index 72bfa5a..6d66ce1 100644
> --- a/include/linux/if_packet.h
> +++ b/include/linux/if_packet.h
> @@ -70,6 +70,7 @@ struct tpacket_auxdata {
>  #define TP_STATUS_COPY		0x2
>  #define TP_STATUS_LOSING	0x4
>  #define TP_STATUS_CSUMNOTREADY	0x8
> +#define TP_STATUS_VLAN_VALID   0x10 /* auxdata has valid tp_vlan_tci */
>  
>  /* Tx ring - header status */
>  #define TP_STATUS_AVAILABLE	0x0
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index f9b807d..9c2d068 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -818,7 +818,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
>  			getnstimeofday(&ts);
>  		h.h2->tp_sec = ts.tv_sec;
>  		h.h2->tp_nsec = ts.tv_nsec;
> -		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
> +		if (vlan_tx_tag_present(skb)) {
> +			h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
> +			status |= TP_STATUS_VLAN_VALID;
> +		}
> +		else
> +			h.h2->tp_vlan_tci = 0;

	if (...) {
		...
	} else {
		...
	}

>  		hdrlen = sizeof(*h.h2);
>  		break;
>  	default:
> @@ -1763,7 +1768,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
>  		aux.tp_snaplen = skb->len;
>  		aux.tp_mac = 0;
>  		aux.tp_net = skb_network_offset(skb);
> -		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
> +		if (vlan_tx_tag_present(skb)) {
> +			aux.tp_vlan_tci = vlan_tx_tag_get(skb);
> +			aux.tp_status |= TP_STATUS_VLAN_VALID;
> +		}
> +		else
> +			aux.tp_vlan_tci = 0;

Same codingstyle comment here.

Reviewed-by: Eric Dumazet <eric.dumazet@gmail.com>

Thanks



^ permalink raw reply

* Re: [PATCH 1/2 v2] af-packet:  Use existing netdev reference for bound sockets.
From: Eric Dumazet @ 2011-05-27  3:42 UTC (permalink / raw)
  To: greearb; +Cc: netdev
In-Reply-To: <1306454141-1634-1-git-send-email-greearb@candelatech.com>

Le jeudi 26 mai 2011 à 16:55 -0700, greearb@candelatech.com a écrit :
> From: Ben Greear <greearb@candelatech.com>
> 
> This saves a network device lookup on each packet transmitted,
> for sockets that are bound to a network device.
> 
> Signed-off-by: Ben Greear <greearb@candelatech.com>
> ---
> 
> v2:  Remove un-used ifindex, lookup dev in else branch.
> 
> :100644 100644 4005b24... f9b807d... M	net/packet/af_packet.c
>  net/packet/af_packet.c |   27 +++++++++++++++------------
>  1 files changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 4005b24..f9b807d 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -989,7 +989,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>  	struct sk_buff *skb;
>  	struct net_device *dev;
>  	__be16 proto;
> -	int ifindex, err, reserve = 0;
> +	bool need_rls_dev = false;
> +	int err, reserve = 0;
>  	void *ph;
>  	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
>  	int tp_len, size_max;
> @@ -1001,7 +1002,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>  
>  	err = -EBUSY;
>  	if (saddr == NULL) {
> -		ifindex	= po->ifindex;
> +		dev = po->prot_hook.dev;
>  		proto	= po->num;
>  		addr	= NULL;
>  	} else {
> @@ -1012,12 +1013,12 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>  					+ offsetof(struct sockaddr_ll,
>  						sll_addr)))
>  			goto out;
> -		ifindex	= saddr->sll_ifindex;
>  		proto	= saddr->sll_protocol;
>  		addr	= saddr->sll_addr;
> +		dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
> +		need_rls_dev = true;
>  	}
>  
> -	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
>  	err = -ENXIO;
>  	if (unlikely(dev == NULL))
>  		goto out;
> @@ -1103,7 +1104,8 @@ out_status:
>  	__packet_set_status(po, ph, status);
>  	kfree_skb(skb);
>  out_put:
> -	dev_put(dev);
> +	if (need_rls_dev)
> +		dev_put(dev);
>  out:
>  	mutex_unlock(&po->pg_vec_lock);
>  	return err;
> @@ -1141,8 +1143,9 @@ static int packet_snd(struct socket *sock,
>  	struct sk_buff *skb;
>  	struct net_device *dev;
>  	__be16 proto;
> +	bool need_rls_dev = false;
>  	unsigned char *addr;
> -	int ifindex, err, reserve = 0;
> +	int err, reserve = 0;
>  	struct virtio_net_hdr vnet_hdr = { 0 };
>  	int offset = 0;
>  	int vnet_hdr_len;
> @@ -1160,7 +1163,7 @@ static int packet_snd(struct socket *sock,
>  	 */
>  
>  	if (saddr == NULL) {
> -		ifindex	= po->ifindex;
> +		dev = po->prot_hook.dev;
>  		proto	= po->num;
>  		addr	= NULL;
>  	} else {
> @@ -1169,13 +1172,12 @@ static int packet_snd(struct socket *sock,
>  			goto out;
>  		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
>  			goto out;
> -		ifindex	= saddr->sll_ifindex;
>  		proto	= saddr->sll_protocol;
>  		addr	= saddr->sll_addr;
> +		dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
> +		need_rls_dev = true;
>  	}
>  
> -
> -	dev = dev_get_by_index(sock_net(sk), ifindex);
>  	err = -ENXIO;
>  	if (dev == NULL)
>  		goto out_unlock;
> @@ -1315,14 +1317,15 @@ static int packet_snd(struct socket *sock,
>  	if (err > 0 && (err = net_xmit_errno(err)) != 0)
>  		goto out_unlock;
>  
> -	dev_put(dev);
> +	if (need_rls_dev)
> +		dev_put(dev);
>  
>  	return len;
>  
>  out_free:
>  	kfree_skb(skb);
>  out_unlock:
> -	if (dev)
> +	if (dev && need_rls_dev)
>  		dev_put(dev);
>  out:
>  	return err;

Hmmm, I wonder why you want this Ben.

IMHO this is buggy, because we can sleep in this function.

We must take a ref on device (its really cheap these days, now we have a
percpu device refcnt)




^ permalink raw reply

* Re: Kernel crash after using new Intel NIC (igb)
From: Eric Dumazet @ 2011-05-27  3:27 UTC (permalink / raw)
  To: Arun Sharma, David Miller
  Cc: Maximilian Engelhardt, linux-kernel, netdev, StuStaNet Vorstand,
	Yann Dupont, Denys Fedoryshchenko
In-Reply-To: <4DDEEBC5.80804@fb.com>

Le jeudi 26 mai 2011 à 17:09 -0700, Arun Sharma a écrit :
> On 5/26/11 3:01 PM, Eric Dumazet wrote:
> 
> 
> >> Yeah - using the refcnt seems better than list_empty(), but I'm not sure
> >> that your patch addresses the race above.
> >
> > It does.
> 
> True. I can't find any holes in this method and it resolves the "failure 
> to unlink from unused" case.
> 
> Perhaps wrap the while(1) loop into its own primitive in atomic.h or use 
> an existing primitive?
> 

Sure, here is a formal submission I cooked.

Thanks

[PATCH] inetpeer: fix race in unused_list manipulations

Several crashes in cleanup_once() were reported in recent kernels.

Commit d6cc1d642de9 (inetpeer: various changes) added a race in
unlink_from_unused().

One way to avoid taking unused_peers.lock before doing the list_empty()
test is to catch 0->1 refcnt transitions, using full barrier atomic
operations variants (atomic_cmpxchg() and atomic_inc_return()) instead
of previous atomic_inc() and atomic_add_unless() variants.

We then call unlink_from_unused() only for the owner of the 0->1
transition.

Add a new atomic_add_unless_return() static helper

With help from Arun Sharma.

Refs: https://bugzilla.kernel.org/show_bug.cgi?id=32772

Reported-by: Arun Sharma <asharma@fb.com>
Reported-by: Maximilian Engelhardt <maxi@daemonizer.de>
Reported-by: Yann Dupont <Yann.Dupont@univ-nantes.fr>
Reported-by: Denys Fedoryshchenko <denys@visp.net.lb>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/inetpeer.c |   42 +++++++++++++++++++++++++++---------------
 1 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 9df4e63..ce616d9 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -154,11 +154,9 @@ void __init inet_initpeers(void)
 /* Called with or without local BH being disabled. */
 static void unlink_from_unused(struct inet_peer *p)
 {
-	if (!list_empty(&p->unused)) {
-		spin_lock_bh(&unused_peers.lock);
-		list_del_init(&p->unused);
-		spin_unlock_bh(&unused_peers.lock);
-	}
+	spin_lock_bh(&unused_peers.lock);
+	list_del_init(&p->unused);
+	spin_unlock_bh(&unused_peers.lock);
 }
 
 static int addr_compare(const struct inetpeer_addr *a,
@@ -205,6 +203,20 @@ static int addr_compare(const struct inetpeer_addr *a,
 	u;							\
 })
 
+static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv)
+{
+	int cur, old = atomic_read(ptr);
+
+	while (old != u) {
+		*newv = old + a;
+		cur = atomic_cmpxchg(ptr, old, *newv);
+		if (cur == old)
+			return true;
+		old = cur;
+	}
+	return false;
+}
+
 /*
  * Called with rcu_read_lock()
  * Because we hold no lock against a writer, its quite possible we fall
@@ -213,7 +225,8 @@ static int addr_compare(const struct inetpeer_addr *a,
  * We exit from this function if number of links exceeds PEER_MAXDEPTH
  */
 static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
-				    struct inet_peer_base *base)
+				    struct inet_peer_base *base,
+				    int *newrefcnt)
 {
 	struct inet_peer *u = rcu_dereference(base->root);
 	int count = 0;
@@ -226,7 +239,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
 			 * distinction between an unused entry (refcnt=0) and
 			 * a freed one.
 			 */
-			if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1)))
+			if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt))
 				u = NULL;
 			return u;
 		}
@@ -465,22 +478,23 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
 	struct inet_peer_base *base = family_to_base(daddr->family);
 	struct inet_peer *p;
 	unsigned int sequence;
-	int invalidated;
+	int invalidated, newrefcnt = 0;
 
 	/* Look up for the address quickly, lockless.
 	 * Because of a concurrent writer, we might not find an existing entry.
 	 */
 	rcu_read_lock();
 	sequence = read_seqbegin(&base->lock);
-	p = lookup_rcu(daddr, base);
+	p = lookup_rcu(daddr, base, &newrefcnt);
 	invalidated = read_seqretry(&base->lock, sequence);
 	rcu_read_unlock();
 
 	if (p) {
-		/* The existing node has been found.
+found:		/* The existing node has been found.
 		 * Remove the entry from unused list if it was there.
 		 */
-		unlink_from_unused(p);
+		if (newrefcnt == 1)
+			unlink_from_unused(p);
 		return p;
 	}
 
@@ -494,11 +508,9 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
 	write_seqlock_bh(&base->lock);
 	p = lookup(daddr, stack, base);
 	if (p != peer_avl_empty) {
-		atomic_inc(&p->refcnt);
+		newrefcnt = atomic_inc_return(&p->refcnt);
 		write_sequnlock_bh(&base->lock);
-		/* Remove the entry from unused list if it was there. */
-		unlink_from_unused(p);
-		return p;
+		goto found;
 	}
 	p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
 	if (p) {

^ permalink raw reply related

* Re: [patch 1/1] net: convert %p usage to %pK
From: Eric Dumazet @ 2011-05-27  3:10 UTC (permalink / raw)
  To: David Miller
  Cc: kees.cook, joe, mingo, akpm, netdev, drosenberg, a.p.zijlstra,
	eparis, eugeneteo, jmorris, tgraf
In-Reply-To: <20110526.224427.433160775431725543.davem@davemloft.net>

Le jeudi 26 mai 2011 à 22:44 -0400, David Miller a écrit :
> From: Kees Cook <kees.cook@canonical.com>
> Date: Thu, 26 May 2011 17:14:49 -0700
> > 
> > We got this dropped from the /proc view; why can't we do the same for
> > this netlink interface?
> 
> Because it's not only an opaque "output" blob, it's also an input key
> for lookups which the user can trigger.

Yes, we wan add a layer to obfuscate the real pointers. We dont trust
values given by user, only match them.

Either we use a XOR with a boot time random value (but let the NULL
cookie being the NULL one), or we generate an unique 64bit socket id for
the cookie (and keep a 64bit cookie in all sockets, increasing ram
usage)




^ permalink raw reply

* RE: [PATCH 2/6] unicore32: add pkunity-v3 mac/net driver (umal)
From: Guan Xuetao @ 2011-05-27  3:08 UTC (permalink / raw)
  To: 'Arnd Bergmann'; +Cc: linux-kernel, linux-arch, greg, netdev
In-Reply-To: <201105261443.06896.arnd@arndb.de>



> -----Original Message-----
> From: Arnd Bergmann [mailto:arnd@arndb.de]
> Sent: Thursday, May 26, 2011 8:43 PM
> To: GuanXuetao
> Cc: linux-kernel@vger.kernel.org; linux-arch@vger.kernel.org; greg@kroah.com; netdev@vger.kernel.org
> Subject: Re: [PATCH 2/6] unicore32: add pkunity-v3 mac/net driver (umal)
> 
> On Thursday 26 May 2011, GuanXuetao wrote:
> > From: Guan Xuetao <gxt@mprc.pku.edu.cn>
> >
> > Signed-off-by: Guan Xuetao <gxt@mprc.pku.edu.cn>
> 
> No objections to this driver, just remember that network drivers should
> go through the netdev tree, and if it wasn't in there before the merge
> window, you missed your chance for this time.
Yes.
I hope someone could help me review the network driver.
Perhaps it's  late for adding this driver during this merge window time.

Regards.
Guan Xuetao

^ permalink raw reply

* Re: [patch 1/1] net: convert %p usage to %pK
From: David Miller @ 2011-05-27  2:44 UTC (permalink / raw)
  To: kees.cook
  Cc: eric.dumazet, joe, mingo, akpm, netdev, drosenberg, a.p.zijlstra,
	eparis, eugeneteo, jmorris, tgraf
In-Reply-To: <20110527001449.GS19633@outflux.net>

From: Kees Cook <kees.cook@canonical.com>
Date: Thu, 26 May 2011 17:14:49 -0700

> On Wed, May 25, 2011 at 09:50:40PM -0400, David Miller wrote:
>> From: Kees Cook <kees.cook@canonical.com>
>> Date: Wed, 25 May 2011 16:29:21 -0700
>> 
>> > Hi David,
>> > 
>> > On Tue, May 24, 2011 at 03:58:01AM -0400, David Miller wrote:
>> >> From: Eric Dumazet <eric.dumazet@gmail.com>
>> >> Date: Tue, 24 May 2011 09:45:01 +0200
>> >> 
>> >> > Le mardi 24 mai 2011 à 00:35 -0700, Joe Perches a écrit :
>> >> > 
>> >> >> I think it's be better without the casts
>> >> >> using the standard kernel.h macros.
>> >> >> 
>> >> >> 	void *ptr;
>> >> >> 
>> >> >> 	ptr = maybe_hide_ptr(sk);
>> >> >> 	r->id.idiag_cookie[0] = lower_32_bits(ptr);
>> >> >> 	r->id.idiag_cookie[1] = upper_32_bits(ptr);
>> >> >> 
>> >> > 
>> >> > I am not sure I want to patch lower_32_bits() and upper_32_bits() for
>> >> > this.
>> >> > 
>> >> > They dont work on pointers, but on "numbers", according to kerneldoc
>> >> > Andrew wrote years ago. gcc agrees :
>> >> > 
>> >> > net/ipv4/inet_diag.c: In function ‘inet_csk_diag_fill’:
>> >> > net/ipv4/inet_diag.c:119: warning: cast from pointer to integer of different size
>> >> > net/ipv4/inet_diag.c:120: error: invalid operands to binary >>
>> >> > make[1]: *** [net/ipv4/inet_diag.o] Error 1
>> >> 
>> >> Also you can't do this, the "cookie" is used by the kernel future
>> >> lookups to find sockets.
>> >> 
>> >> The kernel pointer is part of the API, so sorry you can't "hide"
>> >> kernel pointers in this case without really breaking user visible
>> >> things.
>> > 
>> > But this is precisely what we're trying to control with kptr_restrict.
>> > Setting kptr_restrict will make inet_diag (and some details of similar
>> > things in /proc) meaningless. Based on the name, "diag" isn't going to be
>> > used in normal operation, and kptr_restrict is 0 by default, so only system
>> > owners interested in this will enable it and effectively disable inet_diag.
>> 
>> Are you kidding me?
>> 
>> inet_diag is the standard way to dump sockets using netlink.
>> It's not a special obscure debugging facility, it's for real
>> users.
>> 
>> And the encoded kernel pointer here is used as a shortcut to looking
>> up precise sockets.
> 
> We got this dropped from the /proc view; why can't we do the same for
> this netlink interface?

Because it's not only an opaque "output" blob, it's also an input key
for lookups which the user can trigger.

^ permalink raw reply

* Re: [PATCH 3/3] net: Kill ratelimit.h dependency in linux/net.h
From: David Miller @ 2011-05-27  2:40 UTC (permalink / raw)
  To: joe; +Cc: netdev, mingo
In-Reply-To: <1306445044.25717.0.camel@Joe-Laptop>

From: Joe Perches <joe@perches.com>
Date: Thu, 26 May 2011 14:24:04 -0700

> On Thu, 2011-05-26 at 16:43 -0400, David Miller wrote:
>> Ingo Molnar noticed that we have this unnecessary ratelimit.h
>> dependency in linux/net.h, which hid compilation problems from
>> people doing builds only with CONFIG_NET enabled.
>> 
>> Move this stuff out to a seperate net/net_ratelimit.h file and
>> include that in the only two places where this thing is needed.
> 
> git add net/net_ratelimit.h?

Good catch :-)  It's contents are essentially what were being
deleted from linux/net.h :-)

^ permalink raw reply

* Re: linux-next: Tree for May 26 (drivers/net/caif)
From: Stephen Rothwell @ 2011-05-27  1:01 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: netdev, linux-next, LKML, Sjur Braendeland
In-Reply-To: <20110526120702.fac24de0.randy.dunlap@oracle.com>

[-- Attachment #1: Type: text/plain, Size: 615 bytes --]

Hi Randy,

On Thu, 26 May 2011 12:07:02 -0700 Randy Dunlap <randy.dunlap@oracle.com> wrote:
>
> drivers/net/caif/caif_serial.c:194: warning: 'return' with no value, in function returning non-void
> drivers/net/caif/caif_serial.c:202: warning: 'return' with no value, in function returning non-void
> 
> I'm curious:  how do warnings like this get overlooked?
> too much noise in the build messages?  or it wasn't overlooked, just ignored?

For me, too much noise :-(  I catch some, but not all.

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/

[-- Attachment #2: Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* Re: [PATCH] iproute2: Add processless network namespace support.
From: Eric W. Biederman @ 2011-05-27  0:45 UTC (permalink / raw)
  To: Chris Friesen; +Cc: shemminger, netdev, Linux Containers
In-Reply-To: <4DDEC793.8040502@genband.com>

Chris Friesen <chris.friesen@genband.com> writes:

> On 05/26/2011 02:58 PM, Eric W. Biederman wrote:
>>
>> The goal of this code change is to implement a mechanism such that it is
>> simple to work with a kernel that is using multiple network namespaces
>> at once.
>>
>> This comes in handy for interacting with vpns where there may be rfc1918
>> address overlaps, and different policies default routes, name servers
>> and the like.
>>
>> Configuration specific to a network namespace that would ordinarily be
>> stored under /etc/ is stored under /etc/netns/<name>.  For example if
>> the dns server configuration is different for your vpn you would create
>> a file /etc/netns/myvpn/resolv.conf.
>
> That would be interesting.  Currently I use dnsmasq to keep my DNS servers
> straight when connecting in to two different corporate VPNs simultaneously.
>
> I could see it being interesting trying to remember which shell sessions were
> running in which network namespace...

Keeping shells straight is keeping shells straight, and I haven't had
an particular trouble with that.  I can always look at ifconfig or
/etc/resolv.conf if I am wondering which namespace a shell is in.

Now that I think about it there is some ongoing work to test two
network namespaces for identity and as that matures it probably
makes sense to extend this code to add a "ip netns id" command.

For me there has been real benefit in not having to deal with
conflicting default routes, conflicting ip addresses, or conflicting
hostnames (assuming you put all of the right domain names in your search
path), and I don't have to worry about accidentally bridging traffic.

In general I think it is simpler model to get right than natting remote
networks into my own network, using multiple ip tables so I can cope
with multiple default routes, and setting dnsmasq, and a long search
path in resolv.conf so I can resolve host names that are not fully
qualified that might come over the vpn.  Not that there aren't
benefits to doing all of that.

In practice after I had ip patched all it took was a longish openvpn
up-script (an afternoons hack) (so I could move the tap device to the
new network namespace and intercept all of the ifconfig or ip commands
that openvpn would normally run and run them instead in the network
namespace).

Eric

^ permalink raw reply

* Re: ip fragments sent in reverse order?
From: Changli Gao @ 2011-05-27  0:20 UTC (permalink / raw)
  To: Chris Friesen; +Cc: netdev
In-Reply-To: <4DDE5BF9.6090508@genband.com>

On Thu, May 26, 2011 at 9:56 PM, Chris Friesen
<chris.friesen@genband.com> wrote:
>
> I read that Linux intentionally sends IP fragments in reverse order to allow
> for some optimization on the receive side.  I'm trying to find this code,
> but what I see in ip_fragment() seems to be creating/sending the fragments
> in order.  Has this changed over the years?
>

Yes. Now the fragments are sent in order.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [patch 1/1] net: convert %p usage to %pK
From: Kees Cook @ 2011-05-27  0:14 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, joe, mingo, akpm, netdev, drosenberg, a.p.zijlstra,
	eparis, eugeneteo, jmorris, tgraf
In-Reply-To: <20110525.215040.47969627064310792.davem@davemloft.net>

On Wed, May 25, 2011 at 09:50:40PM -0400, David Miller wrote:
> From: Kees Cook <kees.cook@canonical.com>
> Date: Wed, 25 May 2011 16:29:21 -0700
> 
> > Hi David,
> > 
> > On Tue, May 24, 2011 at 03:58:01AM -0400, David Miller wrote:
> >> From: Eric Dumazet <eric.dumazet@gmail.com>
> >> Date: Tue, 24 May 2011 09:45:01 +0200
> >> 
> >> > Le mardi 24 mai 2011 à 00:35 -0700, Joe Perches a écrit :
> >> > 
> >> >> I think it's be better without the casts
> >> >> using the standard kernel.h macros.
> >> >> 
> >> >> 	void *ptr;
> >> >> 
> >> >> 	ptr = maybe_hide_ptr(sk);
> >> >> 	r->id.idiag_cookie[0] = lower_32_bits(ptr);
> >> >> 	r->id.idiag_cookie[1] = upper_32_bits(ptr);
> >> >> 
> >> > 
> >> > I am not sure I want to patch lower_32_bits() and upper_32_bits() for
> >> > this.
> >> > 
> >> > They dont work on pointers, but on "numbers", according to kerneldoc
> >> > Andrew wrote years ago. gcc agrees :
> >> > 
> >> > net/ipv4/inet_diag.c: In function ‘inet_csk_diag_fill’:
> >> > net/ipv4/inet_diag.c:119: warning: cast from pointer to integer of different size
> >> > net/ipv4/inet_diag.c:120: error: invalid operands to binary >>
> >> > make[1]: *** [net/ipv4/inet_diag.o] Error 1
> >> 
> >> Also you can't do this, the "cookie" is used by the kernel future
> >> lookups to find sockets.
> >> 
> >> The kernel pointer is part of the API, so sorry you can't "hide"
> >> kernel pointers in this case without really breaking user visible
> >> things.
> > 
> > But this is precisely what we're trying to control with kptr_restrict.
> > Setting kptr_restrict will make inet_diag (and some details of similar
> > things in /proc) meaningless. Based on the name, "diag" isn't going to be
> > used in normal operation, and kptr_restrict is 0 by default, so only system
> > owners interested in this will enable it and effectively disable inet_diag.
> 
> Are you kidding me?
> 
> inet_diag is the standard way to dump sockets using netlink.
> It's not a special obscure debugging facility, it's for real
> users.
> 
> And the encoded kernel pointer here is used as a shortcut to looking
> up precise sockets.

We got this dropped from the /proc view; why can't we do the same for
this netlink interface?

-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply

* Re: Kernel crash after using new Intel NIC (igb)
From: Arun Sharma @ 2011-05-27  0:09 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Maximilian Engelhardt, linux-kernel, netdev, StuStaNet Vorstand
In-Reply-To: <1306447292.2543.32.camel@edumazet-laptop>

On 5/26/11 3:01 PM, Eric Dumazet wrote:


>> Yeah - using the refcnt seems better than list_empty(), but I'm not sure
>> that your patch addresses the race above.
>
> It does.

True. I can't find any holes in this method and it resolves the "failure 
to unlink from unused" case.

Perhaps wrap the while(1) loop into its own primitive in atomic.h or use 
an existing primitive?

  -Arun


^ permalink raw reply

* [PATCH 2/2 v2] af-packet:  Add flag to distinguish VID 0 from no-vlan.
From: greearb @ 2011-05-26 23:55 UTC (permalink / raw)
  To: netdev; +Cc: Ben Greear
In-Reply-To: <1306454141-1634-1-git-send-email-greearb@candelatech.com>

From: Ben Greear <greearb@candelatech.com>

Currently, user-space cannot determine if a 0 tcp_vlan_tci
means there is no VLAN tag or the VLAN ID was zero.

Add flag to make this explicit.  User-space can check for
TP_STATUS_VLAN_VALID || tp_vlan_tci > 0, which will be backwards
compatible. Older could would have just checked for tp_vlan_tci,
so it will work no worse than before.

Signed-off-by: Ben Greear <greearb@candelatech.com>
---
v2:  add logic to tpacket_rcv too.

:100644 100644 72bfa5a... 6d66ce1... M	include/linux/if_packet.h
:100644 100644 f9b807d... 9c2d068... M	net/packet/af_packet.c
 include/linux/if_packet.h |    1 +
 net/packet/af_packet.c    |   14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 72bfa5a..6d66ce1 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -70,6 +70,7 @@ struct tpacket_auxdata {
 #define TP_STATUS_COPY		0x2
 #define TP_STATUS_LOSING	0x4
 #define TP_STATUS_CSUMNOTREADY	0x8
+#define TP_STATUS_VLAN_VALID   0x10 /* auxdata has valid tp_vlan_tci */
 
 /* Tx ring - header status */
 #define TP_STATUS_AVAILABLE	0x0
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f9b807d..9c2d068 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -818,7 +818,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 			getnstimeofday(&ts);
 		h.h2->tp_sec = ts.tv_sec;
 		h.h2->tp_nsec = ts.tv_nsec;
-		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
+		if (vlan_tx_tag_present(skb)) {
+			h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
+			status |= TP_STATUS_VLAN_VALID;
+		}
+		else
+			h.h2->tp_vlan_tci = 0;
 		hdrlen = sizeof(*h.h2);
 		break;
 	default:
@@ -1763,7 +1768,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		aux.tp_snaplen = skb->len;
 		aux.tp_mac = 0;
 		aux.tp_net = skb_network_offset(skb);
-		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
+		if (vlan_tx_tag_present(skb)) {
+			aux.tp_vlan_tci = vlan_tx_tag_get(skb);
+			aux.tp_status |= TP_STATUS_VLAN_VALID;
+		}
+		else
+			aux.tp_vlan_tci = 0;
 
 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
 	}
-- 
1.7.3.4


^ permalink raw reply related

* [PATCH 1/2 v2] af-packet:  Use existing netdev reference for bound sockets.
From: greearb @ 2011-05-26 23:55 UTC (permalink / raw)
  To: netdev; +Cc: Ben Greear

From: Ben Greear <greearb@candelatech.com>

This saves a network device lookup on each packet transmitted,
for sockets that are bound to a network device.

Signed-off-by: Ben Greear <greearb@candelatech.com>
---

v2:  Remove un-used ifindex, lookup dev in else branch.

:100644 100644 4005b24... f9b807d... M	net/packet/af_packet.c
 net/packet/af_packet.c |   27 +++++++++++++++------------
 1 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4005b24..f9b807d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -989,7 +989,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	struct sk_buff *skb;
 	struct net_device *dev;
 	__be16 proto;
-	int ifindex, err, reserve = 0;
+	bool need_rls_dev = false;
+	int err, reserve = 0;
 	void *ph;
 	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
 	int tp_len, size_max;
@@ -1001,7 +1002,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 
 	err = -EBUSY;
 	if (saddr == NULL) {
-		ifindex	= po->ifindex;
+		dev = po->prot_hook.dev;
 		proto	= po->num;
 		addr	= NULL;
 	} else {
@@ -1012,12 +1013,12 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 					+ offsetof(struct sockaddr_ll,
 						sll_addr)))
 			goto out;
-		ifindex	= saddr->sll_ifindex;
 		proto	= saddr->sll_protocol;
 		addr	= saddr->sll_addr;
+		dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
+		need_rls_dev = true;
 	}
 
-	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
 	err = -ENXIO;
 	if (unlikely(dev == NULL))
 		goto out;
@@ -1103,7 +1104,8 @@ out_status:
 	__packet_set_status(po, ph, status);
 	kfree_skb(skb);
 out_put:
-	dev_put(dev);
+	if (need_rls_dev)
+		dev_put(dev);
 out:
 	mutex_unlock(&po->pg_vec_lock);
 	return err;
@@ -1141,8 +1143,9 @@ static int packet_snd(struct socket *sock,
 	struct sk_buff *skb;
 	struct net_device *dev;
 	__be16 proto;
+	bool need_rls_dev = false;
 	unsigned char *addr;
-	int ifindex, err, reserve = 0;
+	int err, reserve = 0;
 	struct virtio_net_hdr vnet_hdr = { 0 };
 	int offset = 0;
 	int vnet_hdr_len;
@@ -1160,7 +1163,7 @@ static int packet_snd(struct socket *sock,
 	 */
 
 	if (saddr == NULL) {
-		ifindex	= po->ifindex;
+		dev = po->prot_hook.dev;
 		proto	= po->num;
 		addr	= NULL;
 	} else {
@@ -1169,13 +1172,12 @@ static int packet_snd(struct socket *sock,
 			goto out;
 		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
 			goto out;
-		ifindex	= saddr->sll_ifindex;
 		proto	= saddr->sll_protocol;
 		addr	= saddr->sll_addr;
+		dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
+		need_rls_dev = true;
 	}
 
-
-	dev = dev_get_by_index(sock_net(sk), ifindex);
 	err = -ENXIO;
 	if (dev == NULL)
 		goto out_unlock;
@@ -1315,14 +1317,15 @@ static int packet_snd(struct socket *sock,
 	if (err > 0 && (err = net_xmit_errno(err)) != 0)
 		goto out_unlock;
 
-	dev_put(dev);
+	if (need_rls_dev)
+		dev_put(dev);
 
 	return len;
 
 out_free:
 	kfree_skb(skb);
 out_unlock:
-	if (dev)
+	if (dev && need_rls_dev)
 		dev_put(dev);
 out:
 	return err;
-- 
1.7.3.4


^ permalink raw reply related

* Re: [PATCH 1/1] IPVS : bug in ip_vs_ftp, same list heaad used in all netns.
From: Simon Horman @ 2011-05-26 23:37 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Hans Schillstrom, ja, wensong, lvs-devel, netdev, netfilter-devel,
	hans
In-Reply-To: <4DDE8B3E.30009@netfilter.org>

On Thu, May 26, 2011 at 07:17:50PM +0200, Pablo Neira Ayuso wrote:
> On 24/05/11 14:11, Hans Schillstrom wrote:
> > When ip_vs was adapted to netns the ftp application was not adapted
> > in a correct way.
> > However this is a fix to avoid kernel errors. In the long term another solution
> > might be chosen.  I.e the ports that the ftp appl, uses should be per netns.
> 
> applied, thanks.

Thanks Pablo.

Hans, is this appropriate for -stable (i.e. 2.6.39.x) ?


^ permalink raw reply

* ip fragments sent in reverse order?
From: Chris Friesen @ 2011-05-26 13:56 UTC (permalink / raw)
  To: netdev


I read that Linux intentionally sends IP fragments in reverse order to 
allow for some optimization on the receive side.  I'm trying to find 
this code, but what I see in ip_fragment() seems to be creating/sending 
the fragments in order.  Has this changed over the years?

Thanks,
Chris


	
-- 
Chris Friesen
Software Developer
GENBAND
chris.friesen@genband.com
www.genband.com

^ permalink raw reply

* Re: Kernel crash after using new Intel NIC (igb)
From: Eric Dumazet @ 2011-05-26 22:01 UTC (permalink / raw)
  To: Arun Sharma
  Cc: Maximilian Engelhardt, linux-kernel, netdev, StuStaNet Vorstand
In-Reply-To: <4DDECA9B.8080206@fb.com>

Le jeudi 26 mai 2011 à 14:48 -0700, Arun Sharma a écrit :
> On 5/26/11 12:47 PM, Eric Dumazet wrote:
> 
> > You dont get the problem. Problem is : We can do the empty() test only
> > if protected by the lock.
> >
> > If not locked, result can be wrong. [ false positive or negative ]
> >
> 
> 
> Agreed. Failing to unlink from unused list when we should have sounds wrong.
> 
> >> The list modification under unused_peers.lock looks generally safe. But
> >> the control flow (based on refcnt) done outside the lock might have races.
> >>
> >
> > "might" is not a good word when dealing with this ;)
> 
> Potential race in the current code:
> 
> initial refcnt = 1
> 
>         T1:                                        T2
> 
> atomic_dec_and_lock(refcnt)
> // refcnt == 0
> 
>                                         atomic_add_unless(refcnt)
>                                         unlink_from_unused()
> 
> list_add_tail(unused)
>                                         // T2 using "unused" entry
> 
> 
> > Did you test my fix ?
> 
> I could try it on one or two machines - but it won't tell us anything 
> for weeks if not months. Unfortunately my next window to try a new 
> kernel on a large enough sample is several months away.
> 
> >
> > Its doing the right thing : Using refcnt as the only marker to say if
> > the item must be removed from unused list (and lock the central lock
> > protecting this list only when needed)
> >
> > Since we already must do an atomic operation on refcnt, using
> > atomic_inc_return [ or similar full barrier op ] is enough to tell us
> > the truth.
> 
> Yeah - using the refcnt seems better than list_empty(), but I'm not sure 
> that your patch addresses the race above.

It does.

It becomes

        T1:                                        T2
> 
> atomic_dec_and_lock(refcnt)
> // refcnt == 0
> 
>                          newref = atomic_add_unless_and_return(refcnt)
> 
> list_add_tail(unused)
  unlock();
>                                         
> 			   if (newref == 1) {
				   lock()
                                   unlink_from_unused()
				   unlock()
			   }




^ permalink raw reply

* Re: Kernel crash after using new Intel NIC (igb)
From: Arun Sharma @ 2011-05-26 21:48 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Maximilian Engelhardt, linux-kernel, netdev, StuStaNet Vorstand
In-Reply-To: <1306439246.2543.10.camel@edumazet-laptop>

On 5/26/11 12:47 PM, Eric Dumazet wrote:

> You dont get the problem. Problem is : We can do the empty() test only
> if protected by the lock.
>
> If not locked, result can be wrong. [ false positive or negative ]
>


Agreed. Failing to unlink from unused list when we should have sounds wrong.

>> The list modification under unused_peers.lock looks generally safe. But
>> the control flow (based on refcnt) done outside the lock might have races.
>>
>
> "might" is not a good word when dealing with this ;)

Potential race in the current code:

initial refcnt = 1

        T1:                                        T2

atomic_dec_and_lock(refcnt)
// refcnt == 0

                                        atomic_add_unless(refcnt)
                                        unlink_from_unused()

list_add_tail(unused)
                                        // T2 using "unused" entry


> Did you test my fix ?

I could try it on one or two machines - but it won't tell us anything 
for weeks if not months. Unfortunately my next window to try a new 
kernel on a large enough sample is several months away.

>
> Its doing the right thing : Using refcnt as the only marker to say if
> the item must be removed from unused list (and lock the central lock
> protecting this list only when needed)
>
> Since we already must do an atomic operation on refcnt, using
> atomic_inc_return [ or similar full barrier op ] is enough to tell us
> the truth.

Yeah - using the refcnt seems better than list_empty(), but I'm not sure 
that your patch addresses the race above.

  -Arun

^ permalink raw reply

* Re: [PATCH] iproute2: Add processless network namespace support.
From: Chris Friesen @ 2011-05-26 21:35 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: shemminger, netdev, Linux Containers
In-Reply-To: <m1hb8hyyaj.fsf@fess.ebiederm.org>

On 05/26/2011 02:58 PM, Eric W. Biederman wrote:
>
> The goal of this code change is to implement a mechanism such that it is
> simple to work with a kernel that is using multiple network namespaces
> at once.
>
> This comes in handy for interacting with vpns where there may be rfc1918
> address overlaps, and different policies default routes, name servers
> and the like.
>
> Configuration specific to a network namespace that would ordinarily be
> stored under /etc/ is stored under /etc/netns/<name>.  For example if
> the dns server configuration is different for your vpn you would create
> a file /etc/netns/myvpn/resolv.conf.

That would be interesting.  Currently I use dnsmasq to keep my DNS 
servers straight when connecting in to two different corporate VPNs 
simultaneously.

I could see it being interesting trying to remember which shell sessions 
were running in which network namespace...

Chris

-- 
Chris Friesen
Software Developer
GENBAND
chris.friesen@genband.com
www.genband.com

^ permalink raw reply

* Re: linux-next: Tree for May 26 (drivers/net/caif)
From: Randy Dunlap @ 2011-05-26 21:30 UTC (permalink / raw)
  To: Sjur Brændeland
  Cc: balbi, Stephen Rothwell, netdev, linux-next, LKML, gregkh
In-Reply-To: <BANLkTikik8+f1D4D_Dv_rFK=QQjdce9rwg@mail.gmail.com>

On Thu, 26 May 2011 23:15:34 +0200 Sjur Brændeland wrote:

> Hi Randy,
> 
> [Randy]
> > (not new)
> >
> > drivers/net/caif/caif_serial.c:194: warning: 'return' with no value, in function returning non-void
> > drivers/net/caif/caif_serial.c:202: warning: 'return' with no value, in function returning non-void
> >
> >
> > I'm curious:  how do warnings like this get overlooked?
> > too much noise in the build messages?  or it wasn't overlooked, just ignored?
> 
> The patch introducing this problem is "tty: make receive_buf()
> return the amout of bytes received" by Felipe Balbi <balbi@ti.com>.
> It seems that this patch has been posted to linux-serial and not to
> the netdev list or the gits net-next-2.6 or net-2.6 that I follow.
> It's probably no excuse, but this problem is news to me.

I see.  Thanks for the explanation.

> Should I post a patch on linux-next anyway for this issue?

You or Felipe should, but he recently sent this note to the USB
mailing list:

"I'll be away until 31st of May on a business travel. Don't expect me to
look into patches during that period."

so it looks like it would better for you to do it, please.

thanks,
---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply

* Re: [PATCH 3/3] net: Kill ratelimit.h dependency in linux/net.h
From: Joe Perches @ 2011-05-26 21:24 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, mingo
In-Reply-To: <20110526.164330.2076981043593394399.davem@davemloft.net>

On Thu, 2011-05-26 at 16:43 -0400, David Miller wrote:
> Ingo Molnar noticed that we have this unnecessary ratelimit.h
> dependency in linux/net.h, which hid compilation problems from
> people doing builds only with CONFIG_NET enabled.
> 
> Move this stuff out to a seperate net/net_ratelimit.h file and
> include that in the only two places where this thing is needed.

git add net/net_ratelimit.h?

> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  include/linux/net.h        |    6 ------
>  net/core/sysctl_net_core.c |    1 +
>  net/core/utils.c           |    1 +



^ permalink raw reply

* Re: linux-next: Tree for May 26 (drivers/net/caif)
From: Sjur Brændeland @ 2011-05-26 21:15 UTC (permalink / raw)
  To: Randy Dunlap, balbi; +Cc: Stephen Rothwell, netdev, linux-next, LKML, gregkh
In-Reply-To: <20110526120702.fac24de0.randy.dunlap@oracle.com>

Hi Randy,

[Randy]
> (not new)
>
> drivers/net/caif/caif_serial.c:194: warning: 'return' with no value, in function returning non-void
> drivers/net/caif/caif_serial.c:202: warning: 'return' with no value, in function returning non-void
>
>
> I'm curious:  how do warnings like this get overlooked?
> too much noise in the build messages?  or it wasn't overlooked, just ignored?

The patch introducing this problem is "tty: make receive_buf()
return the amout of bytes received" by Felipe Balbi <balbi@ti.com>.
It seems that this patch has been posted to linux-serial and not to
the netdev list or the gits net-next-2.6 or net-2.6 that I follow.
It's probably no excuse, but this problem is news to me.

Should I post a patch on linux-next anyway for this issue?

Regards,
Sjur

^ permalink raw reply

* Re: [PATCH V6 2/4 net-next] skbuff: Add userspace zero-copy buffers in skb
From: Shirley Ma @ 2011-05-26 21:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, mst, Avi Kivity, Arnd Bergmann, netdev, kvm,
	linux-kernel
In-Reply-To: <1306443325.2543.22.camel@edumazet-laptop>

On Thu, 2011-05-26 at 22:55 +0200, Eric Dumazet wrote:
> Le jeudi 26 mai 2011 à 13:24 -0700, Shirley Ma a écrit :
> 
> > I could reduce callback pointer by moving it to *arg, but not desc,
> this
> > indicates that which buffer DMA hasn't done yet in *arg.
> 
> 
> I guess you dont need to use skb itself to hold all your states ?
> 
> I understand its convenient for you, but I believe its worth the pain
> to
> use only one pointer to a (small) object where you put all your stuff.
> 
> Some machines alloc/free millions of skbs per second. 
> 
> If/when most skb uses are for userspace zero-copy buffers we can
> embbed
> your small object in skb itself ;)

You are right, w/o this desc, there will be lots of alloc/dealloc in the
caller side. To have better performance, maybe I should have changed
most skbs to use zero-copy buffers. :)

Ok, for now let me try to move it to caller, just leave one pointer
*uarg here.

Thanks
Shirley

^ permalink raw reply

* [PATCH net-next when it opens] net: 8021q: Add pr_fmt
From: Joe Perches @ 2011-05-26 20:58 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Ben Greear, David S. Miller, netdev, linux-kernel
In-Reply-To: <20110526.145601.735416831290933990.davem@davemloft.net>

Use the current logging style.

Add #define pr_fmt and remove embedded prefix from formats.

Not converting the current pr_<level> uses to netdev_<level>
because all the output here is nicely prefaced with "8021q: ".

Remove __func__ use from proc registration failure message.

Signed-off-by: Joe Perches <joe@perches.com>
---
 net/8021q/vlan.c     |   15 ++++++++-------
 net/8021q/vlan_dev.c |    4 +++-
 net/8021q/vlanproc.c |    4 +++-
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index c7a581a..cfa9afe 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -18,6 +18,8 @@
  *		2 of the License, or (at your option) any later version.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
@@ -149,13 +151,13 @@ int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id)
 	const struct net_device_ops *ops = real_dev->netdev_ops;
 
 	if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
-		pr_info("8021q: VLANs not supported on %s\n", name);
+		pr_info("VLANs not supported on %s\n", name);
 		return -EOPNOTSUPP;
 	}
 
 	if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) &&
 	    (!ops->ndo_vlan_rx_add_vid || !ops->ndo_vlan_rx_kill_vid)) {
-		pr_info("8021q: Device %s has buggy VLAN hw accel\n", name);
+		pr_info("Device %s has buggy VLAN hw accel\n", name);
 		return -EOPNOTSUPP;
 	}
 
@@ -344,13 +346,12 @@ static void __vlan_device_event(struct net_device *dev, unsigned long event)
 	case NETDEV_CHANGENAME:
 		vlan_proc_rem_dev(dev);
 		if (vlan_proc_add_dev(dev) < 0)
-			pr_warning("8021q: failed to change proc name for %s\n",
-					dev->name);
+			pr_warn("failed to change proc name for %s\n",
+				dev->name);
 		break;
 	case NETDEV_REGISTER:
 		if (vlan_proc_add_dev(dev) < 0)
-			pr_warning("8021q: failed to add proc entry for %s\n",
-					dev->name);
+			pr_warn("failed to add proc entry for %s\n", dev->name);
 		break;
 	case NETDEV_UNREGISTER:
 		vlan_proc_rem_dev(dev);
@@ -374,7 +375,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	if ((event == NETDEV_UP) &&
 	    (dev->features & NETIF_F_HW_VLAN_FILTER) &&
 	    dev->netdev_ops->ndo_vlan_rx_add_vid) {
-		pr_info("8021q: adding VLAN 0 to HW filter on device %s\n",
+		pr_info("adding VLAN 0 to HW filter on device %s\n",
 			dev->name);
 		dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0);
 	}
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index f247f5b..4225e24 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -20,6 +20,8 @@
  *		2 of the License, or (at your option) any later version.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/skbuff.h>
@@ -55,7 +57,7 @@ static int vlan_dev_rebuild_header(struct sk_buff *skb)
 		return arp_find(veth->h_dest, skb);
 #endif
 	default:
-		pr_debug("%s: unable to resolve type %X addresses.\n",
+		pr_debug("%s: unable to resolve type %X addresses\n",
 			 dev->name, ntohs(veth->h_vlan_encapsulated_proto));
 
 		memcpy(veth->h_source, dev->dev_addr, ETH_ALEN);
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index d940c49..016d7f4 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -17,6 +17,8 @@
  * Jan 20, 1998        Ben Greear     Initial Version
  *****************************************************************************/
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -155,7 +157,7 @@ int __net_init vlan_proc_init(struct net *net)
 	return 0;
 
 err:
-	pr_err("%s: can't create entry in proc filesystem!\n", __func__);
+	pr_err("can't create entry in proc filesystem!\n");
 	vlan_proc_cleanup(net);
 	return -ENOBUFS;
 }
-- 
1.7.5.rc3.dirty

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox