Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: Add IPSec IP Range in Linux kernel
From: Herbert Xu @ 2011-11-09  3:27 UTC (permalink / raw)
  To: Daniil Stolnikov
  Cc: davem, linux-kernel, netdev, linux-crypto, linux-security-module,
	adobriyan, peter.p.waskiewicz.jr
In-Reply-To: <552673196.20111109103207@mail.ru>

Daniil Stolnikov <danila.st@mail.ru> wrote:
>> Like I said, if you want address ranges, ask the userland IPSEC daemon
>> authors to synthesize it.
> 
> In this letter, the mailing list http://marc.info/?l=strongswan-users&m=130613736616488&w=4 strongswan-users say that their product has support for IP ranges, but the stack of Linux is based on network masks. So I do not understand how this would work without the support at the kernel level? How will coordination of policies?

Simple, you break a range policy into parts that can be expressed
as network/mask and install multiple policies.  The actual policies
in the kernel just has to have the same effect as the one you
negotiated with the other side, it does not have to look the same.

This is also why you can do the same thing with masks + netfilter.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH] net/usb: Misc. fixes for the LG-VL600 LTE USB modem
From: Mark Kamichoff @ 2011-11-09  3:10 UTC (permalink / raw)
  To: oliver, gregkh; +Cc: netdev, linux-kernel, Mark Kamichoff

Add checking for valid magic values (needed for stability in the event
corrupted packets are received) and remove some other unneeded checks.
Also, fix flagging device as WWAN (Bugzilla bug #39952).

Signed-off-by: Mark Kamichoff <prox@prolixium.com>
---
 drivers/net/usb/cdc_ether.c |    2 +-
 drivers/net/usb/lg-vl600.c  |   30 ++++++++++++++----------------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index c924ea2..99ed6eb 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -567,7 +567,7 @@ static const struct usb_device_id	products [] = {
 {
 	USB_DEVICE_AND_INTERFACE_INFO(0x1004, 0x61aa, USB_CLASS_COMM,
 			USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long)&wwan_info,
+	.driver_info = 0,
 },
 
 /*
diff --git a/drivers/net/usb/lg-vl600.c b/drivers/net/usb/lg-vl600.c
index d43db32..b975a39 100644
--- a/drivers/net/usb/lg-vl600.c
+++ b/drivers/net/usb/lg-vl600.c
@@ -144,10 +144,11 @@ static int vl600_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
 	}
 
 	frame = (struct vl600_frame_hdr *) buf->data;
-	/* NOTE: Should check that frame->magic == 0x53544448?
-	 * Otherwise if we receive garbage at the beginning of the frame
-	 * we may end up allocating a huge buffer and saving all the
-	 * future incoming data into it.  */
+	/* Yes, check that frame->magic == 0x53544448 (or 0x44544d48),
+	 * otherwise we may run out of memory w/a bad packet */
+	if (ntohl(frame->magic) != 0x53544448 &&
+			ntohl(frame->magic) != 0x44544d48)
+		goto error;
 
 	if (buf->len < sizeof(*frame) ||
 			buf->len != le32_to_cpup(&frame->len)) {
@@ -209,8 +210,9 @@ static int vl600_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
 			 * for IPv6 packets, and set the ethertype to IPv6
 			 * (0x86dd) so Linux can understand it.
 			 */
-			if ((buf->data[sizeof(*ethhdr)] & 0xf0) == 0x60)
-				ethhdr->h_proto = __constant_htons(ETH_P_IPV6);
+			if ((buf->data[sizeof(*ethhdr)] & 0xf0) == 0x60) {
+				ethhdr->h_proto = htons(ETH_P_IPV6);
+			}
 		}
 
 		if (count) {
@@ -296,6 +298,11 @@ encapsulate:
 	 * overwrite the remaining fields.
 	 */
 	packet = (struct vl600_pkt_hdr *) skb->data;
+	/* The VL600 wants IPv6 packets to have an IPv4 ethertype
+	 * Since this modem only supports IPv4 and IPv6, just set all
+	 * frames to 0x0800 (ETH_P_IP)
+	 */
+	packet->h_proto = htons(ETH_P_IP);
 	memset(&packet->dummy, 0, sizeof(packet->dummy));
 	packet->len = cpu_to_le32(orig_len);
 
@@ -308,21 +315,12 @@ encapsulate:
 	if (skb->len < full_len) /* Pad */
 		skb_put(skb, full_len - skb->len);
 
-	/* The VL600 wants IPv6 packets to have an IPv4 ethertype
-	 * Check if this is an IPv6 packet, and set the ethertype
-	 * to 0x800
-	 */
-	if ((skb->data[sizeof(struct vl600_pkt_hdr *) + 0x22] & 0xf0) == 0x60) {
-		skb->data[sizeof(struct vl600_pkt_hdr *) + 0x20] = 0x08;
-		skb->data[sizeof(struct vl600_pkt_hdr *) + 0x21] = 0;
-	}
-
 	return skb;
 }
 
 static const struct driver_info	vl600_info = {
 	.description	= "LG VL600 modem",
-	.flags		= FLAG_ETHER | FLAG_RX_ASSEMBLE,
+	.flags		= FLAG_RX_ASSEMBLE | FLAG_WWAN,
 	.bind		= vl600_bind,
 	.unbind		= vl600_unbind,
 	.status		= usbnet_cdc_status,
-- 
1.7.5.4

^ permalink raw reply related

* Re: Add IPSec IP Range in Linux kernel
From: Daniil Stolnikov @ 2011-11-09  2:43 UTC (permalink / raw)
  To: Herbert Xu
  Cc: linux-kernel, netdev, linux-crypto, linux-security-module, davem,
	adobriyan, peter.p.waskiewicz.jr, davem
In-Reply-To: <20111109015406.GA10800@gondor.apana.org.au>

Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Alternatively you can do this with marking and use netfilter
> to set the mark.

> Cheers,

We focus on connections to devices zywall. If you choose to zywall IP range as the remote side will not harmonize policies. The connection is not established. And this alternative makes no sense.

Regards
Daniil Stolnikov

^ permalink raw reply

* Re: Add IPSec IP Range in Linux kernel
From: Daniil Stolnikov @ 2011-11-09  2:32 UTC (permalink / raw)
  To: David Miller
  Cc: linux-kernel, netdev, linux-crypto, linux-security-module, davem,
	adobriyan, peter.p.waskiewicz.jr, herbert
In-Reply-To: <20111108.204253.891598837549584662.davem@davemloft.net>

> Like I said, if you want address ranges, ask the userland IPSEC daemon
> authors to synthesize it.

In this letter, the mailing list http://marc.info/?l=strongswan-users&m=130613736616488&w=4 strongswan-users say that their product has support for IP ranges, but the stack of Linux is based on network masks. So I do not understand how this would work without the support at the kernel level? How will coordination of policies?

^ permalink raw reply

* Re: Add IPSec IP Range in Linux kernel
From: Herbert Xu @ 2011-11-09  1:54 UTC (permalink / raw)
  To: David Miller
  Cc: danila.st, linux-kernel, netdev, linux-crypto,
	linux-security-module, adobriyan, peter.p.waskiewicz.jr
In-Reply-To: <20111108.204253.891598837549584662.davem@davemloft.net>

David Miller <davem@davemloft.net> wrote:
>
> Like I said, if you want address ranges, ask the userland IPSEC daemon
> authors to synthesize it.

Alternatively you can do this with marking and use netfilter
to set the mark.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: Add IPSec IP Range in Linux kernel
From: David Miller @ 2011-11-09  1:42 UTC (permalink / raw)
  To: danila.st
  Cc: linux-kernel, netdev, linux-crypto, linux-security-module,
	adobriyan, peter.p.waskiewicz.jr
In-Reply-To: <1289495586.20111109093607@mail.ru>

From: Daniil Stolnikov <danila.st@mail.ru>
Date: Wed, 9 Nov 2011 09:36:07 +0800

> I never imagined that it will cause some difficulties.

Ever feature has side effects and costs associated with it.  Some of
which can be non-trivial.

Like I said, if you want address ranges, ask the userland IPSEC daemon
authors to synthesize it.

I'm really not able to devote the time necessary to explain every
nuance of how we store IPSEC rules in the kernel side database and
what implications that has for expanding the kind of match keys we
support.

^ permalink raw reply

* Re: Add IPSec IP Range in Linux kernel
From: Daniil Stolnikov @ 2011-11-09  1:36 UTC (permalink / raw)
  To: David Miller
  Cc: linux-kernel, netdev, linux-crypto, linux-security-module, davem,
	adobriyan, peter.p.waskiewicz.jr
In-Reply-To: <20111108.121620.2044664919065812135.davem@davemloft.net>

> From: Daniil Stolnikov <danila.st@mail.ru>
> Date: Tue, 08 Nov 2011 12:40:13 +0400

>> I turned to you, the developers, but rather to urge you to implement
>> this feature using IP range.

> This won't be implemented, the keys used for IPSEC rule lookups supported by
> the kernel are already way too complex.

> From: Alexey Dobriyan <adobriyan@gmail.com>
> Date: Tue, 8 Nov 2011 14:08:24 +0200

>> changing addr_match() is trivial for ipv4 and easy for ipv6. :-)

> No, this is not happening.  This added complexity screws up all the hash table
> and lookup optimizations we have in the XFRM layer.

I never imagined that it will cause some difficulties. Several questions arise:

1) How complex is this implementation?
2) How to do this time?
3) Will this feature is implemented vsetaki? If so, how soon and what will it take?

> Ranges can be synthesized by userspace, and that's the way it has to
> be supported.

That is, you want to say that all this can be done at the user level? How so?

In general, if there are alternative implementations of this feature without support at the kernel level? What are some loopholes, tricks? It is meant to create multiple connections to the same subnet subranges without the use of masks such as / 29. Perhaps this can be achieved through l2tp? There, in the present setup IP range. Or is it both?

^ permalink raw reply

* Re: [PATCH] tcp: fixes for DSACK-based undo of cwnd reduction during fast recovery
From: Yuchung Cheng @ 2011-11-09  1:18 UTC (permalink / raw)
  To: Neal Cardwell
  Cc: David Miller, netdev, ilpo.jarvinen, Nandita Dukkipati,
	Tom Herbert
In-Reply-To: <1320775631-16341-1-git-send-email-ncardwell@google.com>

On Tue, Nov 8, 2011 at 10:07 AM, Neal Cardwell <ncardwell@google.com> wrote:
> Fixes for some issues that prevent DSACKs from allowing TCP senders to
> undo cwnd reductions made during fast recovery.
>
> There were a few related bugs/issues:
>
> 1) Senders ignored DSACKs after recovery when there were no
> outstanding packets (a common scenario for HTTP servers).
>
> 2) When the ACK field is below snd_una (which can happen when ACKs are
> reordered), senders ignored DSACKs (preventing undo) and passed up
> chances to send out more packets based on any newly-SACKed packets.
>
> 3) Senders were overriding cwnd values picked during an undo by
> calling tcp_moderate_cwnd() in tcp_try_to_open().
>
> The fixes:
>
> (1) When there are no outstanding packets (the "no_queue" goto label),
> use DSACKs to undo congestion window reductions.
>
> (2) When the ACK field is below snd_una (the "old_ack" goto label),
> process any DSACKs and try to send out more packets based on
> newly-SACKed packets.
>
> (3) Don't moderate cwnd in tcp_try_to_open() if we're in TCP_CA_Open,
> since doing so is generally unnecessary and specifically would
> override a DSACK-based undo of a cwnd reduction made in fast recovery.
>
> (4) Simplify the congestion avoidance state machine by removing the
> behavior where SACK-enabled connections hung around in the
> TCP_CA_Disorder state just waiting for DSACKs. Instead, when snd_una
> advances to high_seq or beyond we typically move to TCP_CA_Open
> immediately and allow an undo in either TCP_CA_Open or TCP_CA_Disorder
> if we later receive enough DSACKs. Previously, SACK-enabled
> connections hung around in TCP_CA_Disorder state while
> snd_una==high_seq, just waiting to accumulate DSACKs and hopefully
> undo a cwnd reduction. This could and did lead to the following
> unfortunate scenario: if some incoming ACKs advance snd_una beyond
> high_seq then we were setting undo_marker to 0 and moving to
> TCP_CA_Open, so if (due to reordering in the ACK return path) we
> shortly thereafter received a DSACK then we were no longer able to
> undo the cwnd reduction.
>
> Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>


FWIW. This undo-fix patch on Google Web servers increased the undos in
loss by 46% and in disorder by 17%. It also corrects the SNMP stats
TCPTimeouts, TCPRenoFailures, TCPSackFailures by moving state into
open, instead of disorder, after recovery.

^ permalink raw reply

* Re: [PATCH] neigh: replace unres_qlen by unres_qlen_bytes
From: Stephen Hemminger @ 2011-11-09  1:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1320797656.26025.43.camel@edumazet-laptop>

On Wed, 09 Nov 2011 01:14:16 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> [PATCH V2 net-next] neigh: replace unres_qlen by unres_qlen_bytes
> 
> unres_qlen is the number of frames we are able to queue per unresolved
> neighbour. Its default value (3) was never changed and is responsible
> for strange drops, especially if IP fragments are used, or multiple
> sessions start in parallel. TCP initial congestion window is now bigger
> than 3.

I don't understand this argument.
TCP won't send data until the initial SYN is acked. And the SYN can't
be sent until the ARP is resolved.

The only use case for this is for applications that open lots of connections
to the same destination as once (a.k.a TCP accelerators) to get around
TCP slow start.

^ permalink raw reply

* Re: [GIT PULL nf-next] IPVS
From: Simon Horman @ 2011-11-09  0:58 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Krzysztof Wilczynski
In-Reply-To: <20111107082956.GA32329@1984>

Hi Pablo,

On Mon, Nov 07, 2011 at 09:29:56AM +0100, Pablo Neira Ayuso wrote:
> Hi Simon,
> 
> On Mon, Nov 07, 2011 at 12:07:01PM +0900, Simon Horman wrote:
> > Hi Pablo,
> > 
> > I am a little confused. The nf-next branch seems to have disappeared.
> > 
> > Could you consider pulling git://github.com/horms/ipvs-next.git master
> > to get the following changes that were in your nf-next branch.
> 
> I was late to get it into net-next. Since net-next became net after
> the 3.1 release, my moved those changes to net to get it into 3.2
> once Linus announced that the merge window was opened again.
> 
> > Or would
> > you like me to rebase the ipvs patches (9 or the 11 changes below) on
> > top of git://1984.lsi.us.es/net-next/.git master ?
> 
> They are already in net davem's tree, they will be included in the
> upcoming 3.2 release.
> 
> http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fdavem%2Fnet.git&a=search&h=HEAD&st=commit&s=Neira

Thanks, and sorry for missing that when I checked yesterday.

Could you suggest which tree and branch I should base the master branch of my
ipvs and ipvs-next trees on? Their purposes are to provide a reference for
people wishing to fix or enhance IPVS and a mechanism to send pull requests to
you. As of now I am using the master branch of your net tree for both.


^ permalink raw reply

* [PATCH] neigh: replace unres_qlen by unres_qlen_bytes
From: Eric Dumazet @ 2011-11-09  0:14 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20111108.174820.558780148839093199.davem@davemloft.net>

Le mardi 08 novembre 2011 à 17:48 -0500, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 08 Nov 2011 23:45:01 +0100
> 
> > Maybe we can do the same for unres_qlen, and setup a byte limit instead
> > of 3 packets limit (say 64Kbytes of truesize per destination)
> 
> That makes sense because what typically gets blocked are initial TCP SYNs,
> small UDP requests, and ICMPs.

OK, here the V2 of the patch doing this.

One point not addressed yet is the removal of /proc/sys/.../unres_qlen

Not sure if we want a fallback [ I did one for the netlink part ].

[PATCH V2 net-next] neigh: replace unres_qlen by unres_qlen_bytes

unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. TCP initial congestion window is now bigger
than 3.

$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms

--- 192.168.20.108 ping statistics ---
2 packets transmitted, 1 received, 50% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.322/0.322/0.322/0.000 ms

Increasing unres_qlen can be dangerous, since an attacker might try to
fill many queues with many packets and consume all memory.

Switch to a bytes limit (limiting queued skbs truesize), and allow a
default limit of 64Kbytes per unresolved neighbour. This new limit seems
big, but as a packet can consume 64Kbytes, it reduces the memory window
offered to attackers.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 Documentation/networking/ip-sysctl.txt |    5 ++++
 include/linux/neighbour.h              |    1 
 include/net/neighbour.h                |    3 +-
 net/atm/clip.c                         |    2 -
 net/core/neighbour.c                   |   27 +++++++++++++++++------
 net/decnet/dn_neigh.c                  |    2 -
 net/ipv4/arp.c                         |    2 -
 net/ipv6/ndisc.c                       |    2 -
 8 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f049a1c..34b8728 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -31,6 +31,11 @@ neigh/default/gc_thresh3 - INTEGER
 	when using large numbers of interfaces and when communicating
 	with large numbers of directly-connected peers.
 
+neigh/default/unres_qlen_bytes - INTEGER
+	The maximum number of bytes which may be used by packets
+	queued for each	unresolved address by other network layers.
+	(added in linux 3.3)
+
 mtu_expires - INTEGER
 	Time, in seconds, that cached PMTU information is kept.
 
diff --git a/include/linux/neighbour.h b/include/linux/neighbour.h
index a7003b7..b188f68 100644
--- a/include/linux/neighbour.h
+++ b/include/linux/neighbour.h
@@ -116,6 +116,7 @@ enum {
 	NDTPA_PROXY_DELAY,		/* u64, msecs */
 	NDTPA_PROXY_QLEN,		/* u32 */
 	NDTPA_LOCKTIME,			/* u64, msecs */
+	NDTPA_QUEUE_LENBYTES,		/* u32 */
 	__NDTPA_MAX
 };
 #define NDTPA_MAX (__NDTPA_MAX - 1)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 2720884..7ae5acf 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -59,7 +59,7 @@ struct neigh_parms {
 	int	reachable_time;
 	int	delay_probe_time;
 
-	int	queue_len;
+	int	queue_len_bytes;
 	int	ucast_probes;
 	int	app_probes;
 	int	mcast_probes;
@@ -99,6 +99,7 @@ struct neighbour {
 	rwlock_t		lock;
 	atomic_t		refcnt;
 	struct sk_buff_head	arp_queue;
+	unsigned int		arp_queue_len_bytes;
 	struct timer_list	timer;
 	unsigned long		used;
 	atomic_t		probes;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 8523940..50ebab6 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -329,7 +329,7 @@ static struct neigh_table clip_tbl = {
 		.gc_staletime 		= 60 * HZ,
 		.reachable_time 	= 30 * HZ,
 		.delay_probe_time 	= 5 * HZ,
-		.queue_len 		= 3,
+		.queue_len_bytes 	= 64*1024,
 		.ucast_probes 		= 3,
 		.mcast_probes 		= 3,
 		.anycast_delay 		= 1 * HZ,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 039d51e..9a47b0a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -238,6 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
 				   it to safe state.
 				 */
 				skb_queue_purge(&n->arp_queue);
+				n->arp_queue_len_bytes = 0;
 				n->output = neigh_blackhole;
 				if (n->nud_state & NUD_VALID)
 					n->nud_state = NUD_NOARP;
@@ -702,6 +703,7 @@ void neigh_destroy(struct neighbour *neigh)
 		printk(KERN_WARNING "Impossible event.\n");
 
 	skb_queue_purge(&neigh->arp_queue);
+	neigh->arp_queue_len_bytes = 0;
 
 	dev_put(neigh->dev);
 	neigh_parms_put(neigh->parms);
@@ -842,6 +844,7 @@ static void neigh_invalidate(struct neighbour *neigh)
 		write_lock(&neigh->lock);
 	}
 	skb_queue_purge(&neigh->arp_queue);
+	neigh->arp_queue_len_bytes = 0;
 }
 
 static void neigh_probe(struct neighbour *neigh)
@@ -980,15 +983,20 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 
 	if (neigh->nud_state == NUD_INCOMPLETE) {
 		if (skb) {
-			if (skb_queue_len(&neigh->arp_queue) >=
-			    neigh->parms->queue_len) {
+			while (neigh->arp_queue_len_bytes + skb->truesize >
+			       neigh->parms->queue_len_bytes) {
 				struct sk_buff *buff;
+
 				buff = __skb_dequeue(&neigh->arp_queue);
+				if (!buff)
+					break;
+				neigh->arp_queue_len_bytes -= buff->truesize;
 				kfree_skb(buff);
 				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
 			}
 			skb_dst_force(skb);
 			__skb_queue_tail(&neigh->arp_queue, skb);
+			neigh->arp_queue_len_bytes += skb->truesize;
 		}
 		rc = 1;
 	}
@@ -1175,6 +1183,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 			write_lock_bh(&neigh->lock);
 		}
 		skb_queue_purge(&neigh->arp_queue);
+		neigh->arp_queue_len_bytes = 0;
 	}
 out:
 	if (update_isrouter) {
@@ -1747,7 +1756,9 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
 		NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
 
 	NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
-	NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
+	NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes);
+	/* approximative value for deprecated QUEUE_LEN (in packets) */
+	NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len_bytes / SKB_TRUESIZE(ETH_FRAME_LEN));
 	NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
 	NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
 	NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
@@ -1974,7 +1985,11 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 
 			switch (i) {
 			case NDTPA_QUEUE_LEN:
-				p->queue_len = nla_get_u32(tbp[i]);
+				p->queue_len_bytes = nla_get_u32(tbp[i]) *
+						     SKB_TRUESIZE(ETH_FRAME_LEN);
+				break;
+			case NDTPA_QUEUE_LENBYTES:
+				p->queue_len_bytes = nla_get_u32(tbp[i]);
 				break;
 			case NDTPA_PROXY_QLEN:
 				p->proxy_qlen = nla_get_u32(tbp[i]);
@@ -2686,7 +2701,7 @@ static struct neigh_sysctl_table {
 			.proc_handler	= proc_dointvec_jiffies,
 		},
 		{
-			.procname	= "unres_qlen",
+			.procname	= "unres_qlen_bytes",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
@@ -2785,7 +2800,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 	t->neigh_vars[4].data  = &p->base_reachable_time;
 	t->neigh_vars[5].data  = &p->delay_probe_time;
 	t->neigh_vars[6].data  = &p->gc_staletime;
-	t->neigh_vars[7].data  = &p->queue_len;
+	t->neigh_vars[7].data  = &p->queue_len_bytes;
 	t->neigh_vars[8].data  = &p->proxy_qlen;
 	t->neigh_vars[9].data  = &p->anycast_delay;
 	t->neigh_vars[10].data = &p->proxy_delay;
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 7f0eb08..fb8b096 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -107,7 +107,7 @@ struct neigh_table dn_neigh_table = {
 		.gc_staletime =	60 * HZ,
 		.reachable_time =		30 * HZ,
 		.delay_probe_time =	5 * HZ,
-		.queue_len =		3,
+		.queue_len =		64*1024,
 		.ucast_probes =	0,
 		.app_probes =		0,
 		.mcast_probes =	0,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96a164a..d732827 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -177,7 +177,7 @@ struct neigh_table arp_tbl = {
 		.gc_staletime		= 60 * HZ,
 		.reachable_time		= 30 * HZ,
 		.delay_probe_time	= 5 * HZ,
-		.queue_len		= 3,
+		.queue_len_bytes	= 64*1024,
 		.ucast_probes		= 3,
 		.mcast_probes		= 3,
 		.anycast_delay		= 1 * HZ,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 44e5b7f..4a20982 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -141,7 +141,7 @@ struct neigh_table nd_tbl = {
 		.gc_staletime		= 60 * HZ,
 		.reachable_time		= ND_REACHABLE_TIME,
 		.delay_probe_time	= 5 * HZ,
-		.queue_len		= 3,
+		.queue_len_bytes	= 64*1024,
 		.ucast_probes		= 3,
 		.mcast_probes		= 3,
 		.anycast_delay		= 1 * HZ,

^ permalink raw reply related

* [net-next-2.6 PATCH 5/6 v4] macvlan: Add support to for netdev ops to set MAC/VLAN filters
From: Roopa Prabhu @ 2011-11-09  7:56 UTC (permalink / raw)
  To: netdev, davem
  Cc: chrisw, sri, dragos.tatulea, kvm, arnd, mst, gregory.v.rose,
	mchan, dwang2, shemminger, eric.dumazet, kaber, benve
In-Reply-To: <20111109075449.13549.58135.stgit@rhel6.1>

From: Roopa Prabhu <roprabhu@cisco.com>

This patch adds support for MAC and VLAN filter netdev ops
on a macvlan interface. It adds support for set_rx_filter_addr and
set_rx_filter_vlan netdev operations. It currently supports only macvlan
PASSTHRU mode. And removes the code that puts the lowerdev in promiscous mode.

For passthru mode,
	For both Address and vlan filters set, lowerdev
	netdev_ops->set_rx_filter_addr and netdev_ops->set_rx_filter_vlan
	are called if the lowerdev supports these ops.

	Else parse the filter data and update the lowerdev filters:
	 - Address filters: macvlan netdev uc and mc lists and flags are
	updated to reflect the addresses and address filter flags that came
	in the filter. Which inturn results in calls to macvlan_set_rx_mode and
	macvlan_change_rx_flags. These functions pass the filter addresses
	and flags to lowerdev netdev. And the lowerdev driver will pass it
	to the hw.

	- VLAN filter: Currently applied vlan bitmap is cached in
	struct macvlan_dev->vlan_filter. This vlan bitmap is updated to
	reflect the new bitmap that came in the netlink vlan filter msg.
	macvlan_vlan_rx_add_vid and macvlan_vlan_rx_kill_vid are called
	to update the vlan ids on the macvlan netdev, which in turn results in
	passing the vlan ids to the lowerdev using netdev_ops
	ndo_vlan_rx_add_vid and ndo_vlan_rx_kill_vid


Note: In future if most lowerdev drivers find use for these ops and start
supporting them, we could remove the local handling of filters for passthru
mode in macvlan

Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
Signed-off-by: Christian Benvenuti <benve@cisco.com>
Signed-off-by: David Wang <dwang2@cisco.com>
---
 drivers/net/macvlan.c      |  331 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/if_macvlan.h |    2 
 2 files changed, 300 insertions(+), 33 deletions(-)


diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 7413497..c2dea97 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -309,30 +309,37 @@ static int macvlan_open(struct net_device *dev)
 	struct net_device *lowerdev = vlan->lowerdev;
 	int err;
 
-	if (vlan->port->passthru) {
-		dev_set_promiscuity(lowerdev, 1);
-		goto hash_add;
-	}
+	if (!vlan->port->passthru) {
+		err = -EBUSY;
+		if (macvlan_addr_busy(vlan->port, dev->dev_addr))
+			goto out;
 
-	err = -EBUSY;
-	if (macvlan_addr_busy(vlan->port, dev->dev_addr))
-		goto out;
+		err = dev_uc_add(lowerdev, dev->dev_addr);
+		if (err < 0)
+			goto out;
+	}
 
-	err = dev_uc_add(lowerdev, dev->dev_addr);
-	if (err < 0)
-		goto out;
 	if (dev->flags & IFF_ALLMULTI) {
 		err = dev_set_allmulti(lowerdev, 1);
 		if (err < 0)
 			goto del_unicast;
 	}
 
-hash_add:
+	if (dev->flags & IFF_PROMISC) {
+		err = dev_set_promiscuity(lowerdev, 1);
+		if (err < 0)
+			goto unset_allmulti;
+	}
+
 	macvlan_hash_add(vlan);
 	return 0;
 
+unset_allmulti:
+	dev_set_allmulti(lowerdev, -1);
+
 del_unicast:
-	dev_uc_del(lowerdev, dev->dev_addr);
+	if (!vlan->port->passthru)
+		dev_uc_del(lowerdev, dev->dev_addr);
 out:
 	return err;
 }
@@ -342,18 +349,16 @@ static int macvlan_stop(struct net_device *dev)
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct net_device *lowerdev = vlan->lowerdev;
 
-	if (vlan->port->passthru) {
-		dev_set_promiscuity(lowerdev, -1);
-		goto hash_del;
-	}
-
+	dev_uc_unsync(lowerdev, dev);
 	dev_mc_unsync(lowerdev, dev);
 	if (dev->flags & IFF_ALLMULTI)
 		dev_set_allmulti(lowerdev, -1);
+	if (dev->flags & IFF_PROMISC)
+		dev_set_promiscuity(lowerdev, -1);
 
-	dev_uc_del(lowerdev, dev->dev_addr);
+	if (!vlan->port->passthru)
+		dev_uc_del(lowerdev, dev->dev_addr);
 
-hash_del:
 	macvlan_hash_del(vlan, !dev->dismantle);
 	return 0;
 }
@@ -394,12 +399,16 @@ static void macvlan_change_rx_flags(struct net_device *dev, int change)
 
 	if (change & IFF_ALLMULTI)
 		dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1);
+	if (change & IFF_PROMISC)
+		dev_set_promiscuity(lowerdev,
+			dev->flags & IFF_PROMISC ? 1 : -1);
 }
 
-static void macvlan_set_multicast_list(struct net_device *dev)
+static void macvlan_set_rx_mode(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
+	dev_uc_sync(vlan->lowerdev, dev);
 	dev_mc_sync(vlan->lowerdev, dev);
 }
 
@@ -542,6 +551,257 @@ static void macvlan_vlan_rx_kill_vid(struct net_device *dev,
 		ops->ndo_vlan_rx_kill_vid(lowerdev, vid);
 }
 
+static inline void macvlan_set_filter_vlan(struct net_device *dev, int vid)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	set_bit(vid, vlan->vlan_filter);
+	macvlan_vlan_rx_add_vid(dev, vid);
+}
+
+static inline void macvlan_clear_filter_vlan(struct net_device *dev, int vid)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	clear_bit(vid, vlan->vlan_filter);
+	macvlan_vlan_rx_kill_vid(dev, vid);
+}
+
+static int macvlan_set_rx_filter_vlan_passthru(struct net_device *dev, int vf,
+					       struct nlattr *tb[])
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *lowerdev = vlan->lowerdev;
+	const struct net_device_ops *ops = lowerdev->netdev_ops;
+	unsigned long *vlans;
+	u16 vid;
+
+	if (ops->ndo_set_rx_filter_vlan)
+		return ops->ndo_set_rx_filter_vlan(dev, vf, tb);
+
+	if (!tb[IFLA_RX_FILTER_VLAN_BITMAP])
+		return -EINVAL;
+
+	vlans = nla_data(tb[IFLA_RX_FILTER_VLAN_BITMAP]);
+
+	/*
+	 *	Clear vlans that are not present in the new filter
+	 */
+	for_each_set_bit(vid, vlan->vlan_filter, VLAN_N_VID) {
+		if (!test_bit(vid, vlans))
+			macvlan_clear_filter_vlan(dev, vid);
+	}
+
+	/*
+	 *	Set new vlans that came in the filter
+	 */
+	for_each_set_bit(vid, vlans, VLAN_N_VID) {
+		if (!test_bit(vid, vlan->vlan_filter))
+			macvlan_set_filter_vlan(dev, vid);
+	}
+
+	return 0;
+}
+
+static int macvlan_set_rx_filter_vlan(struct net_device *dev, int vf,
+				      struct nlattr *tb[])
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	int err;
+
+	if (vf != SELF_VF)
+		return -EINVAL;
+
+	switch (vlan->mode) {
+	case MACVLAN_MODE_PASSTHRU:
+		return macvlan_set_rx_filter_vlan_passthru(dev, vf, tb);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int macvlan_addr_in_hw_list(struct netdev_hw_addr_list *list,
+				   u8 *addr, int addrlen)
+{
+	struct netdev_hw_addr *ha;
+
+	netdev_hw_addr_list_for_each(ha, list) {
+		if (!memcmp(ha->addr, addr, addrlen))
+			return 1;
+	}
+
+	return 0;
+}
+
+static int macvlan_addr_in_attrs(struct nlattr *addr_list, u8 *addr,
+				 int addrlen)
+{
+	struct nlattr *addr_attr;
+	int addr_rem;
+
+	nla_for_each_nested(addr_attr, addr_list, addr_rem) {
+		if (!memcmp(nla_data(addr_attr), addr, addrlen))
+			return 1;
+	}
+
+	return 0;
+}
+
+static int macvlan_update_hw_addr_list(struct net_device *dev,
+				struct netdev_hw_addr_list *curr_addr_list,
+				int addr_list_type,
+				struct nlattr *new_addr_attrs)
+{
+	struct nlattr *addr_attr;
+	int addr_rem;
+	u8 *addr;
+	int alen, i;
+	int err = 0;
+
+	if (!netdev_hw_addr_list_empty(curr_addr_list)) {
+		struct netdev_hw_addr *ha;
+		u8 *del_addrlist;
+		int del_addr_count = 0;
+
+		alen = ETH_ALEN * netdev_hw_addr_list_count(curr_addr_list);
+		del_addrlist = kmalloc(alen, GFP_KERNEL);
+		if (!del_addrlist) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+
+		/*
+		 *	Get the addresses that need to be deleted
+		 */
+		netdev_hw_addr_list_for_each(ha, curr_addr_list) {
+			if (!macvlan_addr_in_attrs(new_addr_attrs, ha->addr,
+				ETH_ALEN))
+				memcpy(del_addrlist + (del_addr_count++ *
+					ETH_ALEN), ha->addr, ETH_ALEN);
+		}
+
+		/*
+		 * Delete addresses
+		 */
+		for (i = 0, addr = del_addrlist; i < del_addr_count && addr;
+			i++, addr += ETH_ALEN) {
+			if (addr_list_type == NETDEV_HW_ADDR_T_UNICAST)
+				dev_uc_del(dev, addr);
+			else if (addr_list_type == NETDEV_HW_ADDR_T_MULTICAST)
+				dev_mc_del(dev, addr);
+		}
+		kfree(del_addrlist);
+	}
+
+	/* Add new addresses */
+	nla_for_each_nested(addr_attr, new_addr_attrs, addr_rem) {
+		if (!macvlan_addr_in_hw_list(curr_addr_list,
+			nla_data(addr_attr), ETH_ALEN)) {
+			if (addr_list_type == NETDEV_HW_ADDR_T_UNICAST)
+				dev_uc_add(dev, nla_data(addr_attr));
+			else if (addr_list_type == NETDEV_HW_ADDR_T_MULTICAST)
+				dev_mc_add(dev, nla_data(addr_attr));
+		}
+	}
+
+	return 0;
+
+err_out:
+	return err;
+}
+
+static int macvlan_set_rx_filter_addr_passthru(struct net_device *dev,
+					       int vf, struct nlattr *tb[])
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *lowerdev = vlan->lowerdev;
+	const struct net_device_ops *ops = lowerdev->netdev_ops;
+	unsigned int flags, flags_changed;
+	int err;
+
+	if (ops->ndo_set_rx_filter_addr)
+		return ops->ndo_set_rx_filter_addr(vlan->lowerdev, vf, tb);
+
+	if (tb[IFLA_RX_FILTER_ADDR_FLAGS]) {
+		flags = nla_get_u32(tb[IFLA_RX_FILTER_ADDR_FLAGS]);
+
+		flags_changed = (dev->flags ^ flags) & RX_FILTER_FLAGS;
+		if (flags_changed)
+			dev_change_flags(dev, dev->flags ^ flags_changed);
+	}
+
+	if (tb[IFLA_RX_FILTER_ADDR_UC_LIST]) {
+		err = macvlan_update_hw_addr_list(dev, &dev->uc,
+				NETDEV_HW_ADDR_T_UNICAST,
+				tb[IFLA_RX_FILTER_ADDR_UC_LIST]);
+		if (err)
+			return err;
+	}
+
+	if (tb[IFLA_RX_FILTER_ADDR_MC_LIST]) {
+		err = macvlan_update_hw_addr_list(dev, &dev->mc,
+				NETDEV_HW_ADDR_T_MULTICAST,
+				tb[IFLA_RX_FILTER_ADDR_MC_LIST]);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int macvlan_validate_rx_filter_addr(struct net_device *dev, int vf,
+					   struct nlattr *tb[])
+{
+	struct nlattr *addr_attr;
+	int addr_rem;
+
+	if (vf != SELF_VF)
+		return -EINVAL;
+
+	if (tb[IFLA_RX_FILTER_ADDR_UC_LIST]) {
+		nla_for_each_nested(addr_attr, tb[IFLA_RX_FILTER_ADDR_UC_LIST],
+				    addr_rem) {
+			if ((nla_type(addr_attr) != IFLA_ADDR_LIST_ENTRY) ||
+				!is_unicast_ether_addr(nla_data(addr_attr)))
+				return -EINVAL;
+		}
+	}
+
+	if (tb[IFLA_RX_FILTER_ADDR_MC_LIST]) {
+		nla_for_each_nested(addr_attr, tb[IFLA_RX_FILTER_ADDR_MC_LIST],
+				    addr_rem) {
+			if ((nla_type(addr_attr) != IFLA_ADDR_LIST_ENTRY) ||
+				!is_multicast_ether_addr(nla_data(addr_attr)))
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int macvlan_set_rx_filter_addr(struct net_device *dev, int vf,
+				      struct nlattr *tb[])
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	int err;
+
+	err = macvlan_validate_rx_filter_addr(dev, vf, tb);
+	if (err)
+		return err;
+
+	switch (vlan->mode) {
+	case MACVLAN_MODE_PASSTHRU:
+		return macvlan_set_rx_filter_addr_passthru(dev, vf, tb);
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 static void macvlan_ethtool_get_drvinfo(struct net_device *dev,
 					struct ethtool_drvinfo *drvinfo)
 {
@@ -564,19 +824,21 @@ static const struct ethtool_ops macvlan_ethtool_ops = {
 };
 
 static const struct net_device_ops macvlan_netdev_ops = {
-	.ndo_init		= macvlan_init,
-	.ndo_uninit		= macvlan_uninit,
-	.ndo_open		= macvlan_open,
-	.ndo_stop		= macvlan_stop,
-	.ndo_start_xmit		= macvlan_start_xmit,
-	.ndo_change_mtu		= macvlan_change_mtu,
-	.ndo_change_rx_flags	= macvlan_change_rx_flags,
-	.ndo_set_mac_address	= macvlan_set_mac_address,
-	.ndo_set_rx_mode	= macvlan_set_multicast_list,
-	.ndo_get_stats64	= macvlan_dev_get_stats64,
-	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_vlan_rx_add_vid	= macvlan_vlan_rx_add_vid,
-	.ndo_vlan_rx_kill_vid	= macvlan_vlan_rx_kill_vid,
+	.ndo_init			= macvlan_init,
+	.ndo_uninit			= macvlan_uninit,
+	.ndo_open			= macvlan_open,
+	.ndo_stop			= macvlan_stop,
+	.ndo_start_xmit			= macvlan_start_xmit,
+	.ndo_change_mtu			= macvlan_change_mtu,
+	.ndo_change_rx_flags		= macvlan_change_rx_flags,
+	.ndo_set_mac_address		= macvlan_set_mac_address,
+	.ndo_set_rx_mode		= macvlan_set_rx_mode,
+	.ndo_get_stats64		= macvlan_dev_get_stats64,
+	.ndo_validate_addr		= eth_validate_addr,
+	.ndo_vlan_rx_add_vid		= macvlan_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid		= macvlan_vlan_rx_kill_vid,
+	.ndo_set_rx_filter_addr		= macvlan_set_rx_filter_addr,
+	.ndo_set_rx_filter_vlan		= macvlan_set_rx_filter_vlan,
 };
 
 void macvlan_common_setup(struct net_device *dev)
@@ -584,6 +846,7 @@ void macvlan_common_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	dev->priv_flags	       &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+	dev->priv_flags	       |= IFF_UNICAST_FLT;
 	dev->netdev_ops		= &macvlan_netdev_ops;
 	dev->destructor		= free_netdev;
 	dev->header_ops		= &macvlan_hard_header_ops,
@@ -711,6 +974,8 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 	if (data && data[IFLA_MACVLAN_MODE])
 		vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);
 
+	memset(vlan->vlan_filter, 0, VLAN_BITMAP_SIZE);
+
 	if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
 		if (port->count)
 			return -EINVAL;
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index d103dca..c0d84a5 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -7,6 +7,7 @@
 #include <linux/netlink.h>
 #include <net/netlink.h>
 #include <linux/u64_stats_sync.h>
+#include <linux/if_vlan.h>
 
 #if defined(CONFIG_MACVTAP) || defined(CONFIG_MACVTAP_MODULE)
 struct socket *macvtap_get_socket(struct file *);
@@ -65,6 +66,7 @@ struct macvlan_dev {
 	struct macvtap_queue	*taps[MAX_MACVTAP_QUEUES];
 	int			numvtaps;
 	int			minor;
+	unsigned long		vlan_filter[BITS_TO_LONGS(VLAN_N_VID)];
 };
 
 static inline void macvlan_count_rx(const struct macvlan_dev *vlan,

^ permalink raw reply related

* [net-next-2.6 PATCH 2/6 v4] net: Add netdev_ops to set and get MAC/VLAN rx filters
From: Roopa Prabhu @ 2011-11-09  7:55 UTC (permalink / raw)
  To: netdev, davem
  Cc: chrisw, sri, dragos.tatulea, kvm, arnd, mst, gregory.v.rose,
	mchan, dwang2, shemminger, eric.dumazet, kaber, benve
In-Reply-To: <20111109075449.13549.58135.stgit@rhel6.1>

From: Roopa Prabhu <roprabhu@cisco.com>

This patch adds the following netdev_ops to set and get MAC/VLAN
filters on a SRIOV VF or any netdev interface. Each op takes a vf argument.
vf value of SELF_VF or -1 is for applying the operation directly on the
interface.

ndo_set_rx_filter_addr - to set address filter
ndo_get_rx_filter_addr_size - to get address filter size
ndo_get_rx_filter_addr - To get addr filter

ndo_set_rx_filter_vlan - to set vlan filter
ndo_get_rx_filter_vlan_size - to get vlan filter size
ndo_get_rx_filter_vlan - To get vlan filter

Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
Signed-off-by: Christian Benvenuti <benve@cisco.com>
Signed-off-by: David Wang <dwang2@cisco.com>
---
 include/linux/netdevice.h |   32 ++++++++++++++++++++++++++++++++
 1 files changed, 32 insertions(+), 0 deletions(-)


diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cbeb586..3cbd700 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -855,6 +855,20 @@ struct netdev_tc_txq {
  *	feature set might be less than what was returned by ndo_fix_features()).
  *	Must return >0 or -errno if it changed dev->features itself.
  *
+ * Address Filter management functions:
+ * int (*ndo_set_rx_filter_addr)(struct net_device *dev, int vf,
+ *				 struct nlattr *tb[]);
+ * size_t (*ndo_get_rx_filter_addr_size)(const struct net_device *dev, int vf);
+ * int (*ndo_get_rx_filter_addr)(const struct net_device *dev, int vf,
+ *				 struct sk_buff *skb);
+ *
+ * Vlan Filter management functions:
+ * int (*ndo_set_rx_filter_vlan)(struct net_device *dev, int vf,
+ *				 struct nlattr *tb[]);
+ * size_t (*ndo_get_rx_filter_vlan_size)(const struct net_device *dev, int vf);
+ * int (*ndo_get_rx_filter_vlan)(const struct net_device *dev, int vf,
+ *				 struct sk_buff *skb);
+ *
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -948,6 +962,24 @@ struct net_device_ops {
 						    u32 features);
 	int			(*ndo_set_features)(struct net_device *dev,
 						    u32 features);
+	int			(*ndo_set_rx_filter_addr)(
+						struct net_device *dev, int vf,
+						struct nlattr *tb[]);
+	size_t			(*ndo_get_rx_filter_addr_size)(
+						const struct net_device *dev,
+						int vf);
+	int			(*ndo_get_rx_filter_addr)(
+						const struct net_device *dev,
+						int vf, struct sk_buff *skb);
+	int			(*ndo_set_rx_filter_vlan)(
+						struct net_device *dev, int vf,
+						struct nlattr *tb[]);
+	size_t			(*ndo_get_rx_filter_vlan_size)(
+						const struct net_device *dev,
+						int vf);
+	int			(*ndo_get_rx_filter_vlan)(
+						const struct net_device *dev,
+						int vf, struct sk_buff *skb);
 };
 
 /*

^ permalink raw reply related

* [net-next-2.6 PATCH 1/6 v4] rtnetlink: Netlink interface for setting MAC and VLAN filters
From: Roopa Prabhu @ 2011-11-09  7:55 UTC (permalink / raw)
  To: netdev, davem
  Cc: chrisw, sri, dragos.tatulea, kvm, arnd, mst, gregory.v.rose,
	mchan, dwang2, shemminger, eric.dumazet, kaber, benve
In-Reply-To: <20111109075449.13549.58135.stgit@rhel6.1>

From: Roopa Prabhu <roprabhu@cisco.com>

This patch introduces the following netlink interface to set
MAC and VLAN filters on an network interface. It can be used to
set RX filter on any network interface (if supported by the driver) and
also on a SRIOV VF via its PF

Interface to set RX filter on a SRIOV VF
[IFLA_VF_RX_FILTERS] = {
	[IFLA_VF_RX_FILTER] = {
		[IFLA_RX_FILTER_VF]
		[IFLA_RX_FILTER_ADDR] = {
			[IFLA_RX_FILTER_ADDR_FLAGS]
			[IFLA_RX_FILTER_ADDR_UC_LIST] = {
				[IFLA_ADDR_LIST_ENTRY]
			}
			[IFLA_RX_FILTER_ADDR_MC_LIST] = {
				[IFLA_ADDR_LIST_ENTRY]
			}
		}
		[IFLA_RX_FILTER_VLAN] = {
			[IFLA_RX_FILTER_VLAN_BITMAP]
		}
	}
	...
}

Interface to set RX filter on any network interface.:
[IFLA_RX_FILTER] = {
	[IFLA_RX_FILTER_VF]
	[IFLA_RX_FILTER_ADDR] = {
		[IFLA_RX_FILTER_ADDR_FLAGS]
		[IFLA_RX_FILTER_ADDR_UC_LIST] = {
			[IFLA_ADDR_LIST_ENTRY]
		}
		[IFLA_RX_FILTER_ADDR_MC_LIST] = {
			[IFLA_ADDR_LIST_ENTRY]
		}
	}
	[IFLA_RX_FILTER_VLAN] = {
		[IFLA_RX_FILTER_VLAN_BITMAP]
	}
}

Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
Signed-off-by: Christian Benvenuti <benve@cisco.com>
Signed-off-by: David Wang <dwang2@cisco.com>
---
 include/linux/if_link.h |   61 +++++++++++++++++++++++++++++++++++++++++++++++
 net/core/rtnetlink.c    |   20 +++++++++++++++
 2 files changed, 81 insertions(+), 0 deletions(-)


diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index c52d4b5..74a9f17 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -137,6 +137,8 @@ enum {
 	IFLA_AF_SPEC,
 	IFLA_GROUP,		/* Group the device belongs to */
 	IFLA_NET_NS_FD,
+	IFLA_VF_RX_FILTERS,
+	IFLA_RX_FILTER,
 	__IFLA_MAX
 };
 
@@ -390,4 +392,63 @@ struct ifla_port_vsi {
 	__u8 pad[3];
 };
 
+/* VF rx filters management section
+ *
+ *	Nested layout of set/get msg is:
+ *
+ *	[IFLA_VF_RX_FILTERS]
+ *		[IFLA_VF_RX_FILTER]
+ *			[IFLA_RX_FILTER_*], ...
+ *		[IFLA_VF_RX_FILTER]
+ *			[IFLA_RX_FILTER_*], ...
+ *		...
+ *	[IFLA_RX_FILTER]
+ *		[IFLA_RX_FILTER_*], ...
+ */
+enum {
+	IFLA_VF_RX_FILTER_UNSPEC,
+	IFLA_VF_RX_FILTER,			/* nest */
+	__IFLA_VF_RX_FILTER_MAX,
+};
+
+#define IFLA_VF_RX_FILTER_MAX (__IFLA_VF_RX_FILTER_MAX - 1)
+
+enum {
+	IFLA_RX_FILTER_UNSPEC,
+	IFLA_RX_FILTER_VF,		/* __u32 */
+	IFLA_RX_FILTER_ADDR,
+	IFLA_RX_FILTER_VLAN,
+	__IFLA_RX_FILTER_MAX,
+};
+#define IFLA_RX_FILTER_MAX (__IFLA_RX_FILTER_MAX - 1)
+
+enum {
+	IFLA_RX_FILTER_ADDR_UNSPEC,
+	IFLA_RX_FILTER_ADDR_FLAGS,
+	IFLA_RX_FILTER_ADDR_UC_LIST,
+	IFLA_RX_FILTER_ADDR_MC_LIST,
+	__IFLA_RX_FILTER_ADDR_MAX,
+};
+#define IFLA_RX_FILTER_ADDR_MAX (__IFLA_RX_FILTER_ADDR_MAX - 1)
+
+#define RX_FILTER_FLAGS (IFF_UP | IFF_BROADCAST | IFF_MULTICAST | \
+				IFF_PROMISC | IFF_ALLMULTI)
+
+enum {
+	IFLA_ADDR_LIST_UNSPEC,
+	IFLA_ADDR_LIST_ENTRY,
+	__IFLA_ADDR_LIST_MAX,
+};
+#define IFLA_ADDR_LIST_MAX (__IFLA_ADDR_LIST_MAX - 1)
+
+enum {
+	IFLA_RX_FILTER_VLAN_UNSPEC,
+	IFLA_RX_FILTER_VLAN_BITMAP,
+	__IFLA_RX_FILTER_VLAN_MAX,
+};
+#define IFLA_RX_FILTER_VLAN_MAX (__IFLA_RX_FILTER_VLAN_MAX - 1)
+
+#define VLAN_BITMAP_SPLIT_MAX 8
+#define VLAN_BITMAP_SIZE	(VLAN_N_VID/VLAN_BITMAP_SPLIT_MAX)
+
 #endif /* _LINUX_IF_LINK_H */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9083e82..9eead8e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -42,6 +42,7 @@
 
 #include <linux/inet.h>
 #include <linux/netdevice.h>
+#include <linux/if_vlan.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/arp.h>
@@ -1097,6 +1098,8 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_VF_PORTS]		= { .type = NLA_NESTED },
 	[IFLA_PORT_SELF]	= { .type = NLA_NESTED },
 	[IFLA_AF_SPEC]		= { .type = NLA_NESTED },
+	[IFLA_VF_RX_FILTERS]	= { .type = NLA_NESTED },
+	[IFLA_RX_FILTER]	= { .type = NLA_NESTED },
 };
 EXPORT_SYMBOL(ifla_policy);
 
@@ -1132,6 +1135,23 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
 	[IFLA_PORT_RESPONSE]	= { .type = NLA_U16, },
 };
 
+static const struct nla_policy ifla_rx_filter_policy[IFLA_RX_FILTER_MAX+1] = {
+	[IFLA_RX_FILTER_VF]	= { .type = NLA_U32 },
+	[IFLA_RX_FILTER_ADDR]	= { .type = NLA_NESTED },
+	[IFLA_RX_FILTER_VLAN]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_addr_filter_policy[IFLA_RX_FILTER_ADDR_MAX+1] = {
+	[IFLA_RX_FILTER_ADDR_FLAGS]	= { .type = NLA_U32 },
+	[IFLA_RX_FILTER_ADDR_UC_LIST]	= { .type = NLA_NESTED },
+	[IFLA_RX_FILTER_ADDR_MC_LIST]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vlan_filter_policy[IFLA_RX_FILTER_VLAN_MAX+1] = {
+	[IFLA_RX_FILTER_VLAN_BITMAP]	= { .type = NLA_BINARY,
+					    .len = VLAN_BITMAP_SIZE },
+};
+
 struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
 {
 	struct net *net;

^ permalink raw reply related

* [PATCH net-next] ipv4: reduce percpu needs for icmpmsg mibs
From: Eric Dumazet @ 2011-11-08 23:04 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Reading /proc/net/snmp on a machine with a lot of cpus is very expensive
(can be ~88000 us).

This is because ICMPMSG MIB uses 4096 bytes per cpu, and folding values
for all possible cpus can read 16 Mbytes of memory.

ICMP messages are not considered as fast path on a typical server, and
eventually few cpus handle them anyway. We can afford an atomic
operation instead of using percpu data.

This saves 4096 bytes per cpu and per network namespace.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
If this patch is accepted, I'll submit the IPv6 part as well.

 include/net/icmp.h      |    4 ++--
 include/net/netns/mib.h |    2 +-
 include/net/snmp.h      |    2 +-
 net/ipv4/af_inet.c      |    8 ++++----
 net/ipv4/proc.c         |    9 ++++-----
 5 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/include/net/icmp.h b/include/net/icmp.h
index f0698b9..75d6156 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -31,8 +31,8 @@ struct icmp_err {
 extern const struct icmp_err icmp_err_convert[];
 #define ICMP_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.icmp_statistics, field)
 #define ICMP_INC_STATS_BH(net, field)	SNMP_INC_STATS_BH((net)->mib.icmp_statistics, field)
-#define ICMPMSGOUT_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.icmpmsg_statistics, field+256)
-#define ICMPMSGIN_INC_STATS_BH(net, field)	SNMP_INC_STATS_BH((net)->mib.icmpmsg_statistics, field)
+#define ICMPMSGOUT_INC_STATS(net, field)	SNMP_INC_STATS_ATOMIC_LONG((net)->mib.icmpmsg_statistics, field+256)
+#define ICMPMSGIN_INC_STATS_BH(net, field)	SNMP_INC_STATS_ATOMIC_LONG((net)->mib.icmpmsg_statistics, field)
 
 struct dst_entry;
 struct net_proto_family;
diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h
index 0b44112..f360135 100644
--- a/include/net/netns/mib.h
+++ b/include/net/netns/mib.h
@@ -10,7 +10,7 @@ struct netns_mib {
 	DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
 	DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics);
 	DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
-	DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics);
+	DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib, icmpmsg_statistics);
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct proc_dir_entry *proc_net_devsnmp6;
diff --git a/include/net/snmp.h b/include/net/snmp.h
index 8f0f9ac..0feafa6 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -67,7 +67,7 @@ struct icmp_mib {
 
 #define ICMPMSG_MIB_MAX	__ICMPMSG_MIB_MAX
 struct icmpmsg_mib {
-	unsigned long	mibs[ICMPMSG_MIB_MAX];
+	atomic_long_t	mibs[ICMPMSG_MIB_MAX];
 };
 
 /* ICMP6 (IPv6-ICMP) */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1b5096a..b2bbcd0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1572,9 +1572,9 @@ static __net_init int ipv4_mib_init_net(struct net *net)
 			  sizeof(struct icmp_mib),
 			  __alignof__(struct icmp_mib)) < 0)
 		goto err_icmp_mib;
-	if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
-			  sizeof(struct icmpmsg_mib),
-			  __alignof__(struct icmpmsg_mib)) < 0)
+	net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
+					      GFP_KERNEL);
+	if (!net->mib.icmpmsg_statistics)
 		goto err_icmpmsg_mib;
 
 	tcp_mib_init(net);
@@ -1598,7 +1598,7 @@ err_tcp_mib:
 
 static __net_exit void ipv4_mib_exit_net(struct net *net)
 {
-	snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
+	kfree(net->mib.icmpmsg_statistics);
 	snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
 	snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
 	snmp_mib_free((void __percpu **)net->mib.udp_statistics);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 466ea8b..961eed4 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -288,7 +288,7 @@ static void icmpmsg_put(struct seq_file *seq)
 
 	count = 0;
 	for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
-		val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i);
+		val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]);
 		if (val) {
 			type[count] = i;
 			vals[count++] = val;
@@ -307,6 +307,7 @@ static void icmp_put(struct seq_file *seq)
 {
 	int i;
 	struct net *net = seq->private;
+	atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
 
 	seq_puts(seq, "\nIcmp: InMsgs InErrors");
 	for (i=0; icmpmibmap[i].name != NULL; i++)
@@ -319,15 +320,13 @@ static void icmp_put(struct seq_file *seq)
 		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
 	for (i=0; icmpmibmap[i].name != NULL; i++)
 		seq_printf(seq, " %lu",
-			snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
-				icmpmibmap[i].index));
+			   atomic_long_read(ptr + icmpmibmap[i].index));
 	seq_printf(seq, " %lu %lu",
 		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
 		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
 	for (i=0; icmpmibmap[i].name != NULL; i++)
 		seq_printf(seq, " %lu",
-			snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
-				icmpmibmap[i].index | 0x100));
+			   atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
 }
 
 /*

^ permalink raw reply related

* Re: [PATCH] neigh: increase unres_qlen by one magnitude
From: David Miller @ 2011-11-08 22:48 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1320792301.26025.21.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 08 Nov 2011 23:45:01 +0100

> Maybe we can do the same for unres_qlen, and setup a byte limit instead
> of 3 packets limit (say 64Kbytes of truesize per destination)

That makes sense because what typically gets blocked are initial TCP SYNs,
small UDP requests, and ICMPs.

^ permalink raw reply

* Re: [PATCH] neigh: increase unres_qlen by one magnitude
From: Eric Dumazet @ 2011-11-08 22:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20111108.172448.2101648110512252549.davem@davemloft.net>

Le mardi 08 novembre 2011 à 17:24 -0500, David Miller a écrit :

> Pretty risky don't you think?
> 

Yes :)

> So now we'll allow essentially any remote machine to force us to hold
> on to memory on the order of (32 * num_ips_in_subnet) for each IP
> address configured.
> 

Exact limit is 32 * min(num_ips_in_subnet, 1024) : 32768 packets

(because gc_thresh3 is 1024 : max allowed number of neighbors)

> Just spam UDP or ICMP packets with a source address iterating over
> addresses in one of the host's subnets.  If the subnet space is
> relatively large, chances are %99 of those IPs won't respond to ARP
> and we'll queue up the ICMP replies.
> 
> Probably what will trigger first, actually, is we'll hit the per-cpu
> ICMP socket send buffer limit.  Because we won't even get to the
> point in the TX path where we will early orphan the SKB.
> 
> So essentially this will stop ICMP responses completely for all
> traffic processed on that cpu.
> 
> I realize you're trying to address a very real problem, but I'm just
> not sure at all that unilaterally increasing the value like this is
> safe.

Since you speak of icmp sock, its limit is more governed by cumulative
skb truesizes.

Maybe we can do the same for unres_qlen, and setup a byte limit instead
of 3 packets limit (say 64Kbytes of truesize per destination)

^ permalink raw reply

* [PATCH] r8169: more driver shutdown WoL regression.
From: Francois Romieu @ 2011-11-08 22:35 UTC (permalink / raw)
  To: netdev; +Cc: Stefan Becker, David Miller, Hayes

Almost the same narrative as 649b3b8c4e8681de443b4dc9e387c3036369e02e
but with more experimental data.

Stefan Becker has reported that the same kind of fix as the one
introduced in 649b3b8c4e8681de443b4dc9e387c3036369e02e ("r8169: fix
driver shutdown WoL regression") before 3.1 was released is required
for his 8168c (RTL_GIGA_MAC_VER_22).

I have tested a few chipsets as well:
- without patch, shutdown + WoL works fine for :
  o RTL_GIGA_MAC_VER_30 (8105e and 8105evc)
  o RTL_GIGA_MAC_VER_33 (8168ed)
  o RTL_GIGA_MAC_VER_34 (8168evl)
  o RTL_GIGA_MAC_VER_35 (8168f)
  o RTL_GIGA_MAC_VER_06 (plain old PCI 8169sc)
- without patch, shutdown + WoL is broken with :
  o RTL_GIGA_MAC_VER_26 (8168d-vb-gr)
  o RTL_GIGA_MAC_VER_25 (8168d-gr)
  o RTL_GIGA_MAC_VER_12 (8168b)
  o RTL_GIGA_MAC_VER_09 (both 8102e-vb-gr and 8103e-gr)

I have widened rtl_wol_suspend_quirk a bit beyond those data to include
a broader subset of chipsets from the same families, thus including the
8168cp and 8168dp.

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
Tested-by: Stefan Becker <chemobejk@gmail.com>
Cc: Hayes <hayeswang@realtek.com>
---

 Hayes, any insight ?

 drivers/net/ethernet/realtek/r8169.c |   12 ++++++++++++
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 92b45f0..829674d 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -3496,6 +3496,18 @@ static void rtl_wol_suspend_quirk(struct rtl8169_private *tp)
 	void __iomem *ioaddr = tp->mmio_addr;

 	switch (tp->mac_version) {
+	case RTL_GIGA_MAC_VER_07:
+	case RTL_GIGA_MAC_VER_08:
+	case RTL_GIGA_MAC_VER_09:
+	case RTL_GIGA_MAC_VER_11:
+	case RTL_GIGA_MAC_VER_12:
+	case RTL_GIGA_MAC_VER_17:
+	case RTL_GIGA_MAC_VER_19:
+	case RTL_GIGA_MAC_VER_20:
+	case RTL_GIGA_MAC_VER_21:
+	case RTL_GIGA_MAC_VER_22:
+	case RTL_GIGA_MAC_VER_25:
+	case RTL_GIGA_MAC_VER_26:
 	case RTL_GIGA_MAC_VER_29:
 	case RTL_GIGA_MAC_VER_30:
 	case RTL_GIGA_MAC_VER_32:
-- 
1.7.6.4

^ permalink raw reply related

* Re: [PATCH] neigh: increase unres_qlen by one magnitude
From: David Miller @ 2011-11-08 22:24 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1320783085.2588.17.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 08 Nov 2011 21:11:25 +0100

> unres_qlen is the number of frames we are able to queue per unresolved
> neighbour. Its default value (3) was never changed and is responsible
> for strange drops, especially if IP fragments are used, or multiple
> sessions start in parallel. TCP initial congestion window is now bigger
> than 3.
> 
> $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
> PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
> 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms
> 
> --- 192.168.20.108 ping statistics ---
> 2 packets transmitted, 1 received, 50% packet loss, time 1001ms
> rtt min/avg/max/mdev = 0.322/0.322/0.322/0.000 ms
> 
> Since available memory per machine increased quite a lot since 1999, I
> believe its safe to expand unres_qlen to a more reasonable value.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Pretty risky don't you think?

So now we'll allow essentially any remote machine to force us to hold
on to memory on the order of (32 * num_ips_in_subnet) for each IP
address configured.

Just spam UDP or ICMP packets with a source address iterating over
addresses in one of the host's subnets.  If the subnet space is
relatively large, chances are %99 of those IPs won't respond to ARP
and we'll queue up the ICMP replies.

Probably what will trigger first, actually, is we'll hit the per-cpu
ICMP socket send buffer limit.  Because we won't even get to the
point in the TX path where we will early orphan the SKB.

So essentially this will stop ICMP responses completely for all
traffic processed on that cpu.

I realize you're trying to address a very real problem, but I'm just
not sure at all that unilaterally increasing the value like this is
safe.

^ permalink raw reply

* Re: [PATCH net-next] net: rename sk_clone to sk_clone_lock
From: David Miller @ 2011-11-08 22:14 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1320789608.26025.14.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 08 Nov 2011 23:00:08 +0100

> Make clear that sk_clone() and inet_csk_clone() return a locked socket.
> 
> Add _lock() prefix and kerneldoc.
> 
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied, thanks for following up on this Eric.

^ permalink raw reply

* [PATCH 2/2] ah: Read nexthdr value before overwriting it in ahash input callback.
From: Nick Bowler @ 2011-11-08 22:12 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: David S. Miller
In-Reply-To: <1320790365-29152-1-git-send-email-nbowler@elliptictech.com>

The AH4/6 ahash input callbacks read out the nexthdr field from the AH
header *after* they overwrite that header.  This is obviously not going
to end well.  Fix it up.

Signed-off-by: Nick Bowler <nbowler@elliptictech.com>
---
 net/ipv4/ah4.c |    4 ++--
 net/ipv6/ah6.c |    4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 33ca186..c7056b2 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -262,12 +262,12 @@ static void ah_input_done(struct crypto_async_request *base, int err)
 	if (err)
 		goto out;
 
+	err = ah->nexthdr;
+
 	skb->network_header += ah_hlen;
 	memcpy(skb_network_header(skb), work_iph, ihl);
 	__skb_pull(skb, ah_hlen + ihl);
 	skb_set_transport_header(skb, -ihl);
-
-	err = ah->nexthdr;
 out:
 	kfree(AH_SKB_CB(skb)->tmp);
 	xfrm_input_resume(skb, err);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index ede4d9d..7a33aaa 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -464,12 +464,12 @@ static void ah6_input_done(struct crypto_async_request *base, int err)
 	if (err)
 		goto out;
 
+	err = ah->nexthdr;
+
 	skb->network_header += ah_hlen;
 	memcpy(skb_network_header(skb), work_iph, hdr_len);
 	__skb_pull(skb, ah_hlen + hdr_len);
 	skb_set_transport_header(skb, -hdr_len);
-
-	err = ah->nexthdr;
 out:
 	kfree(AH_SKB_CB(skb)->tmp);
 	xfrm_input_resume(skb, err);
-- 
1.7.3.4

^ permalink raw reply related

* [PATCH 1/2] ah: Correctly pass error codes in ahash output callback.
From: Nick Bowler @ 2011-11-08 22:12 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: David S. Miller
In-Reply-To: <1320790365-29152-1-git-send-email-nbowler@elliptictech.com>

The AH4/6 ahash output callbacks pass nexthdr to xfrm_output_resume
instead of the error code.  This appears to be a copy+paste error from
the input case, where nexthdr is expected.  This causes the driver to
continuously add AH headers to the datagram until either an allocation
fails and the packet is dropped or the ahash driver hits a synchronous
fallback and the resulting monstrosity is transmitted.

Correct this issue by simply passing the error code unadulterated.

Signed-off-by: Nick Bowler <nbowler@elliptictech.com>
---
 net/ipv4/ah4.c |    2 --
 net/ipv6/ah6.c |    2 --
 2 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index c1f4154..33ca186 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -136,8 +136,6 @@ static void ah_output_done(struct crypto_async_request *base, int err)
 		memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
 	}

-	err = ah->nexthdr;
-
 	kfree(AH_SKB_CB(skb)->tmp);
 	xfrm_output_resume(skb, err);
 }
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 2195ae6..ede4d9d 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -324,8 +324,6 @@ static void ah6_output_done(struct crypto_async_request *base, int err)
 #endif
 	}

-	err = ah->nexthdr;
-
 	kfree(AH_SKB_CB(skb)->tmp);
 	xfrm_output_resume(skb, err);
 }
-- 
1.7.3.4

^ permalink raw reply related

* [PATCH 0/2] AH fixes for asynchronous hash algorithms.
From: Nick Bowler @ 2011-11-08 22:12 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: David S. Miller

Here are two fixes for AH when using an asynchronous hmac driver.  Both
are -stable candidates as these problems appear to have been present
since AH was converted to use ahash way back in 2.6.33.

These code paths are not exercised when using the default software hash
implementations which do not use the ahash callbacks, but the issues can be
reproduced by using cryptd to create an asynchronous hash algorithm for
testing.

This driver could probably do with some cleanups to reduce the code
duplication (and thus test coverage) between the asynchronous callbacks
and synchronous code paths, which should help avoid these kind of
problems in the future.  These code paths apparently do not see a
lot of testing.  But that's for a later patch series.

Nick Bowler (2):
  ah: Correctly pass error codes in ahash output callback.
  ah: Read nexthdr value before overwriting it in ahash input callback.

 net/ipv4/ah4.c |    6 ++----
 net/ipv6/ah6.c |    6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

-- 
1.7.3.4

^ permalink raw reply

* Re: [PATCH net-next] sch_choke: use skb_header_pointer()
From: David Miller @ 2011-11-08 22:04 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1320785104.26025.5.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 08 Nov 2011 21:45:04 +0100

> Remove the assumption that skb_get_rxhash() makes IP header and ports
> linear, and use skb_header_pointer() instead in choke_match_flow()
> 
> This permits __skb_get_rxhash() to use skb_header_pointer() eventually.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied, thanks Eric.

^ permalink raw reply

* [PATCH net-next] net: rename sk_clone to sk_clone_lock
From: Eric Dumazet @ 2011-11-08 22:00 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Make clear that sk_clone() and inet_csk_clone() return a locked socket.

Add _lock() prefix and kerneldoc.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/inet_connection_sock.h |    6 +++---
 include/net/sock.h                 |    4 ++--
 net/core/sock.c                    |   11 +++++++++--
 net/dccp/minisocks.c               |    2 +-
 net/ipv4/inet_connection_sock.c    |   17 +++++++++++++----
 net/ipv4/tcp_minisocks.c           |    2 +-
 6 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index e6db62e..dbf9aab 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -143,9 +143,9 @@ static inline void *inet_csk_ca(const struct sock *sk)
 	return (void *)inet_csk(sk)->icsk_ca_priv;
 }
 
-extern struct sock *inet_csk_clone(struct sock *sk,
-				   const struct request_sock *req,
-				   const gfp_t priority);
+extern struct sock *inet_csk_clone_lock(const struct sock *sk,
+					const struct request_sock *req,
+					const gfp_t priority);
 
 enum inet_csk_ack_state_t {
 	ICSK_ACK_SCHED	= 1,
diff --git a/include/net/sock.h b/include/net/sock.h
index abb6e0f..67cd458 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1089,8 +1089,8 @@ extern struct sock		*sk_alloc(struct net *net, int family,
 					  struct proto *prot);
 extern void			sk_free(struct sock *sk);
 extern void			sk_release_kernel(struct sock *sk);
-extern struct sock		*sk_clone(const struct sock *sk,
-					  const gfp_t priority);
+extern struct sock		*sk_clone_lock(const struct sock *sk,
+					       const gfp_t priority);
 
 extern struct sk_buff		*sock_wmalloc(struct sock *sk,
 					      unsigned long size, int force,
diff --git a/net/core/sock.c b/net/core/sock.c
index 4ed7b1d..e419061 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1204,7 +1204,14 @@ void sk_release_kernel(struct sock *sk)
 }
 EXPORT_SYMBOL(sk_release_kernel);
 
-struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+/**
+ *	sk_clone_lock - clone a socket, and lock its clone
+ *	@sk: the socket to clone
+ *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 {
 	struct sock *newsk;
 
@@ -1297,7 +1304,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 out:
 	return newsk;
 }
-EXPORT_SYMBOL_GPL(sk_clone);
+EXPORT_SYMBOL_GPL(sk_clone_lock);
 
 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index d7041a0..563b7c7 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -100,7 +100,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 	 *   (* Generate a new socket and switch to that socket *)
 	 *   Set S := new socket for this port pair
 	 */
-	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
 
 	if (newsk != NULL) {
 		struct dccp_request_sock *dreq = dccp_rsk(req);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c14d88a..a598768 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -588,10 +588,19 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 }
 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
 
-struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
-			    const gfp_t priority)
+/**
+ *	inet_csk_clone_lock - clone an inet socket, and lock its clone
+ *	@sk: the socket to clone
+ *	@req: request_sock
+ *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *inet_csk_clone_lock(const struct sock *sk,
+				 const struct request_sock *req,
+				 const gfp_t priority)
 {
-	struct sock *newsk = sk_clone(sk, priority);
+	struct sock *newsk = sk_clone_lock(sk, priority);
 
 	if (newsk != NULL) {
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
@@ -615,7 +624,7 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
 	}
 	return newsk;
 }
-EXPORT_SYMBOL_GPL(inet_csk_clone);
+EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
 
 /*
  * At this point, there should be no process reference to this
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 66363b6..0a7e339 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -425,7 +425,7 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
  */
 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 {
-	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
 
 	if (newsk != NULL) {
 		const struct inet_request_sock *ireq = inet_rsk(req);

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox