Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 1/26] rdma/cm: define native IB address
From: Sean Hefty @ 2010-04-01 17:08 UTC (permalink / raw)
  To: linux-rdma, 'Linux Netdev List'

Define AF_IB and sockaddr_ib to allow the rdma_cm to use native IB
addressing.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
---

 include/linux/socket.h |    2 +
 include/rdma/ib.h      |   89 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 0 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 7b3aae2..966e268 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -181,6 +181,7 @@ struct ucred {
 #define AF_PPPOX	24	/* PPPoX sockets		*/
 #define AF_WANPIPE	25	/* Wanpipe API Sockets */
 #define AF_LLC		26	/* Linux LLC			*/
+#define AF_IB		27	/* Native InfiniBand address	*/
 #define AF_CAN		29	/* Controller Area Network      */
 #define AF_TIPC		30	/* TIPC sockets			*/
 #define AF_BLUETOOTH	31	/* Bluetooth sockets 		*/
@@ -221,6 +222,7 @@ struct ucred {
 #define PF_PPPOX	AF_PPPOX
 #define PF_WANPIPE	AF_WANPIPE
 #define PF_LLC		AF_LLC
+#define PF_IB		AF_IB
 #define PF_CAN		AF_CAN
 #define PF_TIPC		AF_TIPC
 #define PF_BLUETOOTH	AF_BLUETOOTH
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
new file mode 100644
index 0000000..cf8f9e7
--- /dev/null
+++ b/include/rdma/ib.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2010 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(_RDMA_IB_H)
+#define _RDMA_IB_H
+
+#include <linux/types.h>
+
+struct ib_addr {
+	union {
+		__u8		uib_addr8[16];
+		__be16		uib_addr16[8];
+		__be32		uib_addr32[4];
+		__be64		uib_addr64[2];
+	} ib_u;
+#define sib_addr8		ib_u.uib_addr8
+#define sib_addr16		ib_u.uib_addr16
+#define sib_addr32		ib_u.uib_addr32
+#define sib_addr64		ib_u.uib_addr64
+#define sib_raw			ib_u.uib_addr8
+#define sib_subnet_prefix	ib_u.uib_addr64[0]
+#define sib_interface_id	ib_u.uib_addr64[1]
+};
+
+static inline int ib_addr_any(const struct ib_addr *a)
+{
+	return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0);
+}
+
+static inline int ib_addr_loopback(const struct ib_addr *a)
+{
+	return ((a->sib_addr32[0] | a->sib_addr32[1] |
+		 a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0);
+}
+
+static inline void ib_addr_set(struct ib_addr *addr,
+			       __be32 w1, __be32 w2, __be32 w3, __be32 w4)
+{
+	addr->sib_addr32[0] = w1;
+	addr->sib_addr32[1] = w2;
+	addr->sib_addr32[2] = w3;
+	addr->sib_addr32[3] = w4;
+}
+
+static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2)
+{
+	return memcmp(a1, a2, sizeof(struct ib_addr));
+}
+
+struct sockaddr_ib {
+	unsigned short int	sib_family;	/* AF_IB */
+	__be16			sib_pkey;
+	__be32			sib_flowinfo;
+	struct ib_addr		sib_addr;
+	__be64			sib_sid;
+	__be64			sib_sid_mask;
+	__u64			sib_scope_id;
+};
+
+#endif /* _RDMA_IB_H */




^ permalink raw reply related

* Re: [PATCH 1/2] phylib: Support phy module autoloading
From: David Woodhouse @ 2010-04-01 17:03 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: davem, netdev, 553024
In-Reply-To: <1270096441.12516.18.camel@localhost>

On Thu, 2010-04-01 at 05:34 +0100, Ben Hutchings wrote:
> On Wed, 2010-03-31 at 02:18 +0100, David Woodhouse wrote:
> > We don't use the normal hotplug mechanism because it doesn't work. It will
> > load the module some time after the device appears, but that's not good
> > enough for us -- we need the driver loaded _immediately_ because otherwise
> > the NIC driver may just abort and then the phy 'device' goes away.
> > 
> > Instead, we just issue a request_module() directly in phy_device_create().
> [...]
> 
> Thanks for doing this, David.  I had a stab at it earlier when this
> problem was reported in Debian <http://bugs.debian.org/553024>.  I
> didn't complete this because (a) I didn't understand all the details of
> adding new device table type, and (b) I tried to avoid duplicating
> information, which turns out to be rather difficult in modules with
> multiple drivers.

It shouldn't be _that_ hard.

You could contrive a macro which you use inside the driver definition
and which takes the phy_id and phy_id_mask as arguments, and has the
side-effect of setting up the MODULE_DEVICE_TABLE data.

> Since you've dealt with (a), and (b) is not really as important, I would
> just like to suggest some minor changes to your patch 1 (see below).
> Feel free to fold them in.  Your patch 2 would then need the
> substitutions s/phy_device_id/mdio_device_id/; s/TABLE(phy/TABLE(mdio/.

I'll tolerate the silly __u32 crap if I must for consistency, but
normally I prefer to write in C.

I did think about 'mdio:' for the module alias, but I decided that
'phy:' probably made more sense since these are PHY driver modules and
the number is the phy_id.

Kernel-doc is good though.

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation


^ permalink raw reply

* [PATCH net-next-2.6] igb: restrict WoL for 82576 ET2 Quad Port Server Adapter
From: Stefan Assmann @ 2010-04-01 17:00 UTC (permalink / raw)
  To: netdev; +Cc: Duyck, Alexander H, jeffrey.t.kirsher

From: Stefan Assmann <sassmann@redhat.com>

Restrict Wake-on-LAN to first port on 82576 ET2 quad port NICs, as it is
only supported there.

Signed-off-by: Stefan Assmann <sassmann@redhat.com>
---
 drivers/net/igb/igb_ethtool.c |    1 +
 drivers/net/igb/igb_main.c    |    1 +
 2 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index 1d4ee41..cdebfbf 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -1863,6 +1863,7 @@ static int igb_wol_exclusion(struct igb_adapter *adapter,
 		retval = 0;
 		break;
 	case E1000_DEV_ID_82576_QUAD_COPPER:
+	case E1000_DEV_ID_82576_QUAD_COPPER_ET2:
 		/* quad port adapters only support WoL on port A */
 		if (!(adapter->flags & IGB_FLAG_QUAD_PORT_A)) {
 			wol->supported = 0;
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index ea87570..5426f41 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -1597,6 +1597,7 @@ static int __devinit igb_probe(struct pci_dev *pdev,
 			adapter->eeprom_wol = 0;
 		break;
 	case E1000_DEV_ID_82576_QUAD_COPPER:
+	case E1000_DEV_ID_82576_QUAD_COPPER_ET2:
 		/* if quad port adapter, disable WoL on all but port A */
 		if (global_quad_port_a != 0)
 			adapter->eeprom_wol = 0;
-- 
1.6.6.1


^ permalink raw reply related

* Re: [netfilter / iptables] question
From: Jan Engelhardt @ 2010-04-01 16:57 UTC (permalink / raw)
  To: thomas yang; +Cc: Patrick McHardy, netdev
In-Reply-To: <v2if4f837ab1004010944yd08c37c5u23d6f2d07df7ccc6@mail.gmail.com>

On Thursday 2010-04-01 18:44, thomas yang wrote:

>Hi,
>
>I want to copy/ clone a  sk_buff  *skb  to  *skb2  on my Linux router,
>  then transmit both  skb2   and  skb  .
>
>How to do this with   netfilter hook function  / iptables  ?
>
Wait for xt_TEE to be merged.

^ permalink raw reply

* [netfilter / iptables] question
From: thomas yang @ 2010-04-01 16:44 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netdev, jengelh

Hi,

I want to copy/ clone a  sk_buff  *skb  to  *skb2  on my Linux router,
  then transmit both  skb2   and  skb  .

How to do this with   netfilter hook function  / iptables  ?

--
Tom

^ permalink raw reply

* Re: [net-next-2.6 PATCH] netfilter: ctnetlink: compute message size properly
From: Jiri Pirko @ 2010-04-01 10:43 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, netdev, netfilter-devel, netfilter, davem,
	Krzysztof Oledzki
In-Reply-To: <4BB47817.4060902@trash.net>

Thu, Apr 01, 2010 at 12:40:23PM CEST, kaber@trash.net wrote:
>Eric Dumazet wrote:
>> Le mercredi 31 mars 2010 à 20:21 +0200, Jiri Pirko a écrit :
>>> Okay, I see your point. How about this:
>>>
>>> Subject: [net-next-2.6 PATCH] netfilter: ctnetlink: compute message size properly V2
>>>
>>> Message size should be dependent on net->ct.sysctl_acct, not on
>>> CONFIG_NF_CT_ACCT definition.
>>>
>> 
>> Then Changelog is not updated with the actual test :)
>
>I've fixed up the changelog and applied the patch, thanks.

Cand I ask on which tree?

thanks

Jirka

^ permalink raw reply

* CAIF device
From: Alan @ 2010-04-01 15:09 UTC (permalink / raw)
  To: netdev, sjur.brandeland

I was reading through the CAIF code and I noticed a couple of bugs

Doesn't check there is a write method so set on a read only
device it's not good news (doubly so as there seem to be no
permission checks ?) plus no permissions checks and also the
following which looks unsafe

        dev_close(ser->dev);
        unregister_netdevice(ser->dev);
        list_del(&ser->node);
        debugfs_deinit(ser);

Now ser is the netdev private data so what stops it going away when
unregister_netdev is called ?

Secondly tty devices are ref counted and this for some reason didn't get
fixed in the driver yet.

[Patches to follow for the write and kref bugs, the others need the
authors and someone who knows the netdev code these days to fix]

^ permalink raw reply

* [PATCH] r8169: clean up my printk uglyness
From: Neil Horman @ 2010-04-01 14:53 UTC (permalink / raw)
  To: netdev; +Cc: davem, romieu, nhorman, brandon

Fix formatting on r8169 printk

Brandon Philips noted that I had a spacing issue in my printk for the last r8169
patch that made it quite ugly.  Fix that up and add the PFX macro to it as well
so it looks like the other r8169 printks

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>


 r8169.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 9674005..a119885 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -3227,8 +3227,8 @@ static void rtl8169_set_rxbufsize(struct rtl8169_private *tp,
 	unsigned int max_frame = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN;
 
 	if (max_frame != 16383)
-		printk(KERN_WARNING "WARNING! Changing of MTU on this NIC"
-			"May lead to frame reception errors!\n");
+		printk(KERN_WARNING PFX "WARNING! Changing of MTU on this "
+			"NIC May lead to frame reception errors!\n");
 
 	tp->rx_buf_sz = (max_frame > RX_BUF_SIZE) ? max_frame : RX_BUF_SIZE;
 }

^ permalink raw reply related

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Patrick McHardy @ 2010-04-01 14:03 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: netfilter-devel, netdev
In-Reply-To: <alpine.LSU.2.01.1004011557270.5368@obet.zrqbmnf.qr>

Jan Engelhardt wrote:
> On Thursday 2010-04-01 15:48, Patrick McHardy wrote:
>> Jan Engelhardt wrote:
>>> On Thursday 2010-04-01 15:22, Patrick McHardy wrote:
>>>>>>> Conntrack loops are prevented by using a dummy conntrack, just as 
>>>>>>> NOTRACK does.
>>>>>> [...]
>>>>>>>  - When the cloned packets gets XFRMed or tunneled, its status switches 
>>>>>>>    from "special" to "plain". Doing policy routing on them does not seem 
>>>>>>>    so far-fetched.
>>>>>> My question was about the case without conntrack.
>>>>> Hm. Do you have any suggestion in countering a case whereby a user
>>>>> does -I OUTPUT -j TEE without conntrack?
>>>>>
>>>>> Perhaps making nesting a feature that requires conntrack, such that the 
>>>>> non-CT case can't loop?
>>>> If we drop the reentrancy thing, what should work is to prevent
>>>> using loopback as output device and using something similar to
>>>> the recursion counters tunnel devices used to have.
>>> Nah. I'm going to pick a bit from struct skbuff to indicate the
>>> packet was teed so as to avoid that loop.
>> That's a bad idea, we shouldn't be adding new skb members for something
>> as peripheral as this module.
> 
> I would have done this, which does not add a member:
> 
> 	IP6CB(skb)->flags |= IPSKB_CLONED;

This doesn't work, the CB is not preserved across layers
(which f.i. matters if you allow loopback destinations).
Its also not preserved for clones.

>> What's wrong with adding a reentrancy counter?
> 
> Sounds like a plan.



^ permalink raw reply

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Jan Engelhardt @ 2010-04-01 13:59 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, netdev
In-Reply-To: <4BB4A41C.7030107@trash.net>


On Thursday 2010-04-01 15:48, Patrick McHardy wrote:
>Jan Engelhardt wrote:
>> On Thursday 2010-04-01 15:22, Patrick McHardy wrote:
>>>>>> Conntrack loops are prevented by using a dummy conntrack, just as 
>>>>>> NOTRACK does.
>>>>> [...]
>>>>>>  - When the cloned packets gets XFRMed or tunneled, its status switches 
>>>>>>    from "special" to "plain". Doing policy routing on them does not seem 
>>>>>>    so far-fetched.
>>>>> My question was about the case without conntrack.
>>>> Hm. Do you have any suggestion in countering a case whereby a user
>>>> does -I OUTPUT -j TEE without conntrack?
>>>>
>>>> Perhaps making nesting a feature that requires conntrack, such that the 
>>>> non-CT case can't loop?
>>> If we drop the reentrancy thing, what should work is to prevent
>>> using loopback as output device and using something similar to
>>> the recursion counters tunnel devices used to have.
>> 
>> Nah. I'm going to pick a bit from struct skbuff to indicate the
>> packet was teed so as to avoid that loop.
>
>That's a bad idea, we shouldn't be adding new skb members for something
>as peripheral as this module.

I would have done this, which does not add a member:

	IP6CB(skb)->flags |= IPSKB_CLONED;

>What's wrong with adding a reentrancy counter?

Sounds like a plan.

^ permalink raw reply

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Patrick McHardy @ 2010-04-01 13:48 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: netfilter-devel, netdev
In-Reply-To: <alpine.LSU.2.01.1004011543260.1174@obet.zrqbmnf.qr>

Jan Engelhardt wrote:
> On Thursday 2010-04-01 15:22, Patrick McHardy wrote:
>>>>> Conntrack loops are prevented by using a dummy conntrack, just as 
>>>>> NOTRACK does.
>>>> [...]
>>>>>  - When the cloned packets gets XFRMed or tunneled, its status switches 
>>>>>    from "special" to "plain". Doing policy routing on them does not seem 
>>>>>    so far-fetched.
>>>> My question was about the case without conntrack.
>>> Hm. Do you have any suggestion in countering a case whereby a user
>>> does -I OUTPUT -j TEE without conntrack?
>>>
>>> Perhaps making nesting a feature that requires conntrack, such that the 
>>> non-CT case can't loop?
>> If we drop the reentrancy thing, what should work is to prevent
>> using loopback as output device and using something similar to
>> the recursion counters tunnel devices used to have.
> 
> Nah. I'm going to pick a bit from struct skbuff to indicate the
> packet was teed so as to avoid that loop.

That's a bad idea, we shouldn't be adding new skb members for something
as peripheral as this module.

What's wrong with adding a reentrancy counter?

^ permalink raw reply

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Jan Engelhardt @ 2010-04-01 13:44 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, netdev
In-Reply-To: <4BB49E10.8080608@trash.net>


On Thursday 2010-04-01 15:22, Patrick McHardy wrote:
>>>> Conntrack loops are prevented by using a dummy conntrack, just as 
>>>> NOTRACK does.
>>> [...]
>>>>  - When the cloned packets gets XFRMed or tunneled, its status switches 
>>>>    from "special" to "plain". Doing policy routing on them does not seem 
>>>>    so far-fetched.
>>> My question was about the case without conntrack.
>> 
>> Hm. Do you have any suggestion in countering a case whereby a user
>> does -I OUTPUT -j TEE without conntrack?
>> 
>> Perhaps making nesting a feature that requires conntrack, such that the 
>> non-CT case can't loop?
>
>If we drop the reentrancy thing, what should work is to prevent
>using loopback as output device and using something similar to
>the recursion counters tunnel devices used to have.

Nah. I'm going to pick a bit from struct skbuff to indicate the
packet was teed so as to avoid that loop.


^ permalink raw reply

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Patrick McHardy @ 2010-04-01 13:22 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: netfilter-devel, netdev
In-Reply-To: <alpine.LSU.2.01.1004011511180.1174@obet.zrqbmnf.qr>

Jan Engelhardt wrote:
> On Thursday 2010-04-01 13:09, Patrick McHardy wrote:
> 
>>> Conntrack loops are prevented by using a dummy conntrack, just as 
>>> NOTRACK does.
>> [...]
>>>  - When the cloned packets gets XFRMed or tunneled, its status switches 
>>>    from "special" to "plain". Doing policy routing on them does not seem 
>>>    so far-fetched.
>> My question was about the case without conntrack.
> 
> Hm. Do you have any suggestion in countering a case whereby a user
> does -I OUTPUT -j TEE without conntrack?
> 
> Perhaps making nesting a feature that requires conntrack, such that the 
> non-CT case can't loop?

If we drop the reentrancy thing, what should work is to prevent
using loopback as output device and using something similar to
the recursion counters tunnel devices used to have.

>>> I can think of a handful of applications:
>>>  - CLASSIFY
>> Good point, you should probably reset a couple of skb members
>> after the skb_copy().
> 
> I take it you mean
> 
>  nf_reset(skb)
>  skb->mark = 0;
>  skb_init_secmark(nskb);

Yes, basically. Although I believe the selinux people would be
happier if you kept the original secmark for the copied packets :)

> Or should we be using skb_alloc and copying the data portion over, like 
> ipt_REJECT does since v2.6.24-2931-g9ba99b0?

I guess pskb_copy() would be most optimal since we can modify
the header, but the non-linear area could be shared

^ permalink raw reply

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Jan Engelhardt @ 2010-04-01 13:15 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, netdev
In-Reply-To: <4BB47EEA.4020809@trash.net>

On Thursday 2010-04-01 13:09, Patrick McHardy wrote:

>> Conntrack loops are prevented by using a dummy conntrack, just as 
>> NOTRACK does.
>[...]
>>  - When the cloned packets gets XFRMed or tunneled, its status switches 
>>    from "special" to "plain". Doing policy routing on them does not seem 
>>    so far-fetched.
>
>My question was about the case without conntrack.

Hm. Do you have any suggestion in countering a case whereby a user
does -I OUTPUT -j TEE without conntrack?

Perhaps making nesting a feature that requires conntrack, such that the 
non-CT case can't loop?

>> I can think of a handful of applications:
>>  - CLASSIFY
>
>Good point, you should probably reset a couple of skb members
>after the skb_copy().

I take it you mean

 nf_reset(skb)
 skb->mark = 0;
 skb_init_secmark(nskb);

Or should we be using skb_alloc and copying the data portion over, like 
ipt_REJECT does since v2.6.24-2931-g9ba99b0?

^ permalink raw reply

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
From: Timo Teräs @ 2010-04-01 13:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Herbert Xu
In-Reply-To: <1270127120.2229.93.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le jeudi 01 avril 2010 à 15:52 +0300, Timo Teras a écrit :
>> @@ -481,6 +482,7 @@ struct xfrm_policy {
>>  	atomic_t		refcnt;
>>  	struct timer_list	timer;
>>  
>> +	struct flow_cache_entry_ops *fc_ops;
>>  	u32			priority;
>>  	u32			index;
>>  	struct xfrm_mark	mark;
> ...
> 
>> +static struct flow_cache_entry_ops xfrm_policy_fc_ops __read_mostly = {
>> +	.get = xfrm_policy_get_fce,
>> +	.check = xfrm_policy_check_fce,
>> +	.delete = xfrm_policy_delete_fce,
>> +};
>>  
> 
> Any particular reason these flow_cache_entry_ops are not const ?

Umm... No. I'll constify them. Thanks.

^ permalink raw reply

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
From: Eric Dumazet @ 2010-04-01 13:05 UTC (permalink / raw)
  To: Timo Teras; +Cc: netdev, Herbert Xu
In-Reply-To: <1270126340-30181-2-git-send-email-timo.teras@iki.fi>

Le jeudi 01 avril 2010 à 15:52 +0300, Timo Teras a écrit :
> This allows to validate the cached object before returning it.
> It also allows to destruct object properly, if the last reference
> was held in flow cache. This is also a prepartion for caching
> bundles in the flow cache.
> 
> In return for virtualizing the methods, we save on:
> - not having to regenerate the whole flow cache on policy removal:
>   each flow matching a killed policy gets refreshed as the getter
>   function notices it smartly.
> - we do not have to call flow_cache_flush from policy gc, since the
>   flow cache now properly deletes the object if it had any references
> 
> Signed-off-by: Timo Teras <timo.teras@iki.fi>
> ---

...

> @@ -481,6 +482,7 @@ struct xfrm_policy {
>  	atomic_t		refcnt;
>  	struct timer_list	timer;
>  
> +	struct flow_cache_entry_ops *fc_ops;
>  	u32			priority;
>  	u32			index;
>  	struct xfrm_mark	mark;
...

> +static struct flow_cache_entry_ops xfrm_policy_fc_ops __read_mostly = {
> +	.get = xfrm_policy_get_fce,
> +	.check = xfrm_policy_check_fce,
> +	.delete = xfrm_policy_delete_fce,
> +};
>  

Any particular reason these flow_cache_entry_ops are not const ?



^ permalink raw reply

* [PATCH 4/4] flow: delayed deletion of flow cache entries
From: Timo Teras @ 2010-04-01 12:52 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270126340-30181-1-git-send-email-timo.teras@iki.fi>

Speed up lookups by freeing flow cache entries later. After
virtualizing flow cache entry operations, the flow cache may now
end up calling policy or bundle destructor which can be slowish.

As gc_list is more effective with double linked list, the flow cache
is converted to use common hlist and list macroes where appropriate.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 net/core/flow.c |  101 ++++++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 70 insertions(+), 31 deletions(-)

diff --git a/net/core/flow.c b/net/core/flow.c
index f938137..d3a8f80 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,7 +26,10 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry *	next;
+	union {
+		struct hlist_node	hlist;
+		struct list_head	gc_list;
+	} u;
 	u16				family;
 	u8				dir;
 	u32				genid;
@@ -35,7 +38,7 @@ struct flow_cache_entry {
 };
 
 struct flow_cache_percpu {
-	struct flow_cache_entry **	hash_table;
+	struct hlist_head *		hash_table;
 	int				hash_count;
 	u32				hash_rnd;
 	int				hash_rnd_recalc;
@@ -62,6 +65,9 @@ atomic_t flow_cache_genid = ATOMIC_INIT(0);
 static struct flow_cache flow_cache_global;
 static struct kmem_cache *flow_cachep;
 
+static DEFINE_SPINLOCK(flow_cache_gc_lock);
+static LIST_HEAD(flow_cache_gc_list);
+
 #define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
 #define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
@@ -86,38 +92,66 @@ static int flow_entry_valid(struct flow_cache_entry *fle)
 	return 1;
 }
 
-static void flow_entry_kill(struct flow_cache *fc,
-			    struct flow_cache_percpu *fcp,
-			    struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle)
 {
 	if (fle->ops)
 		(*fle->ops)->delete(fle->ops);
 	kmem_cache_free(flow_cachep, fle);
-	fcp->hash_count--;
+}
+
+static void flow_cache_gc_task(struct work_struct *work)
+{
+	struct list_head gc_list;
+	struct flow_cache_entry *fce, *n;
+
+	INIT_LIST_HEAD(&gc_list);
+	spin_lock_bh(&flow_cache_gc_lock);
+	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
+	spin_unlock_bh(&flow_cache_gc_lock);
+
+	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
+		flow_entry_kill(fce);
+}
+static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
+
+static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
+				     int deleted, struct list_head *gc_list)
+{
+	if (deleted) {
+		fcp->hash_count -= deleted;
+		spin_lock_bh(&flow_cache_gc_lock);
+		list_splice_tail(gc_list, &flow_cache_gc_list);
+		spin_unlock_bh(&flow_cache_gc_lock);
+		schedule_work(&flow_cache_gc_work);
+	}
 }
 
 static void __flow_cache_shrink(struct flow_cache *fc,
 				struct flow_cache_percpu *fcp,
 				int shrink_to)
 {
-	struct flow_cache_entry *fle, **flp;
-	int i;
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry, *tmp;
+	LIST_HEAD(gc_list);
+	int i, deleted = 0;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int saved = 0;
 
-		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL) {
+		hlist_for_each_entry_safe(fle, entry, tmp,
+					  &fcp->hash_table[i], u.hlist) {
 			if (saved < shrink_to &&
 			    flow_entry_valid(fle)) {
 				saved++;
-				flp = &fle->next;
 			} else {
-				*flp = fle->next;
-				flow_entry_kill(fc, fcp, fle);
+				deleted++;
+				hlist_del(&fle->u.hlist);
+				list_add_tail(&fle->u.gc_list, &gc_list);
 			}
 		}
 	}
+
+	flow_cache_queue_garbage(fcp, deleted, &gc_list);
 }
 
 static void flow_cache_shrink(struct flow_cache *fc,
@@ -182,8 +216,9 @@ struct flow_cache_entry_ops **flow_cache_lookup(
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
-	struct flow_cache_entry *fle, **head;
 	struct flow_cache_entry_ops **ops;
+	struct flow_cache_entry *fle, *tfle;
+	struct hlist_node *entry;
 	unsigned int hash;
 
 	local_bh_disable();
@@ -200,12 +235,13 @@ struct flow_cache_entry_ops **flow_cache_lookup(
 		flow_new_hash_rnd(fc, fcp);
 
 	hash = flow_hash_code(fc, fcp, key);
-	head = &fcp->hash_table[hash];
-	for (fle = *head; fle; fle = fle->next) {
-		if (fle->family == family &&
-		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0)
+	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
+		if (tfle->family == family &&
+		    tfle->dir == dir &&
+		    flow_key_compare(key, &tfle->key) == 0) {
+			fle = tfle;
 			break;
+		}
 	}
 
 	if (!fle) {
@@ -214,13 +250,13 @@ struct flow_cache_entry_ops **flow_cache_lookup(
 
 		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 		if (fle) {
-			fle->next = *head;
-			*head = fle;
 			fle->family = family;
 			fle->dir = dir;
 			fle->ops = NULL;
 			memcpy(&fle->key, key, sizeof(*key));
 			fle->ops = NULL;
+
+			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
 			fcp->hash_count++;
 		}
 	} else if (fle->genid == atomic_read(&flow_cache_genid)) {
@@ -256,23 +292,26 @@ static void flow_cache_flush_tasklet(unsigned long data)
 	struct flow_flush_info *info = (void *)data;
 	struct flow_cache *fc = info->cache;
 	struct flow_cache_percpu *fcp;
-	int i;
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry, *tmp;
+	LIST_HEAD(gc_list);
+	int i, deleted = 0;
 
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		struct flow_cache_entry *fle;
-
-		fle = fcp->hash_table[i];
-		for (; fle; fle = fle->next) {
+		hlist_for_each_entry_safe(fle, entry, tmp,
+					  &fcp->hash_table[i], u.hlist) {
 			if (flow_entry_valid(fle))
 				continue;
 
-			if (fle->ops)
-				(*fle->ops)->delete(fle->ops);
-			fle->ops = NULL;
+			deleted++;
+			hlist_del(&fle->u.hlist);
+			list_add_tail(&fle->u.gc_list, &gc_list);
 		}
 	}
 
+	flow_cache_queue_garbage(fcp, deleted, &gc_list);
+
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
 }
@@ -314,7 +353,7 @@ void flow_cache_flush(void)
 static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
 					  struct flow_cache_percpu *fcp)
 {
-	fcp->hash_table = (struct flow_cache_entry **)
+	fcp->hash_table = (struct hlist_head *)
 		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
 	if (!fcp->hash_table)
 		panic("NET: failed to allocate flow cache order %lu\n", fc->order);
@@ -348,7 +387,7 @@ static int flow_cache_init(struct flow_cache *fc)
 
 	for (order = 0;
 	     (PAGE_SIZE << order) <
-		     (sizeof(struct flow_cache_entry *)*flow_cache_hash_size(fc));
+		     (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
 	     order++)
 		/* NOTHING */;
 	fc->order = order;
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 3/4] xfrm: remove policy garbage collection
From: Timo Teras @ 2010-04-01 12:52 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270126340-30181-1-git-send-email-timo.teras@iki.fi>

Policies are now properly reference counted and destroyed from
all code paths. The delayed gc is just an overhead now and can
be removed.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 net/xfrm/xfrm_policy.c |   39 +++++----------------------------------
 1 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 3489e2d..0a80978 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -46,9 +46,6 @@ static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
 
-static HLIST_HEAD(xfrm_policy_gc_list);
-static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
-
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
 static void xfrm_init_pmtu(struct dst_entry *dst);
@@ -289,32 +286,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 }
 EXPORT_SYMBOL(xfrm_policy_destroy);
 
-static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
-{
-	atomic_inc(&policy->genid);
-
-	if (del_timer(&policy->timer))
-		atomic_dec(&policy->refcnt);
-
-	xfrm_pol_put(policy);
-}
-
-static void xfrm_policy_gc_task(struct work_struct *work)
-{
-	struct xfrm_policy *policy;
-	struct hlist_node *entry, *tmp;
-	struct hlist_head gc_list;
-
-	spin_lock_bh(&xfrm_policy_gc_lock);
-	gc_list.first = xfrm_policy_gc_list.first;
-	INIT_HLIST_HEAD(&xfrm_policy_gc_list);
-	spin_unlock_bh(&xfrm_policy_gc_lock);
-
-	hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst)
-		xfrm_policy_gc_kill(policy);
-}
-static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task);
-
 /* Rule must be locked. Release descentant resources, announce
  * entry dead. The rule must be unlinked from lists to the moment.
  */
@@ -323,11 +294,12 @@ static void xfrm_policy_kill(struct xfrm_policy *policy)
 {
 	policy->walk.dead = 1;
 
-	spin_lock_bh(&xfrm_policy_gc_lock);
-	hlist_add_head(&policy->bydst, &xfrm_policy_gc_list);
-	spin_unlock_bh(&xfrm_policy_gc_lock);
+	atomic_inc(&policy->genid);
 
-	schedule_work(&xfrm_policy_gc_work);
+	if (del_timer(&policy->timer))
+		xfrm_pol_put(policy);
+
+	xfrm_pol_put(policy);
 }
 
 static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
@@ -2605,7 +2577,6 @@ static void xfrm_policy_fini(struct net *net)
 	audit_info.sessionid = -1;
 	audit_info.secid = 0;
 	xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info);
-	flush_work(&xfrm_policy_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.policy_all));
 
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 2/4] xfrm: cache bundles instead of policies for outgoing flows
From: Timo Teras @ 2010-04-01 12:52 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270126340-30181-1-git-send-email-timo.teras@iki.fi>

__xfrm_lookup() is called for each packet transmitted out of
system. The xfrm_find_bundle() does a linear search which can
kill system performance depending on how many bundles are
required per policy.

This modifies __xfrm_lookup() to store bundles directly in
the flow cache. If we did not get a hit, we just create a new
bundle instead of doing slow search. This means that we can now
get multiple xfrm_dst's for same flow (on per-cpu basis).

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 include/net/xfrm.h      |   10 +-
 net/ipv4/xfrm4_policy.c |   22 --
 net/ipv6/xfrm6_policy.c |   31 --
 net/xfrm/xfrm_policy.c  |  712 +++++++++++++++++++++++++----------------------
 4 files changed, 386 insertions(+), 389 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index cb8934b..6ce593f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -267,7 +267,6 @@ struct xfrm_policy_afinfo {
 					       xfrm_address_t *saddr,
 					       xfrm_address_t *daddr);
 	int			(*get_saddr)(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr);
-	struct dst_entry	*(*find_bundle)(struct flowi *fl, struct xfrm_policy *policy);
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
@@ -483,13 +482,13 @@ struct xfrm_policy {
 	struct timer_list	timer;
 
 	struct flow_cache_entry_ops *fc_ops;
+	atomic_t		genid;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
 	struct xfrm_selector	selector;
 	struct xfrm_lifetime_cfg lft;
 	struct xfrm_lifetime_cur curlft;
-	struct dst_entry       *bundles;
 	struct xfrm_policy_walk_entry walk;
 	u8			type;
 	u8			action;
@@ -879,11 +878,15 @@ struct xfrm_dst {
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
+	struct flow_cache_entry_ops *fc_ops;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	int num_pols, num_xfrms;
 #ifdef CONFIG_XFRM_SUB_POLICY
 	struct flowi *origin;
 	struct xfrm_selector *partner;
 #endif
-	u32 genid;
+	u32 xfrm_genid;
+	u32 policy_genid;
 	u32 route_mtu_cached;
 	u32 child_mtu_cached;
 	u32 route_cookie;
@@ -893,6 +896,7 @@ struct xfrm_dst {
 #ifdef CONFIG_XFRM
 static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 {
+	xfrm_pols_put(xdst->pols, xdst->num_pols);
 	dst_release(xdst->route);
 	if (likely(xdst->u.dst.xfrm))
 		xfrm_state_put(xdst->u.dst.xfrm);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index e4a1483..1705476 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -59,27 +59,6 @@ static int xfrm4_get_saddr(struct net *net,
 	return 0;
 }
 
-static struct dst_entry *
-__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
-{
-	struct dst_entry *dst;
-
-	read_lock_bh(&policy->lock);
-	for (dst = policy->bundles; dst; dst = dst->next) {
-		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
-		if (xdst->u.rt.fl.oif == fl->oif &&	/*XXX*/
-		    xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
-		    xdst->u.rt.fl.fl4_src == fl->fl4_src &&
-		    xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
-		    xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
-			dst_clone(dst);
-			break;
-		}
-	}
-	read_unlock_bh(&policy->lock);
-	return dst;
-}
-
 static int xfrm4_get_tos(struct flowi *fl)
 {
 	return fl->fl4_tos;
@@ -259,7 +238,6 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.dst_ops =		&xfrm4_dst_ops,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.get_saddr =		xfrm4_get_saddr,
-	.find_bundle = 		__xfrm4_find_bundle,
 	.decode_session =	_decode_session4,
 	.get_tos =		xfrm4_get_tos,
 	.init_path =		xfrm4_init_path,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ae18165..8c452fd 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -67,36 +67,6 @@ static int xfrm6_get_saddr(struct net *net,
 	return 0;
 }
 
-static struct dst_entry *
-__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
-{
-	struct dst_entry *dst;
-
-	/* Still not clear if we should set fl->fl6_{src,dst}... */
-	read_lock_bh(&policy->lock);
-	for (dst = policy->bundles; dst; dst = dst->next) {
-		struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
-		struct in6_addr fl_dst_prefix, fl_src_prefix;
-
-		ipv6_addr_prefix(&fl_dst_prefix,
-				 &fl->fl6_dst,
-				 xdst->u.rt6.rt6i_dst.plen);
-		ipv6_addr_prefix(&fl_src_prefix,
-				 &fl->fl6_src,
-				 xdst->u.rt6.rt6i_src.plen);
-		if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) &&
-		    ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) &&
-		    xfrm_bundle_ok(policy, xdst, fl, AF_INET6,
-				   (xdst->u.rt6.rt6i_dst.plen != 128 ||
-				    xdst->u.rt6.rt6i_src.plen != 128))) {
-			dst_clone(dst);
-			break;
-		}
-	}
-	read_unlock_bh(&policy->lock);
-	return dst;
-}
-
 static int xfrm6_get_tos(struct flowi *fl)
 {
 	return 0;
@@ -291,7 +261,6 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.dst_ops =		&xfrm6_dst_ops,
 	.dst_lookup =		xfrm6_dst_lookup,
 	.get_saddr = 		xfrm6_get_saddr,
-	.find_bundle =		__xfrm6_find_bundle,
 	.decode_session =	_decode_session6,
 	.get_tos =		xfrm6_get_tos,
 	.init_path =		xfrm6_init_path,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 20f5b01..3489e2d 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -37,6 +37,8 @@
 DEFINE_MUTEX(xfrm_cfg_mutex);
 EXPORT_SYMBOL(xfrm_cfg_mutex);
 
+static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock);
+static struct dst_entry *xfrm_policy_sk_bundles;
 static DEFINE_RWLOCK(xfrm_policy_lock);
 
 static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
@@ -50,6 +52,7 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
 static void xfrm_init_pmtu(struct dst_entry *dst);
+static int stale_bundle(struct dst_entry *dst);
 
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
@@ -278,8 +281,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 {
 	BUG_ON(!policy->walk.dead);
 
-	BUG_ON(policy->bundles);
-
 	if (del_timer(&policy->timer))
 		BUG();
 
@@ -290,12 +291,7 @@ EXPORT_SYMBOL(xfrm_policy_destroy);
 
 static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 {
-	struct dst_entry *dst;
-
-	while ((dst = policy->bundles) != NULL) {
-		policy->bundles = dst->next;
-		dst_free(dst);
-	}
+	atomic_inc(&policy->genid);
 
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
@@ -573,7 +569,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	struct xfrm_policy *delpol;
 	struct hlist_head *chain;
 	struct hlist_node *entry, *newpos;
-	struct dst_entry *gc_list;
 	u32 mark = policy->mark.v & policy->mark.m;
 
 	write_lock_bh(&xfrm_policy_lock);
@@ -624,33 +619,11 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 		schedule_work(&net->xfrm.policy_hash_work);
 
 	read_lock_bh(&xfrm_policy_lock);
-	gc_list = NULL;
 	entry = &policy->bydst;
-	hlist_for_each_entry_continue(policy, entry, bydst) {
-		struct dst_entry *dst;
-
-		write_lock(&policy->lock);
-		dst = policy->bundles;
-		if (dst) {
-			struct dst_entry *tail = dst;
-			while (tail->next)
-				tail = tail->next;
-			tail->next = gc_list;
-			gc_list = dst;
-
-			policy->bundles = NULL;
-		}
-		write_unlock(&policy->lock);
-	}
+	hlist_for_each_entry_continue(policy, entry, bydst)
+		atomic_inc(&policy->genid);
 	read_unlock_bh(&xfrm_policy_lock);
 
-	while (gc_list) {
-		struct dst_entry *dst = gc_list;
-
-		gc_list = dst->next;
-		dst_free(dst);
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL(xfrm_policy_insert);
@@ -999,6 +972,20 @@ fail:
 	return ret;
 }
 
+static struct xfrm_policy *__xfrm_policy_lookup(
+		struct net *net, struct flowi *fl,
+		u16 family, u8 dir)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_policy *pol;
+
+	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
+	if (pol != NULL)
+		return pol;
+#endif
+	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
+}
+
 static struct flow_cache_entry_ops **xfrm_policy_lookup(
 		struct net *net, struct flowi *fl, u16 family,
 		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx)
@@ -1008,21 +995,10 @@ static struct flow_cache_entry_ops **xfrm_policy_lookup(
 	if (old_ops)
 		xfrm_pol_put(container_of(old_ops, struct xfrm_policy, fc_ops));
 
-#ifdef CONFIG_XFRM_SUB_POLICY
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol))
+	pol = __xfrm_policy_lookup(net, fl, family, dir);
+	if (IS_ERR_OR_NULL(pol))
 		return ERR_CAST(pol);
-	if (pol)
-		goto found;
-#endif
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol))
-		return ERR_CAST(pol);
-	if (pol)
-		goto found;
-	return NULL;
 
-found:
 	/* Resolver returns two references:
 	 * one for cache and one for caller of flow_cache_lookup() */
 	xfrm_pol_hold(pol);
@@ -1314,18 +1290,6 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
  * still valid.
  */
 
-static struct dst_entry *
-xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
-{
-	struct dst_entry *x;
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
-	if (unlikely(afinfo == NULL))
-		return ERR_PTR(-EINVAL);
-	x = afinfo->find_bundle(fl, policy);
-	xfrm_policy_put_afinfo(afinfo);
-	return x;
-}
-
 static inline int xfrm_get_tos(struct flowi *fl, int family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
@@ -1341,6 +1305,55 @@ static inline int xfrm_get_tos(struct flowi *fl, int family)
 	return tos;
 }
 
+static struct flow_cache_entry_ops **xfrm_bundle_get_fce(
+		struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_dst *xdst = container_of(ops, struct xfrm_dst, fc_ops);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (xdst->route == NULL) {
+		/* Dummy bundle - if it has xfrms we were not
+		 * able to build bundle as template resolution failed.
+		 * It means we need to try again resolving. */
+		if (xdst->num_xfrms > 0)
+			return NULL;
+	} else {
+		/* Real bundle */
+		if (stale_bundle(dst))
+			return NULL;
+	}
+
+	dst_hold(dst);
+	return ops;
+}
+
+static int xfrm_bundle_check_fce(struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_dst *xdst = container_of(ops, struct xfrm_dst, fc_ops);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (!xdst->route)
+		return 0;
+	if (stale_bundle(dst))
+		return 0;
+
+	return 1;
+}
+
+static void xfrm_bundle_delete_fce(struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_dst *xdst = container_of(ops, struct xfrm_dst, fc_ops);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	dst_free(dst);
+}
+
+static struct flow_cache_entry_ops xfrm_bundle_fc_ops __read_mostly = {
+	.get = xfrm_bundle_get_fce,
+	.check = xfrm_bundle_check_fce,
+	.delete = xfrm_bundle_delete_fce,
+};
+
 static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
@@ -1363,9 +1376,10 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 		BUG();
 	}
 	xdst = dst_alloc(dst_ops) ?: ERR_PTR(-ENOBUFS);
-
 	xfrm_policy_put_afinfo(afinfo);
 
+	xdst->fc_ops = &xfrm_bundle_fc_ops;
+
 	return xdst;
 }
 
@@ -1403,6 +1417,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	return err;
 }
 
+
 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
  * all the metrics... Shortly, bundle a bundle.
  */
@@ -1466,7 +1481,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 			dst_hold(dst);
 
 		dst1->xfrm = xfrm[i];
-		xdst->genid = xfrm[i]->genid;
+		xdst->xfrm_genid = xfrm[i]->genid;
 
 		dst1->obsolete = -1;
 		dst1->flags |= DST_HOST;
@@ -1559,7 +1574,185 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
 #endif
 }
 
-static int stale_bundle(struct dst_entry *dst);
+static int xfrm_expand_policies(struct flowi *fl, u16 family,
+				struct xfrm_policy **pols,
+				int *num_pols, int *num_xfrms)
+{
+	int i;
+
+	if (*num_pols == 0 || !pols[0]) {
+		*num_pols = 0;
+		*num_xfrms = 0;
+		return 0;
+	}
+	if (IS_ERR(pols[0]))
+		return PTR_ERR(pols[0]);
+
+	*num_xfrms = pols[0]->xfrm_nr;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW &&
+	    pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+		pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
+						    XFRM_POLICY_TYPE_MAIN,
+						    fl, family,
+						    XFRM_POLICY_OUT);
+		if (pols[1]) {
+			if (IS_ERR(pols[1])) {
+				xfrm_pols_put(pols, *num_pols);
+				return PTR_ERR(pols[1]);
+			}
+			(*num_pols) ++;
+			(*num_xfrms) += pols[1]->xfrm_nr;
+		}
+	}
+#endif
+	for (i = 0; i < *num_pols; i++) {
+		if (pols[i]->action != XFRM_POLICY_ALLOW) {
+			*num_xfrms = -1;
+			break;
+		}
+	}
+
+	return 0;
+
+}
+
+static struct xfrm_dst *xfrm_resolve_and_create_bundle(
+		struct xfrm_policy **pols, int num_pols,
+		struct flowi *fl, u16 family, struct dst_entry *dst_orig)
+{
+	struct net *net = xp_net(pols[0]);
+	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+	struct dst_entry *dst;
+	struct xfrm_dst *xdst;
+	int err;
+
+	/* Try to instantiate a bundle */
+	err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
+	if (err < 0) {
+		if (err != -EAGAIN)
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		return ERR_PTR(err);
+	}
+
+	dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
+	if (IS_ERR(dst)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
+		return ERR_CAST(dst);
+	}
+
+	xdst = (struct xfrm_dst *)dst;
+	xdst->num_xfrms = err;
+	if (num_pols > 1)
+		err = xfrm_dst_update_parent(dst, &pols[1]->selector);
+	else
+		err = xfrm_dst_update_origin(dst, fl);
+	if (unlikely(err)) {
+		dst_free(dst);
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
+		return ERR_PTR(err);
+	}
+
+	xdst->num_pols = num_pols;
+	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
+	xdst->policy_genid = atomic_read(&pols[0]->genid);
+
+	return xdst;
+}
+
+static struct flow_cache_entry_ops **xfrm_bundle_lookup(
+		struct net *net, struct flowi *fl, u16 family, u8 dir,
+		struct flow_cache_entry_ops **old_ops, void *ctx)
+{
+	struct dst_entry *dst_orig = (struct dst_entry *)ctx;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	struct xfrm_dst *xdst, *new_xdst;
+	int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
+
+	/* Check if the policies from old bundle are usable */
+	xdst = NULL;
+	if (old_ops) {
+		xdst = container_of(old_ops, struct xfrm_dst, fc_ops);
+		num_pols = xdst->num_pols;
+		num_xfrms = xdst->num_xfrms;
+		pol_dead = 0;
+		for (i = 0; i < num_pols; i++) {
+			pols[i] = xdst->pols[i];
+			pol_dead |= pols[i]->walk.dead;
+		}
+		if (pol_dead) {
+			dst_free(&xdst->u.dst);
+			xdst = NULL;
+			num_pols = 0;
+			num_xfrms = 0;
+			old_ops = NULL;
+		}
+	}
+
+	/* Resolve policies to use if we couldn't get them from
+	 * previous cache entry */
+	if (xdst == NULL) {
+		num_pols = 1;
+		pols[0] = __xfrm_policy_lookup(net, fl, family, dir);
+		err = xfrm_expand_policies(fl, family, pols,
+					   &num_pols, &num_xfrms);
+		if (err < 0)
+			goto inc_error;
+		if (num_pols == 0)
+			return NULL;
+		if (num_xfrms <= 0)
+			goto make_dummy_bundle;
+	}
+
+	new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig);
+	if (IS_ERR(new_xdst)) {
+		err = PTR_ERR(new_xdst);
+		if (err != -EAGAIN)
+			goto error;
+		if (old_ops == NULL)
+			goto make_dummy_bundle;
+		dst_hold(&xdst->u.dst);
+		return old_ops;
+	}
+
+	/* Kill the previous bundle */
+	if (xdst) {
+		/* The policies were stolen for newly generated bundle */
+		xdst->num_pols = 0;
+		dst_free(&xdst->u.dst);
+	}
+
+	/* Flow cache does not have reference, it dst_free()'s,
+	 * but we do need to return one reference for original caller */
+	dst_hold(&new_xdst->u.dst);
+	return &new_xdst->fc_ops;
+
+make_dummy_bundle:
+	/* We found policies, but there's no bundles to instantiate:
+	 * either because the policy blocks, has no transformations or
+	 * we could not build template (no xfrm_states).*/
+	xdst = xfrm_alloc_dst(net, family);
+	if (IS_ERR(xdst)) {
+		xfrm_pols_put(pols, num_pols);
+		return ERR_CAST(xdst);
+	}
+	xdst->num_pols = num_pols;
+	xdst->num_xfrms = num_xfrms;
+	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
+
+	dst_hold(&xdst->u.dst);
+	return &xdst->fc_ops;
+
+inc_error:
+	XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+error:
+	if (xdst != NULL)
+		dst_free(&xdst->u.dst);
+	else
+		xfrm_pols_put(pols, num_pols);
+	return ERR_PTR(err);
+}
 
 /* Main function: finds/creates a bundle for given flow.
  *
@@ -1569,248 +1762,152 @@ static int stale_bundle(struct dst_entry *dst);
 int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
 		  struct sock *sk, int flags)
 {
-	struct xfrm_policy *policy;
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
-	int npols;
-	int pol_dead;
-	int xfrm_nr;
-	int pi;
-	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
-	struct dst_entry *dst, *dst_orig = *dst_p;
-	int nx = 0;
-	int err;
-	u32 genid;
-	u16 family;
+	struct flow_cache_entry_ops **ops;
+	struct xfrm_dst *xdst;
+	struct dst_entry *dst, *dst_orig = *dst_p, *route;
+	u16 family = dst_orig->ops->family;
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+	int i, err, num_pols, num_xfrms, drop_pols = 0;
 
 restart:
-	genid = atomic_read(&flow_cache_genid);
-	policy = NULL;
-	for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
-		pols[pi] = NULL;
-	npols = 0;
-	pol_dead = 0;
-	xfrm_nr = 0;
+	dst = NULL;
+	xdst = NULL;
+	route = NULL;
 
 	if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
-		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		num_pols = 1;
+		pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+		err = xfrm_expand_policies(fl, family, pols,
+					   &num_pols, &num_xfrms);
+		if (err < 0)
 			goto dropdst;
+
+		if (num_pols) {
+			if (num_xfrms <= 0) {
+				drop_pols = num_pols;
+				goto no_transform;
+			}
+
+			xdst = xfrm_resolve_and_create_bundle(
+					pols, num_pols, fl,
+					family, dst_orig);
+			if (IS_ERR(xdst)) {
+				xfrm_pols_put(pols, num_pols);
+				err = PTR_ERR(xdst);
+				goto dropdst;
+			}
+
+			spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+			xdst->u.dst.next = xfrm_policy_sk_bundles;
+			xfrm_policy_sk_bundles = &xdst->u.dst;
+			spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
+
+			route = xdst->route;
 		}
 	}
 
-	if (!policy) {
-		struct flow_cache_entry_ops **ops;
-
+	if (xdst == NULL) {
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		ops = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					dir, xfrm_policy_lookup, NULL);
-		err = PTR_ERR(ops);
+		ops = flow_cache_lookup(net, fl, family, dir,
+					xfrm_bundle_lookup, dst_orig);
+		if (ops == NULL)
+			goto nopol;
 		if (IS_ERR(ops)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+			err = PTR_ERR(ops);
 			goto dropdst;
 		}
-		if (ops)
-			policy = container_of(ops, struct xfrm_policy, fc_ops);
-		else
-			policy = NULL;
+		xdst = container_of(ops, struct xfrm_dst, fc_ops);
+
+		num_pols = xdst->num_pols;
+		num_xfrms = xdst->num_xfrms;
+		memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols);
+		route = xdst->route;
+	}
+
+	dst = &xdst->u.dst;
+	if (route == NULL && num_xfrms > 0) {
+		/* The only case when xfrm_bundle_lookup() returns a
+		 * bundle with null route, is when the template could
+		 * not be resolved. It means policies are there, but
+		 * bundle could not be created, since we don't yet
+		 * have the xfrm_state's. We need to wait for KM to
+		 * negotiate new SA's or bail out with error.*/
+		if (net->xfrm.sysctl_larval_drop) {
+			/* EREMOTE tells the caller to generate
+			 * a one-shot blackhole route. */
+			dst_release(dst);
+			xfrm_pols_put(pols, num_pols);
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+			return -EREMOTE;
+		}
+		if (flags & XFRM_LOOKUP_WAIT) {
+			DECLARE_WAITQUEUE(wait, current);
+
+			add_wait_queue(&net->xfrm.km_waitq, &wait);
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&net->xfrm.km_waitq, &wait);
+
+			if (!signal_pending(current)) {
+				dst_release(dst);
+				goto restart;
+			}
+
+			err = -ERESTART;
+		} else
+			err = -EAGAIN;
+
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+		goto error;
 	}
 
-	if (!policy)
+no_transform:
+	if (num_pols == 0)
 		goto nopol;
 
-	family = dst_orig->ops->family;
-	pols[0] = policy;
-	npols ++;
-	xfrm_nr += pols[0]->xfrm_nr;
-
-	err = -ENOENT;
-	if ((flags & XFRM_LOOKUP_ICMP) && !(policy->flags & XFRM_POLICY_ICMP))
+	if ((flags & XFRM_LOOKUP_ICMP) &&
+	    !(pols[0]->flags & XFRM_POLICY_ICMP)) {
+		err = -ENOENT;
 		goto error;
+	}
 
-	policy->curlft.use_time = get_seconds();
+	for (i = 0; i < num_pols; i++)
+		pols[i]->curlft.use_time = get_seconds();
 
-	switch (policy->action) {
-	default:
-	case XFRM_POLICY_BLOCK:
+	if (num_xfrms < 0) {
 		/* Prohibit the flow */
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
 		err = -EPERM;
 		goto error;
-
-	case XFRM_POLICY_ALLOW:
-#ifndef CONFIG_XFRM_SUB_POLICY
-		if (policy->xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pol_put(policy);
-			return 0;
-		}
-#endif
-
-		/* Try to find matching bundle.
-		 *
-		 * LATER: help from flow cache. It is optional, this
-		 * is required only for output policy.
-		 */
-		dst = xfrm_find_bundle(fl, policy, family);
-		if (IS_ERR(dst)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			err = PTR_ERR(dst);
-			goto error;
-		}
-
-		if (dst)
-			break;
-
-#ifdef CONFIG_XFRM_SUB_POLICY
-		if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
-			pols[1] = xfrm_policy_lookup_bytype(net,
-							    XFRM_POLICY_TYPE_MAIN,
-							    fl, family,
-							    XFRM_POLICY_OUT);
-			if (pols[1]) {
-				if (IS_ERR(pols[1])) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
-					err = PTR_ERR(pols[1]);
-					goto error;
-				}
-				if (pols[1]->action == XFRM_POLICY_BLOCK) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
-					err = -EPERM;
-					goto error;
-				}
-				npols ++;
-				xfrm_nr += pols[1]->xfrm_nr;
-			}
-		}
-
-		/*
-		 * Because neither flowi nor bundle information knows about
-		 * transformation template size. On more than one policy usage
-		 * we can realize whether all of them is bypass or not after
-		 * they are searched. See above not-transformed bypass
-		 * is surrounded by non-sub policy configuration, too.
-		 */
-		if (xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
-#endif
-		nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
-
-		if (unlikely(nx<0)) {
-			err = nx;
-			if (err == -EAGAIN && net->xfrm.sysctl_larval_drop) {
-				/* EREMOTE tells the caller to generate
-				 * a one-shot blackhole route.
-				 */
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-				xfrm_pol_put(policy);
-				return -EREMOTE;
-			}
-			if (err == -EAGAIN && (flags & XFRM_LOOKUP_WAIT)) {
-				DECLARE_WAITQUEUE(wait, current);
-
-				add_wait_queue(&net->xfrm.km_waitq, &wait);
-				set_current_state(TASK_INTERRUPTIBLE);
-				schedule();
-				set_current_state(TASK_RUNNING);
-				remove_wait_queue(&net->xfrm.km_waitq, &wait);
-
-				nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
-
-				if (nx == -EAGAIN && signal_pending(current)) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-					err = -ERESTART;
-					goto error;
-				}
-				if (nx == -EAGAIN ||
-				    genid != atomic_read(&flow_cache_genid)) {
-					xfrm_pols_put(pols, npols);
-					goto restart;
-				}
-				err = nx;
-			}
-			if (err < 0) {
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-				goto error;
-			}
-		}
-		if (nx == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
-		dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig);
-		err = PTR_ERR(dst);
-		if (IS_ERR(dst)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
-			goto error;
-		}
-
-		for (pi = 0; pi < npols; pi++)
-			pol_dead |= pols[pi]->walk.dead;
-
-		write_lock_bh(&policy->lock);
-		if (unlikely(pol_dead || stale_bundle(dst))) {
-			/* Wow! While we worked on resolving, this
-			 * policy has gone. Retry. It is not paranoia,
-			 * we just cannot enlist new bundle to dead object.
-			 * We can't enlist stable bundles either.
-			 */
-			write_unlock_bh(&policy->lock);
-			dst_free(dst);
-
-			if (pol_dead)
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLDEAD);
-			else
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			err = -EHOSTUNREACH;
-			goto error;
-		}
-
-		if (npols > 1)
-			err = xfrm_dst_update_parent(dst, &pols[1]->selector);
-		else
-			err = xfrm_dst_update_origin(dst, fl);
-		if (unlikely(err)) {
-			write_unlock_bh(&policy->lock);
-			dst_free(dst);
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			goto error;
-		}
-
-		dst->next = policy->bundles;
-		policy->bundles = dst;
-		dst_hold(dst);
-		write_unlock_bh(&policy->lock);
+	} else if (num_xfrms > 0) {
+		/* Flow transformed */
+		*dst_p = dst;
+		dst_release(dst_orig);
+	} else {
+		/* Flow passes untransformed */
+		dst_release(dst);
 	}
-	*dst_p = dst;
-	dst_release(dst_orig);
-	xfrm_pols_put(pols, npols);
+ok:
+	xfrm_pols_put(pols, drop_pols);
 	return 0;
 
+nopol:
+	if (!(flags & XFRM_LOOKUP_ICMP))
+		goto ok;
+	err = -ENOENT;
 error:
-	xfrm_pols_put(pols, npols);
+	dst_release(dst);
 dropdst:
 	dst_release(dst_orig);
 	*dst_p = NULL;
+	xfrm_pols_put(pols, drop_pols);
 	return err;
-
-nopol:
-	err = -ENOENT;
-	if (flags & XFRM_LOOKUP_ICMP)
-		goto dropdst;
-	return 0;
 }
 EXPORT_SYMBOL(__xfrm_lookup);
 
@@ -2162,71 +2259,24 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
 	return dst;
 }
 
-static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_entry *), struct dst_entry **gc_list_p)
-{
-	struct dst_entry *dst, **dstp;
-
-	write_lock(&pol->lock);
-	dstp = &pol->bundles;
-	while ((dst=*dstp) != NULL) {
-		if (func(dst)) {
-			*dstp = dst->next;
-			dst->next = *gc_list_p;
-			*gc_list_p = dst;
-		} else {
-			dstp = &dst->next;
-		}
-	}
-	write_unlock(&pol->lock);
-}
-
-static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *))
+static void __xfrm_garbage_collect(struct net *net)
 {
-	struct dst_entry *gc_list = NULL;
-	int dir;
+	struct dst_entry *head, *next;
 
-	read_lock_bh(&xfrm_policy_lock);
-	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
-		struct xfrm_policy *pol;
-		struct hlist_node *entry;
-		struct hlist_head *table;
-		int i;
+	flow_cache_flush();
 
-		hlist_for_each_entry(pol, entry,
-				     &net->xfrm.policy_inexact[dir], bydst)
-			prune_one_bundle(pol, func, &gc_list);
+	spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+	head = xfrm_policy_sk_bundles;
+	xfrm_policy_sk_bundles = NULL;
+	spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
 
-		table = net->xfrm.policy_bydst[dir].table;
-		for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
-			hlist_for_each_entry(pol, entry, table + i, bydst)
-				prune_one_bundle(pol, func, &gc_list);
-		}
-	}
-	read_unlock_bh(&xfrm_policy_lock);
-
-	while (gc_list) {
-		struct dst_entry *dst = gc_list;
-		gc_list = dst->next;
-		dst_free(dst);
+	while (head) {
+		next = head->next;
+		dst_free(head);
+		head = next;
 	}
 }
 
-static int unused_bundle(struct dst_entry *dst)
-{
-	return !atomic_read(&dst->__refcnt);
-}
-
-static void __xfrm_garbage_collect(struct net *net)
-{
-	xfrm_prune_bundles(net, unused_bundle);
-}
-
-static int xfrm_flush_bundles(struct net *net)
-{
-	xfrm_prune_bundles(net, stale_bundle);
-	return 0;
-}
-
 static void xfrm_init_pmtu(struct dst_entry *dst)
 {
 	do {
@@ -2284,7 +2334,9 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 			return 0;
 		if (dst->xfrm->km.state != XFRM_STATE_VALID)
 			return 0;
-		if (xdst->genid != dst->xfrm->genid)
+		if (xdst->xfrm_genid != dst->xfrm->genid)
+			return 0;
+		if (xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
 			return 0;
 
 		if (strict && fl &&
@@ -2445,11 +2497,9 @@ static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
 
 static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
-	struct net_device *dev = ptr;
-
 	switch (event) {
 	case NETDEV_DOWN:
-		xfrm_flush_bundles(dev_net(dev));
+		flow_cache_flush();
 	}
 	return NOTIFY_DONE;
 }
@@ -2781,7 +2831,6 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       struct xfrm_migrate *m, int num_migrate)
 {
 	struct xfrm_migrate *mp;
-	struct dst_entry *dst;
 	int i, j, n = 0;
 
 	write_lock_bh(&pol->lock);
@@ -2806,10 +2855,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       sizeof(pol->xfrm_vec[i].saddr));
 			pol->xfrm_vec[i].encap_family = mp->new_family;
 			/* flush bundles */
-			while ((dst = pol->bundles) != NULL) {
-				pol->bundles = dst->next;
-				dst_free(dst);
-			}
+			atomic_inc(&pol->genid);
 		}
 	}
 
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 1/4] flow: virtualize flow cache entry methods
From: Timo Teras @ 2010-04-01 12:52 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270126340-30181-1-git-send-email-timo.teras@iki.fi>

This allows to validate the cached object before returning it.
It also allows to destruct object properly, if the last reference
was held in flow cache. This is also a prepartion for caching
bundles in the flow cache.

In return for virtualizing the methods, we save on:
- not having to regenerate the whole flow cache on policy removal:
  each flow matching a killed policy gets refreshed as the getter
  function notices it smartly.
- we do not have to call flow_cache_flush from policy gc, since the
  flow cache now properly deletes the object if it had any references

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 include/net/flow.h     |   18 ++++++--
 include/net/xfrm.h     |    2 +
 net/core/flow.c        |  118 ++++++++++++++++++++++++-----------------------
 net/xfrm/xfrm_policy.c |  113 ++++++++++++++++++++++++++++++---------------
 4 files changed, 151 insertions(+), 100 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..65d68a6 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -86,11 +86,21 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
+struct flow_cache_entry_ops {
+	struct flow_cache_entry_ops ** (*get)(struct flow_cache_entry_ops **);
+	int (*check)(struct flow_cache_entry_ops **);
+	void (*delete)(struct flow_cache_entry_ops **);
+};
+
+typedef struct flow_cache_entry_ops **(*flow_resolve_t)(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx);
+
+extern struct flow_cache_entry_ops **flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, flow_resolve_t resolver, void *ctx);
+
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..cb8934b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -19,6 +19,7 @@
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/flow.h>
 
 #include <linux/interrupt.h>
 
@@ -481,6 +482,7 @@ struct xfrm_policy {
 	atomic_t		refcnt;
 	struct timer_list	timer;
 
+	struct flow_cache_entry_ops *fc_ops;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
diff --git a/net/core/flow.c b/net/core/flow.c
index 1d27ca6..f938137 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,13 +26,12 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
+	struct flow_cache_entry *	next;
+	u16				family;
+	u8				dir;
+	u32				genid;
+	struct flowi			key;
+	struct flow_cache_entry_ops **	ops;
 };
 
 struct flow_cache_percpu {
@@ -78,12 +77,21 @@ static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+	if (atomic_read(&flow_cache_genid) != fle->genid)
+		return 0;
+	if (fle->ops && !(*fle->ops)->check(fle->ops))
+		return 0;
+	return 1;
+}
+
 static void flow_entry_kill(struct flow_cache *fc,
 			    struct flow_cache_percpu *fcp,
 			    struct flow_cache_entry *fle)
 {
-	if (fle->object)
-		atomic_dec(fle->object_ref);
+	if (fle->ops)
+		(*fle->ops)->delete(fle->ops);
 	kmem_cache_free(flow_cachep, fle);
 	fcp->hash_count--;
 }
@@ -96,16 +104,18 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 	int i;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		int k = 0;
+		int saved = 0;
 
 		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL && k < shrink_to) {
-			k++;
-			flp = &fle->next;
-		}
 		while ((fle = *flp) != NULL) {
-			*flp = fle->next;
-			flow_entry_kill(fc, fcp, fle);
+			if (saved < shrink_to &&
+			    flow_entry_valid(fle)) {
+				saved++;
+				flp = &fle->next;
+			} else {
+				*flp = fle->next;
+				flow_entry_kill(fc, fcp, fle);
+			}
 		}
 	}
 }
@@ -166,18 +176,21 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_entry_ops **flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family, u8 dir,
+		flow_resolve_t resolver, void *ctx)
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_entry_ops **ops;
 	unsigned int hash;
 
 	local_bh_disable();
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
+	ops = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
 	if (!fcp->hash_table)
@@ -185,24 +198,14 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 	if (fcp->hash_rnd_recalc)
 		flow_new_hash_rnd(fc, fcp);
-	hash = flow_hash_code(fc, fcp, key);
 
+	hash = flow_hash_code(fc, fcp, key);
 	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
+		    flow_key_compare(key, &fle->key) == 0)
 			break;
-		}
 	}
 
 	if (!fle) {
@@ -215,37 +218,37 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			*head = fle;
 			fle->family = family;
 			fle->dir = dir;
+			fle->ops = NULL;
 			memcpy(&fle->key, key, sizeof(*key));
-			fle->object = NULL;
+			fle->ops = NULL;
 			fcp->hash_count++;
 		}
+	} else if (fle->genid == atomic_read(&flow_cache_genid)) {
+		ops = fle->ops;
+		if (!ops)
+			goto ret_ops;
+		ops = (*ops)->get(ops);
+		if (ops)
+			goto ret_ops;
 	}
 
 nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
-
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
-
-			if (fle->object)
-				atomic_dec(fle->object_ref);
-
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
+	ops = resolver(net, key, family, dir, fle ? fle->ops : NULL, ctx);
+	if (fle) {
+		fle->genid = atomic_read(&flow_cache_genid);
+		if (IS_ERR(ops)) {
+			fle->genid--;
+			fle->ops = NULL;
+		} else {
+			fle->ops = ops;
 		}
-		local_bh_enable();
-
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
+	} else {
+		if (ops && !IS_ERR(ops))
+			(*ops)->delete(ops);
 	}
+ret_ops:
+	local_bh_enable();
+	return ops;
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
@@ -261,13 +264,12 @@ static void flow_cache_flush_tasklet(unsigned long data)
 
 		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
+			if (flow_entry_valid(fle))
 				continue;
 
-			fle->object = NULL;
-			atomic_dec(fle->object_ref);
+			if (fle->ops)
+				(*fle->ops)->delete(fle->ops);
+			fle->ops = NULL;
 		}
 	}
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 82789cf..20f5b01 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -216,6 +216,36 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+static struct flow_cache_entry_ops **xfrm_policy_get_fce(
+		struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_policy *pol = container_of(ops, struct xfrm_policy, fc_ops);
+
+	if (unlikely(pol->walk.dead))
+		ops = NULL;
+	else
+		xfrm_pol_hold(pol);
+
+	return ops;
+}
+
+static int xfrm_policy_check_fce(struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_policy *pol = container_of(ops, struct xfrm_policy, fc_ops);
+
+	return !pol->walk.dead;
+}
+
+static void xfrm_policy_delete_fce(struct flow_cache_entry_ops **ops)
+{
+	xfrm_pol_put(container_of(ops, struct xfrm_policy, fc_ops));
+}
+
+static struct flow_cache_entry_ops xfrm_policy_fc_ops __read_mostly = {
+	.get = xfrm_policy_get_fce,
+	.check = xfrm_policy_check_fce,
+	.delete = xfrm_policy_delete_fce,
+};
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -236,6 +266,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 		atomic_set(&policy->refcnt, 1);
 		setup_timer(&policy->timer, xfrm_policy_timer,
 				(unsigned long)policy);
+		policy->fc_ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -269,9 +300,6 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
@@ -661,10 +689,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -703,10 +729,8 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
@@ -822,7 +846,6 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -976,32 +999,35 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct flow_cache_entry_ops **xfrm_policy_lookup(
+		struct net *net, struct flowi *fl, u16 family,
+		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx)
 {
 	struct xfrm_policy *pol;
-	int err = 0;
+
+	if (old_ops)
+		xfrm_pol_put(container_of(old_ops, struct xfrm_policy, fc_ops));
 
 #ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
 #endif
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
+	return NULL;
+
+found:
+	/* Resolver returns two references:
+	 * one for cache and one for caller of flow_cache_lookup() */
+	xfrm_pol_hold(pol);
+
+	return &pol->fc_ops;
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1091,8 +1117,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
-		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1578,18 +1602,24 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_entry_ops **ops;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
+		ops = flow_cache_lookup(net, fl, dst_orig->ops->family,
+					dir, xfrm_policy_lookup, NULL);
+		err = PTR_ERR(ops);
+		if (IS_ERR(ops)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 			goto dropdst;
 		}
+		if (ops)
+			policy = container_of(ops, struct xfrm_policy, fc_ops);
+		else
+			policy = NULL;
 	}
 
 	if (!policy)
@@ -1939,9 +1969,16 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
+	if (!pol) {
+		struct flow_cache_entry_ops **ops;
+
+		ops = flow_cache_lookup(net, &fl, family, fl_dir,
+					xfrm_policy_lookup, NULL);
+		if (IS_ERR(ops))
+			pol = ERR_CAST(ops);
+		else if (ops)
+			pol = container_of(ops, struct xfrm_policy, fc_ops);
+	}
 
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 0/4] caching bundles, iteration 3
From: Timo Teras @ 2010-04-01 12:52 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

Applies on top of the previous patches I sent.

Changes to previous iteration:
 - "flow: delayed deletion of flow cache entries" refactored to go
   after the other patches per Herbert's request
 - fixed hlist searching in the above mentioned patch
 - uses now ERR_CAST and other similar functions for better readability
 - added basic gc for per-socket bundles
 - some other minor clean ups

I'm now running this on a test box, with my specific setup, and it
seems to be working pretty well. However, this changes quite a bit
of things, so detailed review is needed.

Timo Teras (4):
  flow: virtualize flow cache entry methods
  xfrm: cache bundles instead of policies for outgoing flows
  xfrm: remove policy garbage collection
  flow: delayed deletion of flow cache entries

 include/net/flow.h      |   18 +-
 include/net/xfrm.h      |   12 +-
 net/core/flow.c         |  203 +++++++-----
 net/ipv4/xfrm4_policy.c |   22 --
 net/ipv6/xfrm6_policy.c |   31 --
 net/xfrm/xfrm_policy.c  |  822 +++++++++++++++++++++++++----------------------
 6 files changed, 583 insertions(+), 525 deletions(-)


^ permalink raw reply

* Re: [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
From: Patrick McHardy @ 2010-04-01 12:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jorrit Kronjee, netfilter-devel, netdev
In-Reply-To: <1270123804.2229.91.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le jeudi 01 avril 2010 à 13:03 +0200, Patrick McHardy a écrit :
>>> -/* allocate dsthash_ent, initialize dst, put in htable and lock it */
>>> -static struct dsthash_ent *
>>> -dsthash_alloc_init(struct xt_hashlimit_htable *ht,
>>> -		   const struct dsthash_dst *dst)
>> Is there a reason for moving this function downwards in the file?
>> That unnecessarily increases the diff and makes the patch harder to
>> review. For review purposes I moved it back up, resulting in 42
>> lines less diff.
>> --
> 
> Well, this is because I had to move inside this function various
> initializations and these inits use user2credits() which was defined
> after dsthash_alloc_init().

I thought I also tried to compile it after my change, but apparently
I made some mistake :)

> But we can avoid this since we hold the entry spinlock, and before hash
> insertion.
> 
> Only the lookup fields and the spinlock MUST be committed to memory
> before the insert. Other fields can be initialized later by caller.
> 
> Here is V2 of patch, I added locking as well in dl_seq_real_show()
> because we call rateinfo_recalc(). htable main lock doesnt anymore
> protects each entry rateinfo.
> 
> Thanks
> 
> [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
> 
> xt_hashlimit uses a central lock per hash table and suffers from
> contention on some workloads. (Multiqueue NIC or if RPS is enabled)
> 
> After RCU conversion, central lock is only used when a writer wants to
> add or delete an entry.
> 
> For 'readers', updating an existing entry, they use an individual lock
> per entry.

Applied, thanks a lot Eric.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC] SPD basic actions per netdev
From: jamal @ 2010-04-01 12:34 UTC (permalink / raw)
  To: Timo Teräs; +Cc: Herbert Xu, David S. Miller, Patrick McHardy, netdev
In-Reply-To: <4BB48D1C.80205@iki.fi>

On Thu, 2010-04-01 at 15:10 +0300, Timo Teräs wrote:

> On entry to ip_forward the routing decision has already been made.
> Both oif and iif are valid on entry.

ah, ok - yes;->

> Currently policy_check() uses oif for SPD matching.

indeed it does.

So i can see the dilemma with fwd path. It would be nice
to be able to classify on both iif and oif.

So that leaves only IN direction. If i only worried about that
and use skb->skb_iif then at least i wont be breaking the semantics
for FWD/OUT (i.e the patch without check for FWD). 

That would make semantics for selector ifindex as follows:
table    current    patch
----------------------------
OUT       fl->oif   fl->oif
FWD       fl->oif   fl->oif
IN        N/A       skb->skb_iif

By "N/A" it means really you cant set it. If you do it doesnt work.

cheers,
jamal

^ permalink raw reply

* Re: [PATCH] xtables: make XT_ALIGN() usable in exported headers
From: "YOSHIFUJI Hideaki (吉藤 英明)" @ 2010-04-01 12:25 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Alexey Dobriyan, Stephen Hemminger, Ben Hutchings,
	Andreas Henriksson, jamal, netdev
In-Reply-To: <4BB48C50.2000000@trash.net>

I think it is better to solicit comments on LKML.

This is not a new issue but rather a common one
(e.g. CMSG_ALIGN uses its own definition).

Regards,

--yoshfuji

(2010/04/01 21:06), Patrick McHardy wrote:
> Alexey Dobriyan wrote:
>> On Thu, Apr 1, 2010 at 1:50 PM, Patrick McHardy<kaber@trash.net>  wrote:
>>> I can't think of anything but to restore the XT_ALIGN macro.
>>> We could add a XT_ALIGN definition to xtables.h, but that might
>>> still leave problems for other users.
>>>
>>> Alexey, do you have any better suggestions?
>>
>> I like __KERNEL_ALIGN trick.
>>
>> Sorry for attachment, my patch sending facility is broke.
>> Tested on iptables compilation.
>>
>
> Seems fine to me, thanks. I'll wait a bit for others to comment.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
From: Eric Dumazet @ 2010-04-01 12:10 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Jorrit Kronjee, netfilter-devel, netdev
In-Reply-To: <4BB47D81.3060400@trash.net>

Le jeudi 01 avril 2010 à 13:03 +0200, Patrick McHardy a écrit :
> Eric Dumazet wrote:
> > xt_hashlimit uses a central lock per hash table and suffers from
> > contention on some workloads. (Multiqueue NIC or if RPS is enabled)
> > 
> > After RCU conversion, central lock is only used when a writer wants to
> > add or delete an entry.
> > 
> > For 'readers', updating an existing entry, they use an individual lock
> > per entry.
> 
> Looks good to me, thanks Eric.
> 
> > -/* allocate dsthash_ent, initialize dst, put in htable and lock it */
> > -static struct dsthash_ent *
> > -dsthash_alloc_init(struct xt_hashlimit_htable *ht,
> > -		   const struct dsthash_dst *dst)
> 
> Is there a reason for moving this function downwards in the file?
> That unnecessarily increases the diff and makes the patch harder to
> review. For review purposes I moved it back up, resulting in 42
> lines less diff.
> --

Well, this is because I had to move inside this function various
initializations and these inits use user2credits() which was defined
after dsthash_alloc_init().

But we can avoid this since we hold the entry spinlock, and before hash
insertion.

Only the lookup fields and the spinlock MUST be committed to memory
before the insert. Other fields can be initialized later by caller.

Here is V2 of patch, I added locking as well in dl_seq_real_show()
because we call rateinfo_recalc(). htable main lock doesnt anymore
protects each entry rateinfo.

Thanks

[PATCH nf-next-2.6] xt_hashlimit: RCU conversion

xt_hashlimit uses a central lock per hash table and suffers from
contention on some workloads. (Multiqueue NIC or if RPS is enabled)

After RCU conversion, central lock is only used when a writer wants to
add or delete an entry.

For 'readers', updating an existing entry, they use an individual lock
per entry.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 5470bb0..453178d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -81,12 +81,14 @@ struct dsthash_ent {
 	struct dsthash_dst dst;
 
 	/* modified structure members in the end */
+	spinlock_t lock;
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
 		u_int32_t credit;
 		u_int32_t credit_cap, cost;
 	} rateinfo;
+	struct rcu_head rcu;
 };
 
 struct xt_hashlimit_htable {
@@ -143,9 +145,11 @@ dsthash_find(const struct xt_hashlimit_htable *ht,
 	u_int32_t hash = hash_dst(ht, dst);
 
 	if (!hlist_empty(&ht->hash[hash])) {
-		hlist_for_each_entry(ent, pos, &ht->hash[hash], node)
-			if (dst_cmp(ent, dst))
+		hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+			if (dst_cmp(ent, dst)) {
+				spin_lock(&ent->lock);
 				return ent;
+			}
 	}
 	return NULL;
 }
@@ -157,9 +161,10 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
 {
 	struct dsthash_ent *ent;
 
+	spin_lock(&ht->lock);
 	/* initialize hash with random val at the time we allocate
 	 * the first hashtable entry */
-	if (!ht->rnd_initialized) {
+	if (unlikely(!ht->rnd_initialized)) {
 		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
 		ht->rnd_initialized = true;
 	}
@@ -168,27 +173,36 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
 		/* FIXME: do something. question is what.. */
 		if (net_ratelimit())
 			pr_err("max count of %u reached\n", ht->cfg.max);
-		return NULL;
-	}
-
-	ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+		ent = NULL;
+	} else
+		ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
 	if (!ent) {
 		if (net_ratelimit())
 			pr_err("cannot allocate dsthash_ent\n");
-		return NULL;
-	}
-	memcpy(&ent->dst, dst, sizeof(ent->dst));
+	} else {
+		memcpy(&ent->dst, dst, sizeof(ent->dst));
+		spin_lock_init(&ent->lock);
 
-	hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
-	ht->count++;
+		spin_lock(&ent->lock);
+		hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+		ht->count++;
+	}
+	spin_unlock(&ht->lock);
 	return ent;
 }
 
+static void dsthash_free_rcu(struct rcu_head *head)
+{
+	struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
+
+	kmem_cache_free(hashlimit_cachep, ent);
+}
+
 static inline void
 dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 {
-	hlist_del(&ent->node);
-	kmem_cache_free(hashlimit_cachep, ent);
+	hlist_del_rcu(&ent->node);
+	call_rcu_bh(&ent->rcu, dsthash_free_rcu);
 	ht->count--;
 }
 static void htable_gc(unsigned long htlong);
@@ -512,15 +526,14 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
 
-	spin_lock_bh(&hinfo->lock);
+	rcu_read_lock_bh();
 	dh = dsthash_find(hinfo, &dst);
 	if (dh == NULL) {
 		dh = dsthash_alloc_init(hinfo, &dst);
 		if (dh == NULL) {
-			spin_unlock_bh(&hinfo->lock);
+			rcu_read_unlock_bh();
 			goto hotdrop;
 		}
-
 		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
 		dh->rateinfo.prev = jiffies;
 		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
@@ -537,11 +550,13 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
 		/* below the limit */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
-		spin_unlock_bh(&hinfo->lock);
+		spin_unlock(&dh->lock);
+		rcu_read_unlock_bh();
 		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
 	}
 
-	spin_unlock_bh(&hinfo->lock);
+	spin_unlock(&dh->lock);
+	rcu_read_unlock_bh();
 	/* default match is underlimit - so over the limit, we need to invert */
 	return info->cfg.mode & XT_HASHLIMIT_INVERT;
 
@@ -666,12 +681,15 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				   struct seq_file *s)
 {
+	int res;
+
+	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
 	rateinfo_recalc(ent, jiffies);
 
 	switch (family) {
 	case NFPROTO_IPV4:
-		return seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+		res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip.src,
 				 ntohs(ent->dst.src_port),
@@ -679,9 +697,10 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				 ntohs(ent->dst.dst_port),
 				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
 				 ent->rateinfo.cost);
+		break;
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 	case NFPROTO_IPV6:
-		return seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+		res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip6.src,
 				 ntohs(ent->dst.src_port),
@@ -689,11 +708,14 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				 ntohs(ent->dst.dst_port),
 				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
 				 ent->rateinfo.cost);
+		break;
 #endif
 	default:
 		BUG();
-		return 0;
+		res = 0;
 	}
+	spin_unlock(&ent->lock);
+	return res;
 }
 
 static int dl_seq_show(struct seq_file *s, void *v)
@@ -817,9 +839,11 @@ err1:
 
 static void __exit hashlimit_mt_exit(void)
 {
-	kmem_cache_destroy(hashlimit_cachep);
 	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
 	unregister_pernet_subsys(&hashlimit_net_ops);
+
+	rcu_barrier_bh();
+	kmem_cache_destroy(hashlimit_cachep);
 }
 
 module_init(hashlimit_mt_init);



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox