Netdev List
 help / color / mirror / Atom feed
* [PATCH] bonding: fix to rejoin multicast groups immediately
From: Flavio Leitner @ 2010-09-29  7:12 UTC (permalink / raw)
  To: netdev; +Cc: Flavio Leitner

It should rejoin multicast groups immediately when
the failover happens to restore the multicast traffic.

Signed-off-by: Flavio Leitner <fleitner@redhat.com>
---
 net/ipv4/igmp.c |   16 ++++++++--------
 1 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1fdcacd..b81d674 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1257,14 +1257,14 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
 
-	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
-		igmp_mod_timer(im, IGMP_Initial_Report_Delay);
-		return;
-	}
-	/* else, v3 */
-	im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
-		IGMP_Unsolicited_Report_Count;
-	igmp_ifc_event(in_dev);
+	/* a failover is happening and switches
+	 * must be notified immediately */
+	if (IGMP_V1_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+	else if (IGMP_V2_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+	else
+		igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
 #endif
 }
 EXPORT_SYMBOL(ip_mc_rejoin_group);
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH] bonding: rejoin multicast groups on VLANs
From: Flavio Leitner @ 2010-09-29  7:12 UTC (permalink / raw)
  To: netdev; +Cc: Flavio Leitner

It fixes bonding to rejoin multicast groups added
to VLAN devices on top of bonding when a failover
happens.

The first packet may be discarded, so the timer
assure that at least 3 Reports are sent.

Signed-off-by: Flavio Leitner <fleitner@redhat.com>
---
 drivers/net/bonding/bond_main.c |   59 +++++++++++++++++++++++++++++++++-----
 drivers/net/bonding/bonding.h   |    2 +
 2 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3b16f62..a23a5fa 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -865,18 +865,14 @@ static void bond_mc_del(struct bonding *bond, void *addr)
 }
 
 
-/*
- * Retrieve the list of registered multicast addresses for the bonding
- * device and retransmit an IGMP JOIN request to the current active
- * slave.
- */
-static void bond_resend_igmp_join_requests(struct bonding *bond)
+static void __bond_resend_igmp_join_requests(struct net_device *dev)
 {
 	struct in_device *in_dev;
 	struct ip_mc_list *im;
 
 	rcu_read_lock();
-	in_dev = __in_dev_get_rcu(bond->dev);
+
+	in_dev = __in_dev_get_rcu(dev);
 	if (in_dev) {
 		for (im = in_dev->mc_list; im; im = im->next)
 			ip_mc_rejoin_group(im);
@@ -885,6 +881,42 @@ static void bond_resend_igmp_join_requests(struct bonding *bond)
 	rcu_read_unlock();
 }
 
+
+/*
+ * Retrieve the list of registered multicast addresses for the bonding
+ * device and retransmit an IGMP JOIN request to the current active
+ * slave.
+ */
+static void bond_resend_igmp_join_requests(struct bonding *bond)
+{
+	struct net_device *vlan_dev;
+	struct vlan_entry *vlan;
+
+	read_lock(&bond->lock);
+	if (bond->kill_timers)
+		goto out;
+
+	/* rejoin all groups on bond device */
+	__bond_resend_igmp_join_requests(bond->dev);
+
+	if (!bond->vlgrp)
+		goto reschedule;
+
+	/* rejoin all groups on vlan devices */
+	list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
+		vlan_dev = vlan_group_get_device(bond->vlgrp, vlan->vlan_id);
+		if (vlan_dev)
+			__bond_resend_igmp_join_requests(vlan_dev);
+	}
+
+reschedule:
+	if (--bond->resend_igmp > 0)
+		mod_timer(&bond->mc_timer, jiffies + HZ/5);
+
+out:
+	read_unlock(&bond->lock);
+}
+
 /*
  * flush all members of flush->mc_list from device dev->mc_list
  */
@@ -944,7 +976,10 @@ static void bond_mc_swap(struct bonding *bond, struct slave *new_active,
 
 		netdev_for_each_mc_addr(ha, bond->dev)
 			dev_mc_add(new_active->dev, ha->addr);
-		bond_resend_igmp_join_requests(bond);
+
+		/* rejoin multicast groups */
+		bond->resend_igmp = 3;
+		mod_timer(&bond->mc_timer, jiffies + 1);
 	}
 }
 
@@ -3741,9 +3776,15 @@ static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
 static int bond_open(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
+	struct timer_list *mc_timer = &bond->mc_timer;
 
 	bond->kill_timers = 0;
 
+	/* multicast */
+	init_timer(mc_timer);
+	mc_timer->data = (unsigned long)bond;
+	mc_timer->function = (void *)&bond_resend_igmp_join_requests;
+
 	if (bond_is_lb(bond)) {
 		/* bond_alb_initialize must be called before the timer
 		 * is started.
@@ -3808,6 +3849,8 @@ static int bond_close(struct net_device *bond_dev)
 
 	write_unlock_bh(&bond->lock);
 
+	del_timer_sync(&bond->mc_timer);
+
 	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
 		cancel_delayed_work(&bond->mii_work);
 	}
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index c6fdd85..5fd4164 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -198,6 +198,8 @@ struct bonding {
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
 	rwlock_t lock;
 	rwlock_t curr_slave_lock;
+	struct   timer_list mc_timer;
+	s8       resend_igmp;
 	s8       kill_timers;
 	s8	 send_grat_arp;
 	s8	 send_unsol_na;
-- 
1.7.2.3


^ permalink raw reply related

* Re: [PATCH] ip_gre: CONFIG_IPV6_MODULE support
From: David Miller @ 2010-09-29  7:25 UTC (permalink / raw)
  To: eric.dumazet; +Cc: herbert, netdev, kaber
In-Reply-To: <1285742848.22570.53.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 29 Sep 2010 08:47:28 +0200

> Still, this does not solve the problem for people wanting to disable
> ipv6 module load or disable it ?
> 
> install ipv6 /bin/true
> 
> or
> 
> options ipv6 disable=1

If you set the disable option, it should do the right thing.  Since
the ipv6 symbols will be available, yet the ipv6 stack won't by
default bring up ipv6 addresses onto interfaces etc.

^ permalink raw reply

* Re: [PATCH] ip_gre: CONFIG_IPV6_MODULE support
From: David Miller @ 2010-09-29  7:26 UTC (permalink / raw)
  To: herbert; +Cc: eric.dumazet, netdev, kaber
In-Reply-To: <20100929071158.GA4684@gondor.apana.org.au>

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 29 Sep 2010 16:11:58 +0900

> On Wed, Sep 29, 2010 at 08:24:03AM +0200, Eric Dumazet wrote:
> .
>> Well, sometimes people wants to :
>> 
>> /etc/modprobe.conf
>> install ipv6 /bin/true
>> 
>> 
>> Yet, be able to load ip_gre as a module
>> 
>> so IPV6=m, GRE=m,
> 
> I think if we cared about this we should provide ways to disable
> IPv6 (if there aren't any already) even when the module is loaded
> or the code is built-in.

We do provide this functionality already.

^ permalink raw reply

* Re: [PATCH 4/7] net: emaclite: Add support for little-endian platforms
From: Michal Simek @ 2010-09-29  7:33 UTC (permalink / raw)
  To: microblaze-uclinux-rVRm/Wmeqae7NGdpmJTKYQ
  Cc: eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	devicetree-discuss-uLR06cmDAlY/bJ5BZ2RsiQ,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, duyl-gjFFaj9aHVfQT0dZR+AlfA,
	linnj-gjFFaj9aHVfQT0dZR+AlfA,
	edgar.iglesias-Re5JQEeQqe8AvxtiuMwx3w,
	john.williams-g5w7nrANp4BDPfheJLI6IQ
In-Reply-To: <20100928.232722.189705899.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>



David Miller wrote:
> From: Grant Likely <grant.likely-s3s/WqlpOiPyB63q8FvJNQ@public.gmane.org>
> Date: Wed, 29 Sep 2010 15:11:46 +0900
> 
>> On Wed, Sep 29, 2010 at 03:52:15PM +1000, Michal Simek wrote:
>>> Upcomming Microblaze is little endian that's why is necessary
>>> to fix protocol and length loading.
>>>
>>> Signed-off-by: Michal Simek <monstr-pSz03upnqPeHXe+LvDLADg@public.gmane.org>
>>> CC: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
>>> CC: Grant Likely <grant.likely-s3s/WqlpOiPyB63q8FvJNQ@public.gmane.org>
>>> CC: Eric Dumazet <eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>> CC: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>> CC: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>> CC: devicetree-discuss-uLR06cmDAlY/bJ5BZ2RsiQ@public.gmane.org
>> This should go via davem, but it looks correct to me.
>>
>> Acked-by: Grant Likely <grant.likely-s3s/WqlpOiPyB63q8FvJNQ@public.gmane.org>
> 
> It doesn't need to, the microblaze guys can integrate this directly:
> 
> Acked-by: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

Thanks for ACKs. I will add them and will take care about it.

Michal


-- 
Michal Simek, Ing. (M.Eng)
w: www.monstr.eu p: +42-0-721842854
Maintainer of Linux kernel 2.6 Microblaze Linux - http://www.monstr.eu/fdt/
Microblaze U-BOOT custodian

^ permalink raw reply

* Re: [PATCH] ip_gre: CONFIG_IPV6_MODULE support
From: Eric Dumazet @ 2010-09-29  7:45 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, kaber
In-Reply-To: <20100929.002544.183044247.davem@davemloft.net>

Le mercredi 29 septembre 2010 à 00:25 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 29 Sep 2010 08:47:28 +0200
> 
> > Still, this does not solve the problem for people wanting to disable
> > ipv6 module load or disable it ?
> > 
> > install ipv6 /bin/true
> > 
> > or
> > 
> > options ipv6 disable=1
> 
> If you set the disable option, it should do the right thing.  Since
> the ipv6 symbols will be available, yet the ipv6 stack won't by
> default bring up ipv6 addresses onto interfaces etc.

I must miss something obvious.

David, with your patch, I cant :

install ipv6 /bin/true
modprobe ip_gre

FATAL: Error inserting ip_gre
(/lib/modules/2.6.36-rc6-dirty/kernel/net/ipv4/ip_gre.ko): Unknown
symbol in module, or unknown parameter (see dmesg)

[  223.150774] ip_gre: Unknown symbol icmpv6_send (err 0)

Thanks



^ permalink raw reply

* Re: [PATCH] ipv4: remove all rt cache entries on UNREGISTER event
From: Nicolas Dichtel @ 2010-09-29  7:49 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Octavian Purdila
In-Reply-To: <1285692969.3154.86.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le mardi 28 septembre 2010 à 18:45 +0200, Nicolas Dichtel a écrit :
>> Eric Dumazet wrote:
>>> Le mardi 28 septembre 2010 à 17:24 +0200, Nicolas Dichtel a écrit :
>>>> Hi,
>>>>
>>>> I face a problem when I try to remove an interface, 
>>>> netdev_wait_allrefs() complains about refcount.
>>>>
>>>> Here is a trivial scenario to reproduce the problem:
>>>> # ip tunnel add mode ipip remote 10.16.0.164 local 10.16.0.72 dev eth0
>>>> # ./a.out tunl1
>>>> # ip tunnel del tunl1
>>>>
>>>> Note: a.out binary create an IPv4 raw socket, attach it to tunl1 
>>>> (SO_BINDTODEVICE), set it as multicast (IP_MULTICAST_LOOP), set the 
>>>> multicast interface to tunl1 (IP_MULTICAST_IF), build the IP header 
>>>> (IP_HDRINCL) and then send a single packet (192.168.6.1 -> 224.0.0.18).
>>>>
>>>> Note2: when a.out is executed, tunl1 has no ip address and is down.
>>>>
>>> CC Octavian Purdila, the patch author.
>>>
>>> I am just wondering why this route is created in the first place.
The route is created because no function will check interface status (up 
and running or down). Just at the end, the packet will be enqueued in 
the noop qdisc.

>> At first, I asked myself the same question, but it seems that this is 
>> allowed to send a packet through this kind of socket, even if interface 
>> is down. Packet will be destroyed by the noop qdisk.
>> But I agree that it is strange to perform route lookup and everything to 
>>    destroy the packet at the end ...
>> Maybe raw_sendmsg() can delete it directly ;-) ... or maybe 
>> ip_route_output_flow().
>>
>> Any suggestions welcome.
>>
> 
> Hmm...
> 
> One way to track this kind of problem would be to add a WARN_ON() in
> dev_hold()
> 
> -> Check that when a reference on dev is taken, we are in a known state.
> 
> Something like this ?
dev_hold() is done when interface is down, but before unregistering 
process start.

Regards,
Nicolas

> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 83de0eb..54bef78 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1773,6 +1774,7 @@ static inline void dev_put(struct net_device *dev)
>   */
>  static inline void dev_hold(struct net_device *dev)
>  {
> +	WARN_ON(dev->reg_state != NETREG_REGISTERED);
>  	atomic_inc(&dev->refcnt);
>  }
>  
> 
> 

^ permalink raw reply

* Re: [PATCH] ip_gre: CONFIG_IPV6_MODULE support
From: Herbert Xu @ 2010-09-29  7:57 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, kaber
In-Reply-To: <1285746352.2615.11.camel@edumazet-laptop>

On Wed, Sep 29, 2010 at 09:45:52AM +0200, Eric Dumazet wrote:
.
> > > install ipv6 /bin/true
> > > 
> > > or
> > > 
> > > options ipv6 disable=1
> > 
> > If you set the disable option, it should do the right thing.  Since
> > the ipv6 symbols will be available, yet the ipv6 stack won't by
> > default bring up ipv6 addresses onto interfaces etc.
> 
> I must miss something obvious.
> 
> David, with your patch, I cant :
> 
> install ipv6 /bin/true
> modprobe ip_gre

Does it work if you use

	options ipv6 disable=1

If so then that's what should be used in this case.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH] ip_gre: CONFIG_IPV6_MODULE support
From: David Miller @ 2010-09-29  7:59 UTC (permalink / raw)
  To: eric.dumazet; +Cc: herbert, netdev, kaber
In-Reply-To: <1285746352.2615.11.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 29 Sep 2010 09:45:52 +0200

> I must miss something obvious.
> 
> David, with your patch, I cant :
> 
> install ipv6 /bin/true
> modprobe ip_gre
> 
> FATAL: Error inserting ip_gre
> (/lib/modules/2.6.36-rc6-dirty/kernel/net/ipv4/ip_gre.ko): Unknown
> symbol in module, or unknown parameter (see dmesg)
> 
> [  223.150774] ip_gre: Unknown symbol icmpv6_send (err 0)
> 

Try the options disable=1, that works.

We already prevent the "/bin/true" stupidity from working in
the bonding driver, that's why we added the disable
module option to ipv6.

^ permalink raw reply

* Re: [PATCH] ip_gre: CONFIG_IPV6_MODULE support
From: David Miller @ 2010-09-29  8:00 UTC (permalink / raw)
  To: herbert; +Cc: eric.dumazet, netdev, kaber
In-Reply-To: <20100929075758.GA5188@gondor.apana.org.au>

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 29 Sep 2010 16:57:58 +0900

> On Wed, Sep 29, 2010 at 09:45:52AM +0200, Eric Dumazet wrote:
> .
>> > > install ipv6 /bin/true
>> > > 
>> > > or
>> > > 
>> > > options ipv6 disable=1
>> > 
>> > If you set the disable option, it should do the right thing.  Since
>> > the ipv6 symbols will be available, yet the ipv6 stack won't by
>> > default bring up ipv6 addresses onto interfaces etc.
>> 
>> I must miss something obvious.
>> 
>> David, with your patch, I cant :
>> 
>> install ipv6 /bin/true
>> modprobe ip_gre
> 
> Does it work if you use
> 
> 	options ipv6 disable=1
> 
> If so then that's what should be used in this case.

That's what one should use "always" :-)

Bonding has the same issue, and it's why we added the
disable module option, so that the /bin/true stupidity
could stop.

^ permalink raw reply

* Re: [PATCH] net: Implement Any-IP support for IPv6.
From: David Miller @ 2010-09-29  8:02 UTC (permalink / raw)
  To: zenczykowski; +Cc: netdev, maze
In-Reply-To: <1285582022-30787-1-git-send-email-zenczykowski@gmail.com>

From: Maciej Żenczykowski <zenczykowski@gmail.com>
Date: Mon, 27 Sep 2010 03:07:02 -0700

> From: Maciej Żenczykowski <maze@google.com>
> 
> AnyIP is the capability to receive packets and establish incoming
> connections on IPs we have not explicitly configured on the machine.
> 
> An example use case is to configure a machine to accept all incoming
> traffic on eth0, and leave the policy of whether traffic for a given IP
> should be delivered to the machine up to the load balancer.
> 
> Can be setup as follows:
>   ip -6 rule from all iif eth0 lookup 200
>   ip -6 route add local default dev lo table 200
> (in this case for all IPv6 addresses)
> 
> Signed-off-by: Maciej Żenczykowski <maze@google.com>

Ok, I applied this and Tom's ipv4-side patch and pushed it all
out to net-next-2.6

Thanks!

^ permalink raw reply

* Re: [PATCH] ip_gre: CONFIG_IPV6_MODULE support
From: Herbert Xu @ 2010-09-29  8:03 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev, kaber
In-Reply-To: <20100929.010056.115931816.davem@davemloft.net>

On Wed, Sep 29, 2010 at 01:00:56AM -0700, David Miller wrote:
.
> > 	options ipv6 disable=1
> > 
> > If so then that's what should be used in this case.
> 
> That's what one should use "always" :-)
> 
> Bonding has the same issue, and it's why we added the
> disable module option, so that the /bin/true stupidity
> could stop.

Right, that settles it :)
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH net-next-2.6] ip_gre: lockless xmit
From: Nicolas Dichtel @ 2010-09-29  8:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1285664747.2607.48.camel@edumazet-laptop>

NETIF_F_LLTX is marked as deprecated:

include/linux/netdevice.h:
#define NETIF_F_LLTX            4096    /* LockLess TX - deprecated. 
Please */
                                         /* do not use LLTX in new 
drivers */

Is it right to use it?

Regards,
Nicolas

Eric Dumazet wrote:
> GRE tunnels can benefit from lockless xmits, using NETIF_F_LLTX
> 
> Note: If tunnels are created with the "oseq" option, LLTX is not
> enabled :
> 
> Even using an atomic_t o_seq, we would increase chance for packets being
> out of order at receiver.
> 
> Bench on a 16 cpus machine (dual E5540 cpus), 16 threads sending
> 10000000 UDP frames via one gre tunnel (size:200 bytes per frame)
> 
> Before patch : 
> real	3m0.094s
> user	0m9.365s
> sys	47m50.103s
> 
> After patch:
> real	0m29.756s
> user	0m11.097s
> sys	7m33.012s
> 
> Last problem to solve is the contention on dst :
> 
> 
> 38660.00 21.4% __ip_route_output_key          vmlinux             
> 20786.00 11.5% dst_release                    vmlinux             
> 14191.00  7.8% __xfrm_lookup                  vmlinux             
> 12410.00  6.9% ip_finish_output               vmlinux             
>  4540.00  2.5% ip_push_pending_frames         vmlinux             
>  4427.00  2.4% ip_append_data                 vmlinux             
>  4265.00  2.4% __alloc_skb                    vmlinux             
>  4140.00  2.3% __ip_local_out                 vmlinux             
>  3991.00  2.2% dev_queue_xmit                 vmlinux     
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  net/ipv4/ip_gre.c |    4 ++++
>  1 files changed, 4 insertions(+)
> 
> diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
> index a1b5d5e..035db63 100644
> --- a/net/ipv4/ip_gre.c
> +++ b/net/ipv4/ip_gre.c
> @@ -1557,6 +1557,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
>  	if (!tb[IFLA_MTU])
>  		dev->mtu = mtu;
>  
> +	/* Can use a lockless transmit, unless we generate output sequences */
> +	if (!(nt->parms.o_flags & GRE_SEQ))
> +		dev->features |= NETIF_F_LLTX;
> +
>  	err = register_netdevice(dev);
>  	if (err)
>  		goto out;
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host kernel
From: Michael S. Tsirkin @ 2010-09-29  8:16 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Arnd Bergmann, Avi Kivity, Xin, Xiaohui, David Miller, netdev,
	kvm, linux-kernel
In-Reply-To: <1285730669.31343.7.camel@localhost.localdomain>

On Tue, Sep 28, 2010 at 08:24:29PM -0700, Shirley Ma wrote:
> Hello Michael,
> 
> On Wed, 2010-09-15 at 07:52 -0700, Shirley Ma wrote:
> > > >  Don't you think once I address vhost_add_used_and_signal update
> > > > issue, it is a simple and complete patch for macvtap TX zero copy?
> > > > 
> > > > Thanks
> > > > Shirley
> > > 
> > > I like the fact that the patch is simple. Unfortunately
> > > I suspect it'll stop being simple by the time it's complete :) 
> > 
> > I can make a try. :)
> 
> I compared several approaches for addressing the issue being raised here
> on how/when to update vhost_add_used_and_signal. The simple approach I
> have found is:
> 
> 1. Adding completion field in struct virtqueue;
> 2. when it is a zero copy packet, put vhost thread wait for completion
> to update vhost_add_used_and_signal;
> 3. passing vq from vhost to macvtap as skb destruct_arg;
> 4. when skb is freed for the last reference, signal vq completion
> The test results show same performance as the original patch. How do you
> think? If it sounds good to you. I will resubmit this reversion patch.
> The patch still keeps as simple as it was before. :)
> 
> Thanks
> Shirley

If you look at dev_hard_start_xmit you will see a call
to skb_orphan_try which often calls the skb destructor.
So I suspect this is almost equivalent to your original patch,
and has the same correctness issue.

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next-2.6] ip_gre: lockless xmit
From: Eric Dumazet @ 2010-09-29  8:18 UTC (permalink / raw)
  To: nicolas.dichtel; +Cc: David Miller, netdev
In-Reply-To: <4CA2F46B.1070607@6wind.com>

Le mercredi 29 septembre 2010 à 10:10 +0200, Nicolas Dichtel a écrit :
> NETIF_F_LLTX is marked as deprecated:
> 
> include/linux/netdevice.h:
> #define NETIF_F_LLTX            4096    /* LockLess TX - deprecated. 
> Please */
>                                          /* do not use LLTX in new 
> drivers */
> 
> Is it right to use it?
> 

In this particular case (and drivers/net/loopback.c), yes.

This is the only way to avoid the locking in core network
(net/core/dev.c)

What is deprecated is to assert NETIF_F_LLTX and yet, use a lock in the
ndo_xmit() driver method.




^ permalink raw reply

* Re: [PATCH net-next-2.6] ip_gre: lockless xmit
From: David Miller @ 2010-09-29  8:21 UTC (permalink / raw)
  To: eric.dumazet; +Cc: nicolas.dichtel, netdev
In-Reply-To: <1285748288.2615.15.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 29 Sep 2010 10:18:08 +0200

> What is deprecated is to assert NETIF_F_LLTX and yet, use a lock in the
> ndo_xmit() driver method.

Also we've discussed recently to do away with the NETIF_F_LLTX
flag entirely for queue-less devices such as loopback and
sw tunnels.

Then all will be left are the truly "deprecated" cases Eric
mentions.

^ permalink raw reply

* Re: [PATCH] ip_gre: CONFIG_IPV6_MODULE support
From: Eric Dumazet @ 2010-09-29  8:22 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, netdev, kaber
In-Reply-To: <20100929080303.GA5318@gondor.apana.org.au>

Le mercredi 29 septembre 2010 à 17:03 +0900, Herbert Xu a écrit :
> On Wed, Sep 29, 2010 at 01:00:56AM -0700, David Miller wrote:
> .
> > > 	options ipv6 disable=1
> > > 
> > > If so then that's what should be used in this case.
> > 
> > That's what one should use "always" :-)
> > 
> > Bonding has the same issue, and it's why we added the
> > disable module option, so that the /bin/true stupidity
> > could stop.
> 
> Right, that settles it :)

OK guys, you convinced me ;)



^ permalink raw reply

* Re: [RFC PATCH 2/2] macvtap: TX zero copy between guest and host kernel
From: Michael S. Tsirkin @ 2010-09-29  8:28 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Arnd Bergmann, Avi Kivity, Xin, Xiaohui, David Miller, netdev,
	kvm, linux-kernel
In-Reply-To: <20100929081645.GA21195@redhat.com>

On Wed, Sep 29, 2010 at 10:16:45AM +0200, Michael S. Tsirkin wrote:
> On Tue, Sep 28, 2010 at 08:24:29PM -0700, Shirley Ma wrote:
> > Hello Michael,
> > 
> > On Wed, 2010-09-15 at 07:52 -0700, Shirley Ma wrote:
> > > > >  Don't you think once I address vhost_add_used_and_signal update
> > > > > issue, it is a simple and complete patch for macvtap TX zero copy?
> > > > > 
> > > > > Thanks
> > > > > Shirley
> > > > 
> > > > I like the fact that the patch is simple. Unfortunately
> > > > I suspect it'll stop being simple by the time it's complete :) 
> > > 
> > > I can make a try. :)
> > 
> > I compared several approaches for addressing the issue being raised here
> > on how/when to update vhost_add_used_and_signal. The simple approach I
> > have found is:
> > 
> > 1. Adding completion field in struct virtqueue;
> > 2. when it is a zero copy packet, put vhost thread wait for completion
> > to update vhost_add_used_and_signal;
> > 3. passing vq from vhost to macvtap as skb destruct_arg;
> > 4. when skb is freed for the last reference, signal vq completion
> > The test results show same performance as the original patch. How do you
> > think? If it sounds good to you. I will resubmit this reversion patch.
> > The patch still keeps as simple as it was before. :)
> > 
> > Thanks
> > Shirley
> 
> If you look at dev_hard_start_xmit you will see a call
> to skb_orphan_try which often calls the skb destructor.
> So I suspect this is almost equivalent to your original patch,
> and has the same correctness issue.

So you could try doing skb_tx(skb)->prevent_sk_orphan = 1
just to see what will happen. Might be interesting - just
make sure the device doesn't orphan the skb first thing.
I suspect lack of parallelism will result in bad throughput
esp for small messages.

Note this still won't make it correct (this has module unloading
issue, and devices might still orphan skb, clone it, or hang on to
paged data in some other way) but at least closer.

I think you should try testing with guest to external communication,
this will uncover some of these correctness issues for you.
I think netperf also has some flag to check data, might
be a good idea to use it for testing.

> -- 
> MST

^ permalink raw reply

* Re: [PATCH] ipv4: remove all rt cache entries on UNREGISTER event
From: Eric Dumazet @ 2010-09-29  8:35 UTC (permalink / raw)
  To: nicolas.dichtel; +Cc: netdev, Octavian Purdila
In-Reply-To: <4CA2EF9C.9040909@6wind.com>

Le mercredi 29 septembre 2010 à 09:49 +0200, Nicolas Dichtel a écrit :
> Eric Dumazet wrote:
> > Le mardi 28 septembre 2010 à 18:45 +0200, Nicolas Dichtel a écrit :
> >> Eric Dumazet wrote:
> >>> Le mardi 28 septembre 2010 à 17:24 +0200, Nicolas Dichtel a écrit :
> >>>> Hi,
> >>>>
> >>>> I face a problem when I try to remove an interface, 
> >>>> netdev_wait_allrefs() complains about refcount.
> >>>>
> >>>> Here is a trivial scenario to reproduce the problem:
> >>>> # ip tunnel add mode ipip remote 10.16.0.164 local 10.16.0.72 dev eth0
> >>>> # ./a.out tunl1
> >>>> # ip tunnel del tunl1
> >>>>
> >>>> Note: a.out binary create an IPv4 raw socket, attach it to tunl1 
> >>>> (SO_BINDTODEVICE), set it as multicast (IP_MULTICAST_LOOP), set the 
> >>>> multicast interface to tunl1 (IP_MULTICAST_IF), build the IP header 
> >>>> (IP_HDRINCL) and then send a single packet (192.168.6.1 -> 224.0.0.18).
> >>>>
> >>>> Note2: when a.out is executed, tunl1 has no ip address and is down.
> >>>>
> >>> CC Octavian Purdila, the patch author.
> >>>
> >>> I am just wondering why this route is created in the first place.
> The route is created because no function will check interface status (up 
> and running or down). Just at the end, the packet will be enqueued in 
> the noop qdisc.
> 

In your case maybe, but I think there is another point where we can call
dev_hold() while device is in dismantle phase.


> >> At first, I asked myself the same question, but it seems that this is 
> >> allowed to send a packet through this kind of socket, even if interface 
> >> is down. Packet will be destroyed by the noop qdisk.
> >> But I agree that it is strange to perform route lookup and everything to 
> >>    destroy the packet at the end ...
> >> Maybe raw_sendmsg() can delete it directly ;-) ... or maybe 
> >> ip_route_output_flow().
> >>
> >> Any suggestions welcome.
> >>
> > 
> > Hmm...
> > 
> > One way to track this kind of problem would be to add a WARN_ON() in
> > dev_hold()
> > 
> > -> Check that when a reference on dev is taken, we are in a known state.
> > 
> > Something like this ?
> dev_hold() is done when interface is down, but before unregistering 
> process start.

Not on my machine. I can see the backtrace sometimes.

There is a race somewhere (maybe several), and your patch only reduce
the window of this race.

I am working on it.



^ permalink raw reply

* Re: [PATCHv2 net-next-2.6 0/5] XFRM,IPv6: Removal of RH2/HAO from IPsec-protected MIPv6 traffic
From: Arnaud Ebalard @ 2010-09-29  9:04 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, herbert, yoshfuji, netdev
In-Reply-To: <20100928.144005.260072379.davem@davemloft.net>

Hi,

David Miller <davem@davemloft.net> writes:

> From: arno@natisbad.org (Arnaud Ebalard)
> Date: Tue, 28 Sep 2010 23:33:16 +0200
>
>> Before following the (dumb) #ifdef path, I was about to do that but
>> worried about the penalty of the additional xfrm_state_get/put_afinfo()
>> calls on each packet I was about to add. Should I just reduce my amount
>> of coffee or is it a valid concern?
>
> Indeed, it is.
>
> Even without the concern of afinfo refcounting, this test is very
> heavy handed for the packet path.
>
> Can you make it small enough that it can reasonably be inlined?

I came up with an idea. A v3 follows; the cover letter details that.

Cheers,

a+

^ permalink raw reply

* [PATCHv3 net-next-2.6 0/5] XFRM,IPv6: Removal of RH2/HAO from IPsec-protected MIPv6 traffic
From: Arnaud Ebalard @ 2010-09-29  9:05 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Herbert Xu, Hideaki YOSHIFUJI; +Cc: netdev

Hi,

This an updated version of the patches. For reference, introduction of
the feature is here http://thread.gmane.org/gmane.linux.network/172941

This version 3 now also builds with ipv6 modular. To do that, a helper
(input_addr_check()) has been added to struct xfrm_state_afinfo. To
avoid the penalty of xfrm_state_get/put_afinfo() calls from xfrm_input(),
I spent some time in the sources and came up with the idea of accessing
it safely as follows:

 x = xfrm_state_lookup(net, skb->mark, NULL, spi, nexthdr, family);
 if (x == NULL ||
     x->outer_mode->afinfo->input_addr_check(skb, x)) {
     ...

Tell me if I missed something.

Comments welcome.

Cheers,

a+

^ permalink raw reply

* [PATCHv3 net-next-2.6 1/5] XFRM,IPv6: Remove xfrm_spi_hash() dependency on destination address
From: Arnaud Ebalard @ 2010-09-29  9:05 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Herbert Xu, Hideaki YOSHIFUJI; +Cc: netdev
In-Reply-To: <cover.1285749610.git.arno@natisbad.org>


In the new IPsec architecture [RFC4301], "for an SA used to carry
unicast traffic, the Security Parameters Index (SPI) by itself
suffices to specify an SA".  Section 4.1 of [RFC4301] provides
additional guidance on the topic.

In the old IPsec architecture [RFC2401], a SA "is uniquely identified
by a triple consisting of a Security Parameter Index (SPI), an IP
Destination Address and a security protocol (AH or ESP) identifier".

If an IPsec stack only supports the behavior mandated by the old
IPsec architecture, SAD lookup on inbound packets require the use of
both the SPI and the destination address of the SA.

For inbound IPsec traffic, IRO remapping rules may exist on the MN to
remap the destination address (CoA) into the HoA.  In that case, by
design, the address found in the destination address field of the
packet (CoA) does not match the one in the SA (HoA).

At the moment, Linux XFRM stack includes the address when computing
the hash to perform state lookup by SPI. This patch changes XFRM
state hash computation to prevent destination address to be
used. This will later allow finding states for packets w/ mangled
destination addresses.

Signed-off-by: Arnaud Ebalard <arno@natisbad.org>
---
 net/xfrm/xfrm_hash.h  |   21 +--------------------
 net/xfrm/xfrm_state.c |   20 ++++++++------------
 2 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h
index 8e69533..19eeee7 100644
--- a/net/xfrm/xfrm_hash.h
+++ b/net/xfrm/xfrm_hash.h
@@ -4,16 +4,6 @@
 #include <linux/xfrm.h>
 #include <linux/socket.h>
 
-static inline unsigned int __xfrm4_addr_hash(xfrm_address_t *addr)
-{
-	return ntohl(addr->a4);
-}
-
-static inline unsigned int __xfrm6_addr_hash(xfrm_address_t *addr)
-{
-	return ntohl(addr->a6[2] ^ addr->a6[3]);
-}
-
 static inline unsigned int __xfrm4_daddr_saddr_hash(xfrm_address_t *daddr, xfrm_address_t *saddr)
 {
 	u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4;
@@ -60,18 +50,9 @@ static inline unsigned __xfrm_src_hash(xfrm_address_t *daddr,
 }
 
 static inline unsigned int
-__xfrm_spi_hash(xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family,
-		unsigned int hmask)
+__xfrm_spi_hash(__be32 spi, u8 proto, unsigned int hmask)
 {
 	unsigned int h = (__force u32)spi ^ proto;
-	switch (family) {
-	case AF_INET:
-		h ^= __xfrm4_addr_hash(daddr);
-		break;
-	case AF_INET6:
-		h ^= __xfrm6_addr_hash(daddr);
-		break;
-	}
 	return (h ^ (h >> 10) ^ (h >> 20)) & hmask;
 }
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index eb96ce5..b6a4d8d 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -30,7 +30,7 @@
 
 /* Each xfrm_state may be linked to two tables:
 
-   1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
+   1. Hash table by (spi,ah/esp) to find SA by SPI. (input,ctl)
    2. Hash table by (daddr,family,reqid) to find what SAs exist for given
       destination/tunnel endpoint. (output)
  */
@@ -67,9 +67,9 @@ static inline unsigned int xfrm_src_hash(struct net *net,
 }
 
 static inline unsigned int
-xfrm_spi_hash(struct net *net, xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family)
+xfrm_spi_hash(struct net *net, __be32 spi, u8 proto)
 {
-	return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask);
+	return __xfrm_spi_hash(spi, proto, net->xfrm.state_hmask);
 }
 
 static void xfrm_hash_transfer(struct hlist_head *list,
@@ -95,9 +95,7 @@ static void xfrm_hash_transfer(struct hlist_head *list,
 		hlist_add_head(&x->bysrc, nsrctable+h);
 
 		if (x->id.spi) {
-			h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
-					    x->id.proto, x->props.family,
-					    nhashmask);
+			h = __xfrm_spi_hash(x->id.spi, x->id.proto, nhashmask);
 			hlist_add_head(&x->byspi, nspitable+h);
 		}
 	}
@@ -679,7 +677,7 @@ xfrm_init_tempstate(struct xfrm_state *x, struct flowi *fl,
 
 static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family)
 {
-	unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family);
+	unsigned int h = xfrm_spi_hash(net, spi, proto);
 	struct xfrm_state *x;
 	struct hlist_node *entry;
 
@@ -868,7 +866,7 @@ found:
 			h = xfrm_src_hash(net, daddr, saddr, encap_family);
 			hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
 			if (x->id.spi) {
-				h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
+				h = xfrm_spi_hash(net, x->id.spi, x->id.proto);
 				hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
 			}
 			x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
@@ -942,9 +940,7 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 	hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
 
 	if (x->id.spi) {
-		h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto,
-				  x->props.family);
-
+		h = xfrm_spi_hash(net, x->id.spi, x->id.proto);
 		hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
 	}
 
@@ -1535,7 +1531,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
 	}
 	if (x->id.spi) {
 		spin_lock_bh(&xfrm_state_lock);
-		h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family);
+		h = xfrm_spi_hash(net, x->id.spi, x->id.proto);
 		hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
 		spin_unlock_bh(&xfrm_state_lock);
 
-- 
1.7.1



^ permalink raw reply related

* [PATCHv3 net-next-2.6 2/5] XFRM,IPv6: Introduce receive sockopts to access IRO remapped src/dst addresses
From: Arnaud Ebalard @ 2010-09-29  9:05 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Herbert Xu, Hideaki YOSHIFUJI; +Cc: netdev
In-Reply-To: <cover.1285749610.git.arno@natisbad.org>


This patch introduces IRO recv sockopts, in order for userland processes
(e.g. UMIP) to access on-wire source or destination addresses found in
incoming (IPsec-protected) packets as they were before remapping by IRO.
The socket options are respectively IPV6_RECVIROSRC and IPV6_RECVIRODST.

Basically, the two recv socket options are similar in their purpose to
their generic RH2/HAO counterparts defined in RFC 3542 (IPV6_RECVIROSRC
<->  IPV6_RECVDSTOPTS, IPV6_RECVIRODST <-> IPV6_RECVRTHDR). They differ
on the following aspects:

 - IRO reporting sockopts only work on incoming IPsec-protected packets
   Userspace will never get IRO remapped address report for common
   (non protected) packets.
 - The receiver gets the original source/desination address (IRO
   remapping) from its IPsec stack.
 - as IRO sockopts only deal with addresses, no specific structure is
   defined, i.e. struct in6_addr is used to pass info.

As we only interact with IPsec protected packets, struct sec_path is
used to carry information (addresses) for incoming packets that have
undergone remapping process.

Signed-off-by: Arnaud Ebalard <arno@natisbad.org>
---
 include/linux/in6.h      |    7 +++++++
 include/linux/ipv6.h     |    4 +++-
 include/net/xfrm.h       |    5 +++++
 net/ipv6/datagram.c      |   18 ++++++++++++++++++
 net/ipv6/ipv6_sockglue.c |   26 ++++++++++++++++++++++++++
 5 files changed, 59 insertions(+), 1 deletions(-)

diff --git a/include/linux/in6.h b/include/linux/in6.h
index c4bf46f..52a98ab 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -283,4 +283,11 @@ struct in6_flowlabel_req {
  * MRT6_PIM			208
  * (reserved)			209
  */
+
+/* IRO (IPsec Route Optimization) sockopts */
+#define IPV6_RECVIROSRC         74
+#define IPV6_IROSRC		75
+#define IPV6_RECVIRODST         76
+#define IPV6_IRODST		77
+
 #endif
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e62683b..55289ee 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -341,7 +341,9 @@ struct ipv6_pinfo {
 				odstopts:1,
                                 rxflow:1,
 				rxtclass:1,
-				rxpmtu:1;
+				rxpmtu:1,
+				irosrc:1,
+				irodst:1;
 		} bits;
 		__u16		all;
 	} rxopt;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4f53532..e6a753c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -909,6 +909,11 @@ struct sec_path {
 	atomic_t		refcnt;
 	int			len;
 	struct xfrm_state	*xvec[XFRM_MAX_DEPTH];
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct in6_addr         irosrc;
+	struct in6_addr         irodst;
+#endif
 };
 
 static inline struct sec_path *
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index ef371aa..2952c9e 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -29,6 +29,7 @@
 #include <net/transp_v6.h>
 #include <net/ip6_route.h>
 #include <net/tcp_states.h>
+#include <net/xfrm.h>
 
 #include <linux/errqueue.h>
 #include <asm/uaccess.h>
@@ -504,6 +505,23 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
 		put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
 	}
 
+#ifdef CONFIG_XFRM_SUB_POLICY
+	/* If access to IRO-remapped source or destination address has been
+	 * requested and it has indeed been remapped, provide the on-wire
+	 * address to userland */
+	if (skb_sec_path(skb)) {
+		struct sec_path *sp = skb_sec_path(skb);
+
+		if (np->rxopt.bits.irosrc && !ipv6_addr_any(&sp->irosrc))
+			put_cmsg(msg, SOL_IPV6, IPV6_IROSRC,
+				 sizeof(sp->irosrc), &sp->irosrc);
+
+		if (np->rxopt.bits.irodst && !ipv6_addr_any(&sp->irodst))
+			put_cmsg(msg, SOL_IPV6, IPV6_IRODST,
+				 sizeof(sp->irodst), &sp->irodst);
+	}
+#endif
+
 	if (opt->lastopt &&
 	    (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) {
 		/*
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a7f66bc..722a49f 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -302,6 +302,22 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		retv = 0;
 		break;
 
+#ifdef CONFIG_XFRM_SUB_POLICY
+	case IPV6_RECVIROSRC:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.irosrc = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RECVIRODST:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.irodst = valbool;
+		retv = 0;
+		break;
+#endif
+
 	case IPV6_2292DSTOPTS:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1056,6 +1072,16 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		val = np->rxopt.bits.dstopts;
 		break;
 
+#ifdef CONFIG_XFRM_SUB_POLICY
+	case IPV6_RECVIROSRC:
+		val = np->rxopt.bits.irosrc;
+		break;
+
+	case IPV6_RECVIRODST:
+		val = np->rxopt.bits.irodst;
+		break;
+#endif
+
 	case IPV6_2292DSTOPTS:
 		val = np->rxopt.bits.odstopts;
 		break;
-- 
1.7.1



^ permalink raw reply related

* [PATCHv3 net-next-2.6 3/5] XFRM,IPv6: Add IRO src/dst address remapping XFRM types and i/o handlers
From: Arnaud Ebalard @ 2010-09-29  9:05 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Herbert Xu, Hideaki YOSHIFUJI; +Cc: netdev
In-Reply-To: <cover.1285749610.git.arno@natisbad.org>


Add IRO source and destination remapping XFRM types and associated
input/output handlers. This allows userland to install such states
in order to support remapping of source or destination address
of packet. They basically work like existing RH2 and HAO ones; the
main difference is that output handlers do not expand the packet by
adding an extension header: they simply change the source or
destination in place. Input handlers are almost the same as RH2/HAO
version in their behavior, but they are triggered differently. RH2
and HAO handlers are triggered based on structures found in the
packet. On input, IRO states (and associated handlers) are looked
up when processing an IPsec-protected packet, when there is an
address mismatch.

Signed-off-by: Arnaud Ebalard <arno@natisbad.org>
---
 include/net/xfrm.h       |    2 +
 net/ipv6/mip6.c          |  153 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/xfrm6_mode_ro.c |   11 +++-
 net/xfrm/xfrm_user.c     |    4 +
 4 files changed, 169 insertions(+), 1 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e6a753c..05b2b1f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -35,6 +35,8 @@
 #define XFRM_PROTO_IPV6		41
 #define XFRM_PROTO_ROUTING	IPPROTO_ROUTING
 #define XFRM_PROTO_DSTOPTS	IPPROTO_DSTOPTS
+#define XFRM_PROTO_IRO_SRC      127
+#define XFRM_PROTO_IRO_DST      128
 
 #define XFRM_ALIGN8(len)	(((len) + 7) & ~7)
 #define MODULE_ALIAS_XFRM_MODE(family, encap) \
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index d6e9599..04b9e1d 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -477,6 +477,131 @@ static const struct xfrm_type mip6_rthdr_type =
 	.hdr_offset	= mip6_rthdr_offset,
 };
 
+#ifdef CONFIG_XFRM_SUB_POLICY
+/* IRO equivalent of mip6_destopt_input(): handles incoming packet with a
+ * source address different from the one expected in the SA: check that
+ * received source address is indeed the CoA we expected (or any address
+ * if the state references the unspecified address '::') */
+static int mip6_iro_src_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	int err = 1;
+
+	spin_lock(&x->lock);
+	if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) &&
+	    !ipv6_addr_any((struct in6_addr *)x->coaddr))
+		err = -ENOENT;
+	spin_unlock(&x->lock);
+
+	return err;
+}
+
+/* IRO equivalent of mip6_destopt_output(): replaces current source address
+ * of outgoing packet by state's CoA. */
+static int mip6_iro_src_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+
+	spin_lock_bh(&x->lock);
+	memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr));
+	spin_unlock_bh(&x->lock);
+
+	return 0;
+}
+
+static int mip6_iro_src_reject(struct xfrm_state *x, struct sk_buff *skb, struct flowi *fl)
+{
+	int err = 0;
+
+	/* XXX We may need some reject handler at some point but it is not
+	 * critical yet: see xfrm_secpath_reject() in net/xfrm/xfrm_policy.c
+	 * and aslo what mip6_destopt_reject() implements */
+
+	printk("XXX FIXME: mip6_iro_src_reject() called\n");
+
+	return err;
+}
+
+/* This is the IRO equivalent of mip6_rthdr_input(): handles incoming packet
+ * with a destination address different from the one expected in the SA:
+ * check that received destination address is indeed the CoA we expected
+ * (or any address if the state references the unspecified address '::') */
+static int mip6_iro_dst_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	int err = 1;
+
+	spin_lock(&x->lock);
+	if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) &&
+	    !ipv6_addr_any((struct in6_addr *)x->coaddr))
+		err = -ENOENT;
+	spin_unlock(&x->lock);
+
+	return err;
+}
+
+/* IRO equivalent of mip6_rthdr_output(): replaces current destination
+ * address of outgoing packet with state's CoA */
+static int mip6_iro_dst_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+
+	spin_lock_bh(&x->lock);
+	memcpy(&iph->daddr, x->coaddr, sizeof(iph->daddr));
+	spin_unlock_bh(&x->lock);
+
+	return 0;
+}
+
+/* Common to iro src and dst remapping states. */
+static int mip6_iro_init_state(struct xfrm_state *x)
+{
+	if (x->id.spi) {
+		printk(KERN_INFO "%s: spi is not 0: %u\n", __func__,
+		       x->id.spi);
+		return -EINVAL;
+	}
+	if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
+		printk(KERN_INFO "%s: state's mode is not %u: %u\n",
+		       __func__, XFRM_MODE_ROUTEOPTIMIZATION,
+		       x->props.mode);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Unlike common IPsec protocols, nothing to do when destroying */
+static void mip6_iro_destroy(struct xfrm_state *x)
+{
+}
+
+static const struct xfrm_type mip6_iro_src_type =
+{
+	.description	= "MIP6_IRO_SRC",
+	.owner		= THIS_MODULE,
+	.proto	     	= XFRM_PROTO_IRO_SRC,
+	.flags		= XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR,
+	.init_state	= mip6_iro_init_state,
+	.destructor	= mip6_iro_destroy,
+	.input		= mip6_iro_src_input,
+	.output		= mip6_iro_src_output,
+	.reject         = mip6_iro_src_reject,
+};
+
+static const struct xfrm_type mip6_iro_dst_type =
+{
+	.description	= "MIP6_IRO_DST",
+	.owner		= THIS_MODULE,
+	.proto	     	= XFRM_PROTO_IRO_DST,
+	.flags		= XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR,
+	.init_state	= mip6_iro_init_state,
+	.destructor	= mip6_iro_destroy,
+	.input		= mip6_iro_dst_input,
+	.output		= mip6_iro_dst_output,
+};
+#endif /* CONFIG_XFRM_SUB_POLICY */
+
 static int __init mip6_init(void)
 {
 	printk(KERN_INFO "Mobile IPv6\n");
@@ -489,6 +614,20 @@ static int __init mip6_init(void)
 		printk(KERN_INFO "%s: can't add xfrm type(rthdr)\n", __func__);
 		goto mip6_rthdr_xfrm_fail;
 	}
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (xfrm_register_type(&mip6_iro_src_type, AF_INET6) < 0) {
+		printk(KERN_INFO "%s: can't add xfrm type(IRO src remap)\n",
+		       __func__);
+		goto mip6_iro_src_remap_xfrm_fail;
+	}
+	if (xfrm_register_type(&mip6_iro_dst_type, AF_INET6) < 0) {
+		printk(KERN_INFO "%s: can't add xfrm type(IRO dst remap)\n",
+		       __func__);
+		goto mip6_iro_dst_remap_xfrm_fail;
+	}
+#endif
+
 	if (rawv6_mh_filter_register(mip6_mh_filter) < 0) {
 		printk(KERN_INFO "%s: can't add rawv6 mh filter\n", __func__);
 		goto mip6_rawv6_mh_fail;
@@ -498,6 +637,12 @@ static int __init mip6_init(void)
 	return 0;
 
  mip6_rawv6_mh_fail:
+#ifdef CONFIG_XFRM_SUB_POLICY
+	xfrm_unregister_type(&mip6_iro_dst_type, AF_INET6);
+ mip6_iro_dst_remap_xfrm_fail:
+	xfrm_unregister_type(&mip6_iro_src_type, AF_INET6);
+ mip6_iro_src_remap_xfrm_fail:
+#endif
 	xfrm_unregister_type(&mip6_rthdr_type, AF_INET6);
  mip6_rthdr_xfrm_fail:
 	xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
@@ -509,6 +654,14 @@ static void __exit mip6_fini(void)
 {
 	if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0)
 		printk(KERN_INFO "%s: can't remove rawv6 mh filter\n", __func__);
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (xfrm_unregister_type(&mip6_iro_dst_type, AF_INET6) < 0)
+		printk(KERN_INFO "%s: can't remove xfrm type(IRO dst remap)\n",
+		       __func__);
+	if (xfrm_unregister_type(&mip6_iro_src_type, AF_INET6) < 0)
+		printk(KERN_INFO "%s: can't remove xfrm type(IRO src remap)\n",
+		       __func__);
+#endif
 	if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
 		printk(KERN_INFO "%s: can't remove xfrm type(rthdr)\n", __func__);
 	if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0)
diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c
index 63d5d49..ea33178 100644
--- a/net/ipv6/xfrm6_mode_ro.c
+++ b/net/ipv6/xfrm6_mode_ro.c
@@ -45,6 +45,15 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
 	u8 *prevhdr;
 	int hdr_len;
 
+	/* Unlike RH2 (IPPROTO_ROUTING) and HAO in DstOpt
+	 * (IPPROTO_DSTOPTS), IRO remapping states do not
+	 * add extension header to the packet. Source
+	 * and/or destination addresses are simply modified
+	 * in place. */
+	if (x->id.proto == XFRM_PROTO_IRO_SRC ||
+	    x->id.proto == XFRM_PROTO_IRO_DST)
+		goto out;
+
 	iph = ipv6_hdr(skb);
 
 	hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
@@ -54,8 +63,8 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
 	__skb_pull(skb, hdr_len);
 	memmove(ipv6_hdr(skb), iph, hdr_len);
 
+ out:
 	x->lastused = get_seconds();
-
 	return 0;
 }
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 8bae6b2..2aecd40 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -179,6 +179,10 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	case IPPROTO_DSTOPTS:
 	case IPPROTO_ROUTING:
+#ifdef CONFIG_XFRM_SUB_POLICY
+	case XFRM_PROTO_IRO_SRC:
+	case XFRM_PROTO_IRO_DST:
+#endif
 		if (attrs[XFRMA_ALG_COMP]	||
 		    attrs[XFRMA_ALG_AUTH]	||
 		    attrs[XFRMA_ALG_AUTH_TRUNC]	||
-- 
1.7.1



^ permalink raw reply related

* [PATCHv3 net-next-2.6 4/5] XFRM,IPv6: Add IRO remapping hook in xfrm_input()
From: Arnaud Ebalard @ 2010-09-29  9:05 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Herbert Xu, Hideaki YOSHIFUJI; +Cc: netdev
In-Reply-To: <cover.1285749610.git.arno@natisbad.org>


Add a hook in xfrm_input() to allow IRO remapping to occur when
an incoming packet matching an existing SA (based on SPI) with
an unexpected destination or source address is received.
Because IRO does not consume additional bits in a packet (that's
the point), there is no way to demultiplex based on something
like nh or spi. Instead, IRO input handlers (for source and
destination address remapping) are called upon address mismatch
during IPsec processing.
For that to work, we rely on the fact that SPI values generated
locally are no more linked to destination address (first patch
of the set) and we postpone a bit the expected address check in
xfrm_input() (inside xfrm_state_lookup() against daddr param) by
introducing a call to the input_addr_check() handler from the
struct xfrm_state_afinfo associated with the address family.

Signed-off-by: Arnaud Ebalard <arno@natisbad.org>
---
 include/net/xfrm.h     |    5 +++
 net/ipv4/xfrm4_input.c |   12 ++++++++
 net/ipv4/xfrm4_state.c |    1 +
 net/ipv6/xfrm6_input.c |   70 +++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/xfrm6_state.c |    1 +
 net/xfrm/xfrm_input.c  |    5 ++-
 net/xfrm/xfrm_state.c  |    2 +-
 7 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 05b2b1f..5b84c19 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -312,6 +312,8 @@ struct xfrm_state_afinfo {
 						  struct sk_buff *skb);
 	int			(*transport_finish)(struct sk_buff *skb,
 						    int async);
+	int			(*input_addr_check)(struct sk_buff *skb,
+						    struct xfrm_state *x);
 };
 
 extern int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo);
@@ -623,6 +625,7 @@ struct xfrm_spi_skb_cb {
 		struct inet6_skb_parm h6;
 	} header;
 
+	unsigned int saddroff;
 	unsigned int daddroff;
 	unsigned int family;
 };
@@ -1405,6 +1408,7 @@ extern int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
 			   int encap_type);
 extern int xfrm4_transport_finish(struct sk_buff *skb, int async);
 extern int xfrm4_rcv(struct sk_buff *skb);
+extern int xfrm4_input_addr_check(struct sk_buff *skb, struct xfrm_state *x);
 
 static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
 {
@@ -1423,6 +1427,7 @@ extern int xfrm6_transport_finish(struct sk_buff *skb, int async);
 extern int xfrm6_rcv(struct sk_buff *skb);
 extern int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 			    xfrm_address_t *saddr, u8 proto);
+extern int xfrm6_input_addr_check(struct sk_buff *skb, struct xfrm_state *x);
 extern int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family);
 extern int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family);
 extern __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6..82e23ec 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -41,6 +41,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
 		    int encap_type)
 {
 	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->saddroff = offsetof(struct iphdr, saddr);
 	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
 	return xfrm_input(skb, nexthdr, spi, encap_type);
 }
@@ -164,3 +165,14 @@ int xfrm4_rcv(struct sk_buff *skb)
 	return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
 }
 EXPORT_SYMBOL(xfrm4_rcv);
+
+int xfrm4_input_addr_check(struct sk_buff *skb, struct xfrm_state *x)
+{
+	xfrm_address_t *daddr;
+
+	daddr = (xfrm_address_t *)(skb_network_header(skb) +
+				   XFRM_SPI_SKB_CB(skb)->daddroff);
+
+	return xfrm_addr_cmp(&x->id.daddr, daddr, AF_INET);
+}
+EXPORT_SYMBOL(xfrm4_input_addr_check);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 4794762..c6b038a 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -79,6 +79,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.extract_input		= xfrm4_extract_input,
 	.extract_output		= xfrm4_extract_output,
 	.transport_finish	= xfrm4_transport_finish,
+	.input_addr_check	= xfrm4_input_addr_check,
 };
 
 void __init xfrm4_state_init(void)
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index f8c3cf8..754ecf7 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -15,6 +15,7 @@
 #include <linux/netfilter_ipv6.h>
 #include <net/ipv6.h>
 #include <net/xfrm.h>
+#include <net/ip6_route.h> /* XXX for ip6_route_input() */
 
 int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
 {
@@ -24,6 +25,7 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
 int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
 {
 	XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+	XFRM_SPI_SKB_CB(skb)->saddroff = offsetof(struct ipv6hdr, saddr);
 	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
 	return xfrm_input(skb, nexthdr, spi, 0);
 }
@@ -142,5 +144,71 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 drop:
 	return -1;
 }
-
 EXPORT_SYMBOL(xfrm6_input_addr);
+
+#if defined(CONFIG_XFRM_SUB_POLICY)
+/* Perform check on source and destination addresses and possibly IRO
+ * address remapping upon mismatch and if matching IRO state exists. */
+int xfrm6_input_addr_check(struct sk_buff *skb, struct xfrm_state *x)
+{
+	xfrm_address_t *saddr, *exp_saddr, *daddr, *exp_daddr;
+
+	saddr = (xfrm_address_t *)(skb_network_header(skb) +
+				   XFRM_SPI_SKB_CB(skb)->saddroff);
+	daddr = (xfrm_address_t *)(skb_network_header(skb) +
+				   XFRM_SPI_SKB_CB(skb)->daddroff);
+
+	exp_daddr = &x->id.daddr;
+	if (xfrm_addr_cmp(exp_daddr, daddr, AF_INET6)) {
+		/* Destination address mismatch: check if we have an IRO
+		 * destination remapping state to explain that.
+		 *
+		 * Note: saddr is provided as a hint. If source address
+		 * is also a remapped one, xfrm6_input_addr() will manage
+		 * to find IRO destination remapping state */
+		if (xfrm6_input_addr(skb, exp_daddr, saddr,
+				     XFRM_PROTO_IRO_DST) < 0)
+			return -1;
+
+		/* Copy destination address to sec_path for sock opts and
+		 * replace packet destination address with expected HoA */
+		ipv6_addr_copy(&skb->sp->irodst, (struct in6_addr *)daddr);
+		ipv6_addr_copy((struct in6_addr *)daddr,
+			       (struct in6_addr *)exp_daddr);
+
+		skb_dst_drop(skb);
+		ip6_route_input(skb);
+		if (skb_dst(skb)->error)
+			return -1;
+	}
+
+	exp_saddr = &x->props.saddr;
+	if (xfrm_addr_cmp(exp_saddr, saddr, AF_INET6)) {
+		/* Source address mismatch: check if we have an IRO
+		 * source remapping state to explain that.
+		 *
+		 * Note: unlike for destination addresses above, a
+		 * source mismatch is not considered fatal */
+		if (xfrm6_input_addr(skb, daddr, exp_saddr,
+				     XFRM_PROTO_IRO_SRC) < 0)
+			return 0;
+
+		/* Copy destination address to sec_path for sock opts and
+		 * then replace source address with expected peer's HoA */
+		ipv6_addr_copy(&skb->sp->irosrc, (struct in6_addr *)saddr);
+		ipv6_addr_copy((struct in6_addr *)saddr,
+			       (struct in6_addr *)exp_saddr);
+	}
+
+	return 0;
+}
+#else
+int xfrm6_input_addr_check(struct sk_buff *skb, struct xfrm_state *x)
+{
+	xfrm_address_t *daddr;
+	daddr = (xfrm_address_t *)(skb_network_header(skb) +
+				   XFRM_SPI_SKB_CB(skb)->daddroff);
+	return xfrm_addr_cmp(&x->id.daddr, daddr, AF_INET6);
+}
+#endif
+EXPORT_SYMBOL(xfrm6_input_addr_check);
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index a67575d..aeb4688 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -179,6 +179,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.extract_input		= xfrm6_extract_input,
 	.extract_output		= xfrm6_extract_output,
 	.transport_finish	= xfrm6_transport_finish,
+	.input_addr_check	= xfrm6_input_addr_check,
 };
 
 int __init xfrm6_state_init(void)
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 45f1c98..9ff65f6 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -152,8 +152,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 			goto drop;
 		}
 
-		x = xfrm_state_lookup(net, skb->mark, daddr, spi, nexthdr, family);
-		if (x == NULL) {
+		x = xfrm_state_lookup(net, skb->mark, NULL, spi, nexthdr, family);
+		if (x == NULL ||
+		    x->outer_mode->afinfo->input_addr_check(skb, x)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
 			xfrm_audit_state_notfound(skb, family, spi, seq);
 			goto drop;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index b6a4d8d..b8f7c08 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -685,7 +685,7 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, xfrm_ad
 		if (x->props.family != family ||
 		    x->id.spi       != spi ||
 		    x->id.proto     != proto ||
-		    xfrm_addr_cmp(&x->id.daddr, daddr, family))
+		    (daddr && xfrm_addr_cmp(&x->id.daddr, daddr, family)))
 			continue;
 
 		if ((mark & x->mark.m) != x->mark.v)
-- 
1.7.1



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox