[PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-11-20 16:31 [PATCH 00/13]: Netfilter IPsec support Patrick McHardy
@ 2005-11-20 16:31 ` Patrick McHardy
  2005-11-21  4:42   ` Yasuyuki KOZAKAI
                     ` (3 more replies)
  0 siblings, 4 replies; 19+ messages in thread
From: Patrick McHardy @ 2005-11-20 16:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, netfilter-devel, Patrick McHardy

[IPV4/6]: Netfilter IPsec input hooks

When the innermost transform uses transport mode the decapsulated packet
is not visible to netfilter. Pass the packet through the PRE_ROUTING and
LOCAL_IN hooks again before handing it to upper layer protocols to make
netfilter-visibility symetrical to the output path.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 08cf39d5d7d8b942431a6529daa3ab69ecfb34b5
tree 6f2a1a85f915b1b6f89ad50cf3d8855f21a561b6
parent b847425c693f43a63137d18e36e5c8cf0187c175
author Patrick McHardy <kaber@trash.net> Sat, 19 Nov 2005 21:50:22 +0100
committer Patrick McHardy <kaber@trash.net> Sat, 19 Nov 2005 21:50:22 +0100

 include/linux/netfilter_ipv4.h |    2 +-
 include/net/ipv6.h             |    2 ++
 net/ipv4/netfilter.c           |   20 ++++++++++++++++++++
 net/ipv4/xfrm4_input.c         |   14 ++++++++++++++
 net/ipv6/ip6_input.c           |    2 +-
 net/ipv6/xfrm6_input.c         |   13 +++++++++++++
 6 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index fdc4a95..e9103fe 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -79,7 +79,7 @@ enum nf_ip_hook_priorities {
 
 #ifdef __KERNEL__
 extern int ip_route_me_harder(struct sk_buff **pskb);
-
+extern int ip_xfrm_transport_hook(struct sk_buff *skb);
 #endif /*__KERNEL__*/
 
 #endif /*__LINUX_IP_NETFILTER_H*/
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 6addb4d..4fbfe43 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -414,6 +414,8 @@ extern int			ipv6_rcv(struct sk_buff *sk
 					 struct packet_type *pt,
 					 struct net_device *orig_dev);
 
+extern int			ip6_rcv_finish(struct sk_buff *skb);
+
 /*
  *	upper-layer output functions
  */
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index b93e7cd..3c39296 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -105,6 +105,26 @@ int ip_dst_output(struct sk_buff *skb)
 	return dst_output(skb);
 }
 EXPORT_SYMBOL(ip_dst_output);
+
+/*
+ * okfn for transport mode xfrm_input.c hook. Basically a copy of
+ * ip_rcv_finish without statistics and option parsing.
+ */
+int ip_xfrm_transport_hook(struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+
+	if (likely(skb->dst == NULL)) {
+		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
+		                         skb->dev);
+		if (unlikely(err))
+			goto drop;
+	}
+	return dst_input(skb);
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
 #endif /* CONFIG_XFRM */
 
 /*
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 2d3849c..d90cd93 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -11,6 +11,8 @@
 
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
 #include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
@@ -137,6 +139,8 @@ int xfrm4_rcv_encap(struct sk_buff *skb,
 	memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state));
 	skb->sp->len += xfrm_nr;
 
+	nf_reset(skb);
+
 	if (decaps) {
 		if (!(skb->dev->flags&IFF_LOOPBACK)) {
 			dst_release(skb->dst);
@@ -145,7 +149,17 @@ int xfrm4_rcv_encap(struct sk_buff *skb,
 		netif_rx(skb);
 		return 0;
 	} else {
+#ifdef CONFIG_NETFILTER
+		__skb_push(skb, skb->data - skb->nh.raw);
+		skb->nh.iph->tot_len = htons(skb->len);
+		ip_send_check(skb->nh.iph);
+
+		NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL,
+		        ip_xfrm_transport_hook);
+		return 0;
+#else
 		return -skb->nh.iph->protocol;
+#endif
 	}
 
 drop_unlock:
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 33d3b0e..e84b3cd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -48,7 +48,7 @@
 
 
 
-static inline int ip6_rcv_finish( struct sk_buff *skb) 
+inline int ip6_rcv_finish(struct sk_buff *skb) 
 {
 	if (skb->dst == NULL)
 		ip6_route_input(skb);
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 28c29d7..9987416 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -11,6 +11,8 @@
 
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
 #include <net/dsfield.h>
 #include <net/inet_ecn.h>
 #include <net/ip.h>
@@ -121,6 +123,8 @@ int xfrm6_rcv_spi(struct sk_buff **pskb,
 	skb->sp->len += xfrm_nr;
 	skb->ip_summed = CHECKSUM_NONE;
 
+	nf_reset(skb);
+
 	if (decaps) {
 		if (!(skb->dev->flags&IFF_LOOPBACK)) {
 			dst_release(skb->dst);
@@ -129,7 +133,16 @@ int xfrm6_rcv_spi(struct sk_buff **pskb,
 		netif_rx(skb);
 		return -1;
 	} else {
+#ifdef CONFIG_NETFILTER
+		skb->nh.ipv6h->payload_len = htons(skb->len);
+		__skb_push(skb, skb->data - skb->nh.raw);
+
+		NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL,
+		        ip6_rcv_finish);
+		return -1;
+#else
 		return 1;
+#endif
 	}
 
 drop_unlock:

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-11-20 16:31 ` [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks Patrick McHardy
@ 2005-11-21  4:42   ` Yasuyuki KOZAKAI
       [not found]   ` <200511210442.jAL4gPoO001846@toshiba.co.jp>
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 19+ messages in thread
From: Yasuyuki KOZAKAI @ 2005-11-21  4:42 UTC (permalink / raw)
  To: kaber; +Cc: netdev, netfilter-devel, davem

Hi, Patrick,

From: Patrick McHardy <kaber@trash.net>
Date: Sun, 20 Nov 2005 17:31:36 +0100

> [IPV4/6]: Netfilter IPsec input hooks
> 
> When the innermost transform uses transport mode the decapsulated packet
> is not visible to netfilter. Pass the packet through the PRE_ROUTING and
> LOCAL_IN hooks again before handing it to upper layer protocols to make
> netfilter-visibility symetrical to the output path.
> 
> Signed-off-by: Patrick McHardy <kaber@trash.net>

At first, now I could agree to use same name for hooks before/after xfrm
processing, if it's important to keep compatibility than to avoid difficulty
to use. Even now I think it's confusing to pass packets before/after xfrm to
same hook, and believe it's ideal to use different name for them.
But people can configure correctly according to you, then my concern might be
needless fear.

Next is about re-visiting stack, I'm beginning to understand your patch.
It looks natural to re-route after DNAT on input path of IPv4. That's really
needed if packets have been mangled.

But is there any reason that we have to allow packets to re-visit
ip_local_deliver_finish() in the case that they have not been mangled
at PRE_ROUTING ? I think no. This situation is like ip_nat_out().

And this can be said about IPv6 input path. If packets have not been mangled
(this is ordinary case because ip6tables doesn't have neither NAT nor
target module to mangle addresses directly), they don't have to re-route
and don't have to re-visit ip6_input_finish().

In the other way, if their addresses have been mangled, it's necessary to
re-route. I agree re-visiting ip6_input_finish() in this case.

Then, why don't we make xfrm{4,6}_rcv_spi() return 0 (1 if IPv6)
so that ip_local_deliver_finish()/ip6_input_finish() can continue to process
headers if packets have not been mangled ? Is this difficult or impossible to
implement ?

This can solve the issue about twice processing of statistics and IPv6
extension headers. Because ip_local_deliver_finish()/ip6_input_finish() can
continue to process extension headers after ESP/AH in ordinary case.

In special case, if some codes mangle IPv6 addresses, that's the codes
to take care of the possibility of re-visiting ip6_input_finish().

What do you think ?

# Please note that these are just my opinions and other USAGI guys might have
# other opinions.

Regards,

-- Yasuyuki Kozakai

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
       [not found]   ` <200511210442.jAL4gPoO001846@toshiba.co.jp>
@ 2005-11-21  6:52     ` Patrick McHardy
  2005-11-21  7:00       ` David S. Miller
                         ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Patrick McHardy @ 2005-11-21  6:52 UTC (permalink / raw)
  To: Yasuyuki KOZAKAI; +Cc: netdev, netfilter-devel, davem

Yasuyuki KOZAKAI wrote:
> At first, now I could agree to use same name for hooks before/after xfrm
> processing, if it's important to keep compatibility than to avoid difficulty
> to use. Even now I think it's confusing to pass packets before/after xfrm to
> same hook, and believe it's ideal to use different name for them.
> But people can configure correctly according to you, then my concern might be
> needless fear.

I don't see why it is confusing. Plain text packets are visible before
encapsulation (and they have to be because we don't necessarily know
if packets will be encapsulated at the time the hooks are called in
case the policy lookup after NAT returns a policy), plain text packets
are visible after decapsulation. With different hooks we can't have
symetrical behaviour because of the case I mentioned above, and that
would be confusing IMO.

> Next is about re-visiting stack, I'm beginning to understand your patch.
> It looks natural to re-route after DNAT on input path of IPv4. That's really
> needed if packets have been mangled.
> 
> But is there any reason that we have to allow packets to re-visit
> ip_local_deliver_finish() in the case that they have not been mangled
> at PRE_ROUTING ? I think no. This situation is like ip_nat_out().

My patches don't change when and if packets will reach
ip_local_deliver_finish(), they just add a possibility for rerouting.
Currently the transforms are called from ip_local_deliver_finish() and
in transport mode the decapsulated packet continues its path in
ip_local_deliver_finish(), with my patches they will go through two
netfilter hooks and continue the exact same codepath, given that
they are not NATed to a foreign destination IP on their way.

> And this can be said about IPv6 input path. If packets have not been mangled
> (this is ordinary case because ip6tables doesn't have neither NAT nor
> target module to mangle addresses directly), they don't have to re-route
> and don't have to re-visit ip6_input_finish().
> 
> In the other way, if their addresses have been mangled, it's necessary to
> re-route. I agree re-visiting ip6_input_finish() in this case.

Same goes for ip6_input_finish as for ip_local_deliver_finish(),
the packet would continue its path there anyway. Do you actually
mean ip6_rcv_finish()?

> Then, why don't we make xfrm{4,6}_rcv_spi() return 0 (1 if IPv6)
> so that ip_local_deliver_finish()/ip6_input_finish() can continue to process
> headers if packets have not been mangled ? Is this difficult or impossible to
> implement ?

I'm not sure I understand. Do you propose to check if the packet was
mangled after the PRE_ROUTING hook (if it was not stolen or queued)
and if not return directly to ip6_input_finish()? Where would the
LOCAL_IN hook be called?

> This can solve the issue about twice processing of statistics and IPv6
> extension headers. Because ip_local_deliver_finish()/ip6_input_finish() can
> continue to process extension headers after ESP/AH in ordinary case.

AFAICT statistics are not affected by my patches, except for the
iptables counters. The double parsing of extension headers is indeed
a problem I forgot about, it looks like we need to carry nhoff around
if it can't be derived from the packet.

Regards
Patrick

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-11-21  6:52     ` Patrick McHardy
@ 2005-11-21  7:00       ` David S. Miller
  2005-11-21  7:47         ` Herbert Xu
  2005-11-21 16:52         ` Patrick McHardy
  2005-11-21 10:53       ` Yasuyuki KOZAKAI
       [not found]       ` <200511211053.jALAro04019574@toshiba.co.jp>
  2 siblings, 2 replies; 19+ messages in thread
From: David S. Miller @ 2005-11-21  7:00 UTC (permalink / raw)
  To: kaber; +Cc: netdev, netfilter-devel, yasuyuki.kozakai

From: Patrick McHardy <kaber@trash.net>
Date: Mon, 21 Nov 2005 07:52:36 +0100

> I don't see why it is confusing. Plain text packets are visible before
> encapsulation (and they have to be because we don't necessarily know
> if packets will be encapsulated at the time the hooks are called in
> case the policy lookup after NAT returns a policy), plain text packets
> are visible after decapsulation. With different hooks we can't have
> symetrical behaviour because of the case I mentioned above, and that
> would be confusing IMO.

I think this is a very important point.

I can see no serious argument against this behavior, especially for
output.  On input, there is an argument of paranoia about seeing
plaintext packets, but administrator could do this anyways with
tcpdump or custom kernel module if this system is the decapsulation
point.

I've read over Patrick's two most recent postings of these patches
and I think they are generally sane and I cannot find any holes in
them.  Herbert brought up the legitimate concern about defragmentation,
but I think that's a detail and does not take away from the structural
soundness of Patrick's approach.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-11-21  7:00       ` David S. Miller
@ 2005-11-21  7:47         ` Herbert Xu
  2005-11-21 16:52         ` Patrick McHardy
  1 sibling, 0 replies; 19+ messages in thread
From: Herbert Xu @ 2005-11-21  7:47 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, kaber, yasuyuki.kozakai

David S. Miller <davem@davemloft.net> wrote:
> 
> I've read over Patrick's two most recent postings of these patches
> and I think they are generally sane and I cannot find any holes in
> them.  Herbert brought up the legitimate concern about defragmentation,
> but I think that's a detail and does not take away from the structural
> soundness of Patrick's approach.

Yes I agree completely.  The new IPsec stack has been around for three
years and it's about time that we have proper netfilter support for it.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
       [not found]   ` <438185ED.3050005@miyazawa.org>
@ 2005-11-21  8:50     ` YOSHIFUJI Hideaki / 吉藤英明
  2005-11-21 16:29       ` Patrick McHardy
  0 siblings, 1 reply; 19+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-11-21  8:50 UTC (permalink / raw)
  To: kazunori, kaber; +Cc: netdev, netfilter-devel, davem

Hello.

In article <438185ED.3050005@miyazawa.org> (at Mon, 21 Nov 2005 17:31:41 +0900), Kazunori Miyazawa <kazunori@miyazawa.org> says:

> Your ip_xfrm_transport_hook is a good idea, I think.
> 
> We could call ip6_rcv_finish if the netfilter changed the addresses
> or otherwise we can continue the loop to avoid the cost in a similar
> way because we can know the change with checking skb->dst.

Well, I agree.

In article <20051120163135.16666.76993.sendpatchset@localhost.localdomain> (at Sun, 20 Nov 2005 17:31:36 +0100), Patrick McHardy <kaber@trash.net> says:

> diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
> index b93e7cd..3c39296 100644
> --- a/net/ipv4/netfilter.c
> +++ b/net/ipv4/netfilter.c
> @@ -105,6 +105,26 @@ int ip_dst_output(struct sk_buff *skb)
>  	return dst_output(skb);
>  }
>  EXPORT_SYMBOL(ip_dst_output);
> +
> +/*
> + * okfn for transport mode xfrm_input.c hook. Basically a copy of
> + * ip_rcv_finish without statistics and option parsing.
> + */
> +int ip_xfrm_transport_hook(struct sk_buff *skb)
> +{
> +	struct iphdr *iph = skb->nh.iph;
> +
> +	if (likely(skb->dst == NULL)) {
> +		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
> +		                         skb->dev);
> +		if (unlikely(err))
> +			goto drop;
> +	}
> +	return dst_input(skb);
> +drop:
> +	kfree_skb(skb);
> +	return NET_RX_DROP;
> +}
>  #endif /* CONFIG_XFRM */
>  
:
> @@ -129,7 +133,16 @@ int xfrm6_rcv_spi(struct sk_buff **pskb,
>  		netif_rx(skb);
>  		return -1;
>  	} else {
> +#ifdef CONFIG_NETFILTER
> +		skb->nh.ipv6h->payload_len = htons(skb->len);
> +		__skb_push(skb, skb->data - skb->nh.raw);
> +
> +		NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL,
> +		        ip6_rcv_finish);
> +		return -1;
> +#else
>  		return 1;
> +#endif
>  	}
>  

Probably, we can do similarly for ipv6; e.g.:

int ip6_xfrm_transport_hook(struct sk_buff *skb)
{
#if 0 /* We NEVER support NAT. :-) */
     if (likely(skb->dst == NULL)) {
            int err = ip6_route_input()
            if (unlikely(err))
                     goto drop;
     }
#endif
     __skb_pull(skb, skb->h.raw - skb->nh.raw);
     return NET_RX_SUCCESS;
drop:
     kfree_skb(skb);
     return NET_RX_DROP;
}

:

      } else {
#ifdef CONFIG_NETFILTER
             skb->nh.ipv6h->payload_len = htons(skb->len);
	     skb->h.raw = skb->data;
             __skb_push(skb, skb->data - skb->nh.raw);

             if (NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL,
                         ip6_xfrm_transport_hook) == NET_RX_DROP)
                 return -1;
#endif
             return 1;
      }

Then, we can continue parsing extension headers, I think.

-- 
YOSHIFUJI Hideaki @ USAGI Project  <yoshfuji@linux-ipv6.org>
GPG-FP  : 9022 65EB 1ECF 3AD1 0BDF  80D8 4807 F894 E062 0EEA

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-11-21  6:52     ` Patrick McHardy
  2005-11-21  7:00       ` David S. Miller
@ 2005-11-21 10:53       ` Yasuyuki KOZAKAI
       [not found]       ` <200511211053.jALAro04019574@toshiba.co.jp>
  2 siblings, 0 replies; 19+ messages in thread
From: Yasuyuki KOZAKAI @ 2005-11-21 10:53 UTC (permalink / raw)
  To: kaber; +Cc: netdev, netfilter-devel, davem, yasuyuki.kozakai


Hi,

From: Patrick McHardy <kaber@trash.net>
Date: Mon, 21 Nov 2005 07:52:36 +0100

> I don't see why it is confusing. Plain text packets are visible before
> encapsulation (and they have to be because we don't necessarily know
> if packets will be encapsulated at the time the hooks are called in
> case the policy lookup after NAT returns a policy), plain text packets
> are visible after decapsulation. With different hooks we can't have
> symetrical behaviour because of the case I mentioned above, and that
> would be confusing IMO.

Well, what I worried about was just ease to use, not internal processing.
I suspected that users can correctly configure IPsec and packet filtering.
Just doing "iptables -P INPUT -j DROP; iptables -A INPUT -p esp -j ACCEPT"
will drop all input packets and this is different behavior with current
kernel, for example. So I just imagined many people would say "Why ?".

But as you said in other mail, probably this is my needles fear, isn't this ?
As you know, I'm basically worrier. :)

> > And this can be said about IPv6 input path. If packets have not been mangled
> > (this is ordinary case because ip6tables doesn't have neither NAT nor
> > target module to mangle addresses directly), they don't have to re-route
> > and don't have to re-visit ip6_input_finish().
> > 
> > In the other way, if their addresses have been mangled, it's necessary to
> > re-route. I agree re-visiting ip6_input_finish() in this case.
> 
> Same goes for ip6_input_finish as for ip_local_deliver_finish(),
> the packet would continue its path there anyway. Do you actually
> mean ip6_rcv_finish()?

No. I mean ip6_input_finish(). calling ip6_input_finish() twice causes
problem at processing of IPv6 extension headers. This is different point
between IPv4 and IPv6.

Please note that I don't mean IPv4 processing in your patch has problem.
I think it will work. What I wanted to do was just avoiding double processing
of extension headers and synchronizing IPv4/IPv6 behavior as possible.

> > Then, why don't we make xfrm{4,6}_rcv_spi() return 0 (1 if IPv6)
> > so that ip_local_deliver_finish()/ip6_input_finish() can continue to process
> > headers if packets have not been mangled ? Is this difficult or impossible to
> > implement ?
> 
> I'm not sure I understand. Do you propose to check if the packet was
> mangled after the PRE_ROUTING hook (if it was not stolen or queued)
> and if not return directly to ip6_input_finish()?

Yes.

>                                                    Where would the
> LOCAL_IN hook be called?

Ah, indeed. But we can add it just before return directly to
ip6_input_finish().

> > This can solve the issue about twice processing of statistics and IPv6
> > extension headers. Because ip_local_deliver_finish()/ip6_input_finish() can
> > continue to process extension headers after ESP/AH in ordinary case.
> 
> AFAICT statistics are not affected by my patches, except for the
> iptables counters. The double parsing of extension headers is indeed
> a problem I forgot about, it looks like we need to carry nhoff around
> if it can't be derived from the packet.

Of cause that is one solution.

-- Yasuyuki Kozakai

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-11-21  8:50     ` YOSHIFUJI Hideaki / 吉藤英明
@ 2005-11-21 16:29       ` Patrick McHardy
  0 siblings, 0 replies; 19+ messages in thread
From: Patrick McHardy @ 2005-11-21 16:29 UTC (permalink / raw)
  To: yoshfuji; +Cc: netdev, netfilter-devel, kazunori, davem

YOSHIFUJI Hideaki / ^[$B5HF#1QL@^[ wrote:
> Hello.
> 
> In article <438185ED.3050005@miyazawa.org> (at Mon, 21 Nov 2005 17:31:41 +0900), Kazunori Miyazawa <kazunori@miyazawa.org> says:
> 
> 
>>Your ip_xfrm_transport_hook is a good idea, I think.
>>
>>We could call ip6_rcv_finish if the netfilter changed the addresses
>>or otherwise we can continue the loop to avoid the cost in a similar
>>way because we can know the change with checking skb->dst.
> 
> 
> Well, I agree.
> 
> Probably, we can do similarly for ipv6; e.g.:
> 
> int ip6_xfrm_transport_hook(struct sk_buff *skb)
> {
> #if 0 /* We NEVER support NAT. :-) */
>      if (likely(skb->dst == NULL)) {
>             int err = ip6_route_input()
>             if (unlikely(err))
>                      goto drop;
>      }
> #endif
>      __skb_pull(skb, skb->h.raw - skb->nh.raw);
>      return NET_RX_SUCCESS;
> drop:
>      kfree_skb(skb);
>      return NET_RX_DROP;
> }
> 
> :
> 
>       } else {
> #ifdef CONFIG_NETFILTER
>              skb->nh.ipv6h->payload_len = htons(skb->len);
> 	     skb->h.raw = skb->data;
>              __skb_push(skb, skb->data - skb->nh.raw);
> 
>              if (NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL,
>                          ip6_xfrm_transport_hook) == NET_RX_DROP)
>                  return -1;
> #endif
>              return 1;
>       }
> 
> Then, we can continue parsing extension headers, I think.

Is it the rerouting you're concerned about? It will usually not
happen because skb->dst is not NULL. Its needed for NFQUEUE,
packets can be changes in userspace and need rerouting afterwards.
In any case there would still be one path on which extension
headers would be parsed twice, so I'm going to look into different
ways to fix that.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
       [not found]       ` <200511211053.jALAro04019574@toshiba.co.jp>
@ 2005-11-21 16:34         ` Patrick McHardy
  0 siblings, 0 replies; 19+ messages in thread
From: Patrick McHardy @ 2005-11-21 16:34 UTC (permalink / raw)
  To: Yasuyuki KOZAKAI; +Cc: netdev, netfilter-devel, davem

Yasuyuki KOZAKAI wrote:
>>I don't see why it is confusing. Plain text packets are visible before
>>encapsulation (and they have to be because we don't necessarily know
>>if packets will be encapsulated at the time the hooks are called in
>>case the policy lookup after NAT returns a policy), plain text packets
>>are visible after decapsulation. With different hooks we can't have
>>symetrical behaviour because of the case I mentioned above, and that
>>would be confusing IMO.
> 
> Well, what I worried about was just ease to use, not internal processing.
> I suspected that users can correctly configure IPsec and packet filtering.
> Just doing "iptables -P INPUT -j DROP; iptables -A INPUT -p esp -j ACCEPT"
> will drop all input packets and this is different behavior with current
> kernel, for example. So I just imagined many people would say "Why ?".
> 
> But as you said in other mail, probably this is my needles fear, isn't this ?
> As you know, I'm basically worrier. :)

:) I think its OK since in tunnel mode the decapsulated packets already
need to be allowed by the ruleset, so I think its not too confusing to
expect the same in transport mode.

>>>Then, why don't we make xfrm{4,6}_rcv_spi() return 0 (1 if IPv6)
>>>so that ip_local_deliver_finish()/ip6_input_finish() can continue to process
>>>headers if packets have not been mangled ? Is this difficult or impossible to
>>>implement ?
>>
>>I'm not sure I understand. Do you propose to check if the packet was
>>mangled after the PRE_ROUTING hook (if it was not stolen or queued)
>>and if not return directly to ip6_input_finish()?
> 
> Yes.
> 
>> Where would the LOCAL_IN hook be called?
> 
> Ah, indeed. But we can add it just before return directly to
> ip6_input_finish().

Please see my mail to Kazunori. The hooks take ownership of the
skb, changing this would become pretty ugly because of queued
or stolen packets. And it would still leave one path where packets
are not directly returned, so I think the double-parsing should
be handled otherwise.

>>AFAICT statistics are not affected by my patches, except for the
>>iptables counters. The double parsing of extension headers is indeed
>>a problem I forgot about, it looks like we need to carry nhoff around
>>if it can't be derived from the packet.
> 
> Of cause that is one solution.

I'm going to try that if I can't come up with something better.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-11-21  7:00       ` David S. Miller
  2005-11-21  7:47         ` Herbert Xu
@ 2005-11-21 16:52         ` Patrick McHardy
  1 sibling, 0 replies; 19+ messages in thread
From: Patrick McHardy @ 2005-11-21 16:52 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, yasuyuki.kozakai

David S. Miller wrote:
> I've read over Patrick's two most recent postings of these patches
> and I think they are generally sane and I cannot find any holes in
> them.  Herbert brought up the legitimate concern about defragmentation,
> but I think that's a detail and does not take away from the structural
> soundness of Patrick's approach.

I think we implicitly agreed on moving the POST_ROUTING hook before
fragmentation and change the user-visible behaviour of the mangle
POSTROUTING chain. At least neither Harald not Rusty objected to
the patch :)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
       [not found]   ` <438270F2.3000603@trash.net>
@ 2005-11-23 10:38     ` YOSHIFUJI Hideaki / 吉藤英明
  2005-12-18 14:27       ` Patrick McHardy
  0 siblings, 1 reply; 19+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-11-23 10:38 UTC (permalink / raw)
  To: kaber; +Cc: netdev, netfilter-devel, davem, kozakai, kazunori

Hello.

In article <438270F2.3000603@trash.net> (at Tue, 22 Nov 2005 02:14:26 +0100), Patrick McHardy <kaber@trash.net> says:

> The easiest way would be to store nhoff somewhere in the skb and
> use it to continue at the next header. But I still hope there is
> a way without keeping data in the skb.

We've coded up this.

Though we have still another issue (call chain issue) to resolve,
we're getting closer to the goal.
i.e. we should continue the loop for common case.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Yasuyuki Kozawai <yasuyuki.kozakai@toshiba.co.jp>

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8c5d600..1101851 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -272,6 +272,9 @@ struct sk_buff {
 	void			(*destructor)(struct sk_buff *skb);
 #ifdef CONFIG_NETFILTER
 	__u32			nfmark;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	unsigned int		nf_nhoff;
+#endif
 	struct nf_conntrack	*nfct;
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	struct sk_buff		*nfct_reasm;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 44b979a..0531d0a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -463,6 +463,8 @@ extern int			ip6_dst_lookup(struct sock 
 extern int			ip6_output(struct sk_buff *skb);
 extern int			ip6_forward(struct sk_buff *skb);
 extern int			ip6_input(struct sk_buff *skb);
+extern int			ip6_input_finish2(struct sk_buff *skb,
+						  unsigned int nhoff);
 extern int			ip6_mc_input(struct sk_buff *skb);
 
 /*
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index e84b3cd..cd0606a 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -134,31 +134,13 @@ out:
  *	Deliver the packet to the host
  */
 
-
-static inline int ip6_input_finish(struct sk_buff *skb)
+int ip6_input_finish2(struct sk_buff *skb, unsigned int nhoff)
 {
 	struct inet6_protocol *ipprot;
 	struct sock *raw_sk;
-	unsigned int nhoff;
-	int nexthdr;
+	unsigned int nexthdr;
 	u8 hash;
 
-	skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
-
-	/*
-	 *	Parse extension headers
-	 */
-
-	nexthdr = skb->nh.ipv6h->nexthdr;
-	nhoff = offsetof(struct ipv6hdr, nexthdr);
-
-	/* Skip hop-by-hop options, they are already parsed. */
-	if (nexthdr == NEXTHDR_HOP) {
-		nhoff = sizeof(struct ipv6hdr);
-		nexthdr = skb->h.raw[0];
-		skb->h.raw += (skb->h.raw[1]+1)<<3;
-	}
-
 	rcu_read_lock();
 resubmit:
 	if (!pskb_pull(skb, skb->h.raw - skb->data))
@@ -221,6 +203,26 @@ discard:
 	return 0;
 }
 
+static inline int ip6_input_finish(struct sk_buff *skb)
+{
+	unsigned int nhoff;
+
+	skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
+
+	/*
+	 *	Parse extension headers
+	 */
+
+	nhoff = offsetof(struct ipv6hdr, nexthdr);
+
+	/* Skip hop-by-hop options, they are already parsed. */
+	if (skb->nh.ipv6h->nexthdr == NEXTHDR_HOP) {
+		nhoff = sizeof(struct ipv6hdr);
+		skb->h.raw += (skb->h.raw[1]+1)<<3;
+	}
+
+	return ip6_input_finish2(skb, nhoff);
+}
 
 int ip6_input(struct sk_buff *skb)
 {
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 1648278..6051783 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -15,6 +15,7 @@ EXPORT_SYMBOL(ndisc_mc_map);
 EXPORT_SYMBOL(register_inet6addr_notifier);
 EXPORT_SYMBOL(unregister_inet6addr_notifier);
 EXPORT_SYMBOL(ip6_route_output);
+EXPORT_SYMBOL(ip6_input_finish2);
 EXPORT_SYMBOL(addrconf_lock);
 EXPORT_SYMBOL(ipv6_setsockopt);
 EXPORT_SYMBOL(ipv6_getsockopt);
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 9987416..2e3b28d 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -9,6 +9,7 @@
  *		IPv6 support
  */
 
+#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/netfilter.h>
@@ -17,6 +18,7 @@
 #include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/ip6_route.h>
 #include <net/xfrm.h>
 
 static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
@@ -28,6 +30,25 @@ static inline void ipip6_ecn_decapsulate
 		IP6_ECN_set_ce(inner_iph);
 }
 
+#ifdef CONFIG_NETFILTER
+static inline int xfrm6_rcv_spi_finish2(struct sk_buff *skb)
+{
+	__skb_pull(skb, skb->h.raw - skb->nh.raw);
+	return ip6_input_finish2(skb, skb->nf_nhoff);
+}
+
+static inline int xfrm6_rcv_spi_finish(struct sk_buff *skb)
+{
+	if (skb->dst == NULL) {
+		ip6_route_input(skb);
+		return dst_input(skb);
+	}
+
+	return NF_HOOK(PF_INET6, NF_IP6_LOCAL_IN, skb, skb->dev, NULL,
+		       xfrm6_rcv_spi_finish2);
+}
+#endif
+
 int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi)
 {
 	struct sk_buff *skb = *pskb;
@@ -136,9 +157,10 @@ int xfrm6_rcv_spi(struct sk_buff **pskb,
 #ifdef CONFIG_NETFILTER
 		skb->nh.ipv6h->payload_len = htons(skb->len);
 		__skb_push(skb, skb->data - skb->nh.raw);
+		skb->nf_nhoff = nhoff;
 
 		NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL,
-		        ip6_rcv_finish);
+		        xfrm6_rcv_spi_finish);
 		return -1;
 #else
 		return 1;

-- 
YOSHIFUJI Hideaki @ USAGI Project  <yoshfuji@linux-ipv6.org>
GPG-FP  : 9022 65EB 1ECF 3AD1 0BDF  80D8 4807 F894 E062 0EEA

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-11-20 16:31 ` [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks Patrick McHardy
                     ` (2 preceding siblings ...)
       [not found]   ` <438185ED.3050005@miyazawa.org>
@ 2005-12-01  1:27   ` Herbert Xu
  2005-12-04 22:06     ` Patrick McHardy
  3 siblings, 1 reply; 19+ messages in thread
From: Herbert Xu @ 2005-12-01  1:27 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netdev, netfilter-devel, davem

On Sun, Nov 20, 2005 at 04:31:36PM +0000, Patrick McHardy wrote:
>
> @@ -145,7 +149,17 @@ int xfrm4_rcv_encap(struct sk_buff *skb,
>  		netif_rx(skb);
>  		return 0;
>  	} else {
> +#ifdef CONFIG_NETFILTER
> +		__skb_push(skb, skb->data - skb->nh.raw);
> +		skb->nh.iph->tot_len = htons(skb->len);
> +		ip_send_check(skb->nh.iph);
> +
> +		NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL,
> +		        ip_xfrm_transport_hook);
> +		return 0;
> +#else
>  		return -skb->nh.iph->protocol;
> +#endif

I'm worried about this bit.  This looks like it'll go back to the top
of the IP stack with the existing call chain.  So could grow as the
number of transforms increase.

Perhaps we need to play a dst_input/netif_rx trick here.

Actually, was there a problem with your original netif_rx approach
apart from the issue with double counting?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-12-01  1:27   ` Herbert Xu
@ 2005-12-04 22:06     ` Patrick McHardy
  2005-12-04 22:10       ` Herbert Xu
  0 siblings, 1 reply; 19+ messages in thread
From: Patrick McHardy @ 2005-12-04 22:06 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev, netfilter-devel, davem

Herbert Xu wrote:
> On Sun, Nov 20, 2005 at 04:31:36PM +0000, Patrick McHardy wrote:
> 
>>@@ -145,7 +149,17 @@ int xfrm4_rcv_encap(struct sk_buff *skb,
>> 		netif_rx(skb);
>> 		return 0;
>> 	} else {
>>+#ifdef CONFIG_NETFILTER
>>+		__skb_push(skb, skb->data - skb->nh.raw);
>>+		skb->nh.iph->tot_len = htons(skb->len);
>>+		ip_send_check(skb->nh.iph);
>>+
>>+		NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL,
>>+		        ip_xfrm_transport_hook);
>>+		return 0;
>>+#else
>> 		return -skb->nh.iph->protocol;
>>+#endif
> 
> 
> I'm worried about this bit.  This looks like it'll go back to the top
> of the IP stack with the existing call chain.  So could grow as the
> number of transforms increase.

Its not so bad. It adds ip_xfrm_transport_hook and
ip_local_deliver_finish to the call stack, but since two subsequent
transport mode SAs are always processed at once it can't take this
path again without calling netif_rx in between.

> Perhaps we need to play a dst_input/netif_rx trick here.
> 
> Actually, was there a problem with your original netif_rx approach
> apart from the issue with double counting?

Besides the double counting, packets also appear on the packet sockets
after transport mode decapsulation with the original approach. For
IPv6 there's also the double-parsing of extension header issue.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-12-04 22:06     ` Patrick McHardy
@ 2005-12-04 22:10       ` Herbert Xu
  2005-12-04 22:49         ` Patrick McHardy
  0 siblings, 1 reply; 19+ messages in thread
From: Herbert Xu @ 2005-12-04 22:10 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netdev, netfilter-devel, davem

On Sun, Dec 04, 2005 at 11:06:02PM +0100, Patrick McHardy wrote:
>
> >I'm worried about this bit.  This looks like it'll go back to the top
> >of the IP stack with the existing call chain.  So could grow as the
> >number of transforms increase.
> 
> Its not so bad. It adds ip_xfrm_transport_hook and
> ip_local_deliver_finish to the call stack, but since two subsequent
> transport mode SAs are always processed at once it can't take this
> path again without calling netif_rx in between.

If there is a DNAT in the way, this will jump to the very start of
the stack.  So if we have a hostile IPsec peer, and the DNAT rules
are such that this can occur, then we could be in trouble (especially
because policy/selector verification does not occur until all IPsec
has been done so we can't check inner address validitiy at this point).
 
> Besides the double counting, packets also appear on the packet sockets
> after transport mode decapsulation with the original approach. For
> IPv6 there's also the double-parsing of extension header issue.

Having the packets appear twice on AF_PACKET is probably desirable :)

I'll need to think about the double-parsing though.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-12-04 22:10       ` Herbert Xu
@ 2005-12-04 22:49         ` Patrick McHardy
  0 siblings, 0 replies; 19+ messages in thread
From: Patrick McHardy @ 2005-12-04 22:49 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev, netfilter-devel, davem

Herbert Xu wrote:
> On Sun, Dec 04, 2005 at 11:06:02PM +0100, Patrick McHardy wrote:
> 
> If there is a DNAT in the way, this will jump to the very start of
> the stack.  So if we have a hostile IPsec peer, and the DNAT rules
> are such that this can occur, then we could be in trouble (especially
> because policy/selector verification does not occur until all IPsec
> has been done so we can't check inner address validitiy at this point).

We could return NET_XMIT_BYPASS from ip_xfrm_transport_hook(), although
it looks a bit ugly to use NET_XMIT* on the input path.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-11-23 10:38     ` [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks YOSHIFUJI Hideaki / 吉藤英明
@ 2005-12-18 14:27       ` Patrick McHardy
  2005-12-18 15:15         ` YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 1 reply; 19+ messages in thread
From: Patrick McHardy @ 2005-12-18 14:27 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki / ?$B5HF#1QL@
  Cc: netdev, netfilter-devel, davem, kozakai, kazunori

[-- Attachment #1: Type: text/plain, Size: 814 bytes --]

YOSHIFUJI Hideaki / ^[$B5HF#1QL@^[ wrote:
> In article <438270F2.3000603@trash.net> (at Tue, 22 Nov 2005 02:14:26 +0100), Patrick McHardy <kaber@trash.net> says:
> 
> 
>>The easiest way would be to store nhoff somewhere in the skb and
>>use it to continue at the next header. But I still hope there is
>>a way without keeping data in the skb.
> 
> 
> We've coded up this.

How about this patch instead? It eliminates the nhoffp argument
to IPv6 protocol handlers by storing it in the IP6CB, which allows
to call ip6_input_finish a second time and have it skip already
parsed headers and also gets rid of the manual hopopts skipping.

> Though we have still another issue (call chain issue) to resolve,
> we're getting closer to the goal.
> i.e. we should continue the loop for common case.

I'll look into this now.

[-- Attachment #2: 04.diff --]
[-- Type: text/x-patch, Size: 12821 bytes --]

[IPv6]: Move nextheader offset to the IP6CB

Move nextheader offset to the IP6CB to make it possible to pass a
packet to ip6_input_finish multiple times and have it skip already
parsed headers. As a nice side-effect this gets rid of the manual
hopopts skipping.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 6678dcab480a4c410fe7984c43a9d1830c9b9912
tree 5eb3a8f6f5379c5f2b91af6d37dab9ca880681ae
parent 4cd76cb0703ffcb89d6c49e4d9171d7b6a116cd9
author Patrick McHardy <kaber@trash.net> Sun, 18 Dec 2005 06:52:02 +0100
committer Patrick McHardy <kaber@trash.net> Sun, 18 Dec 2005 06:52:02 +0100

 include/linux/ipv6.h    |    1 +
 include/net/protocol.h  |    2 +-
 include/net/xfrm.h      |    6 +++---
 net/dccp/ipv6.c         |    2 +-
 net/ipv6/exthdrs.c      |   19 ++++++++++++-------
 net/ipv6/icmp.c         |    4 ++--
 net/ipv6/ip6_input.c    |   21 ++++++---------------
 net/ipv6/ip6_tunnel.c   |    2 +-
 net/ipv6/reassembly.c   |   11 +++++------
 net/ipv6/tcp_ipv6.c     |    2 +-
 net/ipv6/udp.c          |    2 +-
 net/ipv6/xfrm6_input.c  |    8 ++++----
 net/ipv6/xfrm6_tunnel.c |    6 +++---
 net/sctp/ipv6.c         |    2 +-
 14 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index a0d0489..04bd248 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -190,6 +190,7 @@ struct inet6_skb_parm {
 	__u16			srcrt;
 	__u16			dst1;
 	__u16			lastopt;
+	__u32			nhoff;
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/include/net/protocol.h b/include/net/protocol.h
index a29cb29..7006a25 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -43,7 +43,7 @@ struct net_protocol {
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 struct inet6_protocol 
 {
-	int	(*handler)(struct sk_buff **skb, unsigned int *nhoffp);
+	int	(*handler)(struct sk_buff **skb);
 
 	void	(*err_handler)(struct sk_buff *skb,
 			       struct inet6_skb_parm *opt,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index f327cb3..1b2876c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -830,7 +830,7 @@ struct xfrm_tunnel {
 };
 
 struct xfrm6_tunnel {
-	int (*handler)(struct sk_buff **pskb, unsigned int *nhoffp);
+	int (*handler)(struct sk_buff **pskb);
 	void (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			    int type, int code, int offset, __u32 info);
 };
@@ -867,8 +867,8 @@ extern int xfrm4_rcv(struct sk_buff *skb
 extern int xfrm4_output(struct sk_buff *skb);
 extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
 extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler);
-extern int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi);
-extern int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+extern int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi);
+extern int xfrm6_rcv(struct sk_buff **pskb);
 extern int xfrm6_tunnel_register(struct xfrm6_tunnel *handler);
 extern int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler);
 extern u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 599b0be..67518c5 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1027,7 +1027,7 @@ discard:
 	return 0;
 }
 
-static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int dccp_v6_rcv(struct sk_buff **pskb)
 {
 	const struct dccp_hdr *dh;
 	struct sk_buff *skb = *pskb;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 113374d..2a1e7e4 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -152,7 +152,7 @@ static struct tlvtype_proc tlvprocdestop
 	{-1,			NULL}
 };
 
-static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_destopt_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -169,7 +169,7 @@ static int ipv6_destopt_rcv(struct sk_bu
 
 	if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
 		skb->h.raw += ((skb->h.raw[1]+1)<<3);
-		*nhoffp = opt->dst1;
+		opt->nhoff = opt->dst1;
 		return 1;
 	}
 
@@ -192,7 +192,7 @@ void __init ipv6_destopt_init(void)
   NONE header. No data in packet.
  ********************************/
 
-static int ipv6_nodata_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_nodata_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 
@@ -215,7 +215,7 @@ void __init ipv6_nodata_init(void)
   Routing header.
  ********************************/
 
-static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_rthdr_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -249,7 +249,7 @@ looped_back:
 		skb->h.raw += (hdr->hdrlen + 1) << 3;
 		opt->dst0 = opt->dst1;
 		opt->dst1 = 0;
-		*nhoffp = (&hdr->nexthdr) - skb->nh.raw;
+		opt->nhoff = (&hdr->nexthdr) - skb->nh.raw;
 		return 1;
 	}
 
@@ -487,9 +487,14 @@ static struct tlvtype_proc tlvprochopopt
 
 int ipv6_parse_hopopts(struct sk_buff *skb, int nhoff)
 {
-	IP6CB(skb)->hop = sizeof(struct ipv6hdr);
-	if (ip6_parse_tlv(tlvprochopopt_lst, skb))
+	struct inet6_skb_parm *opt = IP6CB(skb);
+
+	opt->hop = sizeof(struct ipv6hdr);
+	if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
+		skb->h.raw += (skb->h.raw[1]+1)<<3;
+		opt->nhoff = sizeof(struct ipv6hdr);
 		return sizeof(struct ipv6hdr);
+	}
 	return -1;
 }
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 34a3322..d415c00 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -79,7 +79,7 @@ DEFINE_SNMP_STAT(struct icmpv6_mib, icmp
 static DEFINE_PER_CPU(struct socket *, __icmpv6_socket) = NULL;
 #define icmpv6_socket	__get_cpu_var(__icmpv6_socket)
 
-static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+static int icmpv6_rcv(struct sk_buff **pskb);
 
 static struct inet6_protocol icmpv6_protocol = {
 	.handler	=	icmpv6_rcv,
@@ -569,7 +569,7 @@ static void icmpv6_notify(struct sk_buff
  *	Handle icmp messages
  */
 
-static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int icmpv6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct net_device *dev = skb->dev;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index a6026d2..13d7241 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -97,6 +97,9 @@ int ipv6_rcv(struct sk_buff *skb, struct
 	if (hdr->version != 6)
 		goto err;
 
+	skb->h.raw = (u8 *)(hdr + 1);
+	IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
 	pkt_len = ntohs(hdr->payload_len);
 
 	/* pkt_len may be zero if Jumbo payload option is present */
@@ -111,8 +114,7 @@ int ipv6_rcv(struct sk_buff *skb, struct
 	}
 
 	if (hdr->nexthdr == NEXTHDR_HOP) {
-		skb->h.raw = (u8*)(hdr+1);
-		if (ipv6_parse_hopopts(skb, offsetof(struct ipv6hdr, nexthdr)) < 0) {
+		if (ipv6_parse_hopopts(skb, IP6CB(skb)->nhoff) < 0) {
 			IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 			return 0;
 		}
@@ -143,26 +145,15 @@ static inline int ip6_input_finish(struc
 	int nexthdr;
 	u8 hash;
 
-	skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
-
 	/*
 	 *	Parse extension headers
 	 */
 
-	nexthdr = skb->nh.ipv6h->nexthdr;
-	nhoff = offsetof(struct ipv6hdr, nexthdr);
-
-	/* Skip hop-by-hop options, they are already parsed. */
-	if (nexthdr == NEXTHDR_HOP) {
-		nhoff = sizeof(struct ipv6hdr);
-		nexthdr = skb->h.raw[0];
-		skb->h.raw += (skb->h.raw[1]+1)<<3;
-	}
-
 	rcu_read_lock();
 resubmit:
 	if (!pskb_pull(skb, skb->h.raw - skb->data))
 		goto discard;
+	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb->nh.raw[nhoff];
 
 	raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
@@ -194,7 +185,7 @@ resubmit:
 		    !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) 
 			goto discard;
 		
-		ret = ipprot->handler(&skb, &nhoff);
+		ret = ipprot->handler(&skb);
 		if (ret > 0)
 			goto resubmit;
 		else if (ret == 0)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index e315d0f..f079621 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -510,7 +510,7 @@ static inline void ip6ip6_ecn_decapsulat
  **/
 
 static int 
-ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+ip6ip6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct ipv6hdr *ipv6h;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5d316cb..15e1456 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -581,7 +581,6 @@ err:
  *	the last and the first frames arrived and all the bits are here.
  */
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
-			  unsigned int *nhoffp,
 			  struct net_device *dev)
 {
 	struct sk_buff *fp, *head = fq->fragments;
@@ -654,6 +653,7 @@ static int ip6_frag_reasm(struct frag_qu
 	head->dev = dev;
 	skb_set_timestamp(head, &fq->stamp);
 	head->nh.ipv6h->payload_len = htons(payload_len);
+	IP6CB(head)->nhoff = nhoff;
 
 	*skb_in = head;
 
@@ -663,7 +663,6 @@ static int ip6_frag_reasm(struct frag_qu
 
 	IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
 	fq->fragments = NULL;
-	*nhoffp = nhoff;
 	return 1;
 
 out_oversize:
@@ -678,7 +677,7 @@ out_fail:
 	return -1;
 }
 
-static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_frag_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp; 
 	struct net_device *dev = skb->dev;
@@ -710,7 +709,7 @@ static int ipv6_frag_rcv(struct sk_buff 
 		skb->h.raw += sizeof(struct frag_hdr);
 		IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
 
-		*nhoffp = (u8*)fhdr - skb->nh.raw;
+		IP6CB(skb)->nhoff = (u8*)fhdr - skb->nh.raw;
 		return 1;
 	}
 
@@ -722,11 +721,11 @@ static int ipv6_frag_rcv(struct sk_buff 
 
 		spin_lock(&fq->lock);
 
-		ip6_frag_queue(fq, skb, fhdr, *nhoffp);
+		ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		if (fq->last_in == (FIRST_IN|LAST_IN) &&
 		    fq->meat == fq->len)
-			ret = ip6_frag_reasm(fq, skbp, nhoffp, dev);
+			ret = ip6_frag_reasm(fq, skbp, dev);
 
 		spin_unlock(&fq->lock);
 		fq_put(fq, NULL);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2947bc5..a25f4e8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1153,7 +1153,7 @@ ipv6_pktoptions:
 	return 0;
 }
 
-static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int tcp_v6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct tcphdr *th;	
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d8538dc..c476488 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -435,7 +435,7 @@ out:
 	read_unlock(&udp_hash_lock);
 }
 
-static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int udpv6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct sock *sk;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 28c29d7..1079e47 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -26,7 +26,7 @@ static inline void ipip6_ecn_decapsulate
 		IP6_ECN_set_ce(inner_iph);
 }
 
-int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi)
+int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi)
 {
 	struct sk_buff *skb = *pskb;
 	int err;
@@ -38,7 +38,7 @@ int xfrm6_rcv_spi(struct sk_buff **pskb,
 	int nexthdr;
 	unsigned int nhoff;
 
-	nhoff = *nhoffp;
+	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb->nh.raw[nhoff];
 
 	seq = 0;
@@ -144,7 +144,7 @@ drop:
 
 EXPORT_SYMBOL(xfrm6_rcv_spi);
 
-int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+int xfrm6_rcv(struct sk_buff **pskb)
 {
-	return xfrm6_rcv_spi(pskb, nhoffp, 0);
+	return xfrm6_rcv_spi(pskb, 0);
 }
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index fbef782..da09ff2 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -397,7 +397,7 @@ int xfrm6_tunnel_deregister(struct xfrm6
 
 EXPORT_SYMBOL(xfrm6_tunnel_deregister);
 
-static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int xfrm6_tunnel_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct xfrm6_tunnel *handler = xfrm6_tunnel_handler;
@@ -405,11 +405,11 @@ static int xfrm6_tunnel_rcv(struct sk_bu
 	u32 spi;
 
 	/* device-like_ip6ip6_handler() */
-	if (handler && handler->handler(pskb, nhoffp) == 0)
+	if (handler && handler->handler(pskb) == 0)
 		return 0;
 
 	spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr);
-	return xfrm6_rcv_spi(pskb, nhoffp, spi);
+	return xfrm6_rcv_spi(pskb, spi);
 }
 
 static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fa3be2b..bb8f8cf 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -905,7 +905,7 @@ static struct inet_protosw sctpv6_stream
 	.flags         = SCTP_PROTOSW_FLAG,
 };
 
-static int sctp6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int sctp6_rcv(struct sk_buff **pskb)
 {
 	return sctp_rcv(*pskb) ? -1 : 0;
 }

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-12-18 14:27       ` Patrick McHardy
@ 2005-12-18 15:15         ` YOSHIFUJI Hideaki / 吉藤英明
  2005-12-18 22:59           ` Patrick McHardy
  0 siblings, 1 reply; 19+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-12-18 15:15 UTC (permalink / raw)
  To: kaber, davem; +Cc: kozakai, netdev, netfilter-devel, kazunori

In article <43A571B5.205@trash.net> (at Sun, 18 Dec 2005 15:27:01 +0100), Patrick McHardy <kaber@trash.net> says:

> YOSHIFUJI Hideaki wrote:
> > In article <438270F2.3000603@trash.net> (at Tue, 22 Nov 2005 02:14:26 +0100), Patrick McHardy <kaber@trash.net> says:
> > 
> > 
> >>The easiest way would be to store nhoff somewhere in the skb and
> >>use it to continue at the next header. But I still hope there is
> >>a way without keeping data in the skb.
> > 
> > 
> > We've coded up this.
> 
> How about this patch instead? It eliminates the nhoffp argument
> to IPv6 protocol handlers by storing it in the IP6CB, which allows
> to call ip6_input_finish a second time and have it skip already
> parsed headers and also gets rid of the manual hopopts skipping.

The idea to store IP6CB itself seems sane to me.

BTW, we're now using full of skb->cb
(and we are even exceeding it w/ mobile-ipv6 extensions)...

--yoshfuji

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-12-18 15:15         ` YOSHIFUJI Hideaki / 吉藤英明
@ 2005-12-18 22:59           ` Patrick McHardy
  2005-12-19  3:46             ` YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 1 reply; 19+ messages in thread
From: Patrick McHardy @ 2005-12-18 22:59 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki / ?$B5HF#1QL@
  Cc: netdev, netfilter-devel, kazunori, kozakai, davem

YOSHIFUJI Hideaki / ^[$B5HF#1QL@^[ wrote:
> In article <43A571B5.205@trash.net> (at Sun, 18 Dec 2005 15:27:01 +0100), Patrick McHardy <kaber@trash.net> says:
> 
>>How about this patch instead? It eliminates the nhoffp argument
>>to IPv6 protocol handlers by storing it in the IP6CB, which allows
>>to call ip6_input_finish a second time and have it skip already
>>parsed headers and also gets rid of the manual hopopts skipping.
> 
> 
> The idea to store IP6CB itself seems sane to me.
> 
> BTW, we're now using full of skb->cb
> (and we are even exceeding it w/ mobile-ipv6 extensions)...

Not in mainline so far, so maybe we can fit your extensions
and my patches without the mobile extensions, that apparently
exceed the CB anyway, in there for now. Can I look at those
patches somewhere? BTW, other fields in the IP6CB seem to
store offsets in u16 fields, is this OK for nhoff too? I
thought with jumbo options I need to use a u32 field.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks
  2005-12-18 22:59           ` Patrick McHardy
@ 2005-12-19  3:46             ` YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 0 replies; 19+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-12-19  3:46 UTC (permalink / raw)
  To: kaber; +Cc: netdev, netfilter-devel, kazunori, kozakai, davem

In article <43A5E9D7.8050705@trash.net> (at Sun, 18 Dec 2005 23:59:35 +0100), Patrick McHardy <kaber@trash.net> says:

> YOSHIFUJI Hideaki wrote:
:
> >  BTW, we're now using full of skb->cb
> > (and we are even exceeding it w/ mobile-ipv6 extensions)...
> 
> Not in mainline so far, so maybe we can fit your extensions
> and my patches without the mobile extensions, that apparently
> exceed the CB anyway, in there for now. Can I look at those
> patches somewhere? BTW, other fields in the IP6CB seem to
> store offsets in u16 fields, is this OK for nhoff too? I
> thought with jumbo options I need to use a u32 field.

Well, don't mind too much.
I just wanted to note that we're about to exceed skb->cb.
I will probably enlarge skb->cb if needed.

And, yes, good point.
I think nhoff should be u32 because it is critical.
In theory, u32 is definitely needed for others as well...

--yoshfuji

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2005-12-19  3:46 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <4381F4C7.9070903@trash.net>
     [not found] ` <43826F77.7040502@miyazawa.org>
     [not found]   ` <438270F2.3000603@trash.net>
2005-11-23 10:38     ` [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks YOSHIFUJI Hideaki / 吉藤英明
2005-12-18 14:27       ` Patrick McHardy
2005-12-18 15:15         ` YOSHIFUJI Hideaki / 吉藤英明
2005-12-18 22:59           ` Patrick McHardy
2005-12-19  3:46             ` YOSHIFUJI Hideaki / 吉藤英明
2005-11-20 16:31 [PATCH 00/13]: Netfilter IPsec support Patrick McHardy
2005-11-20 16:31 ` [PATCH 06/13]: [IPV4/6]: Netfilter IPsec input hooks Patrick McHardy
2005-11-21  4:42   ` Yasuyuki KOZAKAI
     [not found]   ` <200511210442.jAL4gPoO001846@toshiba.co.jp>
2005-11-21  6:52     ` Patrick McHardy
2005-11-21  7:00       ` David S. Miller
2005-11-21  7:47         ` Herbert Xu
2005-11-21 16:52         ` Patrick McHardy
2005-11-21 10:53       ` Yasuyuki KOZAKAI
     [not found]       ` <200511211053.jALAro04019574@toshiba.co.jp>
2005-11-21 16:34         ` Patrick McHardy
     [not found]   ` <438185ED.3050005@miyazawa.org>
2005-11-21  8:50     ` YOSHIFUJI Hideaki / 吉藤英明
2005-11-21 16:29       ` Patrick McHardy
2005-12-01  1:27   ` Herbert Xu
2005-12-04 22:06     ` Patrick McHardy
2005-12-04 22:10       ` Herbert Xu
2005-12-04 22:49         ` Patrick McHardy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).