netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [1/4] [IPSEC] Merge xfrm[46]_bundle/stale_bundle
@ 2005-02-14 22:10 Herbert Xu
  2005-02-14 22:12 ` [2/4] [IPSEC] Add xfrm_state_mtu Herbert Xu
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-02-14 22:10 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

[-- Attachment #1: Type: text/plain, Size: 591 bytes --]

Hi:

This series of patches adds MTU tracking in each xfrm_dst.

This patch merges xfrm4_bundle_ok/xfrm6_bundle_ok/stale_bundle
so that later additions for MTU calculation only need to be done once.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

BTW, the previous version of this patch contained a critical
error in stale_bundle.  So please disregard that one.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-1 --]
[-- Type: text/plain, Size: 4208 bytes --]

diff -Nru a/include/net/xfrm.h b/include/net/xfrm.h
--- a/include/net/xfrm.h	2005-02-14 14:13:05 +11:00
+++ b/include/net/xfrm.h	2005-02-14 14:13:05 +11:00
@@ -857,6 +857,7 @@
 extern void xfrm_policy_flush(void);
 extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);
 extern int xfrm_flush_bundles(void);
+extern int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family);
 
 extern wait_queue_head_t km_waitq;
 extern int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport);
diff -Nru a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
--- a/net/ipv4/xfrm4_policy.c	2005-02-14 14:13:05 +11:00
+++ b/net/ipv4/xfrm4_policy.c	2005-02-14 14:13:05 +11:00
@@ -22,26 +22,6 @@
 	return __ip_route_output_key((struct rtable**)dst, fl);
 }
 
-/* Check that the bundle accepts the flow and its components are
- * still valid.
- */
-
-static int __xfrm4_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl)
-{
-	do {
-		if (xdst->u.dst.ops != &xfrm4_dst_ops)
-			return 1;
-
-		if (!xfrm_selector_match(&xdst->u.dst.xfrm->sel, fl, AF_INET))
-			return 0;
-		if (xdst->u.dst.xfrm->km.state != XFRM_STATE_VALID ||
-		    xdst->u.dst.path->obsolete > 0)
-			return 0;
-		xdst = (struct xfrm_dst*)xdst->u.dst.child;
-	} while (xdst);
-	return 0;
-}
-
 static struct dst_entry *
 __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
 {
@@ -53,7 +33,7 @@
 		if (xdst->u.rt.fl.oif == fl->oif &&	/*XXX*/
 		    xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
 	    	    xdst->u.rt.fl.fl4_src == fl->fl4_src &&
-		    __xfrm4_bundle_ok(xdst, fl)) {
+		    xfrm_bundle_ok(xdst, fl, AF_INET)) {
 			dst_clone(dst);
 			break;
 		}
diff -Nru a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
--- a/net/ipv6/xfrm6_policy.c	2005-02-14 14:13:05 +11:00
+++ b/net/ipv6/xfrm6_policy.c	2005-02-14 14:13:05 +11:00
@@ -31,26 +31,6 @@
 	return err;
 }
 
-/* Check that the bundle accepts the flow and its components are
- * still valid.
- */
-
-static int __xfrm6_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl)
-{
-	do {
-		if (xdst->u.dst.ops != &xfrm6_dst_ops)
-			return 1;
-
-		if (!xfrm_selector_match(&xdst->u.dst.xfrm->sel, fl, AF_INET6))
-			return 0;
-		if (xdst->u.dst.xfrm->km.state != XFRM_STATE_VALID ||
-		    xdst->u.dst.path->obsolete > 0)
-			return 0;
-		xdst = (struct xfrm_dst*)xdst->u.dst.child;
-	} while (xdst);
-	return 0;
-}
-
 static struct dst_entry *
 __xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
 {
@@ -70,7 +50,7 @@
 				 xdst->u.rt6.rt6i_src.plen);
 		if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) &&
 		    ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) &&
-		    __xfrm6_bundle_ok(xdst, fl)) {
+		    xfrm_bundle_ok(xdst, fl, AF_INET6)) {
 			dst_clone(dst);
 			break;
 		}
diff -Nru a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
--- a/net/xfrm/xfrm_policy.c	2005-02-14 14:13:05 +11:00
+++ b/net/xfrm/xfrm_policy.c	2005-02-14 14:13:05 +11:00
@@ -1021,18 +1021,7 @@
 
 static int stale_bundle(struct dst_entry *dst)
 {
-	struct dst_entry *child = dst;
-
-	while (child) {
-		if (child->obsolete > 0 ||
-		    (child->dev && !netif_running(child->dev)) ||
-		    (child->xfrm && child->xfrm->km.state != XFRM_STATE_VALID)) {
-			return 1;
-		}
-		child = child->child;
-	}
-
-	return 0;
+	return !xfrm_bundle_ok((struct xfrm_dst *)dst, NULL, AF_UNSPEC);
 }
 
 static void xfrm_dst_destroy(struct dst_entry *dst)
@@ -1108,6 +1097,31 @@
 	return 0;
 }
 
+/* Check that the bundle accepts the flow and its components are
+ * still valid.
+ */
+
+int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family)
+{
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (dst->path->obsolete > 0 ||
+	    (dst->dev && !netif_running(dst->dev)))
+		return 0;
+
+	do {
+		if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family))
+			return 0;
+		if (dst->xfrm->km.state != XFRM_STATE_VALID)
+			return 0;
+		dst = dst->child;
+	} while (dst->xfrm);
+
+	return 1;
+}
+
+EXPORT_SYMBOL(xfrm_bundle_ok);
+ 
 /* Well... that's _TASK_. We need to scan through transformation
  * list and figure out what mss tcp should generate in order to
  * final datagram fit to mtu. Mama mia... :-)

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [2/4] [IPSEC] Add xfrm_state_mtu
  2005-02-14 22:10 [1/4] [IPSEC] Merge xfrm[46]_bundle/stale_bundle Herbert Xu
@ 2005-02-14 22:12 ` Herbert Xu
  2005-02-14 22:14   ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu
  2005-03-07  5:23   ` [2/4] [IPSEC] Add xfrm_state_mtu David S. Miller
  0 siblings, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-02-14 22:12 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

[-- Attachment #1: Type: text/plain, Size: 511 bytes --]

This patch introduces the function xfrm_state_mtu which calculates
the MTU under a specific SA.  This function can be simplified once
we get rid all other uses of get_max_size as we can move the calculation
into the invidual SA type.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
 
Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-2 --]
[-- Type: text/plain, Size: 1367 bytes --]

===== include/net/xfrm.h 1.75 vs edited =====
--- 1.75/include/net/xfrm.h	2005-02-14 14:02:13 +11:00
+++ edited/include/net/xfrm.h	2005-02-14 14:11:38 +11:00
@@ -807,6 +807,7 @@
 extern int xfrm_replay_check(struct xfrm_state *x, u32 seq);
 extern void xfrm_replay_advance(struct xfrm_state *x, u32 seq);
 extern int xfrm_state_check(struct xfrm_state *x, struct sk_buff *skb);
+extern int xfrm_state_mtu(struct xfrm_state *x, int mtu);
 extern int xfrm4_rcv(struct sk_buff *skb);
 extern int xfrm4_output(struct sk_buff *skb);
 extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
===== net/xfrm/xfrm_state.c 1.54 vs edited =====
--- 1.54/net/xfrm/xfrm_state.c	2005-02-09 11:17:47 +11:00
+++ edited/net/xfrm/xfrm_state.c	2005-02-14 14:06:17 +11:00
@@ -966,6 +966,36 @@
 }
 EXPORT_SYMBOL(xfrm_state_delete_tunnel);
 
+int xfrm_state_mtu(struct xfrm_state *x, int mtu)
+{
+	int res = mtu;
+
+	res -= x->props.header_len;
+
+	for (;;) {
+		int m = res;
+
+		if (m < 68)
+			return 68;
+
+		spin_lock_bh(&x->lock);
+		if (x->km.state == XFRM_STATE_VALID &&
+		    x->type && x->type->get_max_size)
+			m = x->type->get_max_size(x, m);
+		else
+			m += x->props.header_len;
+		spin_unlock_bh(&x->lock);
+
+		if (m <= mtu)
+			break;
+		res -= (m - mtu);
+	}
+
+	return res;
+}
+
+EXPORT_SYMBOL(xfrm_state_mtu);
+ 
 void __init xfrm_state_init(void)
 {
 	int i;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [3/4] [IPSEC] Add route element to xfrm_dst
  2005-02-14 22:12 ` [2/4] [IPSEC] Add xfrm_state_mtu Herbert Xu
@ 2005-02-14 22:14   ` Herbert Xu
  2005-02-14 22:16     ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu
                       ` (4 more replies)
  2005-03-07  5:23   ` [2/4] [IPSEC] Add xfrm_state_mtu David S. Miller
  1 sibling, 5 replies; 114+ messages in thread
From: Herbert Xu @ 2005-02-14 22:14 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

[-- Attachment #1: Type: text/plain, Size: 524 bytes --]

This patch adds a pointer to the route corresponding to the specific
flow over the SA of an xfrm_dst that's being used.

It also sets the next pointer of each xfrm_dst to the one above it.
This allows to traverse the list upwards from the bottom.
 
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-3 --]
[-- Type: text/plain, Size: 5915 bytes --]

===== include/net/xfrm.h 1.76 vs edited =====
--- 1.76/include/net/xfrm.h	2005-02-14 14:14:25 +11:00
+++ edited/include/net/xfrm.h	2005-02-14 14:24:04 +11:00
@@ -511,6 +511,7 @@
 		struct rtable		rt;
 		struct rt6_info		rt6;
 	} u;
+	struct dst_entry *route;
 };
 
 /* Decapsulation state, used by the input to store data during
===== net/ipv4/xfrm4_policy.c 1.14 vs edited =====
--- 1.14/net/ipv4/xfrm4_policy.c	2005-02-14 14:02:13 +11:00
+++ edited/net/ipv4/xfrm4_policy.c	2005-02-14 14:29:35 +11:00
@@ -55,18 +55,29 @@
 	struct rtable *rt = rt0;
 	u32 remote = fl->fl4_dst;
 	u32 local  = fl->fl4_src;
+	struct flowi fl_tunnel = {
+		.nl_u = {
+			.ip4_u = {
+				.saddr = local,
+				.daddr = remote
+			}
+		}
+	};
 	int i;
 	int err;
 	int header_len = 0;
 	int trailer_len = 0;
 
 	dst = dst_prev = NULL;
+	dst_hold(&rt->u.dst);
 
 	for (i = 0; i < nx; i++) {
 		struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
+		struct xfrm_dst *xdst;
 
 		if (unlikely(dst1 == NULL)) {
 			err = -ENOBUFS;
+			dst_release(&rt->u.dst);
 			goto error;
 		}
 
@@ -77,6 +88,11 @@
 			dst1->flags |= DST_NOHASH;
 			dst_clone(dst1);
 		}
+
+		xdst = (struct xfrm_dst *)dst1;
+		xdst->route = &rt->u.dst;
+
+		dst1->next = dst_prev;
 		dst_prev = dst1;
 		if (xfrm[i]->props.mode) {
 			remote = xfrm[i]->id.daddr.a4;
@@ -84,23 +100,27 @@
 		}
 		header_len += xfrm[i]->props.header_len;
 		trailer_len += xfrm[i]->props.trailer_len;
-	}
 
-	if (remote != fl->fl4_dst) {
-		struct flowi fl_tunnel = { .nl_u = { .ip4_u =
-						     { .daddr = remote,
-						       .saddr = local }
-					           }
-				         };
-		err = xfrm_dst_lookup((struct xfrm_dst**)&rt, &fl_tunnel, AF_INET);
-		if (err)
-			goto error;
-	} else {
-		dst_hold(&rt->u.dst);
+		if (remote != fl_tunnel.fl4_dst) {
+			fl_tunnel.fl4_src = local;
+			fl_tunnel.fl4_dst = remote;
+			err = xfrm_dst_lookup((struct xfrm_dst **)&rt,
+					      &fl_tunnel, AF_INET);
+			if (err)
+				goto error;
+		} else
+			dst_hold(&rt->u.dst);
 	}
+
 	dst_prev->child = &rt->u.dst;
+	dst->path = &rt->u.dst;
+
+	*dst_p = dst;
+	dst = dst_prev;
+
+	dst_prev = *dst_p;
 	i = 0;
-	for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
+	for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
 		struct xfrm_dst *x = (struct xfrm_dst*)dst_prev;
 		x->u.rt.fl = *fl;
 
@@ -114,7 +134,6 @@
 		dst_prev->header_len	= header_len;
 		dst_prev->trailer_len	= trailer_len;
 		memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics));
-		dst_prev->path		= &rt->u.dst;
 
 		/* Copy neighbout for reachability confirmation */
 		dst_prev->neighbour	= neigh_clone(rt->u.dst.neighbour);
@@ -134,7 +153,6 @@
 		header_len -= x->u.dst.xfrm->props.header_len;
 		trailer_len -= x->u.dst.xfrm->props.trailer_len;
 	}
-	*dst_p = dst;
 	return 0;
 
 error:
===== net/ipv6/xfrm6_policy.c 1.25 vs edited =====
--- 1.25/net/ipv6/xfrm6_policy.c	2005-02-14 14:02:13 +11:00
+++ edited/net/ipv6/xfrm6_policy.c	2005-02-14 14:50:04 +11:00
@@ -72,18 +72,29 @@
 	struct rt6_info *rt  = rt0;
 	struct in6_addr *remote = &fl->fl6_dst;
 	struct in6_addr *local  = &fl->fl6_src;
+	struct flowi fl_tunnel = {
+		.nl_u = {
+			.ip6_u = {
+				.saddr = *local,
+				.daddr = *remote
+			}
+		}
+	};
 	int i;
 	int err = 0;
 	int header_len = 0;
 	int trailer_len = 0;
 
 	dst = dst_prev = NULL;
+	dst_hold(&rt->u.dst);
 
 	for (i = 0; i < nx; i++) {
 		struct dst_entry *dst1 = dst_alloc(&xfrm6_dst_ops);
+		struct xfrm_dst *xdst;
 
 		if (unlikely(dst1 == NULL)) {
 			err = -ENOBUFS;
+			dst_release(&rt->u.dst);
 			goto error;
 		}
 
@@ -94,6 +105,11 @@
 			dst1->flags |= DST_NOHASH;
 			dst_clone(dst1);
 		}
+
+		xdst = (struct xfrm_dst *)dst1;
+		xdst->route = &rt->u.dst;
+
+		dst1->next = dst_prev;
 		dst_prev = dst1;
 		if (xfrm[i]->props.mode) {
 			remote = (struct in6_addr*)&xfrm[i]->id.daddr;
@@ -101,25 +117,27 @@
 		}
 		header_len += xfrm[i]->props.header_len;
 		trailer_len += xfrm[i]->props.trailer_len;
-	}
-
-	if (!ipv6_addr_equal(remote, &fl->fl6_dst)) {
-		struct flowi fl_tunnel;
 
-		memset(&fl_tunnel, 0, sizeof(fl_tunnel));
-		ipv6_addr_copy(&fl_tunnel.fl6_dst, remote);
-		ipv6_addr_copy(&fl_tunnel.fl6_src, local);
-
-		err = xfrm_dst_lookup((struct xfrm_dst **) &rt,
-				      &fl_tunnel, AF_INET6);
-		if (err)
-			goto error;
-	} else {
-		dst_hold(&rt->u.dst);
+		if (!ipv6_addr_equal(remote, &fl_tunnel.fl6_dst)) {
+			ipv6_addr_copy(&fl_tunnel.fl6_dst, remote);
+			ipv6_addr_copy(&fl_tunnel.fl6_src, local);
+			err = xfrm_dst_lookup((struct xfrm_dst **) &rt,
+					      &fl_tunnel, AF_INET6);
+			if (err)
+				goto error;
+		} else
+			dst_hold(&rt->u.dst);
 	}
+
 	dst_prev->child = &rt->u.dst;
+	dst->path = &rt->u.dst;
+
+	*dst_p = dst;
+	dst = dst_prev;
+
+	dst_prev = *dst_p;
 	i = 0;
-	for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
+	for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
 		struct xfrm_dst *x = (struct xfrm_dst*)dst_prev;
 
 		dst_prev->xfrm = xfrm[i++];
@@ -132,7 +150,6 @@
 		dst_prev->header_len	= header_len;
 		dst_prev->trailer_len	= trailer_len;
 		memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics));
-		dst_prev->path		= &rt->u.dst;
 
 		/* Copy neighbour for reachability confirmation */
 		dst_prev->neighbour	= neigh_clone(rt->u.dst.neighbour);
@@ -150,7 +167,6 @@
 		header_len -= x->u.dst.xfrm->props.header_len;
 		trailer_len -= x->u.dst.xfrm->props.trailer_len;
 	}
-	*dst_p = dst;
 	return 0;
 
 error:
===== net/xfrm/xfrm_policy.c 1.65 vs edited =====
--- 1.65/net/xfrm/xfrm_policy.c	2005-02-14 14:02:13 +11:00
+++ edited/net/xfrm/xfrm_policy.c	2005-02-14 14:39:52 +11:00
@@ -1026,6 +1026,11 @@
 
 static void xfrm_dst_destroy(struct dst_entry *dst)
 {
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
+	if (xdst->route)
+		dst_release(xdst->route);
+
 	if (!dst->xfrm)
 		return;
 	xfrm_state_put(dst->xfrm);

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [4/4] [IPSEC] Store MTU at each xfrm_dst
  2005-02-14 22:14   ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu
@ 2005-02-14 22:16     ` Herbert Xu
  2005-02-15 15:53       ` James Morris
                         ` (3 more replies)
  2005-02-15  8:10     ` [3/4] [IPSEC] Add route element to xfrm_dst Mika Penttilä
                       ` (3 subsequent siblings)
  4 siblings, 4 replies; 114+ messages in thread
From: Herbert Xu @ 2005-02-14 22:16 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

[-- Attachment #1: Type: text/plain, Size: 514 bytes --]

Finally this is the patch that sets the MTU values of each xfrm_dst
and keeps it up-to-date.
  
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

To recap, at this point we've obtained accurate MTU values at each
xfrm_dst.  The next step would be to start using it as
dst_pmtu(xfrm_dst).

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-4 --]
[-- Type: text/plain, Size: 3516 bytes --]

===== include/net/xfrm.h 1.77 vs edited =====
--- 1.77/include/net/xfrm.h	2005-02-14 14:51:26 +11:00
+++ edited/include/net/xfrm.h	2005-02-14 14:23:51 +11:00
@@ -512,6 +512,8 @@
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
+	u32 route_mtu_cached;
+	u32 child_mtu_cached;
 };
 
 /* Decapsulation state, used by the input to store data during
@@ -860,6 +862,7 @@
 extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);
 extern int xfrm_flush_bundles(void);
 extern int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family);
+extern void xfrm_init_pmtu(struct dst_entry *dst);
 
 extern wait_queue_head_t km_waitq;
 extern int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport);
===== net/ipv4/xfrm4_policy.c 1.15 vs edited =====
--- 1.15/net/ipv4/xfrm4_policy.c	2005-02-14 14:51:26 +11:00
+++ edited/net/ipv4/xfrm4_policy.c	2005-02-14 14:53:06 +11:00
@@ -153,6 +153,8 @@
 		header_len -= x->u.dst.xfrm->props.header_len;
 		trailer_len -= x->u.dst.xfrm->props.trailer_len;
 	}
+
+	xfrm_init_pmtu(dst);
 	return 0;
 
 error:
===== net/ipv6/xfrm6_policy.c 1.26 vs edited =====
--- 1.26/net/ipv6/xfrm6_policy.c	2005-02-14 14:51:26 +11:00
+++ edited/net/ipv6/xfrm6_policy.c	2005-02-14 14:53:25 +11:00
@@ -167,6 +167,8 @@
 		header_len -= x->u.dst.xfrm->props.header_len;
 		trailer_len -= x->u.dst.xfrm->props.trailer_len;
 	}
+
+	xfrm_init_pmtu(dst);
 	return 0;
 
 error:
===== net/xfrm/xfrm_policy.c 1.66 vs edited =====
--- 1.66/net/xfrm/xfrm_policy.c	2005-02-14 14:51:26 +11:00
+++ edited/net/xfrm/xfrm_policy.c	2005-02-14 21:29:56 +11:00
@@ -1102,25 +1102,86 @@
 	return 0;
 }
 
+void xfrm_init_pmtu(struct dst_entry *dst)
+{
+	do {
+		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+		u32 pmtu, route_mtu_cached;
+
+		pmtu = dst_pmtu(dst->child);
+		xdst->child_mtu_cached = pmtu;
+
+		pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
+
+		route_mtu_cached = dst_pmtu(xdst->route);
+		xdst->route_mtu_cached = route_mtu_cached;
+
+		if (pmtu > route_mtu_cached)
+			pmtu = route_mtu_cached;
+
+		dst->metrics[RTAX_MTU-1] = pmtu;
+	} while ((dst = dst->next));
+}
+
+EXPORT_SYMBOL(xfrm_init_pmtu);
+
 /* Check that the bundle accepts the flow and its components are
  * still valid.
  */
 
-int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family)
+int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family)
 {
-	struct dst_entry *dst = &xdst->u.dst;
+	struct dst_entry *dst = &first->u.dst;
+	struct xfrm_dst *last;
+	u32 mtu;
 
 	if (dst->path->obsolete > 0 ||
 	    (dst->dev && !netif_running(dst->dev)))
 		return 0;
 
+	last = NULL;
+
 	do {
+		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
 		if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family))
 			return 0;
 		if (dst->xfrm->km.state != XFRM_STATE_VALID)
 			return 0;
+
+		mtu = dst_pmtu(dst->child);
+		if (xdst->child_mtu_cached != mtu) {
+			last = xdst;
+			xdst->child_mtu_cached = mtu;
+		}
+
+		mtu = dst_pmtu(xdst->route);
+		if (xdst->child_mtu_cached != mtu) {
+			last = xdst;
+			xdst->route_mtu_cached = mtu;
+		}
+
 		dst = dst->child;
 	} while (dst->xfrm);
+
+	if (likely(!last))
+		return 1;
+
+	mtu = last->child_mtu_cached;
+	for (;;) {
+		dst = &last->u.dst;
+
+		mtu = xfrm_state_mtu(dst->xfrm, mtu);
+		if (mtu > last->route_mtu_cached)
+			mtu = last->route_mtu_cached;
+		dst->metrics[RTAX_MTU-1] = mtu;
+
+		if (last == first)
+			break;
+
+		last = last->u.next;
+		last->child_mtu_cached = mtu;
+	}
 
 	return 1;
 }

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [3/4] [IPSEC] Add route element to xfrm_dst
  2005-02-14 22:14   ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu
  2005-02-14 22:16     ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu
@ 2005-02-15  8:10     ` Mika Penttilä
  2005-02-15  9:53       ` Herbert Xu
  2005-03-07  5:28     ` David S. Miller
                       ` (2 subsequent siblings)
  4 siblings, 1 reply; 114+ messages in thread
From: Mika Penttilä @ 2005-02-15  8:10 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:

>This patch adds a pointer to the route corresponding to the specific
>flow over the SA of an xfrm_dst that's being used.
>
>It also sets the next pointer of each xfrm_dst to the one above it.
>This allows to traverse the list upwards from the bottom.
> 
>Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
>Cheers,
>  
>
>------------------------------------------------------------------------
> /* Decapsulation state, used by the input to store data during
>===== net/ipv4/xfrm4_policy.c 1.14 vs edited =====
>--- 1.14/net/ipv4/xfrm4_policy.c	2005-02-14 14:02:13 +11:00
>+++ edited/net/ipv4/xfrm4_policy.c	2005-02-14 14:29:35 +11:00
>@@ -55,18 +55,29 @@
> 	struct rtable *rt = rt0;
> 	u32 remote = fl->fl4_dst;
> 	u32 local  = fl->fl4_src;
>+	struct flowi fl_tunnel = {
>+		.nl_u = {
>+			.ip4_u = {
>+				.saddr = local,
>+				.daddr = remote
>+			}
>+		}
>+	};
> 	int i;
> 	int err;
> 	int header_len = 0;
> 	int trailer_len = 0;
> 
> 	dst = dst_prev = NULL;
>  
>

Shouldn't this pinning happen inside the loop for eaxh xdst, to be 
balanced with  the dst_release(xdst->route) in xfrm_dst_destroy ?

>+	dst_hold(&rt->u.dst);
> 
> 	for (i = 0; i < nx; i++) {
> 		struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
>+		struct xfrm_dst *xdst;
> 
> 		if (unlikely(dst1 == NULL)) {
> 			err = -ENOBUFS;
>  
>

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [3/4] [IPSEC] Add route element to xfrm_dst
  2005-02-15  8:10     ` [3/4] [IPSEC] Add route element to xfrm_dst Mika Penttilä
@ 2005-02-15  9:53       ` Herbert Xu
  2005-02-15 10:22         ` Mika Penttilä
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-02-15  9:53 UTC (permalink / raw)
  To: Mika Penttil?; +Cc: netdev

On Tue, Feb 15, 2005 at 10:10:06AM +0200, Mika Penttil? wrote:
> 
> Shouldn't this pinning happen inside the loop for eaxh xdst, to be 
> balanced with  the dst_release(xdst->route) in xfrm_dst_destroy ?
> 
> >+	dst_hold(&rt->u.dst);
> >
> >	for (i = 0; i < nx; i++) {
> >		struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
> >+		struct xfrm_dst *xdst;
> >
> >		if (unlikely(dst1 == NULL)) {
> >			err = -ENOBUFS;

There is a dst_release here.

The reference is either released here if the allocation fails, or it
is held by dst1 which is either returned or freed as part of the bundle.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [3/4] [IPSEC] Add route element to xfrm_dst
  2005-02-15  9:53       ` Herbert Xu
@ 2005-02-15 10:22         ` Mika Penttilä
  0 siblings, 0 replies; 114+ messages in thread
From: Mika Penttilä @ 2005-02-15 10:22 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:

>On Tue, Feb 15, 2005 at 10:10:06AM +0200, Mika Penttil? wrote:
>  
>
>>Shouldn't this pinning happen inside the loop for eaxh xdst, to be 
>>balanced with  the dst_release(xdst->route) in xfrm_dst_destroy ?
>>
>>    
>>
>>>+	dst_hold(&rt->u.dst);
>>>
>>>	for (i = 0; i < nx; i++) {
>>>		struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
>>>+		struct xfrm_dst *xdst;
>>>
>>>		if (unlikely(dst1 == NULL)) {
>>>			err = -ENOBUFS;
>>>      
>>>
>
>There is a dst_release here.
>
>The reference is either released here if the allocation fails, or it
>is held by dst1 which is either returned or freed as part of the bundle.
>
>Cheers,
>  
>
ok I see now, thanks

Mika

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [4/4] [IPSEC] Store MTU at each xfrm_dst
  2005-02-14 22:16     ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu
@ 2005-02-15 15:53       ` James Morris
  2005-02-15 20:31         ` Herbert Xu
  2005-02-16 10:37       ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu
                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 114+ messages in thread
From: James Morris @ 2005-02-15 15:53 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki, netdev

On Tue, 15 Feb 2005, Herbert Xu wrote:

> Finally this is the patch that sets the MTU values of each xfrm_dst
> and keeps it up-to-date.
>   
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> 
> To recap, at this point we've obtained accurate MTU values at each
> xfrm_dst.  The next step would be to start using it as
> dst_pmtu(xfrm_dst).

Nice work.

The only thing I'd suggest is to consider renaming the new route element
of xfrm_dst to xfrm_dst_route (or xd_route), to make it easier to navigate
the code.  (But the other elements are not distinctive either so perhaps 
some future cleanup could do it).


- James
-- 
James Morris
<jmorris@redhat.com>

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [4/4] [IPSEC] Store MTU at each xfrm_dst
  2005-02-15 15:53       ` James Morris
@ 2005-02-15 20:31         ` Herbert Xu
  0 siblings, 0 replies; 114+ messages in thread
From: Herbert Xu @ 2005-02-15 20:31 UTC (permalink / raw)
  To: James Morris; +Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki, netdev

On Tue, Feb 15, 2005 at 10:53:11AM -0500, James Morris wrote:
> 
> The only thing I'd suggest is to consider renaming the new route element
> of xfrm_dst to xfrm_dst_route (or xd_route), to make it easier to navigate
> the code.  (But the other elements are not distinctive either so perhaps 
> some future cleanup could do it).

Thanks for your comment.

I would say that not only are the other elements of xfrm_dst missing
the prefixes, but the same thing applies to the rest of xfrm.h :)

So if we decide to go that way, then we should do it to the entire
xfrm subsystem.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output
  2005-02-14 22:16     ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu
  2005-02-15 15:53       ` James Morris
@ 2005-02-16 10:37       ` Herbert Xu
  2005-02-16 11:08         ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu
                           ` (2 more replies)
  2005-03-07  5:32       ` [4/4] [IPSEC] Store MTU at each xfrm_dst David S. Miller
  2005-03-28 20:10       ` [4/4] [IPSEC] Store MTU at each xfrm_dst Patrick McHardy
  3 siblings, 3 replies; 114+ messages in thread
From: Herbert Xu @ 2005-02-16 10:37 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

[-- Attachment #1: Type: text/plain, Size: 673 bytes --]

There was a lot of fun in laying the foundations.  Now it's time
to start using it.

This patch fixes the calculations in xfrm[46]_output so that we don't
reject properly sized packets from the TCP stack.  The previous
calculation is wrong because the xfrm overhead is not constant.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

After this I'll start cleaning stuff up.  For example, tcp's
ext2_header_len and dst's path can both be removed.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-5 --]
[-- Type: text/plain, Size: 1256 bytes --]

===== include/net/dst.h 1.25 vs edited =====
--- 1.25/include/net/dst.h	2005-02-06 14:23:59 +11:00
+++ edited/include/net/dst.h	2005-02-16 21:29:56 +11:00
@@ -123,6 +123,16 @@
 	return mtu;
 }
 
+static inline u32 dst_mtu(const struct dst_entry *dst)
+{
+	u32 mtu = dst_metric(dst, RTAX_MTU);
+	/*
+	 * Alexey put it here, so ask him about it :)
+	 */
+	barrier();
+	return mtu;
+}
+
 static inline int
 dst_metric_locked(struct dst_entry *dst, int metric)
 {
===== net/ipv4/xfrm4_output.c 1.5 vs edited =====
--- 1.5/net/ipv4/xfrm4_output.c	2004-10-26 09:10:25 +10:00
+++ edited/net/ipv4/xfrm4_output.c	2005-02-16 21:31:32 +11:00
@@ -82,7 +82,7 @@
 		goto out;
 
 	dst = skb->dst;
-	mtu = dst_pmtu(dst) - dst->header_len - dst->trailer_len;
+	mtu = dst_mtu(dst);
 	if (skb->len > mtu) {
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 		ret = -EMSGSIZE;
===== net/ipv6/xfrm6_output.c 1.7 vs edited =====
--- 1.7/net/ipv6/xfrm6_output.c	2004-11-16 13:52:34 +11:00
+++ edited/net/ipv6/xfrm6_output.c	2005-02-16 21:31:46 +11:00
@@ -79,7 +79,7 @@
 	int mtu, ret = 0;
 	struct dst_entry *dst = skb->dst;
 
-	mtu = dst_pmtu(dst) - dst->header_len - dst->trailer_len;
+	mtu = dst_mtu(dst);
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
 

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst
  2005-02-16 10:37       ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu
@ 2005-02-16 11:08         ` Herbert Xu
  2005-02-16 11:38           ` [7/*] [IPSEC] Get metrics for xfrm_dst from " Herbert Xu
  2005-03-07  5:35           ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update " David S. Miller
  2005-03-07  5:33         ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output David S. Miller
  2005-03-07 11:45         ` [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len Herbert Xu
  2 siblings, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-02-16 11:08 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

[-- Attachment #1: Type: text/plain, Size: 655 bytes --]

Let's also fix IPsec PMTU storage.  When we get an MTU update for an
xfrm_dst, it should be done to the top dst, not the bottom dst.

For example, when we get a need-to-frag message for host C behind
a our IPsec peer B, we should be updating the dst entry for C and
not B as we do now.

I've removed the boundary checks since the same checks are done
in ipv[46]/route.c already.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-6 --]
[-- Type: text/plain, Size: 1056 bytes --]

===== net/ipv4/xfrm4_policy.c 1.16 vs edited =====
--- 1.16/net/ipv4/xfrm4_policy.c	2005-02-14 14:59:25 +11:00
+++ edited/net/ipv4/xfrm4_policy.c	2005-02-16 22:01:56 +11:00
@@ -235,10 +235,8 @@
 
 static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
-	struct dst_entry *path = dst->path;
-
-	if (mtu < 68 + dst->header_len)
-		return;
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	struct dst_entry *path = xdst->route;
 
 	path->ops->update_pmtu(path, mtu);
 }
===== net/ipv6/xfrm6_policy.c 1.27 vs edited =====
--- 1.27/net/ipv6/xfrm6_policy.c	2005-02-14 14:59:25 +11:00
+++ edited/net/ipv6/xfrm6_policy.c	2005-02-16 22:03:06 +11:00
@@ -243,12 +243,10 @@
 
 static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
-	struct dst_entry *path = dst->path;
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	struct dst_entry *path = xdst->route;
 
-	if (mtu >= IPV6_MIN_MTU && mtu < dst_pmtu(dst))
-		path->ops->update_pmtu(path, mtu);
-	
-	return;
+	path->ops->update_pmtu(path, mtu);
 }
 
 static struct dst_ops xfrm6_dst_ops = {

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [7/*] [IPSEC] Get metrics for xfrm_dst from top dst
  2005-02-16 11:08         ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu
@ 2005-02-16 11:38           ` Herbert Xu
  2005-03-07  5:47             ` David S. Miller
  2005-03-07  5:35           ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update " David S. Miller
  1 sibling, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-02-16 11:38 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

[-- Attachment #1: Type: text/plain, Size: 509 bytes --]

For the metrics other than MTU, we should also be getting it from
the top dst as opposed to the bottom dst.  With this change, the
user is able to manipulate values such as advmss for individual
hosts behind a remote IPsec gateway.
 
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-7 --]
[-- Type: text/plain, Size: 1148 bytes --]

===== net/ipv4/xfrm4_policy.c 1.17 vs edited =====
--- 1.17/net/ipv4/xfrm4_policy.c	2005-02-16 22:09:18 +11:00
+++ edited/net/ipv4/xfrm4_policy.c	2005-02-16 22:35:09 +11:00
@@ -133,7 +133,7 @@
 		dst_prev->lastuse	= jiffies;
 		dst_prev->header_len	= header_len;
 		dst_prev->trailer_len	= trailer_len;
-		memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics));
+		memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
 
 		/* Copy neighbout for reachability confirmation */
 		dst_prev->neighbour	= neigh_clone(rt->u.dst.neighbour);
===== net/ipv6/xfrm6_policy.c 1.28 vs edited =====
--- 1.28/net/ipv6/xfrm6_policy.c	2005-02-16 22:09:18 +11:00
+++ edited/net/ipv6/xfrm6_policy.c	2005-02-16 22:35:15 +11:00
@@ -149,7 +149,7 @@
 		dst_prev->lastuse	= jiffies;
 		dst_prev->header_len	= header_len;
 		dst_prev->trailer_len	= trailer_len;
-		memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics));
+		memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
 
 		/* Copy neighbour for reachability confirmation */
 		dst_prev->neighbour	= neigh_clone(rt->u.dst.neighbour);

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [2/4] [IPSEC] Add xfrm_state_mtu
  2005-02-14 22:12 ` [2/4] [IPSEC] Add xfrm_state_mtu Herbert Xu
  2005-02-14 22:14   ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu
@ 2005-03-07  5:23   ` David S. Miller
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-07  5:23 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev

On Tue, 15 Feb 2005 09:12:00 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> This patch introduces the function xfrm_state_mtu which calculates
> the MTU under a specific SA.  This function can be simplified once
> we get rid all other uses of get_max_size as we can move the calculation
> into the invidual SA type.

Applied, thanks Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [3/4] [IPSEC] Add route element to xfrm_dst
  2005-02-14 22:14   ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu
  2005-02-14 22:16     ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu
  2005-02-15  8:10     ` [3/4] [IPSEC] Add route element to xfrm_dst Mika Penttilä
@ 2005-03-07  5:28     ` David S. Miller
  2005-03-07 10:02       ` Herbert Xu
  2005-03-07 10:16     ` [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy Herbert Xu
  2005-03-14 11:52     ` [3/4] [IPSEC] Add route element to xfrm_dst Patrick McHardy
  4 siblings, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-03-07  5:28 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev

On Tue, 15 Feb 2005 09:14:33 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> This patch adds a pointer to the route corresponding to the specific
> flow over the SA of an xfrm_dst that's being used.
> 
> It also sets the next pointer of each xfrm_dst to the one above it.
> This allows to traverse the list upwards from the bottom.

Applied, thanks Herbert.

I guess this back traversal list makes it impossible for us
to share xfrm_dst's for multiple unique flows even if that
could be possible, right?

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [4/4] [IPSEC] Store MTU at each xfrm_dst
  2005-02-14 22:16     ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu
  2005-02-15 15:53       ` James Morris
  2005-02-16 10:37       ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu
@ 2005-03-07  5:32       ` David S. Miller
  2005-03-07 10:35         ` [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok Herbert Xu
  2005-03-28 20:10       ` [4/4] [IPSEC] Store MTU at each xfrm_dst Patrick McHardy
  3 siblings, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-03-07  5:32 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev

On Tue, 15 Feb 2005 09:16:07 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Finally this is the patch that sets the MTU values of each xfrm_dst
> and keeps it up-to-date.
>   
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> 
> To recap, at this point we've obtained accurate MTU values at each
> xfrm_dst.  The next step would be to start using it as
> dst_pmtu(xfrm_dst).

Applied, but with a bug fix:

+		mtu = dst_pmtu(xdst->route);
+		if (xdst->child_mtu_cached != mtu) {
+			last = xdst;
+			xdst->route_mtu_cached = mtu;
+		}

You obviously meant "route_mtu_cached" in the test,
not child_mtu_cached.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output
  2005-02-16 10:37       ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu
  2005-02-16 11:08         ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu
@ 2005-03-07  5:33         ` David S. Miller
  2005-03-07 11:45         ` [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len Herbert Xu
  2 siblings, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-07  5:33 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev

On Wed, 16 Feb 2005 21:37:44 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> There was a lot of fun in laying the foundations.  Now it's time
> to start using it.
> 
> This patch fixes the calculations in xfrm[46]_output so that we don't
> reject properly sized packets from the TCP stack.  The previous
> calculation is wrong because the xfrm overhead is not constant.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> 
> After this I'll start cleaning stuff up.  For example, tcp's
> ext2_header_len and dst's path can both be removed.

Looks good, applied.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst
  2005-02-16 11:08         ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu
  2005-02-16 11:38           ` [7/*] [IPSEC] Get metrics for xfrm_dst from " Herbert Xu
@ 2005-03-07  5:35           ` David S. Miller
  2005-03-07 10:39             ` Herbert Xu
  1 sibling, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-03-07  5:35 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev

On Wed, 16 Feb 2005 22:08:42 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Let's also fix IPsec PMTU storage.  When we get an MTU update for an
> xfrm_dst, it should be done to the top dst, not the bottom dst.
> 
> For example, when we get a need-to-frag message for host C behind
> a our IPsec peer B, we should be updating the dst entry for C and
> not B as we do now.
> 
> I've removed the boundary checks since the same checks are done
> in ipv[46]/route.c already.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks Herbert.

Note that sometimes it is better to replace an "unnecessary as
determined by me" boundary check with a BUG() instead of
outright removal.  That way you get to test your assertion :)

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [7/*] [IPSEC] Get metrics for xfrm_dst from top dst
  2005-02-16 11:38           ` [7/*] [IPSEC] Get metrics for xfrm_dst from " Herbert Xu
@ 2005-03-07  5:47             ` David S. Miller
  2005-03-07 10:41               ` Herbert Xu
  0 siblings, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-03-07  5:47 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev

On Wed, 16 Feb 2005 22:38:36 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> For the metrics other than MTU, we should also be getting it from
> the top dst as opposed to the bottom dst.  With this change, the
> user is able to manipulate values such as advmss for individual
> hosts behind a remote IPsec gateway.
>  
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, great work Herbert.

This is the last patch I saw in this series.  Any more stuff
you have ready to apply in this area?

My net-2.6 tree at:

	bk://kernel.bkbits.net/davem/net-2.6

has all of the patches I applied today in it.  I'll push that
to Linus tomorrow most likely.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [3/4] [IPSEC] Add route element to xfrm_dst
  2005-03-07  5:28     ` David S. Miller
@ 2005-03-07 10:02       ` Herbert Xu
  0 siblings, 0 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-07 10:02 UTC (permalink / raw)
  To: David S. Miller; +Cc: kuznet, jmorris, yoshfuji, netdev

On Sun, Mar 06, 2005 at 09:28:00PM -0800, David S. Miller wrote:
> 
> I guess this back traversal list makes it impossible for us
> to share xfrm_dst's for multiple unique flows even if that
> could be possible, right?

I used the back traversal only because it was convenient.  There is
nothing fundamental about it.

So if we ever implemented xdst sharing, we could easily modify
the couple of functions involved to walk the bundle forward and
then in reverse.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy
  2005-02-14 22:14   ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu
                       ` (2 preceding siblings ...)
  2005-03-07  5:28     ` David S. Miller
@ 2005-03-07 10:16     ` Herbert Xu
  2005-03-07 17:35       ` David S. Miller
  2005-03-14 11:52     ` [3/4] [IPSEC] Add route element to xfrm_dst Patrick McHardy
  4 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-07 10:16 UTC (permalink / raw)
  To: David S. Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 413 bytes --]

Hi Dave:

Here's a trivial patch to get rid of a redundant check that I added
in patch 3/4.  dst_release already checks for dst == NULL.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-8 --]
[-- Type: text/plain, Size: 354 bytes --]

===== net/xfrm/xfrm_policy.c 1.69 vs edited =====
--- 1.69/net/xfrm/xfrm_policy.c	2005-03-07 16:31:30 +11:00
+++ edited/net/xfrm/xfrm_policy.c	2005-03-07 21:12:23 +11:00
@@ -1028,8 +1028,7 @@
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 
-	if (xdst->route)
-		dst_release(xdst->route);
+	dst_release(xdst->route);
 
 	if (!dst->xfrm)
 		return;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok
  2005-03-07  5:32       ` [4/4] [IPSEC] Store MTU at each xfrm_dst David S. Miller
@ 2005-03-07 10:35         ` Herbert Xu
  2005-03-07 17:32           ` David S. Miller
  2005-03-08 10:27           ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu
  0 siblings, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-07 10:35 UTC (permalink / raw)
  To: David S. Miller; +Cc: kuznet, jmorris, yoshfuji, netdev

[-- Attachment #1: Type: text/plain, Size: 1154 bytes --]

On Sun, Mar 06, 2005 at 09:32:14PM -0800, David S. Miller wrote:
> 
> Applied, but with a bug fix:
> 
> +		mtu = dst_pmtu(xdst->route);
> +		if (xdst->child_mtu_cached != mtu) {
> +			last = xdst;
> +			xdst->route_mtu_cached = mtu;
> +		}
> 
> You obviously meant "route_mtu_cached" in the test,
> not child_mtu_cached.

Thanks for catching this.

There is another bug in xfrm_bundle_ok where I forgot to
check the validity of xdst->route.  In fact, the check
on dst->path isn't strong enough either.  For IPv6 entries,
dst->path->obsolete is always negative until you call
ipv6_dst_check.  So we really need to do that here.

Here's the patch to fix those two problems.  Yes I know
my dst_check implementation is lame.  I'll come back and
fix up all the dst_check functions by moving their dst_release
calls out.  It proves that you were right in that IPv6 dst
leak thread :)

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-9 --]
[-- Type: text/plain, Size: 988 bytes --]

===== include/net/dst.h 1.27 vs edited =====
--- 1.27/include/net/dst.h	2005-03-07 16:33:59 +11:00
+++ edited/include/net/dst.h	2005-03-07 21:06:13 +11:00
@@ -259,6 +259,15 @@
 	}
 }
 
+static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
+{
+	dst_hold(dst);
+	if (dst->obsolete)
+		dst = dst->ops->check(dst, cookie);
+	dst_release(dst);
+	return dst;
+}
+
 extern void		dst_init(void);
 
 struct flowi;
===== net/xfrm/xfrm_policy.c 1.70 vs edited =====
--- 1.70/net/xfrm/xfrm_policy.c	2005-03-07 21:17:35 +11:00
+++ edited/net/xfrm/xfrm_policy.c	2005-03-07 21:26:28 +11:00
@@ -1147,7 +1147,7 @@
 	struct xfrm_dst *last;
 	u32 mtu;
 
-	if (dst->path->obsolete > 0 ||
+	if (!dst_check(dst->path, 0) ||
 	    (dst->dev && !netif_running(dst->dev)))
 		return 0;
 
@@ -1167,6 +1167,8 @@
 			xdst->child_mtu_cached = mtu;
 		}
 
+		if (!dst_check(xdst->route, 0))
+			return 0;
 		mtu = dst_pmtu(xdst->route);
 		if (xdst->route_mtu_cached != mtu) {
 			last = xdst;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst
  2005-03-07  5:35           ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update " David S. Miller
@ 2005-03-07 10:39             ` Herbert Xu
  0 siblings, 0 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-07 10:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: kuznet, jmorris, yoshfuji, netdev

On Sun, Mar 06, 2005 at 09:35:02PM -0800, David S. Miller wrote:
> On Wed, 16 Feb 2005 22:08:42 +1100
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > Let's also fix IPsec PMTU storage.  When we get an MTU update for an
> > xfrm_dst, it should be done to the top dst, not the bottom dst.
> > 
> > For example, when we get a need-to-frag message for host C behind
> > a our IPsec peer B, we should be updating the dst entry for C and
> > not B as we do now.
> > 
> > I've removed the boundary checks since the same checks are done
> > in ipv[46]/route.c already.
> 
> Note that sometimes it is better to replace an "unnecessary as
> determined by me" boundary check with a BUG() instead of
> outright removal.  That way you get to test your assertion :)

Point taken.  However, I must say that using BUG() in these
two particular places would be fatal :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [7/*] [IPSEC] Get metrics for xfrm_dst from top dst
  2005-03-07  5:47             ` David S. Miller
@ 2005-03-07 10:41               ` Herbert Xu
  0 siblings, 0 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-07 10:41 UTC (permalink / raw)
  To: David S. Miller; +Cc: kuznet, jmorris, yoshfuji, netdev

On Sun, Mar 06, 2005 at 09:47:54PM -0800, David S. Miller wrote:
> 
> This is the last patch I saw in this series.  Any more stuff
> you have ready to apply in this area?

I've just sent you two fix-up's.  I'll work on the clean-up's
that I talked about next.  Once that's done we can consider
the first step in the MTU rework complete.

The next step would be to relay ICMP information back to the
sender.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len
  2005-02-16 10:37       ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu
  2005-02-16 11:08         ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu
  2005-03-07  5:33         ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output David S. Miller
@ 2005-03-07 11:45         ` Herbert Xu
  2005-03-07 17:33           ` David S. Miller
  2 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-07 11:45 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 550 bytes --]

Hi Dave:

Here is a patch that replaces all occurrences of dst_pmtu in the TCP
stack.  As a result we no longer need ext2_header_len.

This has a nice synergetic effect with Arnaldo's latest change to
linux/tcp.h :)

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

I'll be removing other users of dst->path/dst_pmtu next.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-10 --]
[-- Type: text/plain, Size: 6221 bytes --]

===== include/linux/tcp.h 1.35 vs edited =====
--- 1.35/include/linux/tcp.h	2005-02-23 05:45:31 +11:00
+++ edited/include/linux/tcp.h	2005-03-07 22:41:00 +11:00
@@ -284,10 +284,13 @@
 	__u32	mss_cache;	/* Cached effective mss, not including SACKS */
 	__u16	mss_cache_std;	/* Like mss_cache, but without TSO */
 	__u16	ext_header_len;	/* Network protocol overhead (IP/IPv6 options) */
-	__u16	ext2_header_len;/* Options depending on route */
 	__u8	ca_state;	/* State of fast-retransmit machine 	*/
 	__u8	retransmits;	/* Number of unrecovered RTO timeouts.	*/
 
+	__u16	advmss;		/* Advertised MSS			*/
+	__u32	window_clamp;	/* Maximal window to advertise		*/
+	__u32	rcv_ssthresh;	/* Current window clamp			*/
+
 	__u32	frto_highmark;	/* snd_nxt when RTO occurred */
 	__u8	reordering;	/* Packet reordering metric.		*/
 	__u8	frto_counter;	/* Number of new acks after RTO */
@@ -345,14 +348,9 @@
 	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
 	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
-	__u32	window_clamp;	/* Maximal window to advertise		*/
-	__u32	rcv_ssthresh;	/* Current window clamp			*/
-	__u16	advmss;		/* Advertised MSS			*/
-
 	__u8	syn_retries;	/* num of allowed syn retries */
 	__u8	ecn_flags;	/* ECN status bits.			*/
 	__u16	prior_ssthresh; /* ssthresh saved at recovery start	*/
-	__u16	__pad1;
 	__u32	lost_out;	/* Lost packets			*/
 	__u32	sacked_out;	/* SACK'd packets			*/
 	__u32	fackets_out;	/* FACK'd packets			*/
===== net/ipv4/tcp_ipv4.c 1.110 vs edited =====
--- 1.110/net/ipv4/tcp_ipv4.c	2005-02-23 14:23:09 +11:00
+++ edited/net/ipv4/tcp_ipv4.c	2005-03-07 22:02:37 +11:00
@@ -831,7 +831,6 @@
 	/* OK, now commit destination to socket.  */
 	__sk_dst_set(sk, &rt->u.dst);
 	tcp_v4_setup_caps(sk, &rt->u.dst);
-	tp->ext2_header_len = rt->u.dst.header_len;
 
 	if (!tp->write_seq)
 		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
@@ -941,10 +940,10 @@
 	/* Something is about to be wrong... Remember soft error
 	 * for the case, if this connection will not able to recover.
 	 */
-	if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
+	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 		sk->sk_err_soft = EMSGSIZE;
 
-	mtu = dst_pmtu(dst);
+	mtu = dst_mtu(dst);
 
 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 	    tp->pmtu_cookie > mtu) {
@@ -1578,10 +1577,9 @@
 	newtp->ext_header_len = 0;
 	if (newinet->opt)
 		newtp->ext_header_len = newinet->opt->optlen;
-	newtp->ext2_header_len = dst->header_len;
 	newinet->id = newtp->write_seq ^ jiffies;
 
-	tcp_sync_mss(newsk, dst_pmtu(dst));
+	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(newsk);
 
@@ -1877,7 +1875,6 @@
 
 	__sk_dst_set(sk, &rt->u.dst);
 	tcp_v4_setup_caps(sk, &rt->u.dst);
-	tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
 
 	new_saddr = rt->rt_src;
 
@@ -1937,7 +1934,6 @@
 	if (!err) {
 		__sk_dst_set(sk, &rt->u.dst);
 		tcp_v4_setup_caps(sk, &rt->u.dst);
-		tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
 		return 0;
 	}
 
===== net/ipv4/tcp_output.c 1.80 vs edited =====
--- 1.80/net/ipv4/tcp_output.c	2005-02-24 14:16:59 +11:00
+++ edited/net/ipv4/tcp_output.c	2005-03-07 22:11:07 +11:00
@@ -632,12 +632,8 @@
 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct dst_entry *dst = __sk_dst_get(sk);
 	int mss_now;
 
-	if (dst && dst->ops->get_mss)
-		pmtu = dst->ops->get_mss(dst, pmtu);
-
 	/* Calculate base mss without TCP options:
 	   It is MMS_S - sizeof(tcphdr) of rfc1122
 	 */
@@ -648,7 +644,7 @@
 		mss_now = tp->rx_opt.mss_clamp;
 
 	/* Now subtract optional transport overhead */
-	mss_now -= tp->ext_header_len + tp->ext2_header_len;
+	mss_now -= tp->ext_header_len;
 
 	/* Then reserve room for full set of TCP options and 8 bytes of data */
 	if (mss_now < 48)
@@ -684,9 +680,8 @@
 
 	mss_now = tp->mss_cache_std;
 	if (dst) {
-		u32 mtu = dst_pmtu(dst);
-		if (mtu != tp->pmtu_cookie ||
-		    tp->ext2_header_len != dst->header_len)
+		u32 mtu = dst_mtu(dst);
+		if (mtu != tp->pmtu_cookie)
 			mss_now = tcp_sync_mss(sk, mtu);
 	}
 
@@ -698,8 +693,7 @@
 		unsigned int large_mss, factor, limit;
 
 		large_mss = 65535 - tp->af_specific->net_header_len -
-			tp->ext_header_len - tp->ext2_header_len -
-			tp->tcp_header_len;
+			tp->ext_header_len - tp->tcp_header_len;
 
 		if (tp->max_window && large_mss > (tp->max_window>>1))
 			large_mss = max((tp->max_window>>1),
@@ -1444,7 +1438,7 @@
 	if (tp->rx_opt.user_mss)
 		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
 	tp->max_window = 0;
-	tcp_sync_mss(sk, dst_pmtu(dst));
+	tcp_sync_mss(sk, dst_mtu(dst));
 
 	if (!tp->window_clamp)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
===== net/ipv6/tcp_ipv6.c 1.106 vs edited =====
--- 1.106/net/ipv6/tcp_ipv6.c	2005-02-23 14:23:09 +11:00
+++ edited/net/ipv6/tcp_ipv6.c	2005-03-07 22:02:38 +11:00
@@ -782,7 +782,6 @@
 	tp->ext_header_len = 0;
 	if (np->opt)
 		tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen;
-	tp->ext2_header_len = dst->header_len;
 
 	tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 
@@ -894,8 +893,8 @@
 		} else
 			dst_hold(dst);
 
-		if (tp->pmtu_cookie > dst_pmtu(dst)) {
-			tcp_sync_mss(sk, dst_pmtu(dst));
+		if (tp->pmtu_cookie > dst_mtu(dst)) {
+			tcp_sync_mss(sk, dst_mtu(dst));
 			tcp_simple_retransmit(sk);
 		} /* else let the usual retransmit timer handle it */
 		dst_release(dst);
@@ -1524,9 +1523,8 @@
 	if (newnp->opt)
 		newtp->ext_header_len = newnp->opt->opt_nflen +
 					newnp->opt->opt_flen;
-	newtp->ext2_header_len = dst->header_len;
 
-	tcp_sync_mss(newsk, dst_pmtu(dst));
+	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(newsk);
 
@@ -1873,7 +1871,6 @@
 		ip6_dst_store(sk, dst, NULL);
 		sk->sk_route_caps = dst->dev->features &
 			~(NETIF_F_IP_CSUM | NETIF_F_TSO);
-		tcp_sk(sk)->ext2_header_len = dst->header_len;
 	}
 
 	return 0;
@@ -1927,7 +1924,6 @@
 		ip6_dst_store(sk, dst, NULL);
 		sk->sk_route_caps = dst->dev->features &
 			~(NETIF_F_IP_CSUM | NETIF_F_TSO);
-		tcp_sk(sk)->ext2_header_len = dst->header_len;
 	}
 
 	skb->dst = dst_clone(dst);

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok
  2005-03-07 10:35         ` [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok Herbert Xu
@ 2005-03-07 17:32           ` David S. Miller
  2005-03-08 10:27           ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-07 17:32 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev

On Mon, 7 Mar 2005 21:35:36 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> There is another bug in xfrm_bundle_ok where I forgot to
> check the validity of xdst->route.  In fact, the check
> on dst->path isn't strong enough either.  For IPv6 entries,
> dst->path->obsolete is always negative until you call
> ipv6_dst_check.  So we really need to do that here.
> 
> Here's the patch to fix those two problems.  Yes I know
> my dst_check implementation is lame.  I'll come back and
> fix up all the dst_check functions by moving their dst_release
> calls out.  It proves that you were right in that IPv6 dst
> leak thread :)
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len
  2005-03-07 11:45         ` [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len Herbert Xu
@ 2005-03-07 17:33           ` David S. Miller
  0 siblings, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-07 17:33 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, kaber, netdev

On Mon, 7 Mar 2005 22:45:33 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Here is a patch that replaces all occurrences of dst_pmtu in the TCP
> stack.  As a result we no longer need ext2_header_len.
> 
> This has a nice synergetic effect with Arnaldo's latest change to
> linux/tcp.h :)
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> 
> I'll be removing other users of dst->path/dst_pmtu next.

Also applied, thanks Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy
  2005-03-07 10:16     ` [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy Herbert Xu
@ 2005-03-07 17:35       ` David S. Miller
  0 siblings, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-07 17:35 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

On Mon, 7 Mar 2005 21:16:57 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Here's a trivial patch to get rid of a redundant check that I added
> in patch 3/4.  dst_release already checks for dst == NULL.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [11/*] [NET] Move dst_release out of dst->ops->check
  2005-03-07 10:35         ` [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok Herbert Xu
  2005-03-07 17:32           ` David S. Miller
@ 2005-03-08 10:27           ` Herbert Xu
  2005-03-08 12:50             ` YOSHIFUJI Hideaki / 吉藤英明
                               ` (2 more replies)
  1 sibling, 3 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-08 10:27 UTC (permalink / raw)
  To: David S. Miller; +Cc: kuznet, jmorris, yoshfuji, netdev

[-- Attachment #1: Type: text/plain, Size: 840 bytes --]

On Mon, Mar 07, 2005 at 09:35:36PM +1100, herbert wrote:
> 
> Here's the patch to fix those two problems.  Yes I know
> my dst_check implementation is lame.  I'll come back and
> fix up all the dst_check functions by moving their dst_release
> calls out.  It proves that you were right in that IPv6 dst
> leak thread :)

As promised here is the patch that moves dst_release out of
dst->ops->check.  It bloats sk_dst_check/__sk_dst_check slightly
but they're only used in a handful of places so it isn't too bad.
I actually counted, it's about a few hundred bytes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-11 --]
[-- Type: text/plain, Size: 2751 bytes --]

===== include/net/dst.h 1.28 vs edited =====
--- 1.28/include/net/dst.h	2005-03-07 22:03:36 +11:00
+++ edited/include/net/dst.h	2005-03-08 21:04:46 +11:00
@@ -261,10 +261,8 @@
 
 static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
 {
-	dst_hold(dst);
 	if (dst->obsolete)
 		dst = dst->ops->check(dst, cookie);
-	dst_release(dst);
 	return dst;
 }
 
===== include/net/sock.h 1.90 vs edited =====
--- 1.90/include/net/sock.h	2005-01-11 07:23:55 +11:00
+++ edited/include/net/sock.h	2005-03-08 21:05:42 +11:00
@@ -983,6 +983,7 @@
 
 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 		sk->sk_dst_cache = NULL;
+		dst_release(dst);
 		return NULL;
 	}
 
@@ -996,6 +997,7 @@
 
 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 		sk_dst_reset(sk);
+		dst_release(dst);
 		return NULL;
 	}
 
===== net/decnet/dn_route.c 1.31 vs edited =====
--- 1.31/net/decnet/dn_route.c	2005-01-14 16:03:01 +11:00
+++ edited/net/decnet/dn_route.c	2005-03-08 21:04:46 +11:00
@@ -253,7 +253,6 @@
  */
 static struct dst_entry *dn_dst_check(struct dst_entry *dst, __u32 cookie)
 {
-	dst_release(dst);
 	return NULL;
 }
 
===== net/ipv4/route.c 1.104 vs edited =====
--- 1.104/net/ipv4/route.c	2005-03-06 15:43:44 +11:00
+++ edited/net/ipv4/route.c	2005-03-08 21:04:46 +11:00
@@ -1326,7 +1326,6 @@
 
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
-	dst_release(dst);
 	return NULL;
 }
 
===== net/ipv4/ipvs/ip_vs_xmit.c 1.11 vs edited =====
--- 1.11/net/ipv4/ipvs/ip_vs_xmit.c	2004-09-13 09:25:50 +10:00
+++ edited/net/ipv4/ipvs/ip_vs_xmit.c	2005-03-08 21:04:46 +11:00
@@ -52,6 +52,7 @@
 	if ((dst->obsolete || rtos != dest->dst_rtos) &&
 	    dst->ops->check(dst, cookie) == NULL) {
 		dest->dst_cache = NULL;
+		dst_release(dst);
 		return NULL;
 	}
 	dst_hold(dst);
===== net/ipv6/ip6_tunnel.c 1.28 vs edited =====
--- 1.28/net/ipv6/ip6_tunnel.c	2005-02-07 16:40:51 +11:00
+++ edited/net/ipv6/ip6_tunnel.c	2005-03-08 21:04:47 +11:00
@@ -94,6 +94,7 @@
 	if (dst && dst->obsolete && 
 	    dst->ops->check(dst, t->dst_cookie) == NULL) {
 		t->dst_cache = NULL;
+		dst_release(dst);
 		return NULL;
 	}
 
===== net/ipv6/route.c 1.106 vs edited =====
--- 1.106/net/ipv6/route.c	2005-02-16 09:23:11 +11:00
+++ edited/net/ipv6/route.c	2005-03-08 21:04:47 +11:00
@@ -589,7 +589,6 @@
 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
 		return dst;
 
-	dst_release(dst);
 	return NULL;
 }
 
===== net/xfrm/xfrm_policy.c 1.71 vs edited =====
--- 1.71/net/xfrm/xfrm_policy.c	2005-03-07 22:03:36 +11:00
+++ edited/net/xfrm/xfrm_policy.c	2005-03-08 21:04:47 +11:00
@@ -1015,7 +1015,6 @@
 	if (!stale_bundle(dst))
 		return dst;
 
-	dst_release(dst);
 	return NULL;
 }
 

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [11/*] [NET] Move dst_release out of dst->ops->check
  2005-03-08 10:27           ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu
@ 2005-03-08 12:50             ` YOSHIFUJI Hideaki / 吉藤英明
  2005-03-11  2:17             ` David S. Miller
  2005-03-14 10:26             ` [12/*] [IPSEC] Handle local_df in IPv4 Herbert Xu
  2 siblings, 0 replies; 114+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-03-08 12:50 UTC (permalink / raw)
  To: herbert; +Cc: davem, kuznet, jmorris, netdev, yoshfuji

In article <20050308102741.GA23468@gondor.apana.org.au> (at Tue, 8 Mar 2005 21:27:41 +1100), Herbert Xu <herbert@gondor.apana.org.au> says:

> On Mon, Mar 07, 2005 at 09:35:36PM +1100, herbert wrote:
> > 
> > Here's the patch to fix those two problems.  Yes I know
> > my dst_check implementation is lame.  I'll come back and
> > fix up all the dst_check functions by moving their dst_release
> > calls out.  It proves that you were right in that IPv6 dst
> > leak thread :)
> 
> As promised here is the patch that moves dst_release out of
> dst->ops->check.  It bloats sk_dst_check/__sk_dst_check slightly

I agree.

--yoshfuji

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [11/*] [NET] Move dst_release out of dst->ops->check
  2005-03-08 10:27           ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu
  2005-03-08 12:50             ` YOSHIFUJI Hideaki / 吉藤英明
@ 2005-03-11  2:17             ` David S. Miller
  2005-03-14 10:26             ` [12/*] [IPSEC] Handle local_df in IPv4 Herbert Xu
  2 siblings, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-11  2:17 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev

On Tue, 8 Mar 2005 21:27:41 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Mon, Mar 07, 2005 at 09:35:36PM +1100, herbert wrote:
> > 
> > Here's the patch to fix those two problems.  Yes I know
> > my dst_check implementation is lame.  I'll come back and
> > fix up all the dst_check functions by moving their dst_release
> > calls out.  It proves that you were right in that IPv6 dst
> > leak thread :)
> 
> As promised here is the patch that moves dst_release out of
> dst->ops->check.  It bloats sk_dst_check/__sk_dst_check slightly
> but they're only used in a handful of places so it isn't too bad.
> I actually counted, it's about a few hundred bytes.

Applied, thanks Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [12/*] [IPSEC] Handle local_df in IPv4
  2005-03-08 10:27           ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu
  2005-03-08 12:50             ` YOSHIFUJI Hideaki / 吉藤英明
  2005-03-11  2:17             ` David S. Miller
@ 2005-03-14 10:26             ` Herbert Xu
  2005-03-14 10:53               ` [13/*] [IPV4] Fix room calculation in icmp_send Herbert Xu
  2005-03-15  5:25               ` [12/*] [IPSEC] Handle local_df in IPv4 David S. Miller
  2 siblings, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-14 10:26 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 701 bytes --]

Hi Dave:

When cleaning up the remaining users of dst_pmtu I noticed that
local_df wasn't being treated correctly in IPsec.  In fact, if
you socket's dst went over IPsec, local_df is essentailly ignored.

This patch fixes that.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

I was going to do the same thing to IPv6.  Unfortunately it seems
that we don't have any local_df support over there.  That is, we
always fragment outbound UDP/raw packets.  Did I miss something?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-12 --]
[-- Type: text/plain, Size: 368 bytes --]

===== net/ipv4/xfrm4_output.c 1.7 vs edited =====
--- 1.7/net/ipv4/xfrm4_output.c	2005-03-07 16:33:59 +11:00
+++ edited/net/ipv4/xfrm4_output.c	2005-03-14 13:55:35 +11:00
@@ -78,7 +78,7 @@
 
 	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
 	
-	if (!(iph->frag_off & htons(IP_DF)))
+	if (!(iph->frag_off & htons(IP_DF)) || skb->local_df)
 		goto out;
 
 	dst = skb->dst;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [13/*] [IPV4] Fix room calculation in icmp_send
  2005-03-14 10:26             ` [12/*] [IPSEC] Handle local_df in IPv4 Herbert Xu
@ 2005-03-14 10:53               ` Herbert Xu
  2005-03-14 11:10                 ` [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward Herbert Xu
  2005-03-15  5:26                 ` [13/*] [IPV4] Fix room calculation in icmp_send David S. Miller
  2005-03-15  5:25               ` [12/*] [IPSEC] Handle local_df in IPv4 David S. Miller
  1 sibling, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-14 10:53 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 702 bytes --]

Hi Dave:

I'm now cleaning up all users of dst_pmtu with the aim of replacing
dst_pmtu with dst_mtu.  I'm going to start with the ones that actually
fix bugs.

This patch fixes the length calculation in icmp_send.  As it is
we're overestimating the space available by including the space
that would be used up by IPsec encapsulation.

IPv6 doesn't have this problem since its calculation is based
on 1280 instead of the PMTU.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-13 --]
[-- Type: text/plain, Size: 400 bytes --]

===== net/ipv4/icmp.c 1.49 vs edited =====
--- 1.49/net/ipv4/icmp.c	2004-12-28 16:30:43 +11:00
+++ edited/net/ipv4/icmp.c	2005-03-14 21:34:20 +11:00
@@ -560,7 +560,7 @@
 
 	/* RFC says return as much as we can without exceeding 576 bytes. */
 
-	room = dst_pmtu(&rt->u.dst);
+	room = dst_mtu(&rt->u.dst);
 	if (room > 576)
 		room = 576;
 	room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward
  2005-03-14 10:53               ` [13/*] [IPV4] Fix room calculation in icmp_send Herbert Xu
@ 2005-03-14 11:10                 ` Herbert Xu
  2005-03-15  5:27                   ` David S. Miller
  2005-03-15  9:19                   ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu
  2005-03-15  5:26                 ` [13/*] [IPV4] Fix room calculation in icmp_send David S. Miller
  1 sibling, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-14 11:10 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 561 bytes --]

Hi Dave:

While replacing dst_pmtu in ip6_output I found this little gem.  In
ip6_forward we're not reloading the dst pointer after calling
xfrm6_route_forward.  So all subsequent dereferences of dst will
refer to its pre-IPsec value.

The solution is of course to refresh its value.
 
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-14 --]
[-- Type: text/plain, Size: 381 bytes --]

===== net/ipv6/ip6_output.c 1.85 vs edited =====
--- 1.85/net/ipv6/ip6_output.c	2005-03-03 16:12:38 +11:00
+++ edited/net/ipv6/ip6_output.c	2005-03-14 22:05:31 +11:00
@@ -397,6 +397,7 @@
 		IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
 		goto drop;
 	}
+	dst = skb->dst;
 
 	/* IPv6 specs say nothing about it, but it is clear that we cannot
 	   send redirects to source routed frames.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [3/4] [IPSEC] Add route element to xfrm_dst
  2005-02-14 22:14   ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu
                       ` (3 preceding siblings ...)
  2005-03-07 10:16     ` [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy Herbert Xu
@ 2005-03-14 11:52     ` Patrick McHardy
  2005-03-14 20:32       ` Herbert Xu
  4 siblings, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-03-14 11:52 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

Herbert Xu wrote:
> This patch adds a pointer to the route corresponding to the specific
> flow over the SA of an xfrm_dst that's being used.
> 
> It also sets the next pointer of each xfrm_dst to the one above it.
> This allows to traverse the list upwards from the bottom.

Looking at this again, I noticed a problem:

> +		if (remote != fl_tunnel.fl4_dst) {
> +			fl_tunnel.fl4_src = local;
> +			fl_tunnel.fl4_dst = remote;
> +			err = xfrm_dst_lookup((struct xfrm_dst **)&rt,
> +					      &fl_tunnel, AF_INET);
> +			if (err)
> +				goto error;
> +		} else
> +			dst_hold(&rt->u.dst);
>  	}
> +
>  	dst_prev->child = &rt->u.dst;
> +	dst->path = &rt->u.dst;
> +
> +	*dst_p = dst;
> +	dst = dst_prev;
> +
> +	dst_prev = *dst_p;
>  	i = 0;
> -	for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
> +	for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {

Since the tunnel dst is not necessarily the last in the bundle anymore,
we might miss to initialize some dsts, for example with ipcomp/tunnel +
esp/transport. If we have nested tunnels we'll fiddle with entries in
the routing cache.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [3/4] [IPSEC] Add route element to xfrm_dst
  2005-03-14 11:52     ` [3/4] [IPSEC] Add route element to xfrm_dst Patrick McHardy
@ 2005-03-14 20:32       ` Herbert Xu
  2005-03-15 19:05         ` Patrick McHardy
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-14 20:32 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

On Mon, Mar 14, 2005 at 12:52:16PM +0100, Patrick McHardy wrote:
> 
> >+		if (remote != fl_tunnel.fl4_dst) {
> >+			fl_tunnel.fl4_src = local;
> >+			fl_tunnel.fl4_dst = remote;
> >+			err = xfrm_dst_lookup((struct xfrm_dst **)&rt,
> >+					      &fl_tunnel, AF_INET);
> >+			if (err)
> >+				goto error;
> >+		} else
> >+			dst_hold(&rt->u.dst);
> > 	}
> >+
> > 	dst_prev->child = &rt->u.dst;
> >+	dst->path = &rt->u.dst;
> >+
> >+	*dst_p = dst;
> >+	dst = dst_prev;
> >+
> >+	dst_prev = *dst_p;
> > 	i = 0;
> >-	for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = 
> >dst_prev->child) {
> >+	for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
> 
> Since the tunnel dst is not necessarily the last in the bundle anymore,
> we might miss to initialize some dsts, for example with ipcomp/tunnel +
> esp/transport. If we have nested tunnels we'll fiddle with entries in
> the routing cache.

Sorry, but I don't get it :) First of all what do you mean by the
tunnel dst?

If you mean &rt->u.dst then as far as I can see it's still the last
child in the bundle.  It may also appear in ->route elements earlier
on but that does not come into play in this loop.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [12/*] [IPSEC] Handle local_df in IPv4
  2005-03-14 10:26             ` [12/*] [IPSEC] Handle local_df in IPv4 Herbert Xu
  2005-03-14 10:53               ` [13/*] [IPV4] Fix room calculation in icmp_send Herbert Xu
@ 2005-03-15  5:25               ` David S. Miller
  2005-03-15 18:25                 ` YOSHIFUJI Hideaki / 吉藤英明
  1 sibling, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-03-15  5:25 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Mon, 14 Mar 2005 21:26:14 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> When cleaning up the remaining users of dst_pmtu I noticed that
> local_df wasn't being treated correctly in IPsec.  In fact, if
> you socket's dst went over IPsec, local_df is essentailly ignored.
> 
> This patch fixes that.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks Herbert.

> I was going to do the same thing to IPv6.  Unfortunately it seems
> that we don't have any local_df support over there.  That is, we
> always fragment outbound UDP/raw packets.  Did I miss something?

I have no idea offhand.  Perhaps Yoshifuji can figure it out?

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [13/*] [IPV4] Fix room calculation in icmp_send
  2005-03-14 10:53               ` [13/*] [IPV4] Fix room calculation in icmp_send Herbert Xu
  2005-03-14 11:10                 ` [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward Herbert Xu
@ 2005-03-15  5:26                 ` David S. Miller
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-15  5:26 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Mon, 14 Mar 2005 21:53:13 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> I'm now cleaning up all users of dst_pmtu with the aim of replacing
> dst_pmtu with dst_mtu.  I'm going to start with the ones that actually
> fix bugs.
> 
> This patch fixes the length calculation in icmp_send.  As it is
> we're overestimating the space available by including the space
> that would be used up by IPsec encapsulation.
> 
> IPv6 doesn't have this problem since its calculation is based
> on 1280 instead of the PMTU.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward
  2005-03-14 11:10                 ` [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward Herbert Xu
@ 2005-03-15  5:27                   ` David S. Miller
  2005-03-15  9:19                   ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-15  5:27 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Mon, 14 Mar 2005 22:10:02 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> While replacing dst_pmtu in ip6_output I found this little gem.  In
> ip6_forward we're not reloading the dst pointer after calling
> xfrm6_route_forward.  So all subsequent dereferences of dst will
> refer to its pre-IPsec value.
> 
> The solution is of course to refresh its value.
>  
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Good catch Herbert.

Applied, thanks.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data
  2005-03-14 11:10                 ` [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward Herbert Xu
  2005-03-15  5:27                   ` David S. Miller
@ 2005-03-15  9:19                   ` Herbert Xu
  2005-03-15  9:58                     ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu
                                       ` (2 more replies)
  1 sibling, 3 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-15  9:19 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 746 bytes --]

Hi Dave:

This patch fixes the IPsec overhead handling in ip_append_data and
ip6_append_data.  As it is they assume that the IPsec overhead is
constant.  This is not true as with ESP the IPsec overhead will vary
as the MTU varies.

The result is that they may produce packets that will exceed the MTU
when ESP is used.  Had it taken the trailer_len into account, it would
have produced packets less than the real MTU.

By switching to dst_mtu we get the optimal result.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-15 --]
[-- Type: text/plain, Size: 2640 bytes --]

===== net/ipv4/ip_output.c 1.76 vs edited =====
--- 1.76/net/ipv4/ip_output.c	2005-02-01 02:43:53 +11:00
+++ edited/net/ipv4/ip_output.c	2005-03-15 20:13:13 +11:00
@@ -719,7 +719,6 @@
 
 	struct ip_options *opt = NULL;
 	int hh_len;
-	int exthdrlen;
 	int mtu;
 	int copy;
 	int err;
@@ -746,22 +745,17 @@
 			inet->cork.addr = ipc->addr;
 		}
 		dst_hold(&rt->u.dst);
-		inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
+		inet->cork.fragsize = mtu = dst_mtu(&rt->u.dst);
 		inet->cork.rt = rt;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
 		sk->sk_sndmsg_off = 0;
-		if ((exthdrlen = rt->u.dst.header_len) != 0) {
-			length += exthdrlen;
-			transhdrlen += exthdrlen;
-		}
 	} else {
 		rt = inet->cork.rt;
 		if (inet->cork.flags & IPCORK_OPT)
 			opt = inet->cork.opt;
 
 		transhdrlen = 0;
-		exthdrlen = 0;
 		mtu = inet->cork.fragsize;
 	}
 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
@@ -770,7 +764,7 @@
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 
 	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
-		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
 		return -EMSGSIZE;
 	}
 
@@ -781,7 +775,7 @@
 	if (transhdrlen &&
 	    length + fragheaderlen <= mtu &&
 	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
-	    !exthdrlen)
+	    !rt->u.dst.header_len)
 		csummode = CHECKSUM_HW;
 
 	inet->cork.length += length;
@@ -866,9 +860,9 @@
 			 *	Find where to start putting bytes.
 			 */
 			data = skb_put(skb, fraglen);
-			skb->nh.raw = data + exthdrlen;
+			skb->nh.raw = data;
 			data += fragheaderlen;
-			skb->h.raw = data + exthdrlen;
+			skb->h.raw = data;
 
 			if (fraggap) {
 				skb->csum = skb_copy_and_csum_bits(
@@ -890,7 +884,6 @@
 			offset += copy;
 			length -= datalen - fraggap;
 			transhdrlen = 0;
-			exthdrlen = 0;
 			csummode = CHECKSUM_NONE;
 
 			/*
===== net/ipv6/ip6_output.c 1.85 vs edited =====
--- 1.85/net/ipv6/ip6_output.c	2005-03-03 16:12:38 +11:00
+++ edited/net/ipv6/ip6_output.c	2005-03-15 20:12:15 +11:00
@@ -849,13 +849,13 @@
 		np->cork.rt = rt;
 		inet->cork.fl = *fl;
 		np->cork.hop_limit = hlimit;
-		inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
+		inet->cork.fragsize = mtu = dst_mtu(&rt->u.dst);
 		if (dst_allfrag(&rt->u.dst))
 			inet->cork.flags |= IPCORK_ALLFRAG;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
 		sk->sk_sndmsg_off = 0;
-		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
+		exthdrlen = opt ? opt->opt_flen : 0;
 		length += exthdrlen;
 		transhdrlen += exthdrlen;
 	} else {

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [16/*] [INET] Take IPsec overhead into account in tunnels
  2005-03-15  9:19                   ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu
@ 2005-03-15  9:58                     ` Herbert Xu
  2005-03-15 10:05                       ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu
                                         ` (3 more replies)
  2005-03-15 18:18                     ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data David S. Miller
  2005-03-16 11:31                     ` Herbert Xu
  2 siblings, 4 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-15  9:58 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 912 bytes --]

Hi Dave:

This patch uses dst_mtu instead of dst_pmtu in the various tunnel
implementations.  As it is they simply ignore the IPsec overhead.
This leads to bogus MTU values inside the tunnels.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

BTW, we're doing lazy MTU updates in the tunnel xmit functions.
When a packet with DF set hits us and exceeds the updated MTU,
we will send an ICMP packet back which is good.

Unfortunately when a packet with DF clear hits us as we update
the MTU downwards, the packet will be silently discarded instead
of fragmented (well we will send an ICMP back to ourselves but
we already knew that MTU value :).

I presume we want to fix this, right?
 
Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-16 --]
[-- Type: text/plain, Size: 2924 bytes --]

===== net/ipv4/ip_gre.c 1.47 vs edited =====
--- 1.47/net/ipv4/ip_gre.c	2005-03-10 16:12:11 +11:00
+++ edited/net/ipv4/ip_gre.c	2005-03-15 20:19:26 +11:00
@@ -511,7 +511,7 @@
 
 	/* change mtu on this route */
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
-		if (rel_info > dst_pmtu(skb2->dst)) {
+		if (rel_info > dst_mtu(skb2->dst)) {
 			kfree_skb(skb2);
 			return;
 		}
@@ -764,9 +764,9 @@
 
 	df = tiph->frag_off;
 	if (df)
-		mtu = dst_pmtu(&rt->u.dst) - tunnel->hlen;
+		mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
 	else
-		mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu;
+		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
 
 	if (skb->dst)
 		skb->dst->ops->update_pmtu(skb->dst, mtu);
@@ -785,7 +785,7 @@
 	else if (skb->protocol == htons(ETH_P_IPV6)) {
 		struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
 
-		if (rt6 && mtu < dst_pmtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
+		if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
 			if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
 			    rt6->rt6i_dst.plen == 128) {
 				rt6->rt6i_flags |= RTF_MODIFIED;
===== net/ipv4/ipip.c 1.46 vs edited =====
--- 1.46/net/ipv4/ipip.c	2005-01-14 15:41:05 +11:00
+++ edited/net/ipv4/ipip.c	2005-03-15 20:19:26 +11:00
@@ -436,7 +436,7 @@
 
 	/* change mtu on this route */
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
-		if (rel_info > dst_pmtu(skb2->dst)) {
+		if (rel_info > dst_mtu(skb2->dst)) {
 			kfree_skb(skb2);
 			return;
 		}
@@ -569,9 +569,9 @@
 	}
 
 	if (tiph->frag_off)
-		mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr);
+		mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
 	else
-		mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu;
+		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
 
 	if (mtu < 68) {
 		tunnel->stat.collisions++;
===== net/ipv6/ip6_tunnel.c 1.29 vs edited =====
--- 1.29/net/ipv6/ip6_tunnel.c	2005-03-11 13:17:32 +11:00
+++ edited/net/ipv6/ip6_tunnel.c	2005-03-15 20:19:26 +11:00
@@ -689,14 +689,14 @@
 			       t->parms.name);
 		goto tx_err_dst_release;
 	}
-	mtu = dst_pmtu(dst) - sizeof (*ipv6h);
+	mtu = dst_mtu(dst) - sizeof (*ipv6h);
 	if (opt) {
 		max_headroom += 8;
 		mtu -= 8;
 	}
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
-	if (skb->dst && mtu < dst_pmtu(skb->dst)) {
+	if (skb->dst && mtu < dst_mtu(skb->dst)) {
 		struct rt6_info *rt = (struct rt6_info *) skb->dst;
 		rt->rt6i_flags |= RTF_MODIFIED;
 		rt->u.dst.metrics[RTAX_MTU-1] = mtu;
===== net/ipv6/sit.c 1.45 vs edited =====
--- 1.45/net/ipv6/sit.c	2005-01-14 15:41:06 +11:00
+++ edited/net/ipv6/sit.c	2005-03-15 20:19:27 +11:00
@@ -500,9 +500,9 @@
 	}
 
 	if (tiph->frag_off)
-		mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr);
+		mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
 	else
-		mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu;
+		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
 
 	if (mtu < 68) {
 		tunnel->stat.collisions++;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [17/*] [NET] Replace dst_pmtu with dst_mtu
  2005-03-15  9:58                     ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu
@ 2005-03-15 10:05                       ` Herbert Xu
  2005-03-15 18:24                         ` David S. Miller
  2005-03-15 10:20                       ` [16/*] [INET] Take IPsec overhead into account in tunnels Lennert Buytenhek
                                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-15 10:05 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 819 bytes --]

Hi Dave:

This patch replaces most of the other uses of dst_pmtu with dst_mtu.
As far as I can tell these are either identical because dst->path == dst,
or they're a straightforward replacement of (the slightly incorrect)
dst_pmtu(dst) - dst->header_Len with dst_mtu(dst).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

At this point the only remaining user of dst_pmtu is ipt_REJECT which
will go away either when we use icmp_send in there or when I replace
it with dst_mtu.

We can now remove the other users of dst->path as well with the removal
of that attribute itself as the goal.
  
Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-17 --]
[-- Type: text/plain, Size: 8783 bytes --]

===== net/decnet/af_decnet.c 1.49 vs edited =====
--- 1.49/net/decnet/af_decnet.c	2005-01-21 09:35:54 +11:00
+++ edited/net/decnet/af_decnet.c	2005-03-15 20:59:05 +11:00
@@ -1868,7 +1868,7 @@
 
 	/* This works out the maximum size of segment we can send out */
 	if (dst) {
-		u32 mtu = dst_pmtu(dst);
+		u32 mtu = dst_mtu(dst);
 		mss_now = min_t(int, dn_mss_from_pmtu(dst->dev, mtu), mss_now);
 	}
 
===== net/decnet/dn_route.c 1.32 vs edited =====
--- 1.32/net/decnet/dn_route.c	2005-03-11 13:17:32 +11:00
+++ edited/net/decnet/dn_route.c	2005-03-15 20:59:05 +11:00
@@ -817,7 +817,7 @@
 	if (rt->u.dst.metrics[RTAX_MTU-1] == 0 || 
             rt->u.dst.metrics[RTAX_MTU-1] > rt->u.dst.dev->mtu)
 		rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
-	mss = dn_mss_from_pmtu(dev, dst_pmtu(&rt->u.dst));
+	mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->u.dst));
 	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0 ||
 	    rt->u.dst.metrics[RTAX_ADVMSS-1] > mss)
 		rt->u.dst.metrics[RTAX_ADVMSS-1] = mss;
===== net/ipv4/ip_output.c 1.76 vs edited =====
--- 1.76/net/ipv4/ip_output.c	2005-02-01 02:43:53 +11:00
+++ edited/net/ipv4/ip_output.c	2005-03-15 20:59:05 +11:00
@@ -278,7 +278,7 @@
 				newskb->dev, ip_dev_loopback_xmit);
 	}
 
-	if (skb->len > dst_pmtu(&rt->u.dst))
+	if (skb->len > dst_mtu(&rt->u.dst))
 		return ip_fragment(skb, ip_finish_output);
 	else
 		return ip_finish_output(skb);
@@ -288,7 +288,7 @@
 {
 	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 
-	if (skb->len > dst_pmtu(skb->dst) && !skb_shinfo(skb)->tso_size)
+	if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
 		return ip_fragment(skb, ip_finish_output);
 	else
 		return ip_finish_output(skb);
@@ -448,7 +448,7 @@
 
 	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-			  htonl(dst_pmtu(&rt->u.dst)));
+			  htonl(dst_mtu(&rt->u.dst)));
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
@@ -458,7 +458,7 @@
 	 */
 
 	hlen = iph->ihl * 4;
-	mtu = dst_pmtu(&rt->u.dst) - hlen;	/* Size of data space */
+	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
 
 	/* When frag_list is given, use it. First, check its validity:
 	 * some transformers could create wrong frag_list or break existing
===== net/ipv4/ip_sockglue.c 1.31 vs edited =====
--- 1.31/net/ipv4/ip_sockglue.c	2005-01-14 15:41:05 +11:00
+++ edited/net/ipv4/ip_sockglue.c	2005-03-15 20:59:05 +11:00
@@ -955,7 +955,7 @@
 			val = 0;
 			dst = sk_dst_get(sk);
 			if (dst) {
-				val = dst_pmtu(dst) - dst->header_len;
+				val = dst_mtu(dst);
 				dst_release(dst);
 			}
 			if (!val) {
===== net/ipv6/ip6_output.c 1.85 vs edited =====
--- 1.85/net/ipv6/ip6_output.c	2005-03-03 16:12:38 +11:00
+++ edited/net/ipv6/ip6_output.c	2005-03-15 20:59:05 +11:00
@@ -147,7 +147,7 @@
 
 int ip6_output(struct sk_buff *skb)
 {
-	if (skb->len > dst_pmtu(skb->dst) || dst_allfrag(skb->dst))
+	if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
 		return ip6_fragment(skb, ip6_output2);
 	else
 		return ip6_output2(skb);
@@ -263,7 +263,7 @@
 	ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 	ipv6_addr_copy(&hdr->daddr, first_hop);
 
-	mtu = dst_pmtu(dst);
+	mtu = dst_mtu(dst);
 	if ((skb->len <= mtu) || ipfragok) {
 		IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
@@ -428,10 +428,10 @@
 		goto error;
 	}
 
-	if (skb->len > dst_pmtu(dst)) {
+	if (skb->len > dst_mtu(dst)) {
 		/* Again, force OUTPUT device used as source address */
 		skb->dev = dst->dev;
-		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 		IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
 		IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
 		kfree_skb(skb);
@@ -534,7 +534,7 @@
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
 
-	mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
+	mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
 
 	if (skb_shinfo(skb)->frag_list) {
 		int first_len = skb_pagelen(skb);
===== net/ipv6/ipv6_sockglue.c 1.34 vs edited =====
--- 1.34/net/ipv6/ipv6_sockglue.c	2005-01-14 15:41:06 +11:00
+++ edited/net/ipv6/ipv6_sockglue.c	2005-03-15 20:59:05 +11:00
@@ -607,7 +607,7 @@
 		lock_sock(sk);
 		dst = sk_dst_get(sk);
 		if (dst) {
-			val = dst_pmtu(dst) - dst->header_len;
+			val = dst_mtu(dst);
 			dst_release(dst);
 		}
 		release_sock(sk);
===== net/ipv6/route.c 1.111 vs edited =====
--- 1.111/net/ipv6/route.c	2005-03-11 13:17:32 +11:00
+++ edited/net/ipv6/route.c	2005-03-15 20:59:05 +11:00
@@ -625,7 +625,7 @@
 {
 	struct rt6_info *rt6 = (struct rt6_info*)dst;
 
-	if (mtu < dst_pmtu(dst) && rt6->rt6i_dst.plen == 128) {
+	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
 		rt6->rt6i_flags |= RTF_MODIFIED;
 		if (mtu < IPV6_MIN_MTU) {
 			mtu = IPV6_MIN_MTU;
@@ -686,7 +686,7 @@
 	atomic_set(&rt->u.dst.__refcnt, 1);
 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
-	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst));
+	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
 	rt->u.dst.output  = output;
 
 #if 0	/* there's no chance to use these for ndisc */
@@ -971,7 +971,7 @@
 	if (!rt->u.dst.metrics[RTAX_MTU-1])
 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
-		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst));
+		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
 	rt->u.dst.dev = dev;
 	rt->rt6i_idev = idev;
 	return ip6_ins_rt(rt, nlh, _rtattr);
@@ -1134,7 +1134,7 @@
 	nrt->rt6i_nexthop = neigh_clone(neigh);
 	/* Reset pmtu, it may be better */
 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
-	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&nrt->u.dst));
+	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
 
 	if (ip6_ins_rt(nrt, NULL, NULL))
 		goto out;
@@ -1164,7 +1164,7 @@
 	if (rt == NULL)
 		return;
 
-	if (pmtu >= dst_pmtu(&rt->u.dst))
+	if (pmtu >= dst_mtu(&rt->u.dst))
 		goto out;
 
 	if (pmtu < IPV6_MIN_MTU) {
@@ -1405,7 +1405,7 @@
 	rt->rt6i_dev = &loopback_dev;
 	rt->rt6i_idev = idev;
 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
-	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst));
+	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
 	rt->u.dst.obsolete = -1;
 
@@ -1480,9 +1480,9 @@
 	 */
 	if (rt->rt6i_dev == arg->dev &&
 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
-            (dst_pmtu(&rt->u.dst) > arg->mtu ||
-             (dst_pmtu(&rt->u.dst) < arg->mtu &&
-	      dst_pmtu(&rt->u.dst) == idev->cnf.mtu6)))
+            (dst_mtu(&rt->u.dst) > arg->mtu ||
+             (dst_mtu(&rt->u.dst) < arg->mtu &&
+	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
 	return 0;
===== net/sctp/transport.c 1.31 vs edited =====
--- 1.31/net/sctp/transport.c	2005-01-15 15:32:41 +11:00
+++ edited/net/sctp/transport.c	2005-03-15 20:59:05 +11:00
@@ -227,7 +227,7 @@
 	dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL);
 
 	if (dst) {
-		transport->pmtu = dst_pmtu(dst);
+		transport->pmtu = dst_mtu(dst);
 		dst_release(dst);
 	} else
 		transport->pmtu = SCTP_DEFAULT_MAXSEGMENT;
@@ -253,7 +253,7 @@
 
 	transport->dst = dst;
 	if (dst) {
-		transport->pmtu = dst_pmtu(dst);
+		transport->pmtu = dst_mtu(dst);
 
 		/* Initialize sk->sk_rcv_saddr, if the transport is the
 		 * association's active path for getsockname().
===== net/xfrm/xfrm_policy.c 1.72 vs edited =====
--- 1.72/net/xfrm/xfrm_policy.c	2005-03-11 13:17:32 +11:00
+++ edited/net/xfrm/xfrm_policy.c	2005-03-15 20:59:05 +11:00
@@ -1119,12 +1119,12 @@
 		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 		u32 pmtu, route_mtu_cached;
 
-		pmtu = dst_pmtu(dst->child);
+		pmtu = dst_mtu(dst->child);
 		xdst->child_mtu_cached = pmtu;
 
 		pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
 
-		route_mtu_cached = dst_pmtu(xdst->route);
+		route_mtu_cached = dst_mtu(xdst->route);
 		xdst->route_mtu_cached = route_mtu_cached;
 
 		if (pmtu > route_mtu_cached)
@@ -1160,7 +1160,7 @@
 		if (dst->xfrm->km.state != XFRM_STATE_VALID)
 			return 0;
 
-		mtu = dst_pmtu(dst->child);
+		mtu = dst_mtu(dst->child);
 		if (xdst->child_mtu_cached != mtu) {
 			last = xdst;
 			xdst->child_mtu_cached = mtu;
@@ -1168,7 +1168,7 @@
 
 		if (!dst_check(xdst->route, 0))
 			return 0;
-		mtu = dst_pmtu(xdst->route);
+		mtu = dst_mtu(xdst->route);
 		if (xdst->route_mtu_cached != mtu) {
 			last = xdst;
 			xdst->route_mtu_cached = mtu;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [16/*] [INET] Take IPsec overhead into account in tunnels
  2005-03-15  9:58                     ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu
  2005-03-15 10:05                       ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu
@ 2005-03-15 10:20                       ` Lennert Buytenhek
  2005-03-15 10:27                         ` Herbert Xu
  2005-03-15 18:20                       ` David S. Miller
  2005-03-18  9:03                       ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit Herbert Xu
  3 siblings, 1 reply; 114+ messages in thread
From: Lennert Buytenhek @ 2005-03-15 10:20 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki,
	Patrick McHardy, netdev

On Tue, Mar 15, 2005 at 08:58:37PM +1100, Herbert Xu wrote:

> This patch uses dst_mtu instead of dst_pmtu in the various tunnel
> implementations.  As it is they simply ignore the IPsec overhead.
> This leads to bogus MTU values inside the tunnels.

I've not been able to get ipsec-encapsulated ipip/gre tunneling to
work properly (without MTU issues) so far, and never had the time to
properly dig into it.  If this is going to be fixed, that would be
great.


--L

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [16/*] [INET] Take IPsec overhead into account in tunnels
  2005-03-15 10:20                       ` [16/*] [INET] Take IPsec overhead into account in tunnels Lennert Buytenhek
@ 2005-03-15 10:27                         ` Herbert Xu
  0 siblings, 0 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-15 10:27 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki,
	Patrick McHardy, netdev

On Tue, Mar 15, 2005 at 11:20:52AM +0100, Lennert Buytenhek wrote:
> 
> I've not been able to get ipsec-encapsulated ipip/gre tunneling to
> work properly (without MTU issues) so far, and never had the time to
> properly dig into it.  If this is going to be fixed, that would be
> great.

Well please give it a run and let me know if it's still broken :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data
  2005-03-15  9:19                   ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu
  2005-03-15  9:58                     ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu
@ 2005-03-15 18:18                     ` David S. Miller
  2005-03-16 11:31                     ` Herbert Xu
  2 siblings, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-15 18:18 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Tue, 15 Mar 2005 20:19:04 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> This patch fixes the IPsec overhead handling in ip_append_data and
> ip6_append_data.  As it is they assume that the IPsec overhead is
> constant.  This is not true as with ESP the IPsec overhead will vary
> as the MTU varies.
> 
> The result is that they may produce packets that will exceed the MTU
> when ESP is used.  Had it taken the trailer_len into account, it would
> have produced packets less than the real MTU.
> 
> By switching to dst_mtu we get the optimal result.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Looks nice.  Applied, thanks Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [16/*] [INET] Take IPsec overhead into account in tunnels
  2005-03-15  9:58                     ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu
  2005-03-15 10:05                       ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu
  2005-03-15 10:20                       ` [16/*] [INET] Take IPsec overhead into account in tunnels Lennert Buytenhek
@ 2005-03-15 18:20                       ` David S. Miller
  2005-03-18  9:03                       ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit Herbert Xu
  3 siblings, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-15 18:20 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Tue, 15 Mar 2005 20:58:37 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> This patch uses dst_mtu instead of dst_pmtu in the various tunnel
> implementations.  As it is they simply ignore the IPsec overhead.
> This leads to bogus MTU values inside the tunnels.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks Herbert.

> BTW, we're doing lazy MTU updates in the tunnel xmit functions.
> When a packet with DF set hits us and exceeds the updated MTU,
> we will send an ICMP packet back which is good.
> 
> Unfortunately when a packet with DF clear hits us as we update
> the MTU downwards, the packet will be silently discarded instead
> of fragmented (well we will send an ICMP back to ourselves but
> we already knew that MTU value :).
> 
> I presume we want to fix this, right?

I think so, although it could be argued that in the end it really
doesn't matter.  The final argument though is that quality of
implementation says we shouldn't drop the frame for such a reason.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [17/*] [NET] Replace dst_pmtu with dst_mtu
  2005-03-15 10:05                       ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu
@ 2005-03-15 18:24                         ` David S. Miller
  2005-03-15 19:02                           ` Patrick McHardy
  2005-03-15 20:31                           ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu
  0 siblings, 2 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-15 18:24 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Tue, 15 Mar 2005 21:05:22 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> This patch replaces most of the other uses of dst_pmtu with dst_mtu.
> As far as I can tell these are either identical because dst->path == dst,
> or they're a straightforward replacement of (the slightly incorrect)
> dst_pmtu(dst) - dst->header_Len with dst_mtu(dst).
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks Herbert.

> At this point the only remaining user of dst_pmtu is ipt_REJECT which
> will go away either when we use icmp_send in there or when I replace
> it with dst_mtu.

Yes, I saw that posting on netfilter-devel where you asked Rusty
why icmp_send() wasn't directly used even though it appears it
should be.

> We can now remove the other users of dst->path as well with the removal
> of that attribute itself as the goal.

Ok, sounds great.

So, at that point, I guess the next task is to handle PMTU events of already
encrypted packets properly?  We still have that problem right?  When the
ICMP payload is encrypted we have to cache some information on IPSEC output
so that a proto+SPI key can find us the encrypted inner IP header info, which
we'll need in order to process (and/or forward) the ICMP PMTU information
correctly.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [12/*] [IPSEC] Handle local_df in IPv4
  2005-03-15  5:25               ` [12/*] [IPSEC] Handle local_df in IPv4 David S. Miller
@ 2005-03-15 18:25                 ` YOSHIFUJI Hideaki / 吉藤英明
  2005-03-15 18:28                   ` YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 1 reply; 114+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-03-15 18:25 UTC (permalink / raw)
  To: davem, herbert; +Cc: kuznet, kaber, netdev, yoshfuji

In article <20050314212557.23f0ab09.davem@davemloft.net> (at Mon, 14 Mar 2005 21:25:57 -0800), "David S. Miller" <davem@davemloft.net> says:

> > I was going to do the same thing to IPv6.  Unfortunately it seems
> > that we don't have any local_df support over there.  That is, we
> > always fragment outbound UDP/raw packets.  Did I miss something?
> 
> I have no idea offhand.  Perhaps Yoshifuji can figure it out?

Currently, yes.
And, I know this is one of places we may need to revisit...

--yoshfuji

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [12/*] [IPSEC] Handle local_df in IPv4
  2005-03-15 18:25                 ` YOSHIFUJI Hideaki / 吉藤英明
@ 2005-03-15 18:28                   ` YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 0 replies; 114+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-03-15 18:28 UTC (permalink / raw)
  To: davem, herbert; +Cc: kuznet, kaber, netdev, yoshfuji

In article <20050316.032537.72822422.yoshfuji@linux-ipv6.org> (at Wed, 16 Mar 2005 03:25:37 +0900 (JST)), YOSHIFUJI Hideaki / 吉藤英明 <yoshfuji@linux-ipv6.org> says:

> > > that we don't have any local_df support over there.  That is, we
> > > always fragment outbound UDP/raw packets.  Did I miss something?
> > 
> > I have no idea offhand.  Perhaps Yoshifuji can figure it out?
> 
> Currently, yes.
I mean: we always fragment outbount UDP/raw packets.

--yoshfuji

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [17/*] [NET] Replace dst_pmtu with dst_mtu
  2005-03-15 18:24                         ` David S. Miller
@ 2005-03-15 19:02                           ` Patrick McHardy
  2005-03-15 20:40                             ` Replace send_unreach with icmp_send Herbert Xu
  2005-03-15 20:31                           ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu
  1 sibling, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-03-15 19:02 UTC (permalink / raw)
  To: David S. Miller
  Cc: kuznet, Netfilter Development Mailinglist, Herbert Xu, netdev

David S. Miller wrote:
> On Tue, 15 Mar 2005 21:05:22 +1100
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
>>At this point the only remaining user of dst_pmtu is ipt_REJECT which
>>will go away either when we use icmp_send in there or when I replace
>>it with dst_mtu.
> 
> 
> Yes, I saw that posting on netfilter-devel where you asked Rusty
> why icmp_send() wasn't directly used even though it appears it
> should be.

I would prefer to keep it seperately. I have planned to remove xrlim
from ipt_REJECT so it behaves similar for TCP and ICMP. Limits should
then be handled by the limit match. This can't be done if we switch to
icmp_send().

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [3/4] [IPSEC] Add route element to xfrm_dst
  2005-03-14 20:32       ` Herbert Xu
@ 2005-03-15 19:05         ` Patrick McHardy
  0 siblings, 0 replies; 114+ messages in thread
From: Patrick McHardy @ 2005-03-15 19:05 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev

Herbert Xu wrote:
> On Mon, Mar 14, 2005 at 12:52:16PM +0100, Patrick McHardy wrote:
>
>>Since the tunnel dst is not necessarily the last in the bundle anymore,
>>we might miss to initialize some dsts, for example with ipcomp/tunnel +
>>esp/transport. If we have nested tunnels we'll fiddle with entries in
>>the routing cache.
> 
> 
> Sorry, but I don't get it :) First of all what do you mean by the
> tunnel dst?
> 
> If you mean &rt->u.dst then as far as I can see it's still the last
> child in the bundle.  It may also appear in ->route elements earlier
> on but that does not come into play in this loop.

You're right, I must have misread the code somehow. Sorry for the noise.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [17/*] [NET] Replace dst_pmtu with dst_mtu
  2005-03-15 18:24                         ` David S. Miller
  2005-03-15 19:02                           ` Patrick McHardy
@ 2005-03-15 20:31                           ` Herbert Xu
  1 sibling, 0 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-15 20:31 UTC (permalink / raw)
  To: David S. Miller; +Cc: kuznet, yoshfuji, kaber, netdev

On Tue, Mar 15, 2005 at 10:24:50AM -0800, David S. Miller wrote:
> 
> So, at that point, I guess the next task is to handle PMTU events of already
> encrypted packets properly?  We still have that problem right?  When the
> ICMP payload is encrypted we have to cache some information on IPSEC output
> so that a proto+SPI key can find us the encrypted inner IP header info, which
> we'll need in order to process (and/or forward) the ICMP PMTU information
> correctly.

Yes, that's going to be the second part of the MTU work.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Replace send_unreach with icmp_send
  2005-03-15 19:02                           ` Patrick McHardy
@ 2005-03-15 20:40                             ` Herbert Xu
  2005-03-15 20:48                               ` Patrick McHardy
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-15 20:40 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, kuznet, yoshfuji, netdev

On Tue, Mar 15, 2005 at 08:02:26PM +0100, Patrick McHardy wrote:
> 
> I would prefer to keep it seperately. I have planned to remove xrlim
> from ipt_REJECT so it behaves similar for TCP and ICMP. Limits should
> then be handled by the limit match. This can't be done if we switch to
> icmp_send().

Well it isn't terribly difficult to create a new version of icmp_send
that does xrlim conditionally.  icmp_send/ipt_REJECT can then call that
function.

The main reason I'm looking at getting rid of send_unreach is because
having two implementations of the same code often leads to bugs.  In
fact, as it is there are multiple IPsec-related bugs in the ipt_REJECT
code.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: Replace send_unreach with icmp_send
  2005-03-15 20:40                             ` Replace send_unreach with icmp_send Herbert Xu
@ 2005-03-15 20:48                               ` Patrick McHardy
  2005-03-16 10:51                                 ` [IPV4] Make ipt_REJECT use icmp_send again Herbert Xu
  0 siblings, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-03-15 20:48 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, kuznet, yoshfuji, netdev

Herbert Xu wrote:
> Well it isn't terribly difficult to create a new version of icmp_send
> that does xrlim conditionally.  icmp_send/ipt_REJECT can then call that
> function.
> 
> The main reason I'm looking at getting rid of send_unreach is because
> having two implementations of the same code often leads to bugs.  In
> fact, as it is there are multiple IPsec-related bugs in the ipt_REJECT
> code.

Ok. I can't see any different reason to keep it, so go ahead. I'll take
care of the xrlim stuff later.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [IPV4] Make ipt_REJECT use icmp_send again
  2005-03-15 20:48                               ` Patrick McHardy
@ 2005-03-16 10:51                                 ` Herbert Xu
  2005-03-16 19:00                                   ` Patrick McHardy
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-16 10:51 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, kuznet, yoshfuji, netdev

[-- Attachment #1: Type: text/plain, Size: 1164 bytes --]

On Tue, Mar 15, 2005 at 09:48:53PM +0100, Patrick McHardy wrote:
> 
> Ok. I can't see any different reason to keep it, so go ahead. I'll take
> care of the xrlim stuff later.

Great.  Here is the patch that makes ipt_REJECT use icmp_send again.
We've gone full circle :)

As it is ipt_REJECT doesn't work at all with IPsec.  Despite my efforts
previously in making the policy lookups work there I neglected to change
the final call to dst_output so the policy lookup is useless.

ipt_REJECT also had a number of deviations from icmp_send which seems to
be unjustified.  For examples it ignored source routing IP options.

There was a bug in icmp_send too :) It didn't set the ICMP type/code
values for the policy lookup.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

I compared the two functions line-by-line to make sure that there
weren't any subtle differences.  Please double check this in case
I overlooked something important.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-18 --]
[-- Type: text/plain, Size: 5158 bytes --]

===== net/ipv4/icmp.c 1.50 vs edited =====
--- 1.50/net/ipv4/icmp.c	2005-03-15 16:26:11 +11:00
+++ edited/net/ipv4/icmp.c	2005-03-16 21:20:41 +11:00
@@ -518,14 +518,6 @@
 					   IPTOS_PREC_INTERNETCONTROL) :
 					  iph->tos;
 
-	{
-		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = iph->saddr,
-							 .saddr = saddr,
-							 .tos = RT_TOS(tos) } },
-				    .proto = IPPROTO_ICMP };
-		if (ip_route_output_key(&rt, &fl))
-		    goto out_unlock;
-	}
 	if (ip_options_echo(&icmp_param.replyopts, skb_in))
 		goto ende;
 
@@ -544,13 +536,26 @@
 	inet_sk(icmp_socket->sk)->tos = tos;
 	ipc.addr = iph->saddr;
 	ipc.opt = &icmp_param.replyopts;
-	if (icmp_param.replyopts.srr) {
-		struct flowi fl = { .nl_u = { .ip4_u =
-					      { .daddr = icmp_param.replyopts.faddr,
-						.saddr = saddr,
-						.tos = RT_TOS(tos) } },
-				    .proto = IPPROTO_ICMP };
-		ip_rt_put(rt);
+
+	{
+		struct flowi fl = {
+			.nl_u = {
+				.ip4_u = {
+					.daddr = icmp_param.replyopts.srr ?
+						icmp_param.replyopts.faddr :
+						iph->saddr,
+					.saddr = saddr,
+					.tos = RT_TOS(tos)
+				}
+			},
+			.proto = IPPROTO_ICMP,
+			.uli_u = {
+				.icmpt = {
+					.type = type,
+					.code = code
+				}
+			}
+		};
 		if (ip_route_output_key(&rt, &fl))
 			goto out_unlock;
 	}
===== net/ipv4/netfilter/ipt_REJECT.c 1.35 vs edited =====
--- 1.35/net/ipv4/netfilter/ipt_REJECT.c	2005-03-04 09:20:32 +11:00
+++ edited/net/ipv4/netfilter/ipt_REJECT.c	2005-03-16 21:19:20 +11:00
@@ -220,146 +220,9 @@
 	kfree_skb(nskb);
 }
 
-static void send_unreach(struct sk_buff *skb_in, int code)
+static inline void send_unreach(struct sk_buff *skb_in, int code)
 {
-	struct iphdr *iph;
-	struct icmphdr *icmph;
-	struct sk_buff *nskb;
-	u32 saddr;
-	u8 tos;
-	int hh_len, length;
-	struct rtable *rt = (struct rtable*)skb_in->dst;
-	unsigned char *data;
-
-	if (!rt)
-		return;
-
-	/* FIXME: Use sysctl number. --RR */
-	if (!xrlim_allow(&rt->u.dst, 1*HZ))
-		return;
-
-	iph = skb_in->nh.iph;
-
-	/* No replies to physical multicast/broadcast */
-	if (skb_in->pkt_type!=PACKET_HOST)
-		return;
-
-	/* Now check at the protocol level */
-	if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
-		return;
-
-	/* Only reply to fragment 0. */
-	if (iph->frag_off&htons(IP_OFFSET))
-		return;
-
-	/* If we send an ICMP error to an ICMP error a mess would result.. */
-	if (iph->protocol == IPPROTO_ICMP) {
-		struct icmphdr ihdr;
-
-		icmph = skb_header_pointer(skb_in, skb_in->nh.iph->ihl*4,
-					   sizeof(ihdr), &ihdr);
-		if (!icmph)
-			return;
-
-		/* Between echo-reply (0) and timestamp (13),
-		   everything except echo-request (8) is an error.
-		   Also, anything greater than NR_ICMP_TYPES is
-		   unknown, and hence should be treated as an error... */
-		if ((icmph->type < ICMP_TIMESTAMP
-		     && icmph->type != ICMP_ECHOREPLY
-		     && icmph->type != ICMP_ECHO)
-		    || icmph->type > NR_ICMP_TYPES)
-			return;
-	}
-
-	saddr = iph->daddr;
-	if (!(rt->rt_flags & RTCF_LOCAL))
-		saddr = 0;
-
-	tos = (iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL;
-
-	{
-		struct flowi fl = {
-			.nl_u = {
-				.ip4_u = {
-					.daddr = skb_in->nh.iph->saddr,
-					.saddr = saddr,
-					.tos = RT_TOS(tos)
-				}
-			},
-			.proto = IPPROTO_ICMP,
-			.uli_u = {
-				.icmpt = {
-					.type = ICMP_DEST_UNREACH,
-					.code = code
-				}
-			}
-		};
-
-		if (ip_route_output_key(&rt, &fl))
-			return;
-	}
-	/* RFC says return as much as we can without exceeding 576 bytes. */
-	length = skb_in->len + sizeof(struct iphdr) + sizeof(struct icmphdr);
-
-	if (length > dst_pmtu(&rt->u.dst))
-		length = dst_pmtu(&rt->u.dst);
-	if (length > 576)
-		length = 576;
-
-	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
-
-	nskb = alloc_skb(hh_len + length, GFP_ATOMIC);
-	if (!nskb) {
-		ip_rt_put(rt);
-		return;
-	}
-
-	nskb->priority = 0;
-	nskb->dst = &rt->u.dst;
-	skb_reserve(nskb, hh_len);
-
-	/* Set up IP header */
-	iph = nskb->nh.iph
-		= (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
-	iph->version=4;
-	iph->ihl=5;
-	iph->tos=tos;
-	iph->tot_len = htons(length);
-
-	/* PMTU discovery never applies to ICMP packets. */
-	iph->frag_off = 0;
-
-	iph->ttl = MAXTTL;
-	ip_select_ident(iph, &rt->u.dst, NULL);
-	iph->protocol=IPPROTO_ICMP;
-	iph->saddr=rt->rt_src;
-	iph->daddr=rt->rt_dst;
-	iph->check=0;
-	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
-
-	/* Set up ICMP header. */
-	icmph = nskb->h.icmph
-		= (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
-	icmph->type = ICMP_DEST_UNREACH;
-	icmph->code = code;	
-	icmph->un.gateway = 0;
-	icmph->checksum = 0;
-	
-	/* Copy as much of original packet as will fit */
-	data = skb_put(nskb,
-		       length - sizeof(struct iphdr) - sizeof(struct icmphdr));
-
-	skb_copy_bits(skb_in, 0, data,
-		      length - sizeof(struct iphdr) - sizeof(struct icmphdr));
-
-	icmph->checksum = ip_compute_csum((unsigned char *)icmph,
-					  length - sizeof(struct iphdr));
-
-	nf_ct_attach(nskb, skb_in);
-
-	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
-		ip_finish_output);
+	icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
 }	
 
 static unsigned int reject(struct sk_buff **pskb,

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data
  2005-03-15  9:19                   ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu
  2005-03-15  9:58                     ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu
  2005-03-15 18:18                     ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data David S. Miller
@ 2005-03-16 11:31                     ` Herbert Xu
  2005-03-16 22:02                       ` David S. Miller
  2005-03-21 16:14                       ` Mika Penttilä
  2 siblings, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-16 11:31 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

Hi Dave:

On Tue, Mar 15, 2005 at 08:19:04PM +1100, herbert wrote:
> 
> This patch fixes the IPsec overhead handling in ip_append_data and
> ip6_append_data.  As it is they assume that the IPsec overhead is
> constant.  This is not true as with ESP the IPsec overhead will vary
> as the MTU varies.

This patch is wrong.  This is the *one* place where we do need to
use the path MTU.  The reason is that when the packet is fragmented
we only pay for the IPsec overhead once over all and not once for
each fragment.

Please revert it for now.

The trailer_len in ip_append_data is not quite right as the trailer's
length depends on the length of the entire packet.  However, it should
be harmless since ESP knows how to extend the packet when necessary.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPV4] Make ipt_REJECT use icmp_send again
  2005-03-16 10:51                                 ` [IPV4] Make ipt_REJECT use icmp_send again Herbert Xu
@ 2005-03-16 19:00                                   ` Patrick McHardy
  2005-03-16 22:44                                     ` David S. Miller
  0 siblings, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-03-16 19:00 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, kuznet, yoshfuji, netdev

Herbert Xu wrote:
> I compared the two functions line-by-line to make sure that there
> weren't any subtle differences.  Please double check this in case
> I overlooked something important.

I double checked, all looks good to me.

Signed-off-by: Patrick McHardy <kaber@trash.net>

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data
  2005-03-16 11:31                     ` Herbert Xu
@ 2005-03-16 22:02                       ` David S. Miller
  2005-03-21 16:14                       ` Mika Penttilä
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-16 22:02 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Wed, 16 Mar 2005 22:31:49 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> This patch is wrong.  This is the *one* place where we do need to
> use the path MTU.  The reason is that when the packet is fragmented
> we only pay for the IPsec overhead once over all and not once for
> each fragment.
> 
> Please revert it for now.

Ok, done.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPV4] Make ipt_REJECT use icmp_send again
  2005-03-16 19:00                                   ` Patrick McHardy
@ 2005-03-16 22:44                                     ` David S. Miller
  2005-03-17 10:51                                       ` [IPV4] Send TCP reset through dst_output in ipt_REJECT Herbert Xu
  0 siblings, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-03-16 22:44 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: herbert, kuznet, yoshfuji, netdev

On Wed, 16 Mar 2005 20:00:51 +0100
Patrick McHardy <kaber@trash.net> wrote:

> Herbert Xu wrote:
> > I compared the two functions line-by-line to make sure that there
> > weren't any subtle differences.  Please double check this in case
> > I overlooked something important.
> 
> I double checked, all looks good to me.
> 
> Signed-off-by: Patrick McHardy <kaber@trash.net>

Applied, thanks guys.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [IPV4] Send TCP reset through dst_output in ipt_REJECT
  2005-03-16 22:44                                     ` David S. Miller
@ 2005-03-17 10:51                                       ` Herbert Xu
  2005-03-17 18:06                                         ` David S. Miller
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-17 10:51 UTC (permalink / raw)
  To: David S. Miller; +Cc: Patrick McHardy, kuznet, yoshfuji, netdev

[-- Attachment #1: Type: text/plain, Size: 462 bytes --]

Hi Dave:

I noticed that the TCP reset code in ipt_REJECT didn't call dst_output
either so it also bypasses IPsec processing.  Here is a patch to fix
that and use the correct MTU value.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-20 --]
[-- Type: text/plain, Size: 531 bytes --]

===== net/ipv4/netfilter/ipt_REJECT.c 1.36 vs edited =====
--- 1.36/net/ipv4/netfilter/ipt_REJECT.c	2005-03-17 09:44:14 +11:00
+++ edited/net/ipv4/netfilter/ipt_REJECT.c	2005-03-17 14:18:20 +11:00
@@ -207,13 +207,13 @@
 					   nskb->nh.iph->ihl);
 
 	/* "Never happens" */
-	if (nskb->len > dst_pmtu(nskb->dst))
+	if (nskb->len > dst_mtu(nskb->dst))
 		goto free_nskb;
 
 	nf_ct_attach(nskb, oldskb);
 
 	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
-		ip_finish_output);
+		dst_output);
 	return;
 
  free_nskb:

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPV4] Send TCP reset through dst_output in ipt_REJECT
  2005-03-17 10:51                                       ` [IPV4] Send TCP reset through dst_output in ipt_REJECT Herbert Xu
@ 2005-03-17 18:06                                         ` David S. Miller
  0 siblings, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-17 18:06 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kaber, kuznet, yoshfuji, netdev

On Thu, 17 Mar 2005 21:51:45 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> I noticed that the TCP reset code in ipt_REJECT didn't call dst_output
> either so it also bypasses IPsec processing.  Here is a patch to fix
> that and use the correct MTU value.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks a lot Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit
  2005-03-15  9:58                     ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu
                                         ` (2 preceding siblings ...)
  2005-03-15 18:20                       ` David S. Miller
@ 2005-03-18  9:03                       ` Herbert Xu
  2005-03-18  9:11                         ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu
  2005-03-18 18:39                         ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit David S. Miller
  3 siblings, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-18  9:03 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 706 bytes --]

Hi Dave:

Somehow I missed four files with dst_pmtu usages in them.  I'm going
to split them along the sames lines I did before: bug fixes and then
the trivial changes.

Here is a patch that replaces dst_pmtu with dst_pmtu in ipmr.c
since this is straight IPIP tunneling equivalent to what we have
in ipip.c.

As it is we may send ICMP packets when IPsec is present which is
exactly what the comment says that we shouldn't do.
 
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
  
Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-21 --]
[-- Type: text/plain, Size: 490 bytes --]

===== net/ipv4/ipmr.c 1.46 vs edited =====
--- 1.46/net/ipv4/ipmr.c	2005-01-14 15:41:05 +11:00
+++ edited/net/ipv4/ipmr.c	2005-03-18 19:56:33 +11:00
@@ -1171,7 +1171,7 @@
 
 	dev = rt->u.dst.dev;
 
-	if (skb->len+encap > dst_pmtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
 		/* Do not fragment multicasts. Alas, IPv4 does not
 		   allow to send ICMP, so that packets will disappear
 		   to blackhole.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-18  9:03                       ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit Herbert Xu
@ 2005-03-18  9:11                         ` Herbert Xu
  2005-03-18  9:19                           ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu Herbert Xu
  2005-03-18 18:40                           ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller
  2005-03-18 18:39                         ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit David S. Miller
  1 sibling, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-18  9:11 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 858 bytes --]

Hi Dave:

This patch makes ipt_TCPMSS use the correct MTU value for clamping.
This is a bit tricky actually since TCPMSS can be used in FORWARD,
LOCAL_OUT as well as POST_ROUTING.

In the first two cases we haven't performed IPsec yet so dst_mtu
obviously does the right thing.  As it is, POST_ROUTING is performed
after xfrm_output so MSS clamping is useless there.

With Patrick's IPsec netfilter stuff, there will be a POST_ROUTING
processing before IPsec processing, in which case dst_mtu also returns
exactly what we want.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

BTW Patrick, how is the IPsec netfilter stuff going?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-22 --]
[-- Type: text/plain, Size: 924 bytes --]

===== net/ipv4/netfilter/ipt_TCPMSS.c 1.13 vs edited =====
--- 1.13/net/ipv4/netfilter/ipt_TCPMSS.c	2004-06-04 15:04:05 +10:00
+++ edited/net/ipv4/netfilter/ipt_TCPMSS.c	2005-03-18 19:56:34 +11:00
@@ -87,14 +87,14 @@
 			return NF_DROP; /* or IPT_CONTINUE ?? */
 		}
 
-		if(dst_pmtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) {
+		if(dst_mtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) {
 			if (net_ratelimit())
 				printk(KERN_ERR
-		       			"ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_pmtu((*pskb)->dst));
+		       			"ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_mtu((*pskb)->dst));
 			return NF_DROP; /* or IPT_CONTINUE ?? */
 		}
 
-		newmss = dst_pmtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr);
+		newmss = dst_mtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr);
 	} else
 		newmss = tcpmssinfo->mss;
 

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu
  2005-03-18  9:11                         ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu
@ 2005-03-18  9:19                           ` Herbert Xu
  2005-03-18 10:07                             ` [24/*] [IPSEC] Get ttl from child instead of path Herbert Xu
  2005-03-18 18:41                             ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu David S. Miller
  2005-03-18 18:40                           ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller
  1 sibling, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-18  9:19 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 1004 bytes --]

Hi Dave:

Once again here is a couple of trivial dst_pmtu/dst_mtu replacements.
In both locations, we can only have simple dst entries which means
that dst == dst->path.

BTW, this is the rule that we should apply in future for uses of
dst_mtu/dst_pmtu (or other metrics on dst).  If the only dst's that
can appear are simple dst's (dst == dst->path), then we should use
dst_mtu or dst_metric.  If dst != dst->path, then whoever is writing
the code will need to think about which of dst or dst->path is the
right one.

In most instances dst will be the one.  However, as we have seen in
ip_append_data, dst->path may be needed rarely.  In particular, if
we're doing fragmentation immediately after IPsec, then you may need
it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
 
Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-23 --]
[-- Type: text/plain, Size: 2165 bytes --]

===== net/ipv4/ipvs/ip_vs_xmit.c 1.12 vs edited =====
--- 1.12/net/ipv4/ipvs/ip_vs_xmit.c	2005-03-11 13:17:32 +11:00
+++ edited/net/ipv4/ipvs/ip_vs_xmit.c	2005-03-18 19:56:34 +11:00
@@ -178,7 +178,7 @@
 	}
 
 	/* MTU checking */
-	mtu = dst_pmtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->u.dst);
 	if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
 		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
@@ -245,7 +245,7 @@
 		goto tx_error_icmp;
 
 	/* MTU checking */
-	mtu = dst_pmtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->u.dst);
 	if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
 		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
@@ -342,7 +342,7 @@
 
 	tdev = rt->u.dst.dev;
 
-	mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr);
+	mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
 	if (mtu < 68) {
 		ip_rt_put(rt);
 		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
@@ -445,7 +445,7 @@
 		goto tx_error_icmp;
 
 	/* MTU checking */
-	mtu = dst_pmtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->u.dst);
 	if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 		ip_rt_put(rt);
@@ -520,7 +520,7 @@
 		goto tx_error_icmp;
 
 	/* MTU checking */
-	mtu = dst_pmtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->u.dst);
 	if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
 		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
===== net/ipv4/netfilter/ip_conntrack_standalone.c 1.66 vs edited =====
--- 1.66/net/ipv4/netfilter/ip_conntrack_standalone.c	2005-03-04 09:17:07 +11:00
+++ edited/net/ipv4/netfilter/ip_conntrack_standalone.c	2005-03-18 19:56:34 +11:00
@@ -457,7 +457,7 @@
 	/* Local packets are never produced too large for their
 	   interface.  We degfragment them at LOCAL_OUT, however,
 	   so we have to refragment them here. */
-	if ((*pskb)->len > dst_pmtu(&rt->u.dst) &&
+	if ((*pskb)->len > dst_mtu(&rt->u.dst) &&
 	    !skb_shinfo(*pskb)->tso_size) {
 		/* No hook can be after us, so this should be OK. */
 		ip_fragment(*pskb, okfn);

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [24/*] [IPSEC] Get ttl from child instead of path
  2005-03-18  9:19                           ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu Herbert Xu
@ 2005-03-18 10:07                             ` Herbert Xu
  2005-03-18 10:11                               ` [25/*] [NET] Kill unnecessary uses of dst_path_metric Herbert Xu
  2005-03-18 18:43                               ` [24/*] [IPSEC] Get ttl from child instead of path David S. Miller
  2005-03-18 18:41                             ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu David S. Miller
  1 sibling, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-18 10:07 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 696 bytes --]

Hi Dave:

Now that dst_pmtu is almost gone let's do the same to dst_path_metric.
I've only found one legitimate use of it and that's the one that was
recently added in dst_allfrag.

This patch makes xfrm4_encap/xfrm6_encap use dst->child instead of
dst->path so that we choose the correct route to get the hoplimit
from when nested tunnels are present.

For simple tunnels dst->child == dst->path so there is no change.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-24 --]
[-- Type: text/plain, Size: 1006 bytes --]

===== net/ipv4/xfrm4_output.c 1.9 vs edited =====
--- 1.9/net/ipv4/xfrm4_output.c	2005-03-15 16:38:41 +11:00
+++ edited/net/ipv4/xfrm4_output.c	2005-03-18 21:02:33 +11:00
@@ -58,7 +58,7 @@
 	if (!top_iph->frag_off)
 		__ip_select_ident(top_iph, dst, 0);
 
-	top_iph->ttl = dst_path_metric(dst, RTAX_HOPLIMIT);
+	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
 
 	top_iph->saddr = x->props.saddr.a4;
 	top_iph->daddr = x->id.daddr.a4;
===== net/ipv6/xfrm6_output.c 1.10 vs edited =====
--- 1.10/net/ipv6/xfrm6_output.c	2005-03-15 16:38:41 +11:00
+++ edited/net/ipv6/xfrm6_output.c	2005-03-18 21:02:46 +11:00
@@ -69,7 +69,7 @@
 		dsfield &= ~INET_ECN_MASK;
 	ipv6_change_dsfield(top_iph, 0, dsfield);
 	top_iph->nexthdr = IPPROTO_IPV6; 
-	top_iph->hop_limit = dst_path_metric(dst, RTAX_HOPLIMIT);
+	top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
 	ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
 	ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
 }

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [25/*] [NET] Kill unnecessary uses of dst_path_metric
  2005-03-18 10:07                             ` [24/*] [IPSEC] Get ttl from child instead of path Herbert Xu
@ 2005-03-18 10:11                               ` Herbert Xu
  2005-03-18 11:06                                 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric Herbert Xu
  2005-03-18 18:44                                 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric David S. Miller
  2005-03-18 18:43                               ` [24/*] [IPSEC] Get ttl from child instead of path David S. Miller
  1 sibling, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-18 10:11 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 394 bytes --]

Hi Dave:

This gets rid of the last unnecessary use of dst_path_metric.  In
decnet dst->path is always equal to dst.
 
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-25 --]
[-- Type: text/plain, Size: 786 bytes --]

===== net/decnet/af_decnet.c 1.51 vs edited =====
--- 1.51/net/decnet/af_decnet.c	2005-03-16 09:50:25 +11:00
+++ edited/net/decnet/af_decnet.c	2005-03-18 21:01:24 +11:00
@@ -811,7 +811,7 @@
 		return -EINVAL;
 
 	scp->state = DN_CC;
-	scp->segsize_loc = dst_path_metric(__sk_dst_get(sk), RTAX_ADVMSS);
+	scp->segsize_loc = dst_metric(__sk_dst_get(sk), RTAX_ADVMSS);
 	dn_send_conn_conf(sk, allocation);
 
 	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
@@ -940,7 +940,7 @@
 	sk->sk_route_caps = sk->sk_dst_cache->dev->features;
 	sock->state = SS_CONNECTING;
 	scp->state = DN_CI;
-	scp->segsize_loc = dst_path_metric(sk->sk_dst_cache, RTAX_ADVMSS);
+	scp->segsize_loc = dst_metric(sk->sk_dst_cache, RTAX_ADVMSS);
 
 	dn_nsp_send_conninit(sk, NSP_CI);
 	err = -EINPROGRESS;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [26/*] [NET] Kill dst_pmtu/dst_path_metric
  2005-03-18 10:11                               ` [25/*] [NET] Kill unnecessary uses of dst_path_metric Herbert Xu
@ 2005-03-18 11:06                                 ` Herbert Xu
  2005-03-18 11:28                                   ` [27/*] [NET] Make dst_allfrag use dst instead of dst->path Herbert Xu
  2005-03-18 18:46                                   ` [26/*] [NET] Kill dst_pmtu/dst_path_metric David S. Miller
  2005-03-18 18:44                                 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric David S. Miller
  1 sibling, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-18 11:06 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 623 bytes --]

Hi Dave:

This would have been the patch that removed dst->path.  But
ip_append_data got in the way :)

However, using dst->path is only needed rarely and should always
require a bit of deliberation.  So let's get rid of dst_pmtu
and dst_path_metric and use dst_mtu and dst_metric directly.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

BTW, shouldn't dst_allfrag be called dst_path_allfrag?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-26 --]
[-- Type: text/plain, Size: 1819 bytes --]

===== include/net/dst.h 1.29 vs edited =====
--- 1.29/include/net/dst.h	2005-03-11 13:17:32 +11:00
+++ edited/include/net/dst.h	2005-03-18 21:08:56 +11:00
@@ -109,21 +109,6 @@
 	return dst->metrics[metric-1];
 }
 
-static inline u32
-dst_path_metric(const struct dst_entry *dst, int metric)
-{
-	return dst->path->metrics[metric-1];
-}
-
-static inline u32
-dst_pmtu(const struct dst_entry *dst)
-{
-	u32 mtu = dst_path_metric(dst, RTAX_MTU);
-	/* Yes, _exactly_. This is paranoia. */
-	barrier();
-	return mtu;
-}
-
 static inline u32 dst_mtu(const struct dst_entry *dst)
 {
 	u32 mtu = dst_metric(dst, RTAX_MTU);
@@ -137,7 +122,7 @@
 static inline u32
 dst_allfrag(const struct dst_entry *dst)
 {
-	int ret = dst_path_metric(dst, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG;
+	int ret = dst_metric(dst->path, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG;
 	/* Yes, _exactly_. This is paranoia. */
 	barrier();
 	return ret;
===== net/ipv4/ip_output.c 1.79 vs edited =====
--- 1.79/net/ipv4/ip_output.c	2005-03-17 09:02:36 +11:00
+++ edited/net/ipv4/ip_output.c	2005-03-18 21:00:08 +11:00
@@ -746,7 +746,7 @@
 			inet->cork.addr = ipc->addr;
 		}
 		dst_hold(&rt->u.dst);
-		inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
+		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 		inet->cork.rt = rt;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
===== net/ipv6/ip6_output.c 1.89 vs edited =====
--- 1.89/net/ipv6/ip6_output.c	2005-03-17 09:02:36 +11:00
+++ edited/net/ipv6/ip6_output.c	2005-03-18 21:00:08 +11:00
@@ -850,7 +850,7 @@
 		np->cork.rt = rt;
 		inet->cork.fl = *fl;
 		np->cork.hop_limit = hlimit;
-		inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
+		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 		if (dst_allfrag(&rt->u.dst))
 			inet->cork.flags |= IPCORK_ALLFRAG;
 		inet->cork.length = 0;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [27/*] [NET] Make dst_allfrag use dst instead of dst->path
  2005-03-18 11:06                                 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric Herbert Xu
@ 2005-03-18 11:28                                   ` Herbert Xu
  2005-03-18 18:47                                     ` David S. Miller
  2005-03-18 18:46                                   ` [26/*] [NET] Kill dst_pmtu/dst_path_metric David S. Miller
  1 sibling, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-18 11:28 UTC (permalink / raw)
  To: David S. Miller
  Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev

[-- Attachment #1: Type: text/plain, Size: 617 bytes --]

On Fri, Mar 18, 2005 at 10:06:11PM +1100, herbert wrote:
> 
> BTW, shouldn't dst_allfrag be called dst_path_allfrag?

Rather than doing that, let's make the path usage explicit in
the one place that it's needed (the same place where we use
dst_mtu(dst->path)).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Hmm, it still doesn't feel right to have an IPv6-specific helper
in net/dst.h.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: xfrm-27 --]
[-- Type: text/plain, Size: 879 bytes --]

--- linux-2.6/include/net/dst.h.orig	2005-03-18 22:20:39.000000000 +1100
+++ linux-2.6/include/net/dst.h	2005-03-18 22:20:46.000000000 +1100
@@ -122,7 +122,7 @@
 static inline u32
 dst_allfrag(const struct dst_entry *dst)
 {
-	int ret = dst_metric(dst->path, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG;
+	int ret = dst_metric(dst, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG;
 	/* Yes, _exactly_. This is paranoia. */
 	barrier();
 	return ret;
--- linux-2.6/net/ipv6/ip6_output.c.orig	2005-03-18 22:21:01.000000000 +1100
+++ linux-2.6/net/ipv6/ip6_output.c	2005-03-18 22:23:52.000000000 +1100
@@ -851,7 +851,7 @@
 		inet->cork.fl = *fl;
 		np->cork.hop_limit = hlimit;
 		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
-		if (dst_allfrag(&rt->u.dst))
+		if (dst_allfrag(rt->u.dst.path))
 			inet->cork.flags |= IPCORK_ALLFRAG;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit
  2005-03-18  9:03                       ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit Herbert Xu
  2005-03-18  9:11                         ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu
@ 2005-03-18 18:39                         ` David S. Miller
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-18 18:39 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Fri, 18 Mar 2005 20:03:10 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Somehow I missed four files with dst_pmtu usages in them.  I'm going
> to split them along the sames lines I did before: bug fixes and then
> the trivial changes.

Must be some bug in your grep binary... kidding ;-)

> Here is a patch that replaces dst_pmtu with dst_pmtu in ipmr.c
> since this is straight IPIP tunneling equivalent to what we have
> in ipip.c.
> 
> As it is we may send ICMP packets when IPsec is present which is
> exactly what the comment says that we shouldn't do.
>  
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-18  9:11                         ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu
  2005-03-18  9:19                           ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu Herbert Xu
@ 2005-03-18 18:40                           ` David S. Miller
  2005-03-20 15:46                             ` Patrick McHardy
  1 sibling, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-03-18 18:40 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Fri, 18 Mar 2005 20:11:29 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> This patch makes ipt_TCPMSS use the correct MTU value for clamping.
> This is a bit tricky actually since TCPMSS can be used in FORWARD,
> LOCAL_OUT as well as POST_ROUTING.
> 
> In the first two cases we haven't performed IPsec yet so dst_mtu
> obviously does the right thing.  As it is, POST_ROUTING is performed
> after xfrm_output so MSS clamping is useless there.
> 
> With Patrick's IPsec netfilter stuff, there will be a POST_ROUTING
> processing before IPsec processing, in which case dst_mtu also returns
> exactly what we want.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks Herbert.

> BTW Patrick, how is the IPsec netfilter stuff going?

That boy is seriously backlogged, so I'm not sure how much time
he's gotten to work on that yet.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu
  2005-03-18  9:19                           ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu Herbert Xu
  2005-03-18 10:07                             ` [24/*] [IPSEC] Get ttl from child instead of path Herbert Xu
@ 2005-03-18 18:41                             ` David S. Miller
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-18 18:41 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Fri, 18 Mar 2005 20:19:18 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Once again here is a couple of trivial dst_pmtu/dst_mtu replacements.
> In both locations, we can only have simple dst entries which means
> that dst == dst->path.
> 
> BTW, this is the rule that we should apply in future for uses of
> dst_mtu/dst_pmtu (or other metrics on dst).  If the only dst's that
> can appear are simple dst's (dst == dst->path), then we should use
> dst_mtu or dst_metric.  If dst != dst->path, then whoever is writing
> the code will need to think about which of dst or dst->path is the
> right one.
> 
> In most instances dst will be the one.  However, as we have seen in
> ip_append_data, dst->path may be needed rarely.  In particular, if
> we're doing fragmentation immediately after IPsec, then you may need
> it.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, and thanks for the nice succinct version of the rule :-)

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [24/*] [IPSEC] Get ttl from child instead of path
  2005-03-18 10:07                             ` [24/*] [IPSEC] Get ttl from child instead of path Herbert Xu
  2005-03-18 10:11                               ` [25/*] [NET] Kill unnecessary uses of dst_path_metric Herbert Xu
@ 2005-03-18 18:43                               ` David S. Miller
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-18 18:43 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Fri, 18 Mar 2005 21:07:57 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Now that dst_pmtu is almost gone let's do the same to dst_path_metric.
> I've only found one legitimate use of it and that's the one that was
> recently added in dst_allfrag.
> 
> This patch makes xfrm4_encap/xfrm6_encap use dst->child instead of
> dst->path so that we choose the correct route to get the hoplimit
> from when nested tunnels are present.
> 
> For simple tunnels dst->child == dst->path so there is no change.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Looks good, applied.

Thanks Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [25/*] [NET] Kill unnecessary uses of dst_path_metric
  2005-03-18 10:11                               ` [25/*] [NET] Kill unnecessary uses of dst_path_metric Herbert Xu
  2005-03-18 11:06                                 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric Herbert Xu
@ 2005-03-18 18:44                                 ` David S. Miller
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-18 18:44 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Fri, 18 Mar 2005 21:11:26 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> This gets rid of the last unnecessary use of dst_path_metric.  In
> decnet dst->path is always equal to dst.
>  
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [26/*] [NET] Kill dst_pmtu/dst_path_metric
  2005-03-18 11:06                                 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric Herbert Xu
  2005-03-18 11:28                                   ` [27/*] [NET] Make dst_allfrag use dst instead of dst->path Herbert Xu
@ 2005-03-18 18:46                                   ` David S. Miller
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-18 18:46 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Fri, 18 Mar 2005 22:06:11 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> This would have been the patch that removed dst->path.  But
> ip_append_data got in the way :)
> 
> However, using dst->path is only needed rarely and should always
> require a bit of deliberation.  So let's get rid of dst_pmtu
> and dst_path_metric and use dst_mtu and dst_metric directly.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Applied, thanks.

> BTW, shouldn't dst_allfrag be called dst_path_allfrag?

If you think that makes it's functionality clearer, sure.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [27/*] [NET] Make dst_allfrag use dst instead of dst->path
  2005-03-18 11:28                                   ` [27/*] [NET] Make dst_allfrag use dst instead of dst->path Herbert Xu
@ 2005-03-18 18:47                                     ` David S. Miller
  0 siblings, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-03-18 18:47 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev

On Fri, 18 Mar 2005 22:28:04 +1100
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Rather than doing that, let's make the path usage explicit in
> the one place that it's needed (the same place where we use
> dst_mtu(dst->path)).
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Ok, applied.

> Hmm, it still doesn't feel right to have an IPv6-specific helper
> in net/dst.h.

We have tons of stuff in the neighbour layer that really is
only taken advantage of by ipv6, it's a similar situation.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-18 18:40                           ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller
@ 2005-03-20 15:46                             ` Patrick McHardy
  2005-03-20 16:32                               ` Ludo Stellingwerff
                                                 ` (2 more replies)
  0 siblings, 3 replies; 114+ messages in thread
From: Patrick McHardy @ 2005-03-20 15:46 UTC (permalink / raw)
  To: David S. Miller; +Cc: Herbert Xu, kuznet, yoshfuji, netdev

David S. Miller wrote:
> On Fri, 18 Mar 2005 20:11:29 +1100
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
>>BTW Patrick, how is the IPsec netfilter stuff going?
> 
> That boy is seriously backlogged, so I'm not sure how much time
> he's gotten to work on that yet.

Indeed, but in case of the netfilter patches that's not the problem.
They are basically working fine, but I have doubts about submitting
them. First, and most importantly, the input patch is incredible ugly.
To recap, we want to pass the encapsulated packets to the netfilter
hooks, then again the decapsulated packets after all decapsulation has
been done. The current input patch makes packets that have been
handled by IPsec skip the netfilter hooks until we know no further
IPsec processing will be done (route is non-local or protocol handler
is not marked as xfrm_prot). The packet is then marked as completely
decapsulated and passed through the stack again and the plain packets
go through netfilter again. There are a couple of problems with this
approach:

- decapsulated tunnel-mode packets go through the stack twice
- netfilter only sees them once, everything else multiple times
   (statistics, packet sockets, ...)
- racy, xfrm protocol could be registered after we determined
   decapsulation is done.
- inefficient

The second reason is that I'm not sure at all wether this is the way
to go. With KLIPS-like IPsec-devices you can sniff the plain packets
before they are handled by IPsec and you can perform traffic shaping
on them. These two points are completely unhandled, and people seem
to want them.

So what's holding back these patches is getting some consensus on what
exactly we want to do and finding a better method for determining when
decapsulation is done. One possibility would be stealing packets
in xfrm_policy_check(), but I haven't thought much about this yet.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-20 15:46                             ` Patrick McHardy
@ 2005-03-20 16:32                               ` Ludo Stellingwerff
  2005-03-20 17:17                                 ` Lennert Buytenhek
  2005-03-23  3:49                               ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller
  2005-03-23  9:24                               ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu
  2 siblings, 1 reply; 114+ messages in thread
From: Ludo Stellingwerff @ 2005-03-20 16:32 UTC (permalink / raw)
  To: netdev

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

As an daily user of Patricks patches, I like to agree with Patricks
analysis about the loosing of flexibility with the choice not to use
virtual devices. In a real-life situation I had to revert to using
plain, unsave GRE-tunnels because of the complexity of using 2.6-ipsec
for the same purpose. I'm routing one of multiple defaultroutes over
this tunnel:) It felt like a relieve having a device to work with.

Just my 2 cents,
Greetings,
Ludo.

Patrick McHardy wrote:

| David S. Miller wrote:
|
|> On Fri, 18 Mar 2005 20:11:29 +1100 Herbert Xu
|> <herbert@gondor.apana.org.au> wrote:
|>
|>> BTW Patrick, how is the IPsec netfilter stuff going?
|>
|>
|> That boy is seriously backlogged, so I'm not sure how much time
|> he's gotten to work on that yet.
|
|
| Indeed, but in case of the netfilter patches that's not the
| problem. They are basically working fine, but I have doubts about
| submitting them. First, and most importantly, the input patch is
| incredible ugly. To recap, we want to pass the encapsulated packets
| to the netfilter hooks, then again the decapsulated packets after
| all decapsulation has been done. The current input patch makes
| packets that have been handled by IPsec skip the netfilter hooks
| until we know no further IPsec processing will be done (route is
| non-local or protocol handler is not marked as xfrm_prot). The
| packet is then marked as completely decapsulated and passed through
| the stack again and the plain packets go through netfilter again.
| There are a couple of problems with this approach:
|
| - decapsulated tunnel-mode packets go through the stack twice -
| netfilter only sees them once, everything else multiple times
| (statistics, packet sockets, ...) - racy, xfrm protocol could be
| registered after we determined decapsulation is done. - inefficient
|
|
| The second reason is that I'm not sure at all wether this is the
| way to go. With KLIPS-like IPsec-devices you can sniff the plain
| packets before they are handled by IPsec and you can perform
| traffic shaping on them. These two points are completely unhandled,
| and people seem to want them.
|
| So what's holding back these patches is getting some consensus on
| what exactly we want to do and finding a better method for
| determining when decapsulation is done. One possibility would be
| stealing packets in xfrm_policy_check(), but I haven't thought much
| about this yet.
|
| Regards Patrick
|
|

- --
Ludo Stellingwerff

V&S B.V. The Netherlands
ProTactive firewall solution.
Tel: +31 172 416116
Fax: +31 172 416124

site: www.protactive.nl
demo: http://www.protactive.nl:81/netview.html
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.0 (GNU/Linux)

iD8DBQFCPaWMOF3sCpZ+AJgRAiofAJ9VlmQ3GF+D3q5FLfsyj3vcRJUbogCgzy86
pS4CRtw1sGe3K3vUVgIqi8E=
=5IAn
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-20 16:32                               ` Ludo Stellingwerff
@ 2005-03-20 17:17                                 ` Lennert Buytenhek
  2005-03-20 17:49                                   ` Patrick McHardy
  0 siblings, 1 reply; 114+ messages in thread
From: Lennert Buytenhek @ 2005-03-20 17:17 UTC (permalink / raw)
  To: Ludo Stellingwerff, Herbert Xu; +Cc: netdev

On Sun, Mar 20, 2005 at 05:32:13PM +0100, Ludo Stellingwerff wrote:

> As an daily user of Patricks patches, I like to agree with Patricks
> analysis about the loosing of flexibility with the choice not to use
> virtual devices. In a real-life situation I had to revert to using
> plain, unsave GRE-tunnels because of the complexity of using 2.6-ipsec
> for the same purpose. I'm routing one of multiple defaultroutes over
> this tunnel:) It felt like a relieve having a device to work with.

I'll add my 2 cents to this.

In my situation, there are three different sites in the same city
(Amsterdam), interconnected using a shared L2 vlan, and six routers
(A1, A2, B1, B2, C1, C2) on that vlan, two per site for redundancy
reasons.  Each router runs ospf.

The vlan is provided to us by a telco that we do not necessarily trust.
So ideally, we'd like all traffic that goes over the vlan (modulo ARPs
and STP and stuff) to be encrypted.  ("-o eth3 -j ENCRYPT")

The problem I kept running into with tunnel mode is that tunnel
mode SPD rules appear to dictate routing policy in a way that's not
compatible with dynamic routing.

I.e., a line like:

	spdadd 10.10.1.0/24 10.0.1.0/24 any -P out ipsec esp/tunnel/1.2.3.4-5.6.7.8/require;

effectively says "All traffic from 10.10.1.0/24 to 10.0.1.0/24 will
be sent over a tunnel with local endpoint 1.2.3.4 and remote endpoint
5.6.7.8", but:
- I have no idea beforehand what address ranges are going to be routed
  over this vlan.  (Customers might send traffic with source addresses
  in address ranges that they don't announce to us (asymmetric routing),
  and I don't want those packets to remain unencrypted just because they
  don't match the SPD rule.)  A 0.0.0.0/0 0.0.0.0/0 rule would not be
  appropriate either since that'd suck _all_ traffic into this tunnel.
- I have no idea beforehand what the remote nexthop is going to be.  A1
  might ordinarily send its traffic for site B to B1, but if B1 fails
  it'll want to start using B2 instead, which would be prevented by the
  SPD rule hardcoding the remote tunnel endpoint to B1.

The workaround we tried at first was to create GRE tunnels between each
pair of routers on the vlan, and to run ospf over the tunnels instead of
directly over the vlan interface.  That gave MTU problems, though, which
made us just forget about ipsec altogether and use vtund instead, hardly
better than nothing.  (Now that Herbert has submitted a number of fixes
for ipsec MTU issues with tunnels, I guess I should go and give the
GRE-over-ipsec setup a go again.)

Then again, maybe I totally misunderstood the way this is supposed to
work.


--L

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-20 17:17                                 ` Lennert Buytenhek
@ 2005-03-20 17:49                                   ` Patrick McHardy
  2005-03-20 18:11                                     ` Ludo Stellingwerff
  2005-03-30  9:49                                     ` Extending xfrm_selector (Was: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS) Herbert Xu
  0 siblings, 2 replies; 114+ messages in thread
From: Patrick McHardy @ 2005-03-20 17:49 UTC (permalink / raw)
  To: Lennert Buytenhek; +Cc: Ludo Stellingwerff, Herbert Xu, netdev

Lennert Buytenhek wrote:
> In my situation, there are three different sites in the same city
> (Amsterdam), interconnected using a shared L2 vlan, and six routers
> (A1, A2, B1, B2, C1, C2) on that vlan, two per site for redundancy
> reasons.  Each router runs ospf.
> 
> The vlan is provided to us by a telco that we do not necessarily trust.
> So ideally, we'd like all traffic that goes over the vlan (modulo ARPs
> and STP and stuff) to be encrypted.  ("-o eth3 -j ENCRYPT")
> 
> The problem I kept running into with tunnel mode is that tunnel
> mode SPD rules appear to dictate routing policy in a way that's not
> compatible with dynamic routing.
> 
> I.e., a line like:
> 
> 	spdadd 10.10.1.0/24 10.0.1.0/24 any -P out ipsec esp/tunnel/1.2.3.4-5.6.7.8/require;
> 
> effectively says "All traffic from 10.10.1.0/24 to 10.0.1.0/24 will
> be sent over a tunnel with local endpoint 1.2.3.4 and remote endpoint
> 5.6.7.8", but:
> - I have no idea beforehand what address ranges are going to be routed
>   over this vlan.  (Customers might send traffic with source addresses
>   in address ranges that they don't announce to us (asymmetric routing),
>   and I don't want those packets to remain unencrypted just because they
>   don't match the SPD rule.)  A 0.0.0.0/0 0.0.0.0/0 rule would not be
>   appropriate either since that'd suck _all_ traffic into this tunnel

You can specify an ifindex for oif in the selector, but you need
to use the xfrm_user interface.

> - I have no idea beforehand what the remote nexthop is going to be.  A1
>   might ordinarily send its traffic for site B to B1, but if B1 fails
>   it'll want to start using B2 instead, which would be prevented by the
>   SPD rule hardcoding the remote tunnel endpoint to B1.
> 
> The workaround we tried at first was to create GRE tunnels between each
> pair of routers on the vlan, and to run ospf over the tunnels instead of
> directly over the vlan interface.  That gave MTU problems, though, which
> made us just forget about ipsec altogether and use vtund instead, hardly
> better than nothing.  (Now that Herbert has submitted a number of fixes
> for ipsec MTU issues with tunnels, I guess I should go and give the
> GRE-over-ipsec setup a go again.)

Hmm .. sounds like using the routing realm in the selector would
solve this while avoiding the GRE overhead.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-20 17:49                                   ` Patrick McHardy
@ 2005-03-20 18:11                                     ` Ludo Stellingwerff
  2005-03-20 18:22                                       ` Patrick McHardy
  2005-03-30  9:49                                     ` Extending xfrm_selector (Was: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS) Herbert Xu
  1 sibling, 1 reply; 114+ messages in thread
From: Ludo Stellingwerff @ 2005-03-20 18:11 UTC (permalink / raw)
  To: netdev

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1


Patrick McHardy wrote:

| Lennert Buytenhek wrote:

|> - I have no idea beforehand what the remote nexthop is going to
|> be.  A1 might ordinarily send its traffic for site B to B1, but
|> if B1 fails it'll want to start using B2 instead, which would be
|> prevented by the SPD rule hardcoding the remote tunnel endpoint
|> to B1.
|>
|
| Hmm .. sounds like using the routing realm in the selector would
| solve this while avoiding the GRE overhead.
|
| Regards Patrick
|
I'm hoping that using the fwmark as a selector can provide a workable
solution for both mine and Lennert's problem, any many more related
situations. Netfilter has a (almost) complete range of selectors.
e.g. Lennerts problem could be solved using a combination of the
"realm" match of iptables, in combination with a fwmark for SPD matching.

Greetings,
Ludo.

PS. On a side note: Wouldn't it be possible to have a netfilter target
stating that an transformation should be done?

- --
Ludo Stellingwerff

V&S B.V. The Netherlands
ProTactive firewall solution.
Tel: +31 172 416116
Fax: +31 172 416124

site: www.protactive.nl
demo: http://www.protactive.nl:81/netview.html
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.0 (GNU/Linux)

iD8DBQFCPbzNOF3sCpZ+AJgRApxBAJ9akLfP1onp+WKRgmJ1YDImkrXLHwCgkPS4
GvwO1PoUwkJnVTOjeaf/ZEw=
=OebA
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-20 18:11                                     ` Ludo Stellingwerff
@ 2005-03-20 18:22                                       ` Patrick McHardy
  2005-03-20 18:43                                         ` jamal
  0 siblings, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-03-20 18:22 UTC (permalink / raw)
  To: Ludo Stellingwerff; +Cc: netdev

Ludo Stellingwerff wrote:
> I'm hoping that using the fwmark as a selector can provide a workable
> solution for both mine and Lennert's problem, any many more related
> situations. Netfilter has a (almost) complete range of selectors.
> e.g. Lennerts problem could be solved using a combination of the
> "realm" match of iptables, in combination with a fwmark for SPD matching.

Routing of local packets is done before the first netfilter hook
is called, but I forgot about ip_route_me_harder(). So you're right,
the realm can be translated to nfmark values using iptables.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-20 18:22                                       ` Patrick McHardy
@ 2005-03-20 18:43                                         ` jamal
  2005-03-20 19:10                                           ` Patrick McHardy
  0 siblings, 1 reply; 114+ messages in thread
From: jamal @ 2005-03-20 18:43 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Ludo Stellingwerff, netdev

On Sun, 2005-03-20 at 13:22, Patrick McHardy wrote:
> Ludo Stellingwerff wrote:
> > I'm hoping that using the fwmark as a selector can provide a workable
> > solution for both mine and Lennert's problem, any many more related
> > situations. Netfilter has a (almost) complete range of selectors.
> > e.g. Lennerts problem could be solved using a combination of the
> > "realm" match of iptables, in combination with a fwmark for SPD matching.
> 
> Routing of local packets is done before the first netfilter hook
> is called, but I forgot about ip_route_me_harder(). So you're right,
> the realm can be translated to nfmark values using iptables.

BTW, is there any reason the SPD couldnt have been implemented from day
one using netfilter classification ? Why did we need another speacilized
classifier? the actions are clearly implementable as targets.

cheers,
jamal

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-20 18:43                                         ` jamal
@ 2005-03-20 19:10                                           ` Patrick McHardy
  0 siblings, 0 replies; 114+ messages in thread
From: Patrick McHardy @ 2005-03-20 19:10 UTC (permalink / raw)
  To: hadi; +Cc: Ludo Stellingwerff, netdev

jamal wrote:
> BTW, is there any reason the SPD couldnt have been implemented from day
> one using netfilter classification ? Why did we need another speacilized
> classifier? the actions are clearly implementable as targets.

IMO iptables isn't so great that one would actually want to do this.
The entire ruleset needs to be one continous area in memory, so it can
not be changed, only replaced. To make it useable over pfkey would mean
many things that are currently done by iptables in userspace need to be
done in the kernel. There are multiple other reasons, but I don't think
its even worth discussing this. This of course doesn't mean I'm against
reducing the number of different classification engines.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data
  2005-03-16 11:31                     ` Herbert Xu
  2005-03-16 22:02                       ` David S. Miller
@ 2005-03-21 16:14                       ` Mika Penttilä
  2005-03-21 20:28                         ` Herbert Xu
  1 sibling, 1 reply; 114+ messages in thread
From: Mika Penttilä @ 2005-03-21 16:14 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki,
	Patrick McHardy, netdev

Herbert Xu wrote:

>Hi Dave:
>
>On Tue, Mar 15, 2005 at 08:19:04PM +1100, herbert wrote:
>  
>
>>This patch fixes the IPsec overhead handling in ip_append_data and
>>ip6_append_data.  As it is they assume that the IPsec overhead is
>>constant.  This is not true as with ESP the IPsec overhead will vary
>>as the MTU varies.
>>    
>>
>
>This patch is wrong.  This is the *one* place where we do need to
>use the path MTU.  The reason is that when the packet is fragmented
>we only pay for the IPsec overhead once over all and not once for
>each fragment.
>
>Please revert it for now.
>
>The trailer_len in ip_append_data is not quite right as the trailer's
>length depends on the length of the entire packet.  However, it should
>be harmless since ESP knows how to extend the packet when necessary.
>
>Thanks,
>  
>
Shouldn't ip_output also use the path variant, dst_mtu(skb->dst->path), 
it's surely after ipsec- processing?

--Mika

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data
  2005-03-21 16:14                       ` Mika Penttilä
@ 2005-03-21 20:28                         ` Herbert Xu
  2005-03-21 21:29                           ` Mika Penttilä
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-21 20:28 UTC (permalink / raw)
  To: Mika Penttil?
  Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki,
	Patrick McHardy, netdev

On Mon, Mar 21, 2005 at 06:14:07PM +0200, Mika Penttil? wrote:
>
> Shouldn't ip_output also use the path variant, dst_mtu(skb->dst->path), 
> it's surely after ipsec- processing?

That's the reason why it shouldn't use dst->path.  The only time you
should use dst_mtu(dst->path) is when dst may contain IPsec AND you
need the MTU outside IPsec.

In this case dst cannot contain IPsec so dst_mtu(dst) is correct.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data
  2005-03-21 20:28                         ` Herbert Xu
@ 2005-03-21 21:29                           ` Mika Penttilä
  2005-03-21 22:04                             ` Herbert Xu
  0 siblings, 1 reply; 114+ messages in thread
From: Mika Penttilä @ 2005-03-21 21:29 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki,
	Patrick McHardy, netdev

Herbert Xu wrote:

>On Mon, Mar 21, 2005 at 06:14:07PM +0200, Mika Penttil? wrote:
>  
>
>>Shouldn't ip_output also use the path variant, dst_mtu(skb->dst->path), 
>>it's surely after ipsec- processing?
>>    
>>
>
>That's the reason why it shouldn't use dst->path.  The only time you
>should use dst_mtu(dst->path) is when dst may contain IPsec AND you
>need the MTU outside IPsec.
>
>In this case dst cannot contain IPsec so dst_mtu(dst) is correct.
>
>Cheers,
>  
>
ok I see it now, but this is really easy to get wrong...

Thanks,
Mika

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data
  2005-03-21 21:29                           ` Mika Penttilä
@ 2005-03-21 22:04                             ` Herbert Xu
  0 siblings, 0 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-21 22:04 UTC (permalink / raw)
  To: Mika Penttil?
  Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki,
	Patrick McHardy, netdev

On Mon, Mar 21, 2005 at 11:29:58PM +0200, Mika Penttil? wrote:
>
> ok I see it now, but this is really easy to get wrong...

Well the rule is actually quite simple.  If IPsec can't be present,
then you should always use dst_mtu(dst).  If IPsec may be present,
then you should also use dst_mtu(dst) *unless* you're performing an
operation such as fragmentation that has to be done after IPsec.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-20 15:46                             ` Patrick McHardy
  2005-03-20 16:32                               ` Ludo Stellingwerff
@ 2005-03-23  3:49                               ` David S. Miller
  2005-03-23  4:03                                 ` Patrick McHardy
  2005-03-23  9:24                               ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu
  2 siblings, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-03-23  3:49 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: herbert, kuznet, yoshfuji, netdev

On Sun, 20 Mar 2005 16:46:34 +0100
Patrick McHardy <kaber@trash.net> wrote:

> So what's holding back these patches is getting some consensus on what
> exactly we want to do and finding a better method for determining when
> decapsulation is done. One possibility would be stealing packets
> in xfrm_policy_check(), but I haven't thought much about this yet.

That latter idea sounds pursuable.  I guess you'd do a netfilter
hook in xfrm_policy_check() right?

So then you'd need to pass struct sk_buff ** instead of a direct
pointer.  And that looks fine too, as nobody seems to cache
skb->XXX state across xfrm_policy_check() calls.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-23  3:49                               ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller
@ 2005-03-23  4:03                                 ` Patrick McHardy
  2005-03-24  5:05                                   ` Netfilter+IPsec Patrick McHardy
  0 siblings, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-03-23  4:03 UTC (permalink / raw)
  To: David S. Miller; +Cc: herbert, kuznet, yoshfuji, netdev

David S. Miller wrote:
> On Sun, 20 Mar 2005 16:46:34 +0100
> Patrick McHardy <kaber@trash.net> wrote:
> 
> 
>>So what's holding back these patches is getting some consensus on what
>>exactly we want to do and finding a better method for determining when
>>decapsulation is done. One possibility would be stealing packets
>>in xfrm_policy_check(), but I haven't thought much about this yet.
> 
> 
> That latter idea sounds pursuable.  I guess you'd do a netfilter
> hook in xfrm_policy_check() right?

It would call netif_rx(). The packet should pass all hooks as usual,
so everything works as expected. It is cleaner than my current
approach, but has the same problems wrt. statistics and AF_PACKET/raw
sockets. I'll post a patch (probably tomorrow, its late here) so we
have something concrete to talk about.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS
  2005-03-20 15:46                             ` Patrick McHardy
  2005-03-20 16:32                               ` Ludo Stellingwerff
  2005-03-23  3:49                               ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller
@ 2005-03-23  9:24                               ` Herbert Xu
  2 siblings, 0 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-23  9:24 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, kuznet, yoshfuji, netdev

On Sun, Mar 20, 2005 at 04:46:34PM +0100, Patrick McHardy wrote:
>
> been done. The current input patch makes packets that have been
> handled by IPsec skip the netfilter hooks until we know no further
> IPsec processing will be done (route is non-local or protocol handler
> is not marked as xfrm_prot). The packet is then marked as completely

Could you send me (or the list) a copy of the current input patch?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Netfilter+IPsec
  2005-03-23  4:03                                 ` Patrick McHardy
@ 2005-03-24  5:05                                   ` Patrick McHardy
  2005-03-24  5:43                                     ` Netfilter+IPsec David S. Miller
  0 siblings, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-03-24  5:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: herbert, kuznet, yoshfuji, netdev

[-- Attachment #1: Type: text/plain, Size: 1101 bytes --]

Patrick McHardy wrote:
> It would call netif_rx(). The packet should pass all hooks as usual,
> so everything works as expected. It is cleaner than my current
> approach, but has the same problems wrt. statistics and AF_PACKET/raw
> sockets. I'll post a patch (probably tomorrow, its late here) so we
> have something concrete to talk about.

Unfortunately I have to delay again. This patch (not entirely
reviewed myself yet) contains the parts necessary for hooking
output IPsec packets for netfilter. dst_output() in ipv4/ and
ipv6/ are replaced by ip_dst_output() and ip6_dst_output(), which
pass the packets through POST_ROUTING before IPsec. All replaced
calls should happen directly after NF_HOOK(LOCAL_OUT, ...). The
packet is then marked as transformed in xfrm{4,6}_output() and
passed through LOCAL_OUT in ip_output() again. This resembles
the behaviour of tunnel-devices, a packet is first visible
in plain on OUTPUT/FORWARD -> POST_ROUTING, then encapsulated
on OUTPUT -> POST_ROUTING again. This part doesn't have any
known problems, the input patch will follow tomorrow.

Regards
Patrick

[-- Attachment #2: x --]
[-- Type: text/plain, Size: 13535 bytes --]

===== include/linux/ipv6.h 1.29 vs edited =====
--- 1.29/include/linux/ipv6.h	2005-03-16 00:27:17 +01:00
+++ edited/include/linux/ipv6.h	2005-03-24 04:41:37 +01:00
@@ -177,19 +177,20 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 
-/* 
-   This structure contains results of exthdrs parsing
-   as offsets from skb->nh.
- */
 
 struct inet6_skb_parm {
+	/* results of exthdrs parsing as offsets from skb->nh. */
 	int			iif;
 	__u16			ra;
 	__u16			hop;
 	__u16			dst0;
 	__u16			srcrt;
 	__u16			dst1;
+	/* flags */
+	__u16			flags;
 };
+
+#define IP6SKB_XFRM_TRANSFORMED	0x1
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
 
===== include/linux/netfilter.h 1.18 vs edited =====
--- 1.18/include/linux/netfilter.h	2005-03-12 04:12:50 +01:00
+++ edited/include/linux/netfilter.h	2005-03-23 06:19:51 +01:00
@@ -139,9 +139,10 @@
 /* This is gross, but inline doesn't cut it for avoiding the function
    call in fast path: gcc doesn't inline (needs value tracking?). --RR */
 #ifdef CONFIG_NETFILTER_DEBUG
-#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)			       \
+#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond)		       \
 ({int __ret;								       \
-if ((__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \
+if (!(cond) ||								       \
+    (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \
 	__ret = (okfn)(skb);						       \
 __ret;})
 #define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh)	       \
@@ -150,9 +151,9 @@
 	__ret = (okfn)(skb);						       \
 __ret;})
 #else
-#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)			       \
+#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond)		       \
 ({int __ret;								       \
-if (list_empty(&nf_hooks[pf][hook]) ||					       \
+if (!(cond) || list_empty(&nf_hooks[pf][hook]) ||					       \
     (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \
 	__ret = (okfn)(skb);						       \
 __ret;})
@@ -163,6 +164,8 @@
 	__ret = (okfn)(skb);						       \
 __ret;})
 #endif
+#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)			\
+ NF_HOOK_COND((pf), (hook), (skb), (indev), (outdev), (okfn), 1)
 
 int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
 		 struct net_device *indev, struct net_device *outdev,
@@ -192,6 +195,7 @@
 
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
+#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) (okfn)(skb)
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 #endif /*CONFIG_NETFILTER*/
 
===== include/net/ip.h 1.38 vs edited =====
--- 1.38/include/net/ip.h	2005-01-27 07:03:17 +01:00
+++ edited/include/net/ip.h	2005-03-23 06:20:11 +01:00
@@ -30,6 +30,8 @@
 #include <linux/netdevice.h>
 #include <linux/inetdevice.h>
 #include <linux/in_route.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
 #include <net/route.h>
 #include <net/arp.h>
 #include <net/snmp.h>
@@ -45,6 +47,7 @@
 #define IPSKB_TRANSLATED	2
 #define IPSKB_FORWARDED		4
 #define IPSKB_XFRM_TUNNEL_SIZE	8
+#define IPSKB_XFRM_TRANSFORMED	16
 };
 
 struct ipcm_cookie
@@ -210,6 +213,12 @@
 			iph->id = 0;
 	} else
 		__ip_select_ident(iph, dst, more);
+}
+
+static inline int ip_dst_output(struct sk_buff *skb)
+{
+	return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL,
+	                    skb->dst->dev, dst_output, skb->dst->xfrm != NULL);
 }
 
 /*
===== include/net/ipv6.h 1.44 vs edited =====
--- 1.44/include/net/ipv6.h	2005-03-03 06:12:44 +01:00
+++ edited/include/net/ipv6.h	2005-03-23 06:14:52 +01:00
@@ -17,6 +17,8 @@
 
 #include <linux/ipv6.h>
 #include <linux/hardirq.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
 #include <net/ndisc.h>
 #include <net/flow.h>
 #include <net/snmp.h>
@@ -335,6 +337,12 @@
 {
 	return ((a->s6_addr32[0] | a->s6_addr32[1] | 
 		 a->s6_addr32[2] | a->s6_addr32[3] ) == 0); 
+}
+
+static inline int ip6_dst_output(struct sk_buff *skb)
+{
+	return NF_HOOK_COND(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL,
+	                    skb->dst->dev, dst_output, skb->dst->xfrm != NULL);
 }
 
 /*
===== net/ipv4/igmp.c 1.61 vs edited =====
--- 1.61/net/ipv4/igmp.c	2004-12-28 06:30:43 +01:00
+++ edited/net/ipv4/igmp.c	2005-03-23 05:53:14 +01:00
@@ -343,7 +343,7 @@
 	pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen);
 
 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
-		       dst_output);
+	               ip_dst_output);
 }
 
 static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
@@ -674,7 +674,7 @@
 	ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
 
 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
-		       dst_output);
+	               ip_dst_output);
 }
 
 static void igmp_gq_timer_expire(unsigned long data)
===== net/ipv4/ip_forward.c 1.11 vs edited =====
--- 1.11/net/ipv4/ip_forward.c	2004-07-08 00:17:28 +02:00
+++ edited/net/ipv4/ip_forward.c	2005-03-23 05:53:14 +01:00
@@ -51,7 +51,7 @@
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
 
-	return dst_output(skb);
+	return ip_dst_output(skb);
 }
 
 int ip_forward(struct sk_buff *skb)
===== net/ipv4/ip_output.c 1.80 vs edited =====
--- 1.80/net/ipv4/ip_output.c	2005-03-18 19:43:26 +01:00
+++ edited/net/ipv4/ip_output.c	2005-03-23 06:20:28 +01:00
@@ -166,7 +166,7 @@
 
 	/* Send it out. */
 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
-		       dst_output);
+		       ip_dst_output);
 }
 
 static inline int ip_finish_output2(struct sk_buff *skb)
@@ -284,7 +284,7 @@
 		return ip_finish_output(skb);
 }
 
-int ip_output(struct sk_buff *skb)
+static inline int ip_output2(struct sk_buff *skb)
 {
 	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 
@@ -294,6 +294,16 @@
 		return ip_finish_output(skb);
 }
 
+int ip_output(struct sk_buff *skb)
+{
+	int transformed = IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED;
+
+	if (transformed)
+		nf_reset(skb);
+	return NF_HOOK_COND(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
+	                    skb->dst->dev, ip_output2, transformed);
+}
+
 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 {
 	struct sock *sk = skb->sk;
@@ -374,7 +384,7 @@
 	skb->priority = sk->sk_priority;
 
 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
-		       dst_output);
+		       ip_dst_output);
 
 no_route:
 	IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
@@ -1189,7 +1199,7 @@
 
 	/* Netfilter gets whole the not fragmented skb. */
 	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
-		      skb->dst->dev, dst_output);
+		      skb->dst->dev, ip_dst_output);
 	if (err) {
 		if (err > 0)
 			err = inet->recverr ? net_xmit_errno(err) : 0;
===== net/ipv4/ipmr.c 1.47 vs edited =====
--- 1.47/net/ipv4/ipmr.c	2005-03-18 19:36:11 +01:00
+++ edited/net/ipv4/ipmr.c	2005-03-23 05:53:13 +01:00
@@ -1119,7 +1119,7 @@
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
 
-	return dst_output(skb);
+	return ip_dst_output(skb);
 }
 
 /*
===== net/ipv4/raw.c 1.63 vs edited =====
--- 1.63/net/ipv4/raw.c	2005-03-16 00:20:37 +01:00
+++ edited/net/ipv4/raw.c	2005-03-23 05:53:13 +01:00
@@ -310,7 +310,7 @@
 	}
 
 	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
-		      dst_output);
+	              ip_dst_output);
 	if (err > 0)
 		err = inet->recverr ? net_xmit_errno(err) : 0;
 	if (err)
===== net/ipv4/xfrm4_output.c 1.10 vs edited =====
--- 1.10/net/ipv4/xfrm4_output.c	2005-03-18 19:41:26 +01:00
+++ edited/net/ipv4/xfrm4_output.c	2005-03-23 05:53:13 +01:00
@@ -129,6 +129,7 @@
 		err = -EHOSTUNREACH;
 		goto error_nolock;
 	}
+	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
 	err = NET_XMIT_BYPASS;
 
 out_exit:
===== net/ipv4/ipvs/ip_vs_xmit.c 1.13 vs edited =====
--- 1.13/net/ipv4/ipvs/ip_vs_xmit.c	2005-03-18 19:38:59 +01:00
+++ edited/net/ipv4/ipvs/ip_vs_xmit.c	2005-03-24 05:05:57 +01:00
@@ -131,7 +131,7 @@
 	(skb)->nfcache |= NFC_IPVS_PROPERTY;		\
 	(skb)->ip_summed = CHECKSUM_NONE;		\
 	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,	\
-		(rt)->u.dst.dev, dst_output);		\
+		(rt)->u.dst.dev, ip_dst_output);	\
 } while (0)
 
 
===== net/ipv4/netfilter/ipt_REJECT.c 1.37 vs edited =====
--- 1.37/net/ipv4/netfilter/ipt_REJECT.c	2005-03-17 19:05:37 +01:00
+++ edited/net/ipv4/netfilter/ipt_REJECT.c	2005-03-23 06:05:51 +01:00
@@ -213,7 +213,7 @@
 	nf_ct_attach(nskb, oldskb);
 
 	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
-		dst_output);
+		ip_dst_output);
 	return;
 
  free_nskb:
===== net/ipv6/ip6_input.c 1.23 vs edited =====
--- 1.23/net/ipv6/ip6_input.c	2005-03-10 06:12:11 +01:00
+++ edited/net/ipv6/ip6_input.c	2005-03-24 05:06:19 +01:00
@@ -241,9 +241,9 @@
 			
 			if (deliver) {
 				skb2 = skb_clone(skb, GFP_ATOMIC);
-				dst_output(skb2);
+				ip6_dst_output(skb2);
 			} else {
-				dst_output(skb);
+				ip6_dst_output(skb);
 				return 0;
 			}
 		}
===== net/ipv6/ip6_output.c 1.91 vs edited =====
--- 1.91/net/ipv6/ip6_output.c	2005-03-18 19:44:52 +01:00
+++ edited/net/ipv6/ip6_output.c	2005-03-24 04:52:01 +01:00
@@ -108,7 +108,7 @@
 }
 
 
-static int ip6_output2(struct sk_buff *skb)
+static int ip6_output3(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
 	struct net_device *dev = dst->dev;
@@ -145,12 +145,22 @@
 	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
 }
 
-int ip6_output(struct sk_buff *skb)
+static inline int ip6_output2(struct sk_buff *skb)
 {
 	if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
-		return ip6_fragment(skb, ip6_output2);
+		return ip6_fragment(skb, ip6_output3);
 	else
-		return ip6_output2(skb);
+		return ip6_output3(skb);
+}
+
+int ip6_output(struct sk_buff *skb)
+{
+	int transformed = IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED;
+
+	if (transformed)
+		nf_reset(skb);
+	return NF_HOOK_COND(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL,
+	                    skb->dst->dev, ip6_output2, transformed);
 }
 
 #ifdef CONFIG_NETFILTER
@@ -195,7 +205,7 @@
 		}
 	}
 #endif /* CONFIG_NETFILTER */
-	return dst_output(skb);
+	return ip6_dst_output(skb);
 }
 
 /*
@@ -342,7 +352,7 @@
 
 static inline int ip6_forward_finish(struct sk_buff *skb)
 {
-	return dst_output(skb);
+	return ip6_dst_output(skb);
 }
 
 int ip6_forward(struct sk_buff *skb)
@@ -1146,7 +1156,7 @@
 
 	skb->dst = dst_clone(&rt->u.dst);
 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);	
-	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
+	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, ip6_dst_output);
 	if (err) {
 		if (err > 0)
 			err = inet->recverr ? net_xmit_errno(err) : 0;
===== net/ipv6/ip6_tunnel.c 1.30 vs edited =====
--- 1.30/net/ipv6/ip6_tunnel.c	2005-03-15 19:19:23 +01:00
+++ edited/net/ipv6/ip6_tunnel.c	2005-03-23 06:08:09 +01:00
@@ -744,7 +744,7 @@
 	nf_reset(skb);
 	pkt_len = skb->len;
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, 
-		      skb->dst->dev, dst_output);
+		      skb->dst->dev, ip6_dst_output);
 
 	if (err == NET_XMIT_SUCCESS || err == NET_XMIT_CN) {
 		stats->tx_bytes += pkt_len;
===== net/ipv6/ndisc.c 1.124 vs edited =====
--- 1.124/net/ipv6/ndisc.c	2005-03-16 23:52:27 +01:00
+++ edited/net/ipv6/ndisc.c	2005-03-23 06:08:42 +01:00
@@ -501,7 +501,7 @@
 	skb->dst = dst;
 	idev = in6_dev_get(dst->dev);
 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
-	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output);
+	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_dst_output);
 	if (!err) {
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS);
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
@@ -586,7 +586,7 @@
 	skb->dst = dst;
 	idev = in6_dev_get(dst->dev);
 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
-	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output);
+	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_dst_output);
 	if (!err) {
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORSOLICITS);
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
@@ -660,7 +660,7 @@
 	skb->dst = dst;
 	idev = in6_dev_get(dst->dev);
 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);	
-	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output);
+	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_dst_output);
 	if (!err) {
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTROUTERSOLICITS);
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
@@ -1446,7 +1446,7 @@
 	buff->dst = dst;
 	idev = in6_dev_get(dst->dev);
 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
-	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, buff, NULL, dst->dev, dst_output);
+	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, buff, NULL, dst->dev, ip6_dst_output);
 	if (!err) {
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTREDIRECTS);
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
===== net/ipv6/raw.c 1.79 vs edited =====
--- 1.79/net/ipv6/raw.c	2005-03-03 06:12:38 +01:00
+++ edited/net/ipv6/raw.c	2005-03-23 06:08:54 +01:00
@@ -541,7 +541,7 @@
 
 	IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);		
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
-		      dst_output);
+		      ip6_dst_output);
 	if (err > 0)
 		err = inet->recverr ? net_xmit_errno(err) : 0;
 	if (err)
===== net/ipv6/xfrm6_output.c 1.11 vs edited =====
--- 1.11/net/ipv6/xfrm6_output.c	2005-03-18 19:41:26 +01:00
+++ edited/net/ipv6/xfrm6_output.c	2005-03-24 04:46:59 +01:00
@@ -131,6 +131,7 @@
 		err = -EHOSTUNREACH;
 		goto error_nolock;
 	}
+	IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
 	err = NET_XMIT_BYPASS;
 
 out_exit:

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: Netfilter+IPsec
  2005-03-24  5:05                                   ` Netfilter+IPsec Patrick McHardy
@ 2005-03-24  5:43                                     ` David S. Miller
  2005-03-25  2:53                                       ` Netfilter+IPsec Herbert Xu
  0 siblings, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-03-24  5:43 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: herbert, kuznet, yoshfuji, netdev

On Thu, 24 Mar 2005 06:05:50 +0100
Patrick McHardy <kaber@trash.net> wrote:

> This patch (not entirely reviewed myself yet) contains the parts
> necessary for hooking output IPsec packets for netfilter.

This is actually much cleaner than I had ever anticipated.
I like it.

I suppose the input side will be quite a bit more involved?

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: Netfilter+IPsec
  2005-03-24  5:43                                     ` Netfilter+IPsec David S. Miller
@ 2005-03-25  2:53                                       ` Herbert Xu
  2005-03-25  5:10                                         ` Netfilter+IPsec Patrick McHardy
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-25  2:53 UTC (permalink / raw)
  To: David S. Miller; +Cc: Patrick McHardy, kuznet, yoshfuji, netdev

On Wed, Mar 23, 2005 at 09:43:40PM -0800, David S. Miller wrote:
> On Thu, 24 Mar 2005 06:05:50 +0100
> Patrick McHardy <kaber@trash.net> wrote:
> 
> > This patch (not entirely reviewed myself yet) contains the parts
> > necessary for hooking output IPsec packets for netfilter.
> 
> This is actually much cleaner than I had ever anticipated.
> I like it.

I completely agree.  The output patch is an elegant piece of work.

> I suppose the input side will be quite a bit more involved?

Maybe it won't be that bad when we actually see it :)

BTW Patrick, what about the other bits in your original patch set?
In particular, have you still got the bit that does policy lookups
after SNAT?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: Netfilter+IPsec
  2005-03-25  2:53                                       ` Netfilter+IPsec Herbert Xu
@ 2005-03-25  5:10                                         ` Patrick McHardy
  0 siblings, 0 replies; 114+ messages in thread
From: Patrick McHardy @ 2005-03-25  5:10 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, kuznet, yoshfuji, netdev

Herbert Xu wrote:
> On Wed, Mar 23, 2005 at 09:43:40PM -0800, David S. Miller wrote:
>>
>>>This patch (not entirely reviewed myself yet) contains the parts
>>>necessary for hooking output IPsec packets for netfilter.
>>
>>This is actually much cleaner than I had ever anticipated.
>>I like it.
> 
> I completely agree.  The output patch is an elegant piece of work.

Thanks. Unfortunantely it might need to be replaced because of
issues with the input side.

>>I suppose the input side will be quite a bit more involved?
> 
> Maybe it won't be that bad when we actually see it :)

Stealing the packets in xfrm_policy_check() didn't work out, a packet
can be checked multiple times, and before all IPsec processing is
done, because of raw sockets. Even worse, a raw socket can have its
own policy and accept packets that will be further processed by
IPsec. This suggests that the whole idea of skipping netfilter hooks
before all IPsec processing is done was wrong and we need to call
them on each pass through the stack as usual to be able to filter
before raw sockets. For symetry in the output path we would need to
pass the packet through POST_ROUTING and OUTPUT for each tunnel mode
transform. I wanted to avoid this so far because I can't think of
anything useful netfilter could do between two transforms on output,
but the good part is that it shouldn't require any changes in the
input path. I'm trying it now ..

> BTW Patrick, what about the other bits in your original patch set?
> In particular, have you still got the bit that does policy lookups
> after SNAT?

I haven't got up-to-date patches, but Christophe Saoute has ported
them to 2.6.12-rc1 and published on his page:
http://www.saout.de/misc/linux-2.6.12-rc1-ipsec-nat/
There are two patches that will probably be required either way,
the policy lookup after SNAT patch you mentioned, and a patch that
adds a function to restore struct flowi as it would have looked
without NAT for policy checks. Both are small and should be
painless.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [4/4] [IPSEC] Store MTU at each xfrm_dst
  2005-02-14 22:16     ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu
                         ` (2 preceding siblings ...)
  2005-03-07  5:32       ` [4/4] [IPSEC] Store MTU at each xfrm_dst David S. Miller
@ 2005-03-28 20:10       ` Patrick McHardy
  2005-03-28 23:30         ` [IPSEC] Move xfrm_flush_bundles into xfrm_state GC Herbert Xu
  2005-03-28 23:39         ` Checking SPI in xfrm_state_find Herbert Xu
  3 siblings, 2 replies; 114+ messages in thread
From: Patrick McHardy @ 2005-03-28 20:10 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

Hi Herbert,

Herbert Xu wrote:
> Finally this is the patch that sets the MTU values of each xfrm_dst
> and keeps it up-to-date.

while looking for ways to exchange the xfrm pointer of a live
dst_entry I noticed what looks like an ABBA deadlock introduced
by the call to xfrm_state_mtu() in xfrm_bundle_ok().

spin_lock x->lock			(xfrm_state_delete)
	read_lock xfrm_policy_lock	(xfrm_prune_bundles)
		write_lock policy->lock	(xfrm_prune_bundles)

read/write_lock policy->lock		(__xfrm46_find_acq/xfrm_lookup)
	spin_lock x->lock		(xfrm_state_mtu)

I haven't though of a way to avoid this yet. It would be nice though
if we could keep the rule that xfrm_policy_lock and policy->lock nest
with x->lock.

Something unrelated I was also wondering about, from xfrm_find_state():

         list_for_each_entry(x, xfrm_state_bydst+h, bydst) {
                 if (x->props.family == family &&
                     x->props.reqid == tmpl->reqid &&
                     xfrm_state_addr_check(x, daddr, saddr, family) &&
                     tmpl->mode == x->props.mode &&
                     tmpl->id.proto == x->id.proto) {

Shouldn't we check for (tmpl->id.spi == x->id.spi || !tmpl->id.spi) ?

Regards
Patrick

>   
> -int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family)
> +int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family)
>  {
> -	struct dst_entry *dst = &xdst->u.dst;
> +	struct dst_entry *dst = &first->u.dst;
> +	struct xfrm_dst *last;
> +	u32 mtu;
>  
>  	if (dst->path->obsolete > 0 ||
>  	    (dst->dev && !netif_running(dst->dev)))
>  		return 0;
>  
> +	last = NULL;
> +
>  	do {
> +		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
> +
>  		if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family))
>  			return 0;
>  		if (dst->xfrm->km.state != XFRM_STATE_VALID)
>  			return 0;
> +
> +		mtu = dst_pmtu(dst->child);
> +		if (xdst->child_mtu_cached != mtu) {
> +			last = xdst;
> +			xdst->child_mtu_cached = mtu;
> +		}
> +
> +		mtu = dst_pmtu(xdst->route);
> +		if (xdst->child_mtu_cached != mtu) {
> +			last = xdst;
> +			xdst->route_mtu_cached = mtu;
> +		}
> +
>  		dst = dst->child;
>  	} while (dst->xfrm);
> +
> +	if (likely(!last))
> +		return 1;
> +
> +	mtu = last->child_mtu_cached;
> +	for (;;) {
> +		dst = &last->u.dst;
> +
> +		mtu = xfrm_state_mtu(dst->xfrm, mtu);
> +		if (mtu > last->route_mtu_cached)
> +			mtu = last->route_mtu_cached;
> +		dst->metrics[RTAX_MTU-1] = mtu;
> +
> +		if (last == first)
> +			break;
> +
> +		last = last->u.next;
> +		last->child_mtu_cached = mtu;
> +	}
>  
>  	return 1;
>  }

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [IPSEC] Move xfrm_flush_bundles into xfrm_state GC
  2005-03-28 20:10       ` [4/4] [IPSEC] Store MTU at each xfrm_dst Patrick McHardy
@ 2005-03-28 23:30         ` Herbert Xu
  2005-03-31  0:10           ` Patrick McHardy
  2005-04-01  5:21           ` David S. Miller
  2005-03-28 23:39         ` Checking SPI in xfrm_state_find Herbert Xu
  1 sibling, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-28 23:30 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

[-- Attachment #1: Type: text/plain, Size: 1149 bytes --]

Hi Patrick:

On Mon, Mar 28, 2005 at 10:10:54PM +0200, Patrick McHardy wrote:
> 
> I haven't though of a way to avoid this yet. It would be nice though
> if we could keep the rule that xfrm_policy_lock and policy->lock nest
> with x->lock.

Thanks for catching this.

The locking in xfrm_state/xfrm_policy has always struck me as being
an overkill.  A lot of the locks should be replaced by rules that
ensure the validity of most operations while a ref count is held.
Now I have an excuse to do just that :)

For 2.6.12 let's go for a simpler fix that breaks the dead lock.

__xfrm_state_delete does not need to flush the bundles immediately.
In fact, it is more efficient if we delay the flush to the GC worker
since the flush is not dependent on any particular xfrm state.  By
delaying it we can do one single flush even when you're deleteing
the entire xfrm state list.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p --]
[-- Type: text/plain, Size: 1269 bytes --]

===== net/xfrm/xfrm_state.c 1.56 vs edited =====
--- 1.56/net/xfrm/xfrm_state.c	2005-03-11 13:39:54 +11:00
+++ edited/net/xfrm/xfrm_state.c	2005-03-29 09:16:13 +10:00
@@ -48,6 +48,8 @@
 static struct list_head xfrm_state_gc_list = LIST_HEAD_INIT(xfrm_state_gc_list);
 static DEFINE_SPINLOCK(xfrm_state_gc_lock);
 
+static int xfrm_state_gc_flush_bundles;
+
 static void __xfrm_state_delete(struct xfrm_state *x);
 
 static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family);
@@ -81,6 +83,11 @@
 	struct list_head *entry, *tmp;
 	struct list_head gc_list = LIST_HEAD_INIT(gc_list);
 
+	if (xfrm_state_gc_flush_bundles) {
+		xfrm_state_gc_flush_bundles = 0;
+		xfrm_flush_bundles();
+	}
+
 	spin_lock_bh(&xfrm_state_gc_lock);
 	list_splice_init(&xfrm_state_gc_list, &gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
@@ -228,8 +235,10 @@
 		 * our caller holds.  A larger value means that
 		 * there are DSTs attached to this xfrm_state.
 		 */
-		if (atomic_read(&x->refcnt) > 2)
-			xfrm_flush_bundles();
+		if (atomic_read(&x->refcnt) > 2) {
+			xfrm_state_gc_flush_bundles = 1;
+			schedule_work(&xfrm_state_gc_work);
+		}
 
 		/* All xfrm_state objects are created by xfrm_state_alloc.
 		 * The xfrm_state_alloc call gives a reference, and that

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Checking SPI in xfrm_state_find
  2005-03-28 20:10       ` [4/4] [IPSEC] Store MTU at each xfrm_dst Patrick McHardy
  2005-03-28 23:30         ` [IPSEC] Move xfrm_flush_bundles into xfrm_state GC Herbert Xu
@ 2005-03-28 23:39         ` Herbert Xu
  2005-03-31  0:13           ` Patrick McHardy
  1 sibling, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-03-28 23:39 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

On Mon, Mar 28, 2005 at 10:10:54PM +0200, Patrick McHardy wrote:
> 
> Something unrelated I was also wondering about, from xfrm_find_state():
> 
>         list_for_each_entry(x, xfrm_state_bydst+h, bydst) {
>                 if (x->props.family == family &&
>                     x->props.reqid == tmpl->reqid &&
>                     xfrm_state_addr_check(x, daddr, saddr, family) &&
>                     tmpl->mode == x->props.mode &&
>                     tmpl->id.proto == x->id.proto) {
> 
> Shouldn't we check for (tmpl->id.spi == x->id.spi || !tmpl->id.spi) ?

Absolutely.  We should also fix the larval state generation in that
same function to fail the operation if that SPI already exists.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Extending xfrm_selector (Was: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS)
  2005-03-20 17:49                                   ` Patrick McHardy
  2005-03-20 18:11                                     ` Ludo Stellingwerff
@ 2005-03-30  9:49                                     ` Herbert Xu
  1 sibling, 0 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-30  9:49 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Lennert Buytenhek, Ludo Stellingwerff, netdev

On Sun, Mar 20, 2005 at 06:49:43PM +0100, Patrick McHardy wrote:
>
> >- I have no idea beforehand what address ranges are going to be routed
> >  over this vlan.  (Customers might send traffic with source addresses
> >  in address ranges that they don't announce to us (asymmetric routing),
> >  and I don't want those packets to remain unencrypted just because they
> >  don't match the SPD rule.)  A 0.0.0.0/0 0.0.0.0/0 rule would not be
> >  appropriate either since that'd suck _all_ traffic into this tunnel
> 
> You can specify an ifindex for oif in the selector, but you need
> to use the xfrm_user interface.

Unfortunately that doesn't work currently because we don't fill in
the oif field in the flow.  More importantly, having an interface
specific outbound policy isn't very useful when you can't have an
interface specific inbound/forward policy to match.

This actually leads to another issue.  The xfrm_selector structure
only contains one iface field.  So we can't specify both iif and oif
for forwarded packets.

Extending xfrm_selector is not trivial.  Ludo Stellingwerff also
wanted to do that in order use fwmark in the selector.

Here is one possible solution that allows xfrm_selector to be
extended.

We store new selector fields in a separate netlink payload.  This
works because all the current messages that carry xfrm_selector carry
exactly one selector.

When the kernel receives a message containing an xfrm_selector it'll
initialise the kernel xfrm_selector to contain that plus whatever
is in the correspond netlink payload.  If the payload is absent the
kernel should fill in sane default values for compatibility.

For messages sent by the kernel we also attach a netlink payload to
cover the new values.  This should be backwards compatible as appliations
should ignore unknown payloads.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC] Move xfrm_flush_bundles into xfrm_state GC
  2005-03-28 23:30         ` [IPSEC] Move xfrm_flush_bundles into xfrm_state GC Herbert Xu
@ 2005-03-31  0:10           ` Patrick McHardy
  2005-04-01  5:21           ` David S. Miller
  1 sibling, 0 replies; 114+ messages in thread
From: Patrick McHardy @ 2005-03-31  0:10 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

Herbert Xu wrote:
> The locking in xfrm_state/xfrm_policy has always struck me as being
> an overkill.  A lot of the locks should be replaced by rules that
> ensure the validity of most operations while a ref count is held.
> Now I have an excuse to do just that :)

I agree, it really looks like slight overkill.

> For 2.6.12 let's go for a simpler fix that breaks the dead lock.
> 
> __xfrm_state_delete does not need to flush the bundles immediately.
> In fact, it is more efficient if we delay the flush to the GC worker
> since the flush is not dependent on any particular xfrm state.  By
> delaying it we can do one single flush even when you're deleteing
> the entire xfrm state list.

Looks good to me.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: Checking SPI in xfrm_state_find
  2005-03-28 23:39         ` Checking SPI in xfrm_state_find Herbert Xu
@ 2005-03-31  0:13           ` Patrick McHardy
  2005-03-31  0:46             ` Herbert Xu
  0 siblings, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-03-31  0:13 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

[-- Attachment #1: Type: text/plain, Size: 788 bytes --]

Herbert Xu wrote:
> On Mon, Mar 28, 2005 at 10:10:54PM +0200, Patrick McHardy wrote:
> 
>>Something unrelated I was also wondering about, from xfrm_find_state():
>>
>>        list_for_each_entry(x, xfrm_state_bydst+h, bydst) {
>>                if (x->props.family == family &&
>>                    x->props.reqid == tmpl->reqid &&
>>                    xfrm_state_addr_check(x, daddr, saddr, family) &&
>>                    tmpl->mode == x->props.mode &&
>>                    tmpl->id.proto == x->id.proto) {
>>
>>Shouldn't we check for (tmpl->id.spi == x->id.spi || !tmpl->id.spi) ?
> 
> 
> Absolutely.  We should also fix the larval state generation in that
> same function to fail the operation if that SPI already exists.

Thanks, both done by these two patches.

Regards
Patrick

[-- Attachment #2: x1 --]
[-- Type: text/plain, Size: 964 bytes --]

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2005/03/30 06:02:45+02:00 kaber@coreworks.de 
#   [IPSEC]: Check SPI in xfrm_state_find()
#   
#   Signed-off-by: Patrick McHardy <kaber@trash.net>
# 
# net/xfrm/xfrm_state.c
#   2005/03/30 06:02:36+02:00 kaber@coreworks.de +2 -1
#   [IPSEC]: Check SPI in xfrm_state_find()
#   
#   Signed-off-by: Patrick McHardy <kaber@trash.net>
# 
diff -Nru a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
--- a/net/xfrm/xfrm_state.c	2005-03-31 02:12:12 +02:00
+++ b/net/xfrm/xfrm_state.c	2005-03-31 02:12:12 +02:00
@@ -306,7 +306,8 @@
 		    x->props.reqid == tmpl->reqid &&
 		    xfrm_state_addr_check(x, daddr, saddr, family) &&
 		    tmpl->mode == x->props.mode &&
-		    tmpl->id.proto == x->id.proto) {
+		    tmpl->id.proto == x->id.proto &&
+		    (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) {
 			/* Resolution logic:
 			   1. There is a valid state with matching selector.
 			      Done.

[-- Attachment #3: x2 --]
[-- Type: text/plain, Size: 2235 bytes --]

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2005/03/31 02:07:54+02:00 kaber@coreworks.de 
#   [IPSEC]: Check if SPI exists before creating acquire state
#   
#   Signed-off-by: Patrick McHardy <kaber@trash.net>
# 
# net/xfrm/xfrm_state.c
#   2005/03/31 02:07:42+02:00 kaber@coreworks.de +25 -7
#   [IPSEC]: Check if SPI exists before creating acquire state
#   
#   Signed-off-by: Patrick McHardy <kaber@trash.net>
# 
diff -Nru a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
--- a/net/xfrm/xfrm_state.c	2005-03-31 02:12:57 +02:00
+++ b/net/xfrm/xfrm_state.c	2005-03-31 02:12:57 +02:00
@@ -295,10 +295,17 @@
 		unsigned short family)
 {
 	unsigned h = xfrm_dst_hash(daddr, family);
-	struct xfrm_state *x;
+	struct xfrm_state *x, *x0;
 	int acquire_in_progress = 0;
 	int error = 0;
 	struct xfrm_state *best = NULL;
+	struct xfrm_state_afinfo *afinfo;
+	
+	afinfo = xfrm_state_get_afinfo(family);
+	if (afinfo == NULL) {
+		*err = -EAFNOSUPPORT;
+		return NULL;
+	}
 
 	spin_lock_bh(&xfrm_state_lock);
 	list_for_each_entry(x, xfrm_state_bydst+h, bydst) {
@@ -334,14 +341,24 @@
 			} else if (x->km.state == XFRM_STATE_ERROR ||
 				   x->km.state == XFRM_STATE_EXPIRED) {
 				if (xfrm_selector_match(&x->sel, fl, family))
-					error = 1;
+					error = -ESRCH;
 			}
 		}
 	}
 
 	x = best;
-	if (!x && !error && !acquire_in_progress &&
-	    ((x = xfrm_state_alloc()) != NULL)) {
+	if (!x && !error && !acquire_in_progress) {
+		x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto);
+		if (x0 != NULL) {
+			xfrm_state_put(x0);
+			error = -EEXIST;
+			goto out;
+		}
+		x = xfrm_state_alloc();
+		if (x == NULL) {
+			error = -ENOMEM;
+			goto out;
+		}
 		/* Initialize temporary selector matching only
 		 * to current session. */
 		xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family);
@@ -363,15 +380,16 @@
 			x->km.state = XFRM_STATE_DEAD;
 			xfrm_state_put(x);
 			x = NULL;
-			error = 1;
+			error = -ESRCH;
 		}
 	}
+out:
 	if (x)
 		xfrm_state_hold(x);
 	else
-		*err = acquire_in_progress ? -EAGAIN :
-			(error ? -ESRCH : -ENOMEM);
+		*err = acquire_in_progress ? -EAGAIN : error;
 	spin_unlock_bh(&xfrm_state_lock);
+	xfrm_state_put_afinfo(afinfo);
 	return x;
 }
 

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: Checking SPI in xfrm_state_find
  2005-03-31  0:13           ` Patrick McHardy
@ 2005-03-31  0:46             ` Herbert Xu
  2005-04-01  5:23               ` David S. Miller
  2005-04-03 17:00               ` Checking SPI in xfrm_state_find Patrick McHardy
  0 siblings, 2 replies; 114+ messages in thread
From: Herbert Xu @ 2005-03-31  0:46 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

On Thu, Mar 31, 2005 at 02:13:54AM +0200, Patrick McHardy wrote:
>
> Thanks, both done by these two patches.

Great.

> # This is a BitKeeper generated diff -Nru style patch.
> #
> # ChangeSet
> #   2005/03/30 06:02:45+02:00 kaber@coreworks.de 
> #   [IPSEC]: Check SPI in xfrm_state_find()
> #   
> #   Signed-off-by: Patrick McHardy <kaber@trash.net>

Looks good.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

> # This is a BitKeeper generated diff -Nru style patch.
> #
> # ChangeSet
> #   2005/03/31 02:07:54+02:00 kaber@coreworks.de 
> #   [IPSEC]: Check if SPI exists before creating acquire state
> #   
> #   Signed-off-by: Patrick McHardy <kaber@trash.net>
>  
>  	x = best;
> -	if (!x && !error && !acquire_in_progress &&
> -	    ((x = xfrm_state_alloc()) != NULL)) {
> +	if (!x && !error && !acquire_in_progress) {
> +		x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto);

It just occured to me that it would be much simpler if you did the
existence check in the first loop.

So something like

		if (x->props.family != family ||
		    !xfrm_state_addr_check(x, daddr, saddr, family) ||
		    tmpl->id.proto == x->id.proto)
			continue;
		if (tmpl->id.spi) {
			if (tmpl->id.spi != x->id.spi)
				continue;
			error = -EEXIST;
		}
		if (x->props.reqid == tmpl->reqid &&
		    tmpl->mode == x->props.mode) {
		}

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC] Move xfrm_flush_bundles into xfrm_state GC
  2005-03-28 23:30         ` [IPSEC] Move xfrm_flush_bundles into xfrm_state GC Herbert Xu
  2005-03-31  0:10           ` Patrick McHardy
@ 2005-04-01  5:21           ` David S. Miller
  1 sibling, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-04-01  5:21 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev

On Tue, 29 Mar 2005 09:30:32 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> The locking in xfrm_state/xfrm_policy has always struck me as being
> an overkill.  A lot of the locks should be replaced by rules that
> ensure the validity of most operations while a ref count is held.
> Now I have an excuse to do just that :)
> 
> For 2.6.12 let's go for a simpler fix that breaks the dead lock.
> 
> __xfrm_state_delete does not need to flush the bundles immediately.
> In fact, it is more efficient if we delay the flush to the GC worker
> since the flush is not dependent on any particular xfrm state.  By
> delaying it we can do one single flush even when you're deleteing
> the entire xfrm state list.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Looks good, patch applied, thanks Herbert.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: Checking SPI in xfrm_state_find
  2005-03-31  0:46             ` Herbert Xu
@ 2005-04-01  5:23               ` David S. Miller
  2005-04-02  0:49                 ` [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel Herbert Xu
  2005-04-03 17:00               ` Checking SPI in xfrm_state_find Patrick McHardy
  1 sibling, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-04-01  5:23 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev

On Thu, 31 Mar 2005 10:46:58 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> > # This is a BitKeeper generated diff -Nru style patch.
> > #
> > # ChangeSet
> > #   2005/03/30 06:02:45+02:00 kaber@coreworks.de 
> > #   [IPSEC]: Check SPI in xfrm_state_find()
> > #   
> > #   Signed-off-by: Patrick McHardy <kaber@trash.net>
> 
> Looks good.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

To me too, both patches applied, thanks Patrick.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-01  5:23               ` David S. Miller
@ 2005-04-02  0:49                 ` Herbert Xu
  2005-04-02  1:20                   ` David S. Miller
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-04-02  0:49 UTC (permalink / raw)
  To: David S. Miller; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev

[-- Attachment #1: Type: text/plain, Size: 1336 bytes --]

Hi Dave:

On Thu, Mar 31, 2005 at 09:23:25PM -0800, David S. Miller wrote:
> On Thu, 31 Mar 2005 10:46:58 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > > # This is a BitKeeper generated diff -Nru style patch.
> > > #
> > > # ChangeSet
> > > #   2005/03/30 06:02:45+02:00 kaber@coreworks.de 
> > > #   [IPSEC]: Check SPI in xfrm_state_find()
> > > #   
> > > #   Signed-off-by: Patrick McHardy <kaber@trash.net>
> > 
> > Looks good.
> > 
> > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> 
> To me too, both patches applied, thanks Patrick.

Actually I only signed off on the first patch :)

The second patch creates a dead lock since it does a nested read
lock.  The solution is simply to get rid of xfrm_init_tempsel
and call the afinfo version directly.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

BTW I'd like to start cleaning up the locking in net/xfrm.
I don't want these changes to go into 2.6.12.  However, I'd
like to have them sit in mm for a while so that they get some
testing coverage.

What's the best way to do this? Could you create a tree slated
for 2.6.13?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p --]
[-- Type: text/plain, Size: 1024 bytes --]

===== net/xfrm/xfrm_state.c 1.60 vs edited =====
--- 1.60/net/xfrm/xfrm_state.c	2005-04-01 15:19:54 +10:00
+++ edited/net/xfrm/xfrm_state.c	2005-04-02 10:35:06 +10:00
@@ -283,20 +283,6 @@
 }
 EXPORT_SYMBOL(xfrm_state_flush);
 
-static int
-xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl,
-		  struct xfrm_tmpl *tmpl,
-		  xfrm_address_t *daddr, xfrm_address_t *saddr,
-		  unsigned short family)
-{
-	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	if (!afinfo)
-		return -1;
-	afinfo->init_tempsel(x, fl, tmpl, daddr, saddr);
-	xfrm_state_put_afinfo(afinfo);
-	return 0;
-}
-
 struct xfrm_state *
 xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, 
 		struct flowi *fl, struct xfrm_tmpl *tmpl,
@@ -370,7 +356,7 @@
 		}
 		/* Initialize temporary selector matching only
 		 * to current session. */
-		xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family);
+		afinfo->init_tempsel(x, fl, tmpl, daddr, saddr);
 
 		if (km_query(x, tmpl, pol) == 0) {
 			x->km.state = XFRM_STATE_ACQ;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-02  0:49                 ` [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel Herbert Xu
@ 2005-04-02  1:20                   ` David S. Miller
  2005-04-02  2:09                     ` Herbert Xu
  0 siblings, 1 reply; 114+ messages in thread
From: David S. Miller @ 2005-04-02  1:20 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev

On Sat, 2 Apr 2005 10:49:56 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> The second patch creates a dead lock since it does a nested read
> lock.  The solution is simply to get rid of xfrm_init_tempsel
> and call the afinfo version directly.

read locks nest even in the presence of pending writers

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-02  1:20                   ` David S. Miller
@ 2005-04-02  2:09                     ` Herbert Xu
  2005-04-03 16:48                       ` Patrick McHardy
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-04-02  2:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev

On Fri, Apr 01, 2005 at 05:20:07PM -0800, David S. Miller wrote:
> On Sat, 2 Apr 2005 10:49:56 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > The second patch creates a dead lock since it does a nested read
> > lock.  The solution is simply to get rid of xfrm_init_tempsel
> > and call the afinfo version directly.
> 
> read locks nest even in the presence of pending writers

Doh! I should've read the code first :)

It's still a valid clean-up patch though.

There is another reason why it won't dead lock.  We don't actually
ever hold the write lock on afinfo :) Is there any reason why we
dont't just use xfrm_state_afinfo_lock instead of afinfo->lock?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-02  2:09                     ` Herbert Xu
@ 2005-04-03 16:48                       ` Patrick McHardy
  2005-04-05 10:39                         ` Herbert Xu
  0 siblings, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-04-03 16:48 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, kuznet, jmorris, yoshfuji, netdev

[-- Attachment #1: Type: text/plain, Size: 779 bytes --]

Herbert Xu wrote:
> It's still a valid clean-up patch though.

Agreed. There is also a bug in my patch, tmpl->daddr can be 0 in which
case the daddr passed as an argument to xfrm_state_find() will be used.
My patch only checked tmpl->daddr, this patch fixes it. It also uses
afinfo->init_tempsel directly, but I didn't kill xfrm_init_tempsel() yet
because I need it for xfrm resolution.

> There is another reason why it won't dead lock.  We don't actually
> ever hold the write lock on afinfo :) Is there any reason why we
> dont't just use xfrm_state_afinfo_lock instead of afinfo->lock?

I don't think so. I also don't see a reason why the lock needs to be
held between xfrm_state_get_afinfo() and xfrm_state_put_afinfo(),
a reference count should be enough.

Regards
Patrick

[-- Attachment #2: x --]
[-- Type: text/plain, Size: 1698 bytes --]

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2005/04/03 18:41:22+02:00 kaber@coreworks.de 
#   [IPSEC]: Use correct daddr for duplicate state check
#   
#   Signed-off-by: Patrick McHardy <kaber@trash.net>
# 
# net/xfrm/xfrm_state.c
#   2005/04/03 18:41:14+02:00 kaber@coreworks.de +9 -9
#   [IPSEC]: Use correct daddr for duplicate state check
#   
#   Signed-off-by: Patrick McHardy <kaber@trash.net>
# 
diff -Nru a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
--- a/net/xfrm/xfrm_state.c	2005-04-03 18:41:41 +02:00
+++ b/net/xfrm/xfrm_state.c	2005-04-03 18:41:41 +02:00
@@ -357,12 +357,6 @@
 
 	x = best;
 	if (!x && !error && !acquire_in_progress) {
-		x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto);
-		if (x0 != NULL) {
-			xfrm_state_put(x0);
-			error = -EEXIST;
-			goto out;
-		}
 		x = xfrm_state_alloc();
 		if (x == NULL) {
 			error = -ENOMEM;
@@ -370,9 +364,11 @@
 		}
 		/* Initialize temporary selector matching only
 		 * to current session. */
-		xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family);
+		afinfo->init_tempsel(x, fl, tmpl, daddr, saddr);
+
+		x0 = afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto);
 
-		if (km_query(x, tmpl, pol) == 0) {
+		if (!x0 && km_query(x, tmpl, pol) == 0) {
 			x->km.state = XFRM_STATE_ACQ;
 			list_add_tail(&x->bydst, xfrm_state_bydst+h);
 			xfrm_state_hold(x);
@@ -386,10 +382,14 @@
 			x->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
 			add_timer(&x->timer);
 		} else {
+			error = -ESRCH;
+			if (x0) {
+				xfrm_state_put(x0);
+				error = -EEXIST;
+			}
 			x->km.state = XFRM_STATE_DEAD;
 			xfrm_state_put(x);
 			x = NULL;
-			error = -ESRCH;
 		}
 	}
 out:

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: Checking SPI in xfrm_state_find
  2005-03-31  0:46             ` Herbert Xu
  2005-04-01  5:23               ` David S. Miller
@ 2005-04-03 17:00               ` Patrick McHardy
  1 sibling, 0 replies; 114+ messages in thread
From: Patrick McHardy @ 2005-04-03 17:00 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	YOSHIFUJI Hideaki, netdev

Herbert Xu wrote:
> It just occured to me that it would be much simpler if you did the
> existence check in the first loop.
> 
> So something like
> 
> 		if (x->props.family != family ||
> 		    !xfrm_state_addr_check(x, daddr, saddr, family) ||
> 		    tmpl->id.proto == x->id.proto)
> 			continue;
> 		if (tmpl->id.spi) {
> 			if (tmpl->id.spi != x->id.spi)
> 				continue;
> 			error = -EEXIST;
> 		}
> 		if (x->props.reqid == tmpl->reqid &&
> 		    tmpl->mode == x->props.mode) {
> 		}

You're right, sorry for getting back to you so late. But since its
already in now and not very important, I'm going to leave it until
I have a better reason to touch that code, if you're ok with that.

Regards
Patrick

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-03 16:48                       ` Patrick McHardy
@ 2005-04-05 10:39                         ` Herbert Xu
  2005-04-05 20:01                           ` Patrick McHardy
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-04-05 10:39 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, kuznet, jmorris, yoshfuji, netdev

On Sun, Apr 03, 2005 at 06:48:17PM +0200, Patrick McHardy wrote:
> 
> Agreed. There is also a bug in my patch, tmpl->daddr can be 0 in which
> case the daddr passed as an argument to xfrm_state_find() will be used.
> My patch only checked tmpl->daddr, this patch fixes it. It also uses

Why not just use daddr? It's always guaranteed to be correct.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-05 10:39                         ` Herbert Xu
@ 2005-04-05 20:01                           ` Patrick McHardy
  2005-04-06  2:21                             ` Herbert Xu
  0 siblings, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-04-05 20:01 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, kuznet, jmorris, yoshfuji, netdev

[-- Attachment #1: Type: text/plain, Size: 581 bytes --]

Herbert Xu wrote:
> On Sun, Apr 03, 2005 at 06:48:17PM +0200, Patrick McHardy wrote:
> 
>>Agreed. There is also a bug in my patch, tmpl->daddr can be 0 in which
>>case the daddr passed as an argument to xfrm_state_find() will be used.
>>My patch only checked tmpl->daddr, this patch fixes it. It also uses
> 
> 
> Why not just use daddr? It's always guaranteed to be correct.

Good idea, I didn't think of this. This patch is without replacement of
xfrm_init_tempsel(), it caused an "unused function" warning and, as I
mentioned before, I still need the function.

Regards
Patrick

[-- Attachment #2: x --]
[-- Type: text/plain, Size: 457 bytes --]

===== net/xfrm/xfrm_state.c 1.60 vs edited =====
--- 1.60/net/xfrm/xfrm_state.c	2005-04-01 07:19:54 +02:00
+++ edited/net/xfrm/xfrm_state.c	2005-04-05 21:58:08 +02:00
@@ -357,7 +357,7 @@
 
 	x = best;
 	if (!x && !error && !acquire_in_progress) {
-		x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto);
+		x0 = afinfo->state_lookup(daddr, tmpl->id.spi, tmpl->id.proto);
 		if (x0 != NULL) {
 			xfrm_state_put(x0);
 			error = -EEXIST;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-05 20:01                           ` Patrick McHardy
@ 2005-04-06  2:21                             ` Herbert Xu
  2005-04-21 23:35                               ` David S. Miller
  0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-04-06  2:21 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, kuznet, jmorris, yoshfuji, netdev

On Tue, Apr 05, 2005 at 10:01:38PM +0200, Patrick McHardy wrote:
> 
> Good idea, I didn't think of this. This patch is without replacement of
> xfrm_init_tempsel(), it caused an "unused function" warning and, as I
> mentioned before, I still need the function.

Thanks.  Just one more issue that I can think of, the check should
only be done when tmpl->id.spi != 0.  Otherwise the presence of
valid states with differing state selectors will prevent new
sessions from starting up.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-06  2:21                             ` Herbert Xu
@ 2005-04-21 23:35                               ` David S. Miller
  2005-04-21 23:52                                 ` Herbert Xu
  2005-04-21 23:53                                 ` Patrick McHardy
  0 siblings, 2 replies; 114+ messages in thread
From: David S. Miller @ 2005-04-21 23:35 UTC (permalink / raw)
  To: Herbert Xu; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev

On Wed, 6 Apr 2005 12:21:55 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Tue, Apr 05, 2005 at 10:01:38PM +0200, Patrick McHardy wrote:
> > 
> > Good idea, I didn't think of this. This patch is without replacement of
> > xfrm_init_tempsel(), it caused an "unused function" warning and, as I
> > mentioned before, I still need the function.
> 
> Thanks.  Just one more issue that I can think of, the check should
> only be done when tmpl->id.spi != 0.  Otherwise the presence of
> valid states with differing state selectors will prevent new
> sessions from starting up.

Is it really worthwhile, right now, to change that tmpl->id.daddr to
daddr?  That seems to be all that Patrick's most recent patch does.

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-21 23:35                               ` David S. Miller
@ 2005-04-21 23:52                                 ` Herbert Xu
  2005-04-21 23:53                                 ` Patrick McHardy
  1 sibling, 0 replies; 114+ messages in thread
From: Herbert Xu @ 2005-04-21 23:52 UTC (permalink / raw)
  To: David S. Miller; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev

On Thu, Apr 21, 2005 at 04:35:26PM -0700, David S. Miller wrote:
>
> > Thanks.  Just one more issue that I can think of, the check should
> > only be done when tmpl->id.spi != 0.  Otherwise the presence of
> > valid states with differing state selectors will prevent new
> > sessions from starting up.
> 
> Is it really worthwhile, right now, to change that tmpl->id.daddr to
> daddr?  That seems to be all that Patrick's most recent patch does.

Yes, because tmpl->id.daddr can be zero when daddr is not zero.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-21 23:35                               ` David S. Miller
  2005-04-21 23:52                                 ` Herbert Xu
@ 2005-04-21 23:53                                 ` Patrick McHardy
  2005-04-22  3:13                                   ` David S. Miller
  1 sibling, 1 reply; 114+ messages in thread
From: Patrick McHardy @ 2005-04-21 23:53 UTC (permalink / raw)
  To: David S. Miller; +Cc: Herbert Xu, kuznet, jmorris, yoshfuji, netdev

[-- Attachment #1: Type: text/plain, Size: 844 bytes --]

David S. Miller wrote:
> On Wed, 6 Apr 2005 12:21:55 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
>>Thanks.  Just one more issue that I can think of, the check should
>>only be done when tmpl->id.spi != 0.  Otherwise the presence of
>>valid states with differing state selectors will prevent new
>>sessions from starting up.
> 
> Is it really worthwhile, right now, to change that tmpl->id.daddr to
> daddr?  That seems to be all that Patrick's most recent patch does.

Yes, tmpl->id.daddr might be 0, in which case the destination
of the packet or previous tunnel mode transforms is used. daddr
always contains the correct adress, so we should use it to check
for duplicate SPIs. But as Herbert noted, we shouldn't perform
the check if tmpl->id.spi == 0, so here is a new patch.

Signed-off-by: Patrick McHardy <kaber@trash.net>

[-- Attachment #2: x --]
[-- Type: text/plain, Size: 544 bytes --]

===== net/xfrm/xfrm_state.c 1.60 vs edited =====
--- 1.60/net/xfrm/xfrm_state.c	2005-04-01 07:19:54 +02:00
+++ edited/net/xfrm/xfrm_state.c	2005-04-22 01:51:37 +02:00
@@ -357,8 +357,9 @@
 
 	x = best;
 	if (!x && !error && !acquire_in_progress) {
-		x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto);
-		if (x0 != NULL) {
+		if (tmpl->id.spi &&
+		    (x0 = afinfo->state_lookup(daddr, tmpl->id.spi,
+		                               tmpl->id.proto)) != NULL) {
 			xfrm_state_put(x0);
 			error = -EEXIST;
 			goto out;

^ permalink raw reply	[flat|nested] 114+ messages in thread

* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel
  2005-04-21 23:53                                 ` Patrick McHardy
@ 2005-04-22  3:13                                   ` David S. Miller
  0 siblings, 0 replies; 114+ messages in thread
From: David S. Miller @ 2005-04-22  3:13 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: herbert, kuznet, jmorris, yoshfuji, netdev

On Fri, 22 Apr 2005 01:53:43 +0200
Patrick McHardy <kaber@trash.net> wrote:

> Yes, tmpl->id.daddr might be 0, in which case the destination
> of the packet or previous tunnel mode transforms is used. daddr
> always contains the correct adress, so we should use it to check
> for duplicate SPIs. But as Herbert noted, we shouldn't perform
> the check if tmpl->id.spi == 0, so here is a new patch.
> 
> Signed-off-by: Patrick McHardy <kaber@trash.net>

Ok, that makes sense.  Patch applied, thanks guys.

^ permalink raw reply	[flat|nested] 114+ messages in thread

end of thread, other threads:[~2005-04-22  3:13 UTC | newest]

Thread overview: 114+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-02-14 22:10 [1/4] [IPSEC] Merge xfrm[46]_bundle/stale_bundle Herbert Xu
2005-02-14 22:12 ` [2/4] [IPSEC] Add xfrm_state_mtu Herbert Xu
2005-02-14 22:14   ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu
2005-02-14 22:16     ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu
2005-02-15 15:53       ` James Morris
2005-02-15 20:31         ` Herbert Xu
2005-02-16 10:37       ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu
2005-02-16 11:08         ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu
2005-02-16 11:38           ` [7/*] [IPSEC] Get metrics for xfrm_dst from " Herbert Xu
2005-03-07  5:47             ` David S. Miller
2005-03-07 10:41               ` Herbert Xu
2005-03-07  5:35           ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update " David S. Miller
2005-03-07 10:39             ` Herbert Xu
2005-03-07  5:33         ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output David S. Miller
2005-03-07 11:45         ` [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len Herbert Xu
2005-03-07 17:33           ` David S. Miller
2005-03-07  5:32       ` [4/4] [IPSEC] Store MTU at each xfrm_dst David S. Miller
2005-03-07 10:35         ` [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok Herbert Xu
2005-03-07 17:32           ` David S. Miller
2005-03-08 10:27           ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu
2005-03-08 12:50             ` YOSHIFUJI Hideaki / 吉藤英明
2005-03-11  2:17             ` David S. Miller
2005-03-14 10:26             ` [12/*] [IPSEC] Handle local_df in IPv4 Herbert Xu
2005-03-14 10:53               ` [13/*] [IPV4] Fix room calculation in icmp_send Herbert Xu
2005-03-14 11:10                 ` [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward Herbert Xu
2005-03-15  5:27                   ` David S. Miller
2005-03-15  9:19                   ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu
2005-03-15  9:58                     ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu
2005-03-15 10:05                       ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu
2005-03-15 18:24                         ` David S. Miller
2005-03-15 19:02                           ` Patrick McHardy
2005-03-15 20:40                             ` Replace send_unreach with icmp_send Herbert Xu
2005-03-15 20:48                               ` Patrick McHardy
2005-03-16 10:51                                 ` [IPV4] Make ipt_REJECT use icmp_send again Herbert Xu
2005-03-16 19:00                                   ` Patrick McHardy
2005-03-16 22:44                                     ` David S. Miller
2005-03-17 10:51                                       ` [IPV4] Send TCP reset through dst_output in ipt_REJECT Herbert Xu
2005-03-17 18:06                                         ` David S. Miller
2005-03-15 20:31                           ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu
2005-03-15 10:20                       ` [16/*] [INET] Take IPsec overhead into account in tunnels Lennert Buytenhek
2005-03-15 10:27                         ` Herbert Xu
2005-03-15 18:20                       ` David S. Miller
2005-03-18  9:03                       ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit Herbert Xu
2005-03-18  9:11                         ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu
2005-03-18  9:19                           ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu Herbert Xu
2005-03-18 10:07                             ` [24/*] [IPSEC] Get ttl from child instead of path Herbert Xu
2005-03-18 10:11                               ` [25/*] [NET] Kill unnecessary uses of dst_path_metric Herbert Xu
2005-03-18 11:06                                 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric Herbert Xu
2005-03-18 11:28                                   ` [27/*] [NET] Make dst_allfrag use dst instead of dst->path Herbert Xu
2005-03-18 18:47                                     ` David S. Miller
2005-03-18 18:46                                   ` [26/*] [NET] Kill dst_pmtu/dst_path_metric David S. Miller
2005-03-18 18:44                                 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric David S. Miller
2005-03-18 18:43                               ` [24/*] [IPSEC] Get ttl from child instead of path David S. Miller
2005-03-18 18:41                             ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu David S. Miller
2005-03-18 18:40                           ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller
2005-03-20 15:46                             ` Patrick McHardy
2005-03-20 16:32                               ` Ludo Stellingwerff
2005-03-20 17:17                                 ` Lennert Buytenhek
2005-03-20 17:49                                   ` Patrick McHardy
2005-03-20 18:11                                     ` Ludo Stellingwerff
2005-03-20 18:22                                       ` Patrick McHardy
2005-03-20 18:43                                         ` jamal
2005-03-20 19:10                                           ` Patrick McHardy
2005-03-30  9:49                                     ` Extending xfrm_selector (Was: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS) Herbert Xu
2005-03-23  3:49                               ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller
2005-03-23  4:03                                 ` Patrick McHardy
2005-03-24  5:05                                   ` Netfilter+IPsec Patrick McHardy
2005-03-24  5:43                                     ` Netfilter+IPsec David S. Miller
2005-03-25  2:53                                       ` Netfilter+IPsec Herbert Xu
2005-03-25  5:10                                         ` Netfilter+IPsec Patrick McHardy
2005-03-23  9:24                               ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu
2005-03-18 18:39                         ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit David S. Miller
2005-03-15 18:18                     ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data David S. Miller
2005-03-16 11:31                     ` Herbert Xu
2005-03-16 22:02                       ` David S. Miller
2005-03-21 16:14                       ` Mika Penttilä
2005-03-21 20:28                         ` Herbert Xu
2005-03-21 21:29                           ` Mika Penttilä
2005-03-21 22:04                             ` Herbert Xu
2005-03-15  5:26                 ` [13/*] [IPV4] Fix room calculation in icmp_send David S. Miller
2005-03-15  5:25               ` [12/*] [IPSEC] Handle local_df in IPv4 David S. Miller
2005-03-15 18:25                 ` YOSHIFUJI Hideaki / 吉藤英明
2005-03-15 18:28                   ` YOSHIFUJI Hideaki / 吉藤英明
2005-03-28 20:10       ` [4/4] [IPSEC] Store MTU at each xfrm_dst Patrick McHardy
2005-03-28 23:30         ` [IPSEC] Move xfrm_flush_bundles into xfrm_state GC Herbert Xu
2005-03-31  0:10           ` Patrick McHardy
2005-04-01  5:21           ` David S. Miller
2005-03-28 23:39         ` Checking SPI in xfrm_state_find Herbert Xu
2005-03-31  0:13           ` Patrick McHardy
2005-03-31  0:46             ` Herbert Xu
2005-04-01  5:23               ` David S. Miller
2005-04-02  0:49                 ` [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel Herbert Xu
2005-04-02  1:20                   ` David S. Miller
2005-04-02  2:09                     ` Herbert Xu
2005-04-03 16:48                       ` Patrick McHardy
2005-04-05 10:39                         ` Herbert Xu
2005-04-05 20:01                           ` Patrick McHardy
2005-04-06  2:21                             ` Herbert Xu
2005-04-21 23:35                               ` David S. Miller
2005-04-21 23:52                                 ` Herbert Xu
2005-04-21 23:53                                 ` Patrick McHardy
2005-04-22  3:13                                   ` David S. Miller
2005-04-03 17:00               ` Checking SPI in xfrm_state_find Patrick McHardy
2005-02-15  8:10     ` [3/4] [IPSEC] Add route element to xfrm_dst Mika Penttilä
2005-02-15  9:53       ` Herbert Xu
2005-02-15 10:22         ` Mika Penttilä
2005-03-07  5:28     ` David S. Miller
2005-03-07 10:02       ` Herbert Xu
2005-03-07 10:16     ` [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy Herbert Xu
2005-03-07 17:35       ` David S. Miller
2005-03-14 11:52     ` [3/4] [IPSEC] Add route element to xfrm_dst Patrick McHardy
2005-03-14 20:32       ` Herbert Xu
2005-03-15 19:05         ` Patrick McHardy
2005-03-07  5:23   ` [2/4] [IPSEC] Add xfrm_state_mtu David S. Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).