* [1/4] [IPSEC] Merge xfrm[46]_bundle/stale_bundle
@ 2005-02-14 22:10 Herbert Xu
2005-02-14 22:12 ` [2/4] [IPSEC] Add xfrm_state_mtu Herbert Xu
0 siblings, 1 reply; 114+ messages in thread
From: Herbert Xu @ 2005-02-14 22:10 UTC (permalink / raw)
To: David S. Miller, Alexey Kuznetsov, James Morris,
YOSHIFUJI Hideaki, netdev
[-- Attachment #1: Type: text/plain, Size: 591 bytes --]
Hi:
This series of patches adds MTU tracking in each xfrm_dst.
This patch merges xfrm4_bundle_ok/xfrm6_bundle_ok/stale_bundle
so that later additions for MTU calculation only need to be done once.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
BTW, the previous version of this patch contained a critical
error in stale_bundle. So please disregard that one.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
[-- Attachment #2: xfrm-1 --]
[-- Type: text/plain, Size: 4208 bytes --]
diff -Nru a/include/net/xfrm.h b/include/net/xfrm.h
--- a/include/net/xfrm.h 2005-02-14 14:13:05 +11:00
+++ b/include/net/xfrm.h 2005-02-14 14:13:05 +11:00
@@ -857,6 +857,7 @@
extern void xfrm_policy_flush(void);
extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);
extern int xfrm_flush_bundles(void);
+extern int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family);
extern wait_queue_head_t km_waitq;
extern int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport);
diff -Nru a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
--- a/net/ipv4/xfrm4_policy.c 2005-02-14 14:13:05 +11:00
+++ b/net/ipv4/xfrm4_policy.c 2005-02-14 14:13:05 +11:00
@@ -22,26 +22,6 @@
return __ip_route_output_key((struct rtable**)dst, fl);
}
-/* Check that the bundle accepts the flow and its components are
- * still valid.
- */
-
-static int __xfrm4_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl)
-{
- do {
- if (xdst->u.dst.ops != &xfrm4_dst_ops)
- return 1;
-
- if (!xfrm_selector_match(&xdst->u.dst.xfrm->sel, fl, AF_INET))
- return 0;
- if (xdst->u.dst.xfrm->km.state != XFRM_STATE_VALID ||
- xdst->u.dst.path->obsolete > 0)
- return 0;
- xdst = (struct xfrm_dst*)xdst->u.dst.child;
- } while (xdst);
- return 0;
-}
-
static struct dst_entry *
__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
{
@@ -53,7 +33,7 @@
if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
xdst->u.rt.fl.fl4_src == fl->fl4_src &&
- __xfrm4_bundle_ok(xdst, fl)) {
+ xfrm_bundle_ok(xdst, fl, AF_INET)) {
dst_clone(dst);
break;
}
diff -Nru a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
--- a/net/ipv6/xfrm6_policy.c 2005-02-14 14:13:05 +11:00
+++ b/net/ipv6/xfrm6_policy.c 2005-02-14 14:13:05 +11:00
@@ -31,26 +31,6 @@
return err;
}
-/* Check that the bundle accepts the flow and its components are
- * still valid.
- */
-
-static int __xfrm6_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl)
-{
- do {
- if (xdst->u.dst.ops != &xfrm6_dst_ops)
- return 1;
-
- if (!xfrm_selector_match(&xdst->u.dst.xfrm->sel, fl, AF_INET6))
- return 0;
- if (xdst->u.dst.xfrm->km.state != XFRM_STATE_VALID ||
- xdst->u.dst.path->obsolete > 0)
- return 0;
- xdst = (struct xfrm_dst*)xdst->u.dst.child;
- } while (xdst);
- return 0;
-}
-
static struct dst_entry *
__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
{
@@ -70,7 +50,7 @@
xdst->u.rt6.rt6i_src.plen);
if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) &&
ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) &&
- __xfrm6_bundle_ok(xdst, fl)) {
+ xfrm_bundle_ok(xdst, fl, AF_INET6)) {
dst_clone(dst);
break;
}
diff -Nru a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
--- a/net/xfrm/xfrm_policy.c 2005-02-14 14:13:05 +11:00
+++ b/net/xfrm/xfrm_policy.c 2005-02-14 14:13:05 +11:00
@@ -1021,18 +1021,7 @@
static int stale_bundle(struct dst_entry *dst)
{
- struct dst_entry *child = dst;
-
- while (child) {
- if (child->obsolete > 0 ||
- (child->dev && !netif_running(child->dev)) ||
- (child->xfrm && child->xfrm->km.state != XFRM_STATE_VALID)) {
- return 1;
- }
- child = child->child;
- }
-
- return 0;
+ return !xfrm_bundle_ok((struct xfrm_dst *)dst, NULL, AF_UNSPEC);
}
static void xfrm_dst_destroy(struct dst_entry *dst)
@@ -1108,6 +1097,31 @@
return 0;
}
+/* Check that the bundle accepts the flow and its components are
+ * still valid.
+ */
+
+int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family)
+{
+ struct dst_entry *dst = &xdst->u.dst;
+
+ if (dst->path->obsolete > 0 ||
+ (dst->dev && !netif_running(dst->dev)))
+ return 0;
+
+ do {
+ if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family))
+ return 0;
+ if (dst->xfrm->km.state != XFRM_STATE_VALID)
+ return 0;
+ dst = dst->child;
+ } while (dst->xfrm);
+
+ return 1;
+}
+
+EXPORT_SYMBOL(xfrm_bundle_ok);
+
/* Well... that's _TASK_. We need to scan through transformation
* list and figure out what mss tcp should generate in order to
* final datagram fit to mtu. Mama mia... :-)
^ permalink raw reply [flat|nested] 114+ messages in thread* [2/4] [IPSEC] Add xfrm_state_mtu 2005-02-14 22:10 [1/4] [IPSEC] Merge xfrm[46]_bundle/stale_bundle Herbert Xu @ 2005-02-14 22:12 ` Herbert Xu 2005-02-14 22:14 ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu 2005-03-07 5:23 ` [2/4] [IPSEC] Add xfrm_state_mtu David S. Miller 0 siblings, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-02-14 22:12 UTC (permalink / raw) To: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev [-- Attachment #1: Type: text/plain, Size: 511 bytes --] This patch introduces the function xfrm_state_mtu which calculates the MTU under a specific SA. This function can be simplified once we get rid all other uses of get_max_size as we can move the calculation into the invidual SA type. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-2 --] [-- Type: text/plain, Size: 1367 bytes --] ===== include/net/xfrm.h 1.75 vs edited ===== --- 1.75/include/net/xfrm.h 2005-02-14 14:02:13 +11:00 +++ edited/include/net/xfrm.h 2005-02-14 14:11:38 +11:00 @@ -807,6 +807,7 @@ extern int xfrm_replay_check(struct xfrm_state *x, u32 seq); extern void xfrm_replay_advance(struct xfrm_state *x, u32 seq); extern int xfrm_state_check(struct xfrm_state *x, struct sk_buff *skb); +extern int xfrm_state_mtu(struct xfrm_state *x, int mtu); extern int xfrm4_rcv(struct sk_buff *skb); extern int xfrm4_output(struct sk_buff *skb); extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler); ===== net/xfrm/xfrm_state.c 1.54 vs edited ===== --- 1.54/net/xfrm/xfrm_state.c 2005-02-09 11:17:47 +11:00 +++ edited/net/xfrm/xfrm_state.c 2005-02-14 14:06:17 +11:00 @@ -966,6 +966,36 @@ } EXPORT_SYMBOL(xfrm_state_delete_tunnel); +int xfrm_state_mtu(struct xfrm_state *x, int mtu) +{ + int res = mtu; + + res -= x->props.header_len; + + for (;;) { + int m = res; + + if (m < 68) + return 68; + + spin_lock_bh(&x->lock); + if (x->km.state == XFRM_STATE_VALID && + x->type && x->type->get_max_size) + m = x->type->get_max_size(x, m); + else + m += x->props.header_len; + spin_unlock_bh(&x->lock); + + if (m <= mtu) + break; + res -= (m - mtu); + } + + return res; +} + +EXPORT_SYMBOL(xfrm_state_mtu); + void __init xfrm_state_init(void) { int i; ^ permalink raw reply [flat|nested] 114+ messages in thread
* [3/4] [IPSEC] Add route element to xfrm_dst 2005-02-14 22:12 ` [2/4] [IPSEC] Add xfrm_state_mtu Herbert Xu @ 2005-02-14 22:14 ` Herbert Xu 2005-02-14 22:16 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu ` (4 more replies) 2005-03-07 5:23 ` [2/4] [IPSEC] Add xfrm_state_mtu David S. Miller 1 sibling, 5 replies; 114+ messages in thread From: Herbert Xu @ 2005-02-14 22:14 UTC (permalink / raw) To: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev [-- Attachment #1: Type: text/plain, Size: 524 bytes --] This patch adds a pointer to the route corresponding to the specific flow over the SA of an xfrm_dst that's being used. It also sets the next pointer of each xfrm_dst to the one above it. This allows to traverse the list upwards from the bottom. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-3 --] [-- Type: text/plain, Size: 5915 bytes --] ===== include/net/xfrm.h 1.76 vs edited ===== --- 1.76/include/net/xfrm.h 2005-02-14 14:14:25 +11:00 +++ edited/include/net/xfrm.h 2005-02-14 14:24:04 +11:00 @@ -511,6 +511,7 @@ struct rtable rt; struct rt6_info rt6; } u; + struct dst_entry *route; }; /* Decapsulation state, used by the input to store data during ===== net/ipv4/xfrm4_policy.c 1.14 vs edited ===== --- 1.14/net/ipv4/xfrm4_policy.c 2005-02-14 14:02:13 +11:00 +++ edited/net/ipv4/xfrm4_policy.c 2005-02-14 14:29:35 +11:00 @@ -55,18 +55,29 @@ struct rtable *rt = rt0; u32 remote = fl->fl4_dst; u32 local = fl->fl4_src; + struct flowi fl_tunnel = { + .nl_u = { + .ip4_u = { + .saddr = local, + .daddr = remote + } + } + }; int i; int err; int header_len = 0; int trailer_len = 0; dst = dst_prev = NULL; + dst_hold(&rt->u.dst); for (i = 0; i < nx; i++) { struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops); + struct xfrm_dst *xdst; if (unlikely(dst1 == NULL)) { err = -ENOBUFS; + dst_release(&rt->u.dst); goto error; } @@ -77,6 +88,11 @@ dst1->flags |= DST_NOHASH; dst_clone(dst1); } + + xdst = (struct xfrm_dst *)dst1; + xdst->route = &rt->u.dst; + + dst1->next = dst_prev; dst_prev = dst1; if (xfrm[i]->props.mode) { remote = xfrm[i]->id.daddr.a4; @@ -84,23 +100,27 @@ } header_len += xfrm[i]->props.header_len; trailer_len += xfrm[i]->props.trailer_len; - } - if (remote != fl->fl4_dst) { - struct flowi fl_tunnel = { .nl_u = { .ip4_u = - { .daddr = remote, - .saddr = local } - } - }; - err = xfrm_dst_lookup((struct xfrm_dst**)&rt, &fl_tunnel, AF_INET); - if (err) - goto error; - } else { - dst_hold(&rt->u.dst); + if (remote != fl_tunnel.fl4_dst) { + fl_tunnel.fl4_src = local; + fl_tunnel.fl4_dst = remote; + err = xfrm_dst_lookup((struct xfrm_dst **)&rt, + &fl_tunnel, AF_INET); + if (err) + goto error; + } else + dst_hold(&rt->u.dst); } + dst_prev->child = &rt->u.dst; + dst->path = &rt->u.dst; + + *dst_p = dst; + dst = dst_prev; + + dst_prev = *dst_p; i = 0; - for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { + for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; x->u.rt.fl = *fl; @@ -114,7 +134,6 @@ dst_prev->header_len = header_len; dst_prev->trailer_len = trailer_len; memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics)); - dst_prev->path = &rt->u.dst; /* Copy neighbout for reachability confirmation */ dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); @@ -134,7 +153,6 @@ header_len -= x->u.dst.xfrm->props.header_len; trailer_len -= x->u.dst.xfrm->props.trailer_len; } - *dst_p = dst; return 0; error: ===== net/ipv6/xfrm6_policy.c 1.25 vs edited ===== --- 1.25/net/ipv6/xfrm6_policy.c 2005-02-14 14:02:13 +11:00 +++ edited/net/ipv6/xfrm6_policy.c 2005-02-14 14:50:04 +11:00 @@ -72,18 +72,29 @@ struct rt6_info *rt = rt0; struct in6_addr *remote = &fl->fl6_dst; struct in6_addr *local = &fl->fl6_src; + struct flowi fl_tunnel = { + .nl_u = { + .ip6_u = { + .saddr = *local, + .daddr = *remote + } + } + }; int i; int err = 0; int header_len = 0; int trailer_len = 0; dst = dst_prev = NULL; + dst_hold(&rt->u.dst); for (i = 0; i < nx; i++) { struct dst_entry *dst1 = dst_alloc(&xfrm6_dst_ops); + struct xfrm_dst *xdst; if (unlikely(dst1 == NULL)) { err = -ENOBUFS; + dst_release(&rt->u.dst); goto error; } @@ -94,6 +105,11 @@ dst1->flags |= DST_NOHASH; dst_clone(dst1); } + + xdst = (struct xfrm_dst *)dst1; + xdst->route = &rt->u.dst; + + dst1->next = dst_prev; dst_prev = dst1; if (xfrm[i]->props.mode) { remote = (struct in6_addr*)&xfrm[i]->id.daddr; @@ -101,25 +117,27 @@ } header_len += xfrm[i]->props.header_len; trailer_len += xfrm[i]->props.trailer_len; - } - - if (!ipv6_addr_equal(remote, &fl->fl6_dst)) { - struct flowi fl_tunnel; - memset(&fl_tunnel, 0, sizeof(fl_tunnel)); - ipv6_addr_copy(&fl_tunnel.fl6_dst, remote); - ipv6_addr_copy(&fl_tunnel.fl6_src, local); - - err = xfrm_dst_lookup((struct xfrm_dst **) &rt, - &fl_tunnel, AF_INET6); - if (err) - goto error; - } else { - dst_hold(&rt->u.dst); + if (!ipv6_addr_equal(remote, &fl_tunnel.fl6_dst)) { + ipv6_addr_copy(&fl_tunnel.fl6_dst, remote); + ipv6_addr_copy(&fl_tunnel.fl6_src, local); + err = xfrm_dst_lookup((struct xfrm_dst **) &rt, + &fl_tunnel, AF_INET6); + if (err) + goto error; + } else + dst_hold(&rt->u.dst); } + dst_prev->child = &rt->u.dst; + dst->path = &rt->u.dst; + + *dst_p = dst; + dst = dst_prev; + + dst_prev = *dst_p; i = 0; - for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { + for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; dst_prev->xfrm = xfrm[i++]; @@ -132,7 +150,6 @@ dst_prev->header_len = header_len; dst_prev->trailer_len = trailer_len; memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics)); - dst_prev->path = &rt->u.dst; /* Copy neighbour for reachability confirmation */ dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); @@ -150,7 +167,6 @@ header_len -= x->u.dst.xfrm->props.header_len; trailer_len -= x->u.dst.xfrm->props.trailer_len; } - *dst_p = dst; return 0; error: ===== net/xfrm/xfrm_policy.c 1.65 vs edited ===== --- 1.65/net/xfrm/xfrm_policy.c 2005-02-14 14:02:13 +11:00 +++ edited/net/xfrm/xfrm_policy.c 2005-02-14 14:39:52 +11:00 @@ -1026,6 +1026,11 @@ static void xfrm_dst_destroy(struct dst_entry *dst) { + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + + if (xdst->route) + dst_release(xdst->route); + if (!dst->xfrm) return; xfrm_state_put(dst->xfrm); ^ permalink raw reply [flat|nested] 114+ messages in thread
* [4/4] [IPSEC] Store MTU at each xfrm_dst 2005-02-14 22:14 ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu @ 2005-02-14 22:16 ` Herbert Xu 2005-02-15 15:53 ` James Morris ` (3 more replies) 2005-02-15 8:10 ` [3/4] [IPSEC] Add route element to xfrm_dst Mika Penttilä ` (3 subsequent siblings) 4 siblings, 4 replies; 114+ messages in thread From: Herbert Xu @ 2005-02-14 22:16 UTC (permalink / raw) To: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev [-- Attachment #1: Type: text/plain, Size: 514 bytes --] Finally this is the patch that sets the MTU values of each xfrm_dst and keeps it up-to-date. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> To recap, at this point we've obtained accurate MTU values at each xfrm_dst. The next step would be to start using it as dst_pmtu(xfrm_dst). Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-4 --] [-- Type: text/plain, Size: 3516 bytes --] ===== include/net/xfrm.h 1.77 vs edited ===== --- 1.77/include/net/xfrm.h 2005-02-14 14:51:26 +11:00 +++ edited/include/net/xfrm.h 2005-02-14 14:23:51 +11:00 @@ -512,6 +512,8 @@ struct rt6_info rt6; } u; struct dst_entry *route; + u32 route_mtu_cached; + u32 child_mtu_cached; }; /* Decapsulation state, used by the input to store data during @@ -860,6 +862,7 @@ extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol); extern int xfrm_flush_bundles(void); extern int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family); +extern void xfrm_init_pmtu(struct dst_entry *dst); extern wait_queue_head_t km_waitq; extern int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport); ===== net/ipv4/xfrm4_policy.c 1.15 vs edited ===== --- 1.15/net/ipv4/xfrm4_policy.c 2005-02-14 14:51:26 +11:00 +++ edited/net/ipv4/xfrm4_policy.c 2005-02-14 14:53:06 +11:00 @@ -153,6 +153,8 @@ header_len -= x->u.dst.xfrm->props.header_len; trailer_len -= x->u.dst.xfrm->props.trailer_len; } + + xfrm_init_pmtu(dst); return 0; error: ===== net/ipv6/xfrm6_policy.c 1.26 vs edited ===== --- 1.26/net/ipv6/xfrm6_policy.c 2005-02-14 14:51:26 +11:00 +++ edited/net/ipv6/xfrm6_policy.c 2005-02-14 14:53:25 +11:00 @@ -167,6 +167,8 @@ header_len -= x->u.dst.xfrm->props.header_len; trailer_len -= x->u.dst.xfrm->props.trailer_len; } + + xfrm_init_pmtu(dst); return 0; error: ===== net/xfrm/xfrm_policy.c 1.66 vs edited ===== --- 1.66/net/xfrm/xfrm_policy.c 2005-02-14 14:51:26 +11:00 +++ edited/net/xfrm/xfrm_policy.c 2005-02-14 21:29:56 +11:00 @@ -1102,25 +1102,86 @@ return 0; } +void xfrm_init_pmtu(struct dst_entry *dst) +{ + do { + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + u32 pmtu, route_mtu_cached; + + pmtu = dst_pmtu(dst->child); + xdst->child_mtu_cached = pmtu; + + pmtu = xfrm_state_mtu(dst->xfrm, pmtu); + + route_mtu_cached = dst_pmtu(xdst->route); + xdst->route_mtu_cached = route_mtu_cached; + + if (pmtu > route_mtu_cached) + pmtu = route_mtu_cached; + + dst->metrics[RTAX_MTU-1] = pmtu; + } while ((dst = dst->next)); +} + +EXPORT_SYMBOL(xfrm_init_pmtu); + /* Check that the bundle accepts the flow and its components are * still valid. */ -int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family) +int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family) { - struct dst_entry *dst = &xdst->u.dst; + struct dst_entry *dst = &first->u.dst; + struct xfrm_dst *last; + u32 mtu; if (dst->path->obsolete > 0 || (dst->dev && !netif_running(dst->dev))) return 0; + last = NULL; + do { + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family)) return 0; if (dst->xfrm->km.state != XFRM_STATE_VALID) return 0; + + mtu = dst_pmtu(dst->child); + if (xdst->child_mtu_cached != mtu) { + last = xdst; + xdst->child_mtu_cached = mtu; + } + + mtu = dst_pmtu(xdst->route); + if (xdst->child_mtu_cached != mtu) { + last = xdst; + xdst->route_mtu_cached = mtu; + } + dst = dst->child; } while (dst->xfrm); + + if (likely(!last)) + return 1; + + mtu = last->child_mtu_cached; + for (;;) { + dst = &last->u.dst; + + mtu = xfrm_state_mtu(dst->xfrm, mtu); + if (mtu > last->route_mtu_cached) + mtu = last->route_mtu_cached; + dst->metrics[RTAX_MTU-1] = mtu; + + if (last == first) + break; + + last = last->u.next; + last->child_mtu_cached = mtu; + } return 1; } ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [4/4] [IPSEC] Store MTU at each xfrm_dst 2005-02-14 22:16 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu @ 2005-02-15 15:53 ` James Morris 2005-02-15 20:31 ` Herbert Xu 2005-02-16 10:37 ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu ` (2 subsequent siblings) 3 siblings, 1 reply; 114+ messages in thread From: James Morris @ 2005-02-15 15:53 UTC (permalink / raw) To: Herbert Xu; +Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki, netdev On Tue, 15 Feb 2005, Herbert Xu wrote: > Finally this is the patch that sets the MTU values of each xfrm_dst > and keeps it up-to-date. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> > > To recap, at this point we've obtained accurate MTU values at each > xfrm_dst. The next step would be to start using it as > dst_pmtu(xfrm_dst). Nice work. The only thing I'd suggest is to consider renaming the new route element of xfrm_dst to xfrm_dst_route (or xd_route), to make it easier to navigate the code. (But the other elements are not distinctive either so perhaps some future cleanup could do it). - James -- James Morris <jmorris@redhat.com> ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [4/4] [IPSEC] Store MTU at each xfrm_dst 2005-02-15 15:53 ` James Morris @ 2005-02-15 20:31 ` Herbert Xu 0 siblings, 0 replies; 114+ messages in thread From: Herbert Xu @ 2005-02-15 20:31 UTC (permalink / raw) To: James Morris; +Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki, netdev On Tue, Feb 15, 2005 at 10:53:11AM -0500, James Morris wrote: > > The only thing I'd suggest is to consider renaming the new route element > of xfrm_dst to xfrm_dst_route (or xd_route), to make it easier to navigate > the code. (But the other elements are not distinctive either so perhaps > some future cleanup could do it). Thanks for your comment. I would say that not only are the other elements of xfrm_dst missing the prefixes, but the same thing applies to the rest of xfrm.h :) So if we decide to go that way, then we should do it to the entire xfrm subsystem. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output 2005-02-14 22:16 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu 2005-02-15 15:53 ` James Morris @ 2005-02-16 10:37 ` Herbert Xu 2005-02-16 11:08 ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu ` (2 more replies) 2005-03-07 5:32 ` [4/4] [IPSEC] Store MTU at each xfrm_dst David S. Miller 2005-03-28 20:10 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Patrick McHardy 3 siblings, 3 replies; 114+ messages in thread From: Herbert Xu @ 2005-02-16 10:37 UTC (permalink / raw) To: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev [-- Attachment #1: Type: text/plain, Size: 673 bytes --] There was a lot of fun in laying the foundations. Now it's time to start using it. This patch fixes the calculations in xfrm[46]_output so that we don't reject properly sized packets from the TCP stack. The previous calculation is wrong because the xfrm overhead is not constant. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> After this I'll start cleaning stuff up. For example, tcp's ext2_header_len and dst's path can both be removed. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-5 --] [-- Type: text/plain, Size: 1256 bytes --] ===== include/net/dst.h 1.25 vs edited ===== --- 1.25/include/net/dst.h 2005-02-06 14:23:59 +11:00 +++ edited/include/net/dst.h 2005-02-16 21:29:56 +11:00 @@ -123,6 +123,16 @@ return mtu; } +static inline u32 dst_mtu(const struct dst_entry *dst) +{ + u32 mtu = dst_metric(dst, RTAX_MTU); + /* + * Alexey put it here, so ask him about it :) + */ + barrier(); + return mtu; +} + static inline int dst_metric_locked(struct dst_entry *dst, int metric) { ===== net/ipv4/xfrm4_output.c 1.5 vs edited ===== --- 1.5/net/ipv4/xfrm4_output.c 2004-10-26 09:10:25 +10:00 +++ edited/net/ipv4/xfrm4_output.c 2005-02-16 21:31:32 +11:00 @@ -82,7 +82,7 @@ goto out; dst = skb->dst; - mtu = dst_pmtu(dst) - dst->header_len - dst->trailer_len; + mtu = dst_mtu(dst); if (skb->len > mtu) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); ret = -EMSGSIZE; ===== net/ipv6/xfrm6_output.c 1.7 vs edited ===== --- 1.7/net/ipv6/xfrm6_output.c 2004-11-16 13:52:34 +11:00 +++ edited/net/ipv6/xfrm6_output.c 2005-02-16 21:31:46 +11:00 @@ -79,7 +79,7 @@ int mtu, ret = 0; struct dst_entry *dst = skb->dst; - mtu = dst_pmtu(dst) - dst->header_len - dst->trailer_len; + mtu = dst_mtu(dst); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; ^ permalink raw reply [flat|nested] 114+ messages in thread
* [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst 2005-02-16 10:37 ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu @ 2005-02-16 11:08 ` Herbert Xu 2005-02-16 11:38 ` [7/*] [IPSEC] Get metrics for xfrm_dst from " Herbert Xu 2005-03-07 5:35 ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update " David S. Miller 2005-03-07 5:33 ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output David S. Miller 2005-03-07 11:45 ` [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len Herbert Xu 2 siblings, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-02-16 11:08 UTC (permalink / raw) To: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev [-- Attachment #1: Type: text/plain, Size: 655 bytes --] Let's also fix IPsec PMTU storage. When we get an MTU update for an xfrm_dst, it should be done to the top dst, not the bottom dst. For example, when we get a need-to-frag message for host C behind a our IPsec peer B, we should be updating the dst entry for C and not B as we do now. I've removed the boundary checks since the same checks are done in ipv[46]/route.c already. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-6 --] [-- Type: text/plain, Size: 1056 bytes --] ===== net/ipv4/xfrm4_policy.c 1.16 vs edited ===== --- 1.16/net/ipv4/xfrm4_policy.c 2005-02-14 14:59:25 +11:00 +++ edited/net/ipv4/xfrm4_policy.c 2005-02-16 22:01:56 +11:00 @@ -235,10 +235,8 @@ static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) { - struct dst_entry *path = dst->path; - - if (mtu < 68 + dst->header_len) - return; + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; path->ops->update_pmtu(path, mtu); } ===== net/ipv6/xfrm6_policy.c 1.27 vs edited ===== --- 1.27/net/ipv6/xfrm6_policy.c 2005-02-14 14:59:25 +11:00 +++ edited/net/ipv6/xfrm6_policy.c 2005-02-16 22:03:06 +11:00 @@ -243,12 +243,10 @@ static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) { - struct dst_entry *path = dst->path; + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; - if (mtu >= IPV6_MIN_MTU && mtu < dst_pmtu(dst)) - path->ops->update_pmtu(path, mtu); - - return; + path->ops->update_pmtu(path, mtu); } static struct dst_ops xfrm6_dst_ops = { ^ permalink raw reply [flat|nested] 114+ messages in thread
* [7/*] [IPSEC] Get metrics for xfrm_dst from top dst 2005-02-16 11:08 ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu @ 2005-02-16 11:38 ` Herbert Xu 2005-03-07 5:47 ` David S. Miller 2005-03-07 5:35 ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update " David S. Miller 1 sibling, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-02-16 11:38 UTC (permalink / raw) To: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev [-- Attachment #1: Type: text/plain, Size: 509 bytes --] For the metrics other than MTU, we should also be getting it from the top dst as opposed to the bottom dst. With this change, the user is able to manipulate values such as advmss for individual hosts behind a remote IPsec gateway. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-7 --] [-- Type: text/plain, Size: 1148 bytes --] ===== net/ipv4/xfrm4_policy.c 1.17 vs edited ===== --- 1.17/net/ipv4/xfrm4_policy.c 2005-02-16 22:09:18 +11:00 +++ edited/net/ipv4/xfrm4_policy.c 2005-02-16 22:35:09 +11:00 @@ -133,7 +133,7 @@ dst_prev->lastuse = jiffies; dst_prev->header_len = header_len; dst_prev->trailer_len = trailer_len; - memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics)); + memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics)); /* Copy neighbout for reachability confirmation */ dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); ===== net/ipv6/xfrm6_policy.c 1.28 vs edited ===== --- 1.28/net/ipv6/xfrm6_policy.c 2005-02-16 22:09:18 +11:00 +++ edited/net/ipv6/xfrm6_policy.c 2005-02-16 22:35:15 +11:00 @@ -149,7 +149,7 @@ dst_prev->lastuse = jiffies; dst_prev->header_len = header_len; dst_prev->trailer_len = trailer_len; - memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics)); + memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics)); /* Copy neighbour for reachability confirmation */ dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [7/*] [IPSEC] Get metrics for xfrm_dst from top dst 2005-02-16 11:38 ` [7/*] [IPSEC] Get metrics for xfrm_dst from " Herbert Xu @ 2005-03-07 5:47 ` David S. Miller 2005-03-07 10:41 ` Herbert Xu 0 siblings, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-03-07 5:47 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev On Wed, 16 Feb 2005 22:38:36 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > For the metrics other than MTU, we should also be getting it from > the top dst as opposed to the bottom dst. With this change, the > user is able to manipulate values such as advmss for individual > hosts behind a remote IPsec gateway. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, great work Herbert. This is the last patch I saw in this series. Any more stuff you have ready to apply in this area? My net-2.6 tree at: bk://kernel.bkbits.net/davem/net-2.6 has all of the patches I applied today in it. I'll push that to Linus tomorrow most likely. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [7/*] [IPSEC] Get metrics for xfrm_dst from top dst 2005-03-07 5:47 ` David S. Miller @ 2005-03-07 10:41 ` Herbert Xu 0 siblings, 0 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-07 10:41 UTC (permalink / raw) To: David S. Miller; +Cc: kuznet, jmorris, yoshfuji, netdev On Sun, Mar 06, 2005 at 09:47:54PM -0800, David S. Miller wrote: > > This is the last patch I saw in this series. Any more stuff > you have ready to apply in this area? I've just sent you two fix-up's. I'll work on the clean-up's that I talked about next. Once that's done we can consider the first step in the MTU rework complete. The next step would be to relay ICMP information back to the sender. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst 2005-02-16 11:08 ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu 2005-02-16 11:38 ` [7/*] [IPSEC] Get metrics for xfrm_dst from " Herbert Xu @ 2005-03-07 5:35 ` David S. Miller 2005-03-07 10:39 ` Herbert Xu 1 sibling, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-03-07 5:35 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev On Wed, 16 Feb 2005 22:08:42 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > Let's also fix IPsec PMTU storage. When we get an MTU update for an > xfrm_dst, it should be done to the top dst, not the bottom dst. > > For example, when we get a need-to-frag message for host C behind > a our IPsec peer B, we should be updating the dst entry for C and > not B as we do now. > > I've removed the boundary checks since the same checks are done > in ipv[46]/route.c already. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks Herbert. Note that sometimes it is better to replace an "unnecessary as determined by me" boundary check with a BUG() instead of outright removal. That way you get to test your assertion :) ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst 2005-03-07 5:35 ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update " David S. Miller @ 2005-03-07 10:39 ` Herbert Xu 0 siblings, 0 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-07 10:39 UTC (permalink / raw) To: David S. Miller; +Cc: kuznet, jmorris, yoshfuji, netdev On Sun, Mar 06, 2005 at 09:35:02PM -0800, David S. Miller wrote: > On Wed, 16 Feb 2005 22:08:42 +1100 > Herbert Xu <herbert@gondor.apana.org.au> wrote: > > > Let's also fix IPsec PMTU storage. When we get an MTU update for an > > xfrm_dst, it should be done to the top dst, not the bottom dst. > > > > For example, when we get a need-to-frag message for host C behind > > a our IPsec peer B, we should be updating the dst entry for C and > > not B as we do now. > > > > I've removed the boundary checks since the same checks are done > > in ipv[46]/route.c already. > > Note that sometimes it is better to replace an "unnecessary as > determined by me" boundary check with a BUG() instead of > outright removal. That way you get to test your assertion :) Point taken. However, I must say that using BUG() in these two particular places would be fatal :) -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output 2005-02-16 10:37 ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu 2005-02-16 11:08 ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu @ 2005-03-07 5:33 ` David S. Miller 2005-03-07 11:45 ` [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len Herbert Xu 2 siblings, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-07 5:33 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev On Wed, 16 Feb 2005 21:37:44 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > There was a lot of fun in laying the foundations. Now it's time > to start using it. > > This patch fixes the calculations in xfrm[46]_output so that we don't > reject properly sized packets from the TCP stack. The previous > calculation is wrong because the xfrm overhead is not constant. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> > > After this I'll start cleaning stuff up. For example, tcp's > ext2_header_len and dst's path can both be removed. Looks good, applied. ^ permalink raw reply [flat|nested] 114+ messages in thread
* [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len 2005-02-16 10:37 ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu 2005-02-16 11:08 ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu 2005-03-07 5:33 ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output David S. Miller @ 2005-03-07 11:45 ` Herbert Xu 2005-03-07 17:33 ` David S. Miller 2 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-07 11:45 UTC (permalink / raw) To: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 550 bytes --] Hi Dave: Here is a patch that replaces all occurrences of dst_pmtu in the TCP stack. As a result we no longer need ext2_header_len. This has a nice synergetic effect with Arnaldo's latest change to linux/tcp.h :) Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> I'll be removing other users of dst->path/dst_pmtu next. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-10 --] [-- Type: text/plain, Size: 6221 bytes --] ===== include/linux/tcp.h 1.35 vs edited ===== --- 1.35/include/linux/tcp.h 2005-02-23 05:45:31 +11:00 +++ edited/include/linux/tcp.h 2005-03-07 22:41:00 +11:00 @@ -284,10 +284,13 @@ __u32 mss_cache; /* Cached effective mss, not including SACKS */ __u16 mss_cache_std; /* Like mss_cache, but without TSO */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ - __u16 ext2_header_len;/* Options depending on route */ __u8 ca_state; /* State of fast-retransmit machine */ __u8 retransmits; /* Number of unrecovered RTO timeouts. */ + __u16 advmss; /* Advertised MSS */ + __u32 window_clamp; /* Maximal window to advertise */ + __u32 rcv_ssthresh; /* Current window clamp */ + __u32 frto_highmark; /* snd_nxt when RTO occurred */ __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ @@ -345,14 +348,9 @@ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ - __u32 window_clamp; /* Maximal window to advertise */ - __u32 rcv_ssthresh; /* Current window clamp */ - __u16 advmss; /* Advertised MSS */ - __u8 syn_retries; /* num of allowed syn retries */ __u8 ecn_flags; /* ECN status bits. */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ - __u16 __pad1; __u32 lost_out; /* Lost packets */ __u32 sacked_out; /* SACK'd packets */ __u32 fackets_out; /* FACK'd packets */ ===== net/ipv4/tcp_ipv4.c 1.110 vs edited ===== --- 1.110/net/ipv4/tcp_ipv4.c 2005-02-23 14:23:09 +11:00 +++ edited/net/ipv4/tcp_ipv4.c 2005-03-07 22:02:37 +11:00 @@ -831,7 +831,6 @@ /* OK, now commit destination to socket. */ __sk_dst_set(sk, &rt->u.dst); tcp_v4_setup_caps(sk, &rt->u.dst); - tp->ext2_header_len = rt->u.dst.header_len; if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->saddr, @@ -941,10 +940,10 @@ /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ - if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst)) + if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) sk->sk_err_soft = EMSGSIZE; - mtu = dst_pmtu(dst); + mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && tp->pmtu_cookie > mtu) { @@ -1578,10 +1577,9 @@ newtp->ext_header_len = 0; if (newinet->opt) newtp->ext_header_len = newinet->opt->optlen; - newtp->ext2_header_len = dst->header_len; newinet->id = newtp->write_seq ^ jiffies; - tcp_sync_mss(newsk, dst_pmtu(dst)); + tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); @@ -1877,7 +1875,6 @@ __sk_dst_set(sk, &rt->u.dst); tcp_v4_setup_caps(sk, &rt->u.dst); - tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len; new_saddr = rt->rt_src; @@ -1937,7 +1934,6 @@ if (!err) { __sk_dst_set(sk, &rt->u.dst); tcp_v4_setup_caps(sk, &rt->u.dst); - tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len; return 0; } ===== net/ipv4/tcp_output.c 1.80 vs edited ===== --- 1.80/net/ipv4/tcp_output.c 2005-02-24 14:16:59 +11:00 +++ edited/net/ipv4/tcp_output.c 2005-03-07 22:11:07 +11:00 @@ -632,12 +632,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); int mss_now; - if (dst && dst->ops->get_mss) - pmtu = dst->ops->get_mss(dst, pmtu); - /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ @@ -648,7 +644,7 @@ mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ - mss_now -= tp->ext_header_len + tp->ext2_header_len; + mss_now -= tp->ext_header_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) @@ -684,9 +680,8 @@ mss_now = tp->mss_cache_std; if (dst) { - u32 mtu = dst_pmtu(dst); - if (mtu != tp->pmtu_cookie || - tp->ext2_header_len != dst->header_len) + u32 mtu = dst_mtu(dst); + if (mtu != tp->pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } @@ -698,8 +693,7 @@ unsigned int large_mss, factor, limit; large_mss = 65535 - tp->af_specific->net_header_len - - tp->ext_header_len - tp->ext2_header_len - - tp->tcp_header_len; + tp->ext_header_len - tp->tcp_header_len; if (tp->max_window && large_mss > (tp->max_window>>1)) large_mss = max((tp->max_window>>1), @@ -1444,7 +1438,7 @@ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; - tcp_sync_mss(sk, dst_pmtu(dst)); + tcp_sync_mss(sk, dst_mtu(dst)); if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); ===== net/ipv6/tcp_ipv6.c 1.106 vs edited ===== --- 1.106/net/ipv6/tcp_ipv6.c 2005-02-23 14:23:09 +11:00 +++ edited/net/ipv6/tcp_ipv6.c 2005-03-07 22:02:38 +11:00 @@ -782,7 +782,6 @@ tp->ext_header_len = 0; if (np->opt) tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; - tp->ext2_header_len = dst->header_len; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -894,8 +893,8 @@ } else dst_hold(dst); - if (tp->pmtu_cookie > dst_pmtu(dst)) { - tcp_sync_mss(sk, dst_pmtu(dst)); + if (tp->pmtu_cookie > dst_mtu(dst)) { + tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ dst_release(dst); @@ -1524,9 +1523,8 @@ if (newnp->opt) newtp->ext_header_len = newnp->opt->opt_nflen + newnp->opt->opt_flen; - newtp->ext2_header_len = dst->header_len; - tcp_sync_mss(newsk, dst_pmtu(dst)); + tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); @@ -1873,7 +1871,6 @@ ip6_dst_store(sk, dst, NULL); sk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - tcp_sk(sk)->ext2_header_len = dst->header_len; } return 0; @@ -1927,7 +1924,6 @@ ip6_dst_store(sk, dst, NULL); sk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - tcp_sk(sk)->ext2_header_len = dst->header_len; } skb->dst = dst_clone(dst); ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len 2005-03-07 11:45 ` [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len Herbert Xu @ 2005-03-07 17:33 ` David S. Miller 0 siblings, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-07 17:33 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, kaber, netdev On Mon, 7 Mar 2005 22:45:33 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > Here is a patch that replaces all occurrences of dst_pmtu in the TCP > stack. As a result we no longer need ext2_header_len. > > This has a nice synergetic effect with Arnaldo's latest change to > linux/tcp.h :) > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> > > I'll be removing other users of dst->path/dst_pmtu next. Also applied, thanks Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [4/4] [IPSEC] Store MTU at each xfrm_dst 2005-02-14 22:16 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu 2005-02-15 15:53 ` James Morris 2005-02-16 10:37 ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu @ 2005-03-07 5:32 ` David S. Miller 2005-03-07 10:35 ` [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok Herbert Xu 2005-03-28 20:10 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Patrick McHardy 3 siblings, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-03-07 5:32 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev On Tue, 15 Feb 2005 09:16:07 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > Finally this is the patch that sets the MTU values of each xfrm_dst > and keeps it up-to-date. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> > > To recap, at this point we've obtained accurate MTU values at each > xfrm_dst. The next step would be to start using it as > dst_pmtu(xfrm_dst). Applied, but with a bug fix: + mtu = dst_pmtu(xdst->route); + if (xdst->child_mtu_cached != mtu) { + last = xdst; + xdst->route_mtu_cached = mtu; + } You obviously meant "route_mtu_cached" in the test, not child_mtu_cached. ^ permalink raw reply [flat|nested] 114+ messages in thread
* [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok 2005-03-07 5:32 ` [4/4] [IPSEC] Store MTU at each xfrm_dst David S. Miller @ 2005-03-07 10:35 ` Herbert Xu 2005-03-07 17:32 ` David S. Miller 2005-03-08 10:27 ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu 0 siblings, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-07 10:35 UTC (permalink / raw) To: David S. Miller; +Cc: kuznet, jmorris, yoshfuji, netdev [-- Attachment #1: Type: text/plain, Size: 1154 bytes --] On Sun, Mar 06, 2005 at 09:32:14PM -0800, David S. Miller wrote: > > Applied, but with a bug fix: > > + mtu = dst_pmtu(xdst->route); > + if (xdst->child_mtu_cached != mtu) { > + last = xdst; > + xdst->route_mtu_cached = mtu; > + } > > You obviously meant "route_mtu_cached" in the test, > not child_mtu_cached. Thanks for catching this. There is another bug in xfrm_bundle_ok where I forgot to check the validity of xdst->route. In fact, the check on dst->path isn't strong enough either. For IPv6 entries, dst->path->obsolete is always negative until you call ipv6_dst_check. So we really need to do that here. Here's the patch to fix those two problems. Yes I know my dst_check implementation is lame. I'll come back and fix up all the dst_check functions by moving their dst_release calls out. It proves that you were right in that IPv6 dst leak thread :) Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-9 --] [-- Type: text/plain, Size: 988 bytes --] ===== include/net/dst.h 1.27 vs edited ===== --- 1.27/include/net/dst.h 2005-03-07 16:33:59 +11:00 +++ edited/include/net/dst.h 2005-03-07 21:06:13 +11:00 @@ -259,6 +259,15 @@ } } +static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) +{ + dst_hold(dst); + if (dst->obsolete) + dst = dst->ops->check(dst, cookie); + dst_release(dst); + return dst; +} + extern void dst_init(void); struct flowi; ===== net/xfrm/xfrm_policy.c 1.70 vs edited ===== --- 1.70/net/xfrm/xfrm_policy.c 2005-03-07 21:17:35 +11:00 +++ edited/net/xfrm/xfrm_policy.c 2005-03-07 21:26:28 +11:00 @@ -1147,7 +1147,7 @@ struct xfrm_dst *last; u32 mtu; - if (dst->path->obsolete > 0 || + if (!dst_check(dst->path, 0) || (dst->dev && !netif_running(dst->dev))) return 0; @@ -1167,6 +1167,8 @@ xdst->child_mtu_cached = mtu; } + if (!dst_check(xdst->route, 0)) + return 0; mtu = dst_pmtu(xdst->route); if (xdst->route_mtu_cached != mtu) { last = xdst; ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok 2005-03-07 10:35 ` [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok Herbert Xu @ 2005-03-07 17:32 ` David S. Miller 2005-03-08 10:27 ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-07 17:32 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev On Mon, 7 Mar 2005 21:35:36 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > There is another bug in xfrm_bundle_ok where I forgot to > check the validity of xdst->route. In fact, the check > on dst->path isn't strong enough either. For IPv6 entries, > dst->path->obsolete is always negative until you call > ipv6_dst_check. So we really need to do that here. > > Here's the patch to fix those two problems. Yes I know > my dst_check implementation is lame. I'll come back and > fix up all the dst_check functions by moving their dst_release > calls out. It proves that you were right in that IPv6 dst > leak thread :) > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
* [11/*] [NET] Move dst_release out of dst->ops->check 2005-03-07 10:35 ` [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok Herbert Xu 2005-03-07 17:32 ` David S. Miller @ 2005-03-08 10:27 ` Herbert Xu 2005-03-08 12:50 ` YOSHIFUJI Hideaki / 吉藤英明 ` (2 more replies) 1 sibling, 3 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-08 10:27 UTC (permalink / raw) To: David S. Miller; +Cc: kuznet, jmorris, yoshfuji, netdev [-- Attachment #1: Type: text/plain, Size: 840 bytes --] On Mon, Mar 07, 2005 at 09:35:36PM +1100, herbert wrote: > > Here's the patch to fix those two problems. Yes I know > my dst_check implementation is lame. I'll come back and > fix up all the dst_check functions by moving their dst_release > calls out. It proves that you were right in that IPv6 dst > leak thread :) As promised here is the patch that moves dst_release out of dst->ops->check. It bloats sk_dst_check/__sk_dst_check slightly but they're only used in a handful of places so it isn't too bad. I actually counted, it's about a few hundred bytes. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-11 --] [-- Type: text/plain, Size: 2751 bytes --] ===== include/net/dst.h 1.28 vs edited ===== --- 1.28/include/net/dst.h 2005-03-07 22:03:36 +11:00 +++ edited/include/net/dst.h 2005-03-08 21:04:46 +11:00 @@ -261,10 +261,8 @@ static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { - dst_hold(dst); if (dst->obsolete) dst = dst->ops->check(dst, cookie); - dst_release(dst); return dst; } ===== include/net/sock.h 1.90 vs edited ===== --- 1.90/include/net/sock.h 2005-01-11 07:23:55 +11:00 +++ edited/include/net/sock.h 2005-03-08 21:05:42 +11:00 @@ -983,6 +983,7 @@ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk->sk_dst_cache = NULL; + dst_release(dst); return NULL; } @@ -996,6 +997,7 @@ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_dst_reset(sk); + dst_release(dst); return NULL; } ===== net/decnet/dn_route.c 1.31 vs edited ===== --- 1.31/net/decnet/dn_route.c 2005-01-14 16:03:01 +11:00 +++ edited/net/decnet/dn_route.c 2005-03-08 21:04:46 +11:00 @@ -253,7 +253,6 @@ */ static struct dst_entry *dn_dst_check(struct dst_entry *dst, __u32 cookie) { - dst_release(dst); return NULL; } ===== net/ipv4/route.c 1.104 vs edited ===== --- 1.104/net/ipv4/route.c 2005-03-06 15:43:44 +11:00 +++ edited/net/ipv4/route.c 2005-03-08 21:04:46 +11:00 @@ -1326,7 +1326,6 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - dst_release(dst); return NULL; } ===== net/ipv4/ipvs/ip_vs_xmit.c 1.11 vs edited ===== --- 1.11/net/ipv4/ipvs/ip_vs_xmit.c 2004-09-13 09:25:50 +10:00 +++ edited/net/ipv4/ipvs/ip_vs_xmit.c 2005-03-08 21:04:46 +11:00 @@ -52,6 +52,7 @@ if ((dst->obsolete || rtos != dest->dst_rtos) && dst->ops->check(dst, cookie) == NULL) { dest->dst_cache = NULL; + dst_release(dst); return NULL; } dst_hold(dst); ===== net/ipv6/ip6_tunnel.c 1.28 vs edited ===== --- 1.28/net/ipv6/ip6_tunnel.c 2005-02-07 16:40:51 +11:00 +++ edited/net/ipv6/ip6_tunnel.c 2005-03-08 21:04:47 +11:00 @@ -94,6 +94,7 @@ if (dst && dst->obsolete && dst->ops->check(dst, t->dst_cookie) == NULL) { t->dst_cache = NULL; + dst_release(dst); return NULL; } ===== net/ipv6/route.c 1.106 vs edited ===== --- 1.106/net/ipv6/route.c 2005-02-16 09:23:11 +11:00 +++ edited/net/ipv6/route.c 2005-03-08 21:04:47 +11:00 @@ -589,7 +589,6 @@ if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) return dst; - dst_release(dst); return NULL; } ===== net/xfrm/xfrm_policy.c 1.71 vs edited ===== --- 1.71/net/xfrm/xfrm_policy.c 2005-03-07 22:03:36 +11:00 +++ edited/net/xfrm/xfrm_policy.c 2005-03-08 21:04:47 +11:00 @@ -1015,7 +1015,6 @@ if (!stale_bundle(dst)) return dst; - dst_release(dst); return NULL; } ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [11/*] [NET] Move dst_release out of dst->ops->check 2005-03-08 10:27 ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu @ 2005-03-08 12:50 ` YOSHIFUJI Hideaki / 吉藤英明 2005-03-11 2:17 ` David S. Miller 2005-03-14 10:26 ` [12/*] [IPSEC] Handle local_df in IPv4 Herbert Xu 2 siblings, 0 replies; 114+ messages in thread From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-03-08 12:50 UTC (permalink / raw) To: herbert; +Cc: davem, kuznet, jmorris, netdev, yoshfuji In article <20050308102741.GA23468@gondor.apana.org.au> (at Tue, 8 Mar 2005 21:27:41 +1100), Herbert Xu <herbert@gondor.apana.org.au> says: > On Mon, Mar 07, 2005 at 09:35:36PM +1100, herbert wrote: > > > > Here's the patch to fix those two problems. Yes I know > > my dst_check implementation is lame. I'll come back and > > fix up all the dst_check functions by moving their dst_release > > calls out. It proves that you were right in that IPv6 dst > > leak thread :) > > As promised here is the patch that moves dst_release out of > dst->ops->check. It bloats sk_dst_check/__sk_dst_check slightly I agree. --yoshfuji ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [11/*] [NET] Move dst_release out of dst->ops->check 2005-03-08 10:27 ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu 2005-03-08 12:50 ` YOSHIFUJI Hideaki / 吉藤英明 @ 2005-03-11 2:17 ` David S. Miller 2005-03-14 10:26 ` [12/*] [IPSEC] Handle local_df in IPv4 Herbert Xu 2 siblings, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-11 2:17 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev On Tue, 8 Mar 2005 21:27:41 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > On Mon, Mar 07, 2005 at 09:35:36PM +1100, herbert wrote: > > > > Here's the patch to fix those two problems. Yes I know > > my dst_check implementation is lame. I'll come back and > > fix up all the dst_check functions by moving their dst_release > > calls out. It proves that you were right in that IPv6 dst > > leak thread :) > > As promised here is the patch that moves dst_release out of > dst->ops->check. It bloats sk_dst_check/__sk_dst_check slightly > but they're only used in a handful of places so it isn't too bad. > I actually counted, it's about a few hundred bytes. Applied, thanks Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
* [12/*] [IPSEC] Handle local_df in IPv4 2005-03-08 10:27 ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu 2005-03-08 12:50 ` YOSHIFUJI Hideaki / 吉藤英明 2005-03-11 2:17 ` David S. Miller @ 2005-03-14 10:26 ` Herbert Xu 2005-03-14 10:53 ` [13/*] [IPV4] Fix room calculation in icmp_send Herbert Xu 2005-03-15 5:25 ` [12/*] [IPSEC] Handle local_df in IPv4 David S. Miller 2 siblings, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-14 10:26 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 701 bytes --] Hi Dave: When cleaning up the remaining users of dst_pmtu I noticed that local_df wasn't being treated correctly in IPsec. In fact, if you socket's dst went over IPsec, local_df is essentailly ignored. This patch fixes that. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> I was going to do the same thing to IPv6. Unfortunately it seems that we don't have any local_df support over there. That is, we always fragment outbound UDP/raw packets. Did I miss something? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-12 --] [-- Type: text/plain, Size: 368 bytes --] ===== net/ipv4/xfrm4_output.c 1.7 vs edited ===== --- 1.7/net/ipv4/xfrm4_output.c 2005-03-07 16:33:59 +11:00 +++ edited/net/ipv4/xfrm4_output.c 2005-03-14 13:55:35 +11:00 @@ -78,7 +78,7 @@ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; - if (!(iph->frag_off & htons(IP_DF))) + if (!(iph->frag_off & htons(IP_DF)) || skb->local_df) goto out; dst = skb->dst; ^ permalink raw reply [flat|nested] 114+ messages in thread
* [13/*] [IPV4] Fix room calculation in icmp_send 2005-03-14 10:26 ` [12/*] [IPSEC] Handle local_df in IPv4 Herbert Xu @ 2005-03-14 10:53 ` Herbert Xu 2005-03-14 11:10 ` [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward Herbert Xu 2005-03-15 5:26 ` [13/*] [IPV4] Fix room calculation in icmp_send David S. Miller 2005-03-15 5:25 ` [12/*] [IPSEC] Handle local_df in IPv4 David S. Miller 1 sibling, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-14 10:53 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 702 bytes --] Hi Dave: I'm now cleaning up all users of dst_pmtu with the aim of replacing dst_pmtu with dst_mtu. I'm going to start with the ones that actually fix bugs. This patch fixes the length calculation in icmp_send. As it is we're overestimating the space available by including the space that would be used up by IPsec encapsulation. IPv6 doesn't have this problem since its calculation is based on 1280 instead of the PMTU. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-13 --] [-- Type: text/plain, Size: 400 bytes --] ===== net/ipv4/icmp.c 1.49 vs edited ===== --- 1.49/net/ipv4/icmp.c 2004-12-28 16:30:43 +11:00 +++ edited/net/ipv4/icmp.c 2005-03-14 21:34:20 +11:00 @@ -560,7 +560,7 @@ /* RFC says return as much as we can without exceeding 576 bytes. */ - room = dst_pmtu(&rt->u.dst); + room = dst_mtu(&rt->u.dst); if (room > 576) room = 576; room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; ^ permalink raw reply [flat|nested] 114+ messages in thread
* [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward 2005-03-14 10:53 ` [13/*] [IPV4] Fix room calculation in icmp_send Herbert Xu @ 2005-03-14 11:10 ` Herbert Xu 2005-03-15 5:27 ` David S. Miller 2005-03-15 9:19 ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu 2005-03-15 5:26 ` [13/*] [IPV4] Fix room calculation in icmp_send David S. Miller 1 sibling, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-14 11:10 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 561 bytes --] Hi Dave: While replacing dst_pmtu in ip6_output I found this little gem. In ip6_forward we're not reloading the dst pointer after calling xfrm6_route_forward. So all subsequent dereferences of dst will refer to its pre-IPsec value. The solution is of course to refresh its value. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-14 --] [-- Type: text/plain, Size: 381 bytes --] ===== net/ipv6/ip6_output.c 1.85 vs edited ===== --- 1.85/net/ipv6/ip6_output.c 2005-03-03 16:12:38 +11:00 +++ edited/net/ipv6/ip6_output.c 2005-03-14 22:05:31 +11:00 @@ -397,6 +397,7 @@ IP6_INC_STATS(IPSTATS_MIB_INDISCARDS); goto drop; } + dst = skb->dst; /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward 2005-03-14 11:10 ` [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward Herbert Xu @ 2005-03-15 5:27 ` David S. Miller 2005-03-15 9:19 ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-15 5:27 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Mon, 14 Mar 2005 22:10:02 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > While replacing dst_pmtu in ip6_output I found this little gem. In > ip6_forward we're not reloading the dst pointer after calling > xfrm6_route_forward. So all subsequent dereferences of dst will > refer to its pre-IPsec value. > > The solution is of course to refresh its value. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Good catch Herbert. Applied, thanks. ^ permalink raw reply [flat|nested] 114+ messages in thread
* [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data 2005-03-14 11:10 ` [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward Herbert Xu 2005-03-15 5:27 ` David S. Miller @ 2005-03-15 9:19 ` Herbert Xu 2005-03-15 9:58 ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu ` (2 more replies) 1 sibling, 3 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-15 9:19 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 746 bytes --] Hi Dave: This patch fixes the IPsec overhead handling in ip_append_data and ip6_append_data. As it is they assume that the IPsec overhead is constant. This is not true as with ESP the IPsec overhead will vary as the MTU varies. The result is that they may produce packets that will exceed the MTU when ESP is used. Had it taken the trailer_len into account, it would have produced packets less than the real MTU. By switching to dst_mtu we get the optimal result. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-15 --] [-- Type: text/plain, Size: 2640 bytes --] ===== net/ipv4/ip_output.c 1.76 vs edited ===== --- 1.76/net/ipv4/ip_output.c 2005-02-01 02:43:53 +11:00 +++ edited/net/ipv4/ip_output.c 2005-03-15 20:13:13 +11:00 @@ -719,7 +719,6 @@ struct ip_options *opt = NULL; int hh_len; - int exthdrlen; int mtu; int copy; int err; @@ -746,22 +745,17 @@ inet->cork.addr = ipc->addr; } dst_hold(&rt->u.dst); - inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst); + inet->cork.fragsize = mtu = dst_mtu(&rt->u.dst); inet->cork.rt = rt; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - if ((exthdrlen = rt->u.dst.header_len) != 0) { - length += exthdrlen; - transhdrlen += exthdrlen; - } } else { rt = inet->cork.rt; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; transhdrlen = 0; - exthdrlen = 0; mtu = inet->cork.fragsize; } hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); @@ -770,7 +764,7 @@ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (inet->cork.length + length > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); return -EMSGSIZE; } @@ -781,7 +775,7 @@ if (transhdrlen && length + fragheaderlen <= mtu && rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && - !exthdrlen) + !rt->u.dst.header_len) csummode = CHECKSUM_HW; inet->cork.length += length; @@ -866,9 +860,9 @@ * Find where to start putting bytes. */ data = skb_put(skb, fraglen); - skb->nh.raw = data + exthdrlen; + skb->nh.raw = data; data += fragheaderlen; - skb->h.raw = data + exthdrlen; + skb->h.raw = data; if (fraggap) { skb->csum = skb_copy_and_csum_bits( @@ -890,7 +884,6 @@ offset += copy; length -= datalen - fraggap; transhdrlen = 0; - exthdrlen = 0; csummode = CHECKSUM_NONE; /* ===== net/ipv6/ip6_output.c 1.85 vs edited ===== --- 1.85/net/ipv6/ip6_output.c 2005-03-03 16:12:38 +11:00 +++ edited/net/ipv6/ip6_output.c 2005-03-15 20:12:15 +11:00 @@ -849,13 +849,13 @@ np->cork.rt = rt; inet->cork.fl = *fl; np->cork.hop_limit = hlimit; - inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst); + inet->cork.fragsize = mtu = dst_mtu(&rt->u.dst); if (dst_allfrag(&rt->u.dst)) inet->cork.flags |= IPCORK_ALLFRAG; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0); + exthdrlen = opt ? opt->opt_flen : 0; length += exthdrlen; transhdrlen += exthdrlen; } else { ^ permalink raw reply [flat|nested] 114+ messages in thread
* [16/*] [INET] Take IPsec overhead into account in tunnels 2005-03-15 9:19 ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu @ 2005-03-15 9:58 ` Herbert Xu 2005-03-15 10:05 ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu ` (3 more replies) 2005-03-15 18:18 ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data David S. Miller 2005-03-16 11:31 ` Herbert Xu 2 siblings, 4 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-15 9:58 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 912 bytes --] Hi Dave: This patch uses dst_mtu instead of dst_pmtu in the various tunnel implementations. As it is they simply ignore the IPsec overhead. This leads to bogus MTU values inside the tunnels. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> BTW, we're doing lazy MTU updates in the tunnel xmit functions. When a packet with DF set hits us and exceeds the updated MTU, we will send an ICMP packet back which is good. Unfortunately when a packet with DF clear hits us as we update the MTU downwards, the packet will be silently discarded instead of fragmented (well we will send an ICMP back to ourselves but we already knew that MTU value :). I presume we want to fix this, right? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-16 --] [-- Type: text/plain, Size: 2924 bytes --] ===== net/ipv4/ip_gre.c 1.47 vs edited ===== --- 1.47/net/ipv4/ip_gre.c 2005-03-10 16:12:11 +11:00 +++ edited/net/ipv4/ip_gre.c 2005-03-15 20:19:26 +11:00 @@ -511,7 +511,7 @@ /* change mtu on this route */ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - if (rel_info > dst_pmtu(skb2->dst)) { + if (rel_info > dst_mtu(skb2->dst)) { kfree_skb(skb2); return; } @@ -764,9 +764,9 @@ df = tiph->frag_off; if (df) - mtu = dst_pmtu(&rt->u.dst) - tunnel->hlen; + mtu = dst_mtu(&rt->u.dst) - tunnel->hlen; else - mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu; + mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; if (skb->dst) skb->dst->ops->update_pmtu(skb->dst, mtu); @@ -785,7 +785,7 @@ else if (skb->protocol == htons(ETH_P_IPV6)) { struct rt6_info *rt6 = (struct rt6_info*)skb->dst; - if (rt6 && mtu < dst_pmtu(skb->dst) && mtu >= IPV6_MIN_MTU) { + if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) { if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; ===== net/ipv4/ipip.c 1.46 vs edited ===== --- 1.46/net/ipv4/ipip.c 2005-01-14 15:41:05 +11:00 +++ edited/net/ipv4/ipip.c 2005-03-15 20:19:26 +11:00 @@ -436,7 +436,7 @@ /* change mtu on this route */ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - if (rel_info > dst_pmtu(skb2->dst)) { + if (rel_info > dst_mtu(skb2->dst)) { kfree_skb(skb2); return; } @@ -569,9 +569,9 @@ } if (tiph->frag_off) - mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); else - mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu; + mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; if (mtu < 68) { tunnel->stat.collisions++; ===== net/ipv6/ip6_tunnel.c 1.29 vs edited ===== --- 1.29/net/ipv6/ip6_tunnel.c 2005-03-11 13:17:32 +11:00 +++ edited/net/ipv6/ip6_tunnel.c 2005-03-15 20:19:26 +11:00 @@ -689,14 +689,14 @@ t->parms.name); goto tx_err_dst_release; } - mtu = dst_pmtu(dst) - sizeof (*ipv6h); + mtu = dst_mtu(dst) - sizeof (*ipv6h); if (opt) { max_headroom += 8; mtu -= 8; } if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (skb->dst && mtu < dst_pmtu(skb->dst)) { + if (skb->dst && mtu < dst_mtu(skb->dst)) { struct rt6_info *rt = (struct rt6_info *) skb->dst; rt->rt6i_flags |= RTF_MODIFIED; rt->u.dst.metrics[RTAX_MTU-1] = mtu; ===== net/ipv6/sit.c 1.45 vs edited ===== --- 1.45/net/ipv6/sit.c 2005-01-14 15:41:06 +11:00 +++ edited/net/ipv6/sit.c 2005-03-15 20:19:27 +11:00 @@ -500,9 +500,9 @@ } if (tiph->frag_off) - mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); else - mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu; + mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; if (mtu < 68) { tunnel->stat.collisions++; ^ permalink raw reply [flat|nested] 114+ messages in thread
* [17/*] [NET] Replace dst_pmtu with dst_mtu 2005-03-15 9:58 ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu @ 2005-03-15 10:05 ` Herbert Xu 2005-03-15 18:24 ` David S. Miller 2005-03-15 10:20 ` [16/*] [INET] Take IPsec overhead into account in tunnels Lennert Buytenhek ` (2 subsequent siblings) 3 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-15 10:05 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 819 bytes --] Hi Dave: This patch replaces most of the other uses of dst_pmtu with dst_mtu. As far as I can tell these are either identical because dst->path == dst, or they're a straightforward replacement of (the slightly incorrect) dst_pmtu(dst) - dst->header_Len with dst_mtu(dst). Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> At this point the only remaining user of dst_pmtu is ipt_REJECT which will go away either when we use icmp_send in there or when I replace it with dst_mtu. We can now remove the other users of dst->path as well with the removal of that attribute itself as the goal. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-17 --] [-- Type: text/plain, Size: 8783 bytes --] ===== net/decnet/af_decnet.c 1.49 vs edited ===== --- 1.49/net/decnet/af_decnet.c 2005-01-21 09:35:54 +11:00 +++ edited/net/decnet/af_decnet.c 2005-03-15 20:59:05 +11:00 @@ -1868,7 +1868,7 @@ /* This works out the maximum size of segment we can send out */ if (dst) { - u32 mtu = dst_pmtu(dst); + u32 mtu = dst_mtu(dst); mss_now = min_t(int, dn_mss_from_pmtu(dst->dev, mtu), mss_now); } ===== net/decnet/dn_route.c 1.32 vs edited ===== --- 1.32/net/decnet/dn_route.c 2005-03-11 13:17:32 +11:00 +++ edited/net/decnet/dn_route.c 2005-03-15 20:59:05 +11:00 @@ -817,7 +817,7 @@ if (rt->u.dst.metrics[RTAX_MTU-1] == 0 || rt->u.dst.metrics[RTAX_MTU-1] > rt->u.dst.dev->mtu) rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; - mss = dn_mss_from_pmtu(dev, dst_pmtu(&rt->u.dst)); + mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->u.dst)); if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0 || rt->u.dst.metrics[RTAX_ADVMSS-1] > mss) rt->u.dst.metrics[RTAX_ADVMSS-1] = mss; ===== net/ipv4/ip_output.c 1.76 vs edited ===== --- 1.76/net/ipv4/ip_output.c 2005-02-01 02:43:53 +11:00 +++ edited/net/ipv4/ip_output.c 2005-03-15 20:59:05 +11:00 @@ -278,7 +278,7 @@ newskb->dev, ip_dev_loopback_xmit); } - if (skb->len > dst_pmtu(&rt->u.dst)) + if (skb->len > dst_mtu(&rt->u.dst)) return ip_fragment(skb, ip_finish_output); else return ip_finish_output(skb); @@ -288,7 +288,7 @@ { IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - if (skb->len > dst_pmtu(skb->dst) && !skb_shinfo(skb)->tso_size) + if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size) return ip_fragment(skb, ip_finish_output); else return ip_finish_output(skb); @@ -448,7 +448,7 @@ if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_pmtu(&rt->u.dst))); + htonl(dst_mtu(&rt->u.dst))); kfree_skb(skb); return -EMSGSIZE; } @@ -458,7 +458,7 @@ */ hlen = iph->ihl * 4; - mtu = dst_pmtu(&rt->u.dst) - hlen; /* Size of data space */ + mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing ===== net/ipv4/ip_sockglue.c 1.31 vs edited ===== --- 1.31/net/ipv4/ip_sockglue.c 2005-01-14 15:41:05 +11:00 +++ edited/net/ipv4/ip_sockglue.c 2005-03-15 20:59:05 +11:00 @@ -955,7 +955,7 @@ val = 0; dst = sk_dst_get(sk); if (dst) { - val = dst_pmtu(dst) - dst->header_len; + val = dst_mtu(dst); dst_release(dst); } if (!val) { ===== net/ipv6/ip6_output.c 1.85 vs edited ===== --- 1.85/net/ipv6/ip6_output.c 2005-03-03 16:12:38 +11:00 +++ edited/net/ipv6/ip6_output.c 2005-03-15 20:59:05 +11:00 @@ -147,7 +147,7 @@ int ip6_output(struct sk_buff *skb) { - if (skb->len > dst_pmtu(skb->dst) || dst_allfrag(skb->dst)) + if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst)) return ip6_fragment(skb, ip6_output2); else return ip6_output2(skb); @@ -263,7 +263,7 @@ ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); ipv6_addr_copy(&hdr->daddr, first_hop); - mtu = dst_pmtu(dst); + mtu = dst_mtu(dst); if ((skb->len <= mtu) || ipfragok) { IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute); @@ -428,10 +428,10 @@ goto error; } - if (skb->len > dst_pmtu(dst)) { + if (skb->len > dst_mtu(dst)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev); IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS); IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); @@ -534,7 +534,7 @@ hlen = ip6_find_1stfragopt(skb, &prevhdr); nexthdr = *prevhdr; - mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr); + mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr); if (skb_shinfo(skb)->frag_list) { int first_len = skb_pagelen(skb); ===== net/ipv6/ipv6_sockglue.c 1.34 vs edited ===== --- 1.34/net/ipv6/ipv6_sockglue.c 2005-01-14 15:41:06 +11:00 +++ edited/net/ipv6/ipv6_sockglue.c 2005-03-15 20:59:05 +11:00 @@ -607,7 +607,7 @@ lock_sock(sk); dst = sk_dst_get(sk); if (dst) { - val = dst_pmtu(dst) - dst->header_len; + val = dst_mtu(dst); dst_release(dst); } release_sock(sk); ===== net/ipv6/route.c 1.111 vs edited ===== --- 1.111/net/ipv6/route.c 2005-03-11 13:17:32 +11:00 +++ edited/net/ipv6/route.c 2005-03-15 20:59:05 +11:00 @@ -625,7 +625,7 @@ { struct rt6_info *rt6 = (struct rt6_info*)dst; - if (mtu < dst_pmtu(dst) && rt6->rt6i_dst.plen == 128) { + if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; if (mtu < IPV6_MIN_MTU) { mtu = IPV6_MIN_MTU; @@ -686,7 +686,7 @@ atomic_set(&rt->u.dst.__refcnt, 1); rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255; rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst)); + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); rt->u.dst.output = output; #if 0 /* there's no chance to use these for ndisc */ @@ -971,7 +971,7 @@ if (!rt->u.dst.metrics[RTAX_MTU-1]) rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev); if (!rt->u.dst.metrics[RTAX_ADVMSS-1]) - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst)); + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); rt->u.dst.dev = dev; rt->rt6i_idev = idev; return ip6_ins_rt(rt, nlh, _rtattr); @@ -1134,7 +1134,7 @@ nrt->rt6i_nexthop = neigh_clone(neigh); /* Reset pmtu, it may be better */ nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); - nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&nrt->u.dst)); + nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst)); if (ip6_ins_rt(nrt, NULL, NULL)) goto out; @@ -1164,7 +1164,7 @@ if (rt == NULL) return; - if (pmtu >= dst_pmtu(&rt->u.dst)) + if (pmtu >= dst_mtu(&rt->u.dst)) goto out; if (pmtu < IPV6_MIN_MTU) { @@ -1405,7 +1405,7 @@ rt->rt6i_dev = &loopback_dev; rt->rt6i_idev = idev; rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst)); + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; rt->u.dst.obsolete = -1; @@ -1480,9 +1480,9 @@ */ if (rt->rt6i_dev == arg->dev && !dst_metric_locked(&rt->u.dst, RTAX_MTU) && - (dst_pmtu(&rt->u.dst) > arg->mtu || - (dst_pmtu(&rt->u.dst) < arg->mtu && - dst_pmtu(&rt->u.dst) == idev->cnf.mtu6))) + (dst_mtu(&rt->u.dst) > arg->mtu || + (dst_mtu(&rt->u.dst) < arg->mtu && + dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu; rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu); return 0; ===== net/sctp/transport.c 1.31 vs edited ===== --- 1.31/net/sctp/transport.c 2005-01-15 15:32:41 +11:00 +++ edited/net/sctp/transport.c 2005-03-15 20:59:05 +11:00 @@ -227,7 +227,7 @@ dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL); if (dst) { - transport->pmtu = dst_pmtu(dst); + transport->pmtu = dst_mtu(dst); dst_release(dst); } else transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; @@ -253,7 +253,7 @@ transport->dst = dst; if (dst) { - transport->pmtu = dst_pmtu(dst); + transport->pmtu = dst_mtu(dst); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). ===== net/xfrm/xfrm_policy.c 1.72 vs edited ===== --- 1.72/net/xfrm/xfrm_policy.c 2005-03-11 13:17:32 +11:00 +++ edited/net/xfrm/xfrm_policy.c 2005-03-15 20:59:05 +11:00 @@ -1119,12 +1119,12 @@ struct xfrm_dst *xdst = (struct xfrm_dst *)dst; u32 pmtu, route_mtu_cached; - pmtu = dst_pmtu(dst->child); + pmtu = dst_mtu(dst->child); xdst->child_mtu_cached = pmtu; pmtu = xfrm_state_mtu(dst->xfrm, pmtu); - route_mtu_cached = dst_pmtu(xdst->route); + route_mtu_cached = dst_mtu(xdst->route); xdst->route_mtu_cached = route_mtu_cached; if (pmtu > route_mtu_cached) @@ -1160,7 +1160,7 @@ if (dst->xfrm->km.state != XFRM_STATE_VALID) return 0; - mtu = dst_pmtu(dst->child); + mtu = dst_mtu(dst->child); if (xdst->child_mtu_cached != mtu) { last = xdst; xdst->child_mtu_cached = mtu; @@ -1168,7 +1168,7 @@ if (!dst_check(xdst->route, 0)) return 0; - mtu = dst_pmtu(xdst->route); + mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { last = xdst; xdst->route_mtu_cached = mtu; ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [17/*] [NET] Replace dst_pmtu with dst_mtu 2005-03-15 10:05 ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu @ 2005-03-15 18:24 ` David S. Miller 2005-03-15 19:02 ` Patrick McHardy 2005-03-15 20:31 ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu 0 siblings, 2 replies; 114+ messages in thread From: David S. Miller @ 2005-03-15 18:24 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Tue, 15 Mar 2005 21:05:22 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > This patch replaces most of the other uses of dst_pmtu with dst_mtu. > As far as I can tell these are either identical because dst->path == dst, > or they're a straightforward replacement of (the slightly incorrect) > dst_pmtu(dst) - dst->header_Len with dst_mtu(dst). > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks Herbert. > At this point the only remaining user of dst_pmtu is ipt_REJECT which > will go away either when we use icmp_send in there or when I replace > it with dst_mtu. Yes, I saw that posting on netfilter-devel where you asked Rusty why icmp_send() wasn't directly used even though it appears it should be. > We can now remove the other users of dst->path as well with the removal > of that attribute itself as the goal. Ok, sounds great. So, at that point, I guess the next task is to handle PMTU events of already encrypted packets properly? We still have that problem right? When the ICMP payload is encrypted we have to cache some information on IPSEC output so that a proto+SPI key can find us the encrypted inner IP header info, which we'll need in order to process (and/or forward) the ICMP PMTU information correctly. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [17/*] [NET] Replace dst_pmtu with dst_mtu 2005-03-15 18:24 ` David S. Miller @ 2005-03-15 19:02 ` Patrick McHardy 2005-03-15 20:40 ` Replace send_unreach with icmp_send Herbert Xu 2005-03-15 20:31 ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu 1 sibling, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-03-15 19:02 UTC (permalink / raw) To: David S. Miller Cc: kuznet, Netfilter Development Mailinglist, Herbert Xu, netdev David S. Miller wrote: > On Tue, 15 Mar 2005 21:05:22 +1100 > Herbert Xu <herbert@gondor.apana.org.au> wrote: > >>At this point the only remaining user of dst_pmtu is ipt_REJECT which >>will go away either when we use icmp_send in there or when I replace >>it with dst_mtu. > > > Yes, I saw that posting on netfilter-devel where you asked Rusty > why icmp_send() wasn't directly used even though it appears it > should be. I would prefer to keep it seperately. I have planned to remove xrlim from ipt_REJECT so it behaves similar for TCP and ICMP. Limits should then be handled by the limit match. This can't be done if we switch to icmp_send(). Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Replace send_unreach with icmp_send 2005-03-15 19:02 ` Patrick McHardy @ 2005-03-15 20:40 ` Herbert Xu 2005-03-15 20:48 ` Patrick McHardy 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-15 20:40 UTC (permalink / raw) To: Patrick McHardy; +Cc: David S. Miller, kuznet, yoshfuji, netdev On Tue, Mar 15, 2005 at 08:02:26PM +0100, Patrick McHardy wrote: > > I would prefer to keep it seperately. I have planned to remove xrlim > from ipt_REJECT so it behaves similar for TCP and ICMP. Limits should > then be handled by the limit match. This can't be done if we switch to > icmp_send(). Well it isn't terribly difficult to create a new version of icmp_send that does xrlim conditionally. icmp_send/ipt_REJECT can then call that function. The main reason I'm looking at getting rid of send_unreach is because having two implementations of the same code often leads to bugs. In fact, as it is there are multiple IPsec-related bugs in the ipt_REJECT code. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: Replace send_unreach with icmp_send 2005-03-15 20:40 ` Replace send_unreach with icmp_send Herbert Xu @ 2005-03-15 20:48 ` Patrick McHardy 2005-03-16 10:51 ` [IPV4] Make ipt_REJECT use icmp_send again Herbert Xu 0 siblings, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-03-15 20:48 UTC (permalink / raw) To: Herbert Xu; +Cc: David S. Miller, kuznet, yoshfuji, netdev Herbert Xu wrote: > Well it isn't terribly difficult to create a new version of icmp_send > that does xrlim conditionally. icmp_send/ipt_REJECT can then call that > function. > > The main reason I'm looking at getting rid of send_unreach is because > having two implementations of the same code often leads to bugs. In > fact, as it is there are multiple IPsec-related bugs in the ipt_REJECT > code. Ok. I can't see any different reason to keep it, so go ahead. I'll take care of the xrlim stuff later. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* [IPV4] Make ipt_REJECT use icmp_send again 2005-03-15 20:48 ` Patrick McHardy @ 2005-03-16 10:51 ` Herbert Xu 2005-03-16 19:00 ` Patrick McHardy 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-16 10:51 UTC (permalink / raw) To: Patrick McHardy; +Cc: David S. Miller, kuznet, yoshfuji, netdev [-- Attachment #1: Type: text/plain, Size: 1164 bytes --] On Tue, Mar 15, 2005 at 09:48:53PM +0100, Patrick McHardy wrote: > > Ok. I can't see any different reason to keep it, so go ahead. I'll take > care of the xrlim stuff later. Great. Here is the patch that makes ipt_REJECT use icmp_send again. We've gone full circle :) As it is ipt_REJECT doesn't work at all with IPsec. Despite my efforts previously in making the policy lookups work there I neglected to change the final call to dst_output so the policy lookup is useless. ipt_REJECT also had a number of deviations from icmp_send which seems to be unjustified. For examples it ignored source routing IP options. There was a bug in icmp_send too :) It didn't set the ICMP type/code values for the policy lookup. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> I compared the two functions line-by-line to make sure that there weren't any subtle differences. Please double check this in case I overlooked something important. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-18 --] [-- Type: text/plain, Size: 5158 bytes --] ===== net/ipv4/icmp.c 1.50 vs edited ===== --- 1.50/net/ipv4/icmp.c 2005-03-15 16:26:11 +11:00 +++ edited/net/ipv4/icmp.c 2005-03-16 21:20:41 +11:00 @@ -518,14 +518,6 @@ IPTOS_PREC_INTERNETCONTROL) : iph->tos; - { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = iph->saddr, - .saddr = saddr, - .tos = RT_TOS(tos) } }, - .proto = IPPROTO_ICMP }; - if (ip_route_output_key(&rt, &fl)) - goto out_unlock; - } if (ip_options_echo(&icmp_param.replyopts, skb_in)) goto ende; @@ -544,13 +536,26 @@ inet_sk(icmp_socket->sk)->tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts; - if (icmp_param.replyopts.srr) { - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = icmp_param.replyopts.faddr, - .saddr = saddr, - .tos = RT_TOS(tos) } }, - .proto = IPPROTO_ICMP }; - ip_rt_put(rt); + + { + struct flowi fl = { + .nl_u = { + .ip4_u = { + .daddr = icmp_param.replyopts.srr ? + icmp_param.replyopts.faddr : + iph->saddr, + .saddr = saddr, + .tos = RT_TOS(tos) + } + }, + .proto = IPPROTO_ICMP, + .uli_u = { + .icmpt = { + .type = type, + .code = code + } + } + }; if (ip_route_output_key(&rt, &fl)) goto out_unlock; } ===== net/ipv4/netfilter/ipt_REJECT.c 1.35 vs edited ===== --- 1.35/net/ipv4/netfilter/ipt_REJECT.c 2005-03-04 09:20:32 +11:00 +++ edited/net/ipv4/netfilter/ipt_REJECT.c 2005-03-16 21:19:20 +11:00 @@ -220,146 +220,9 @@ kfree_skb(nskb); } -static void send_unreach(struct sk_buff *skb_in, int code) +static inline void send_unreach(struct sk_buff *skb_in, int code) { - struct iphdr *iph; - struct icmphdr *icmph; - struct sk_buff *nskb; - u32 saddr; - u8 tos; - int hh_len, length; - struct rtable *rt = (struct rtable*)skb_in->dst; - unsigned char *data; - - if (!rt) - return; - - /* FIXME: Use sysctl number. --RR */ - if (!xrlim_allow(&rt->u.dst, 1*HZ)) - return; - - iph = skb_in->nh.iph; - - /* No replies to physical multicast/broadcast */ - if (skb_in->pkt_type!=PACKET_HOST) - return; - - /* Now check at the protocol level */ - if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) - return; - - /* Only reply to fragment 0. */ - if (iph->frag_off&htons(IP_OFFSET)) - return; - - /* If we send an ICMP error to an ICMP error a mess would result.. */ - if (iph->protocol == IPPROTO_ICMP) { - struct icmphdr ihdr; - - icmph = skb_header_pointer(skb_in, skb_in->nh.iph->ihl*4, - sizeof(ihdr), &ihdr); - if (!icmph) - return; - - /* Between echo-reply (0) and timestamp (13), - everything except echo-request (8) is an error. - Also, anything greater than NR_ICMP_TYPES is - unknown, and hence should be treated as an error... */ - if ((icmph->type < ICMP_TIMESTAMP - && icmph->type != ICMP_ECHOREPLY - && icmph->type != ICMP_ECHO) - || icmph->type > NR_ICMP_TYPES) - return; - } - - saddr = iph->daddr; - if (!(rt->rt_flags & RTCF_LOCAL)) - saddr = 0; - - tos = (iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL; - - { - struct flowi fl = { - .nl_u = { - .ip4_u = { - .daddr = skb_in->nh.iph->saddr, - .saddr = saddr, - .tos = RT_TOS(tos) - } - }, - .proto = IPPROTO_ICMP, - .uli_u = { - .icmpt = { - .type = ICMP_DEST_UNREACH, - .code = code - } - } - }; - - if (ip_route_output_key(&rt, &fl)) - return; - } - /* RFC says return as much as we can without exceeding 576 bytes. */ - length = skb_in->len + sizeof(struct iphdr) + sizeof(struct icmphdr); - - if (length > dst_pmtu(&rt->u.dst)) - length = dst_pmtu(&rt->u.dst); - if (length > 576) - length = 576; - - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); - - nskb = alloc_skb(hh_len + length, GFP_ATOMIC); - if (!nskb) { - ip_rt_put(rt); - return; - } - - nskb->priority = 0; - nskb->dst = &rt->u.dst; - skb_reserve(nskb, hh_len); - - /* Set up IP header */ - iph = nskb->nh.iph - = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); - iph->version=4; - iph->ihl=5; - iph->tos=tos; - iph->tot_len = htons(length); - - /* PMTU discovery never applies to ICMP packets. */ - iph->frag_off = 0; - - iph->ttl = MAXTTL; - ip_select_ident(iph, &rt->u.dst, NULL); - iph->protocol=IPPROTO_ICMP; - iph->saddr=rt->rt_src; - iph->daddr=rt->rt_dst; - iph->check=0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - - /* Set up ICMP header. */ - icmph = nskb->h.icmph - = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr)); - icmph->type = ICMP_DEST_UNREACH; - icmph->code = code; - icmph->un.gateway = 0; - icmph->checksum = 0; - - /* Copy as much of original packet as will fit */ - data = skb_put(nskb, - length - sizeof(struct iphdr) - sizeof(struct icmphdr)); - - skb_copy_bits(skb_in, 0, data, - length - sizeof(struct iphdr) - sizeof(struct icmphdr)); - - icmph->checksum = ip_compute_csum((unsigned char *)icmph, - length - sizeof(struct iphdr)); - - nf_ct_attach(nskb, skb_in); - - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev, - ip_finish_output); + icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); } static unsigned int reject(struct sk_buff **pskb, ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPV4] Make ipt_REJECT use icmp_send again 2005-03-16 10:51 ` [IPV4] Make ipt_REJECT use icmp_send again Herbert Xu @ 2005-03-16 19:00 ` Patrick McHardy 2005-03-16 22:44 ` David S. Miller 0 siblings, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-03-16 19:00 UTC (permalink / raw) To: Herbert Xu; +Cc: David S. Miller, kuznet, yoshfuji, netdev Herbert Xu wrote: > I compared the two functions line-by-line to make sure that there > weren't any subtle differences. Please double check this in case > I overlooked something important. I double checked, all looks good to me. Signed-off-by: Patrick McHardy <kaber@trash.net> Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPV4] Make ipt_REJECT use icmp_send again 2005-03-16 19:00 ` Patrick McHardy @ 2005-03-16 22:44 ` David S. Miller 2005-03-17 10:51 ` [IPV4] Send TCP reset through dst_output in ipt_REJECT Herbert Xu 0 siblings, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-03-16 22:44 UTC (permalink / raw) To: Patrick McHardy; +Cc: herbert, kuznet, yoshfuji, netdev On Wed, 16 Mar 2005 20:00:51 +0100 Patrick McHardy <kaber@trash.net> wrote: > Herbert Xu wrote: > > I compared the two functions line-by-line to make sure that there > > weren't any subtle differences. Please double check this in case > > I overlooked something important. > > I double checked, all looks good to me. > > Signed-off-by: Patrick McHardy <kaber@trash.net> Applied, thanks guys. ^ permalink raw reply [flat|nested] 114+ messages in thread
* [IPV4] Send TCP reset through dst_output in ipt_REJECT 2005-03-16 22:44 ` David S. Miller @ 2005-03-17 10:51 ` Herbert Xu 2005-03-17 18:06 ` David S. Miller 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-17 10:51 UTC (permalink / raw) To: David S. Miller; +Cc: Patrick McHardy, kuznet, yoshfuji, netdev [-- Attachment #1: Type: text/plain, Size: 462 bytes --] Hi Dave: I noticed that the TCP reset code in ipt_REJECT didn't call dst_output either so it also bypasses IPsec processing. Here is a patch to fix that and use the correct MTU value. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-20 --] [-- Type: text/plain, Size: 531 bytes --] ===== net/ipv4/netfilter/ipt_REJECT.c 1.36 vs edited ===== --- 1.36/net/ipv4/netfilter/ipt_REJECT.c 2005-03-17 09:44:14 +11:00 +++ edited/net/ipv4/netfilter/ipt_REJECT.c 2005-03-17 14:18:20 +11:00 @@ -207,13 +207,13 @@ nskb->nh.iph->ihl); /* "Never happens" */ - if (nskb->len > dst_pmtu(nskb->dst)) + if (nskb->len > dst_mtu(nskb->dst)) goto free_nskb; nf_ct_attach(nskb, oldskb); NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev, - ip_finish_output); + dst_output); return; free_nskb: ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPV4] Send TCP reset through dst_output in ipt_REJECT 2005-03-17 10:51 ` [IPV4] Send TCP reset through dst_output in ipt_REJECT Herbert Xu @ 2005-03-17 18:06 ` David S. Miller 0 siblings, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-17 18:06 UTC (permalink / raw) To: Herbert Xu; +Cc: kaber, kuznet, yoshfuji, netdev On Thu, 17 Mar 2005 21:51:45 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > I noticed that the TCP reset code in ipt_REJECT didn't call dst_output > either so it also bypasses IPsec processing. Here is a patch to fix > that and use the correct MTU value. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks a lot Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [17/*] [NET] Replace dst_pmtu with dst_mtu 2005-03-15 18:24 ` David S. Miller 2005-03-15 19:02 ` Patrick McHardy @ 2005-03-15 20:31 ` Herbert Xu 1 sibling, 0 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-15 20:31 UTC (permalink / raw) To: David S. Miller; +Cc: kuznet, yoshfuji, kaber, netdev On Tue, Mar 15, 2005 at 10:24:50AM -0800, David S. Miller wrote: > > So, at that point, I guess the next task is to handle PMTU events of already > encrypted packets properly? We still have that problem right? When the > ICMP payload is encrypted we have to cache some information on IPSEC output > so that a proto+SPI key can find us the encrypted inner IP header info, which > we'll need in order to process (and/or forward) the ICMP PMTU information > correctly. Yes, that's going to be the second part of the MTU work. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [16/*] [INET] Take IPsec overhead into account in tunnels 2005-03-15 9:58 ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu 2005-03-15 10:05 ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu @ 2005-03-15 10:20 ` Lennert Buytenhek 2005-03-15 10:27 ` Herbert Xu 2005-03-15 18:20 ` David S. Miller 2005-03-18 9:03 ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit Herbert Xu 3 siblings, 1 reply; 114+ messages in thread From: Lennert Buytenhek @ 2005-03-15 10:20 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev On Tue, Mar 15, 2005 at 08:58:37PM +1100, Herbert Xu wrote: > This patch uses dst_mtu instead of dst_pmtu in the various tunnel > implementations. As it is they simply ignore the IPsec overhead. > This leads to bogus MTU values inside the tunnels. I've not been able to get ipsec-encapsulated ipip/gre tunneling to work properly (without MTU issues) so far, and never had the time to properly dig into it. If this is going to be fixed, that would be great. --L ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [16/*] [INET] Take IPsec overhead into account in tunnels 2005-03-15 10:20 ` [16/*] [INET] Take IPsec overhead into account in tunnels Lennert Buytenhek @ 2005-03-15 10:27 ` Herbert Xu 0 siblings, 0 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-15 10:27 UTC (permalink / raw) To: Lennert Buytenhek Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev On Tue, Mar 15, 2005 at 11:20:52AM +0100, Lennert Buytenhek wrote: > > I've not been able to get ipsec-encapsulated ipip/gre tunneling to > work properly (without MTU issues) so far, and never had the time to > properly dig into it. If this is going to be fixed, that would be > great. Well please give it a run and let me know if it's still broken :) -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [16/*] [INET] Take IPsec overhead into account in tunnels 2005-03-15 9:58 ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu 2005-03-15 10:05 ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu 2005-03-15 10:20 ` [16/*] [INET] Take IPsec overhead into account in tunnels Lennert Buytenhek @ 2005-03-15 18:20 ` David S. Miller 2005-03-18 9:03 ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit Herbert Xu 3 siblings, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-15 18:20 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Tue, 15 Mar 2005 20:58:37 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > This patch uses dst_mtu instead of dst_pmtu in the various tunnel > implementations. As it is they simply ignore the IPsec overhead. > This leads to bogus MTU values inside the tunnels. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks Herbert. > BTW, we're doing lazy MTU updates in the tunnel xmit functions. > When a packet with DF set hits us and exceeds the updated MTU, > we will send an ICMP packet back which is good. > > Unfortunately when a packet with DF clear hits us as we update > the MTU downwards, the packet will be silently discarded instead > of fragmented (well we will send an ICMP back to ourselves but > we already knew that MTU value :). > > I presume we want to fix this, right? I think so, although it could be argued that in the end it really doesn't matter. The final argument though is that quality of implementation says we shouldn't drop the frame for such a reason. ^ permalink raw reply [flat|nested] 114+ messages in thread
* [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit 2005-03-15 9:58 ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu ` (2 preceding siblings ...) 2005-03-15 18:20 ` David S. Miller @ 2005-03-18 9:03 ` Herbert Xu 2005-03-18 9:11 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu 2005-03-18 18:39 ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit David S. Miller 3 siblings, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-18 9:03 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 706 bytes --] Hi Dave: Somehow I missed four files with dst_pmtu usages in them. I'm going to split them along the sames lines I did before: bug fixes and then the trivial changes. Here is a patch that replaces dst_pmtu with dst_pmtu in ipmr.c since this is straight IPIP tunneling equivalent to what we have in ipip.c. As it is we may send ICMP packets when IPsec is present which is exactly what the comment says that we shouldn't do. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-21 --] [-- Type: text/plain, Size: 490 bytes --] ===== net/ipv4/ipmr.c 1.46 vs edited ===== --- 1.46/net/ipv4/ipmr.c 2005-01-14 15:41:05 +11:00 +++ edited/net/ipv4/ipmr.c 2005-03-18 19:56:33 +11:00 @@ -1171,7 +1171,7 @@ dev = rt->u.dst.dev; - if (skb->len+encap > dst_pmtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { + if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not allow to send ICMP, so that packets will disappear to blackhole. ^ permalink raw reply [flat|nested] 114+ messages in thread
* [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-18 9:03 ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit Herbert Xu @ 2005-03-18 9:11 ` Herbert Xu 2005-03-18 9:19 ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu Herbert Xu 2005-03-18 18:40 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller 2005-03-18 18:39 ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit David S. Miller 1 sibling, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-18 9:11 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 858 bytes --] Hi Dave: This patch makes ipt_TCPMSS use the correct MTU value for clamping. This is a bit tricky actually since TCPMSS can be used in FORWARD, LOCAL_OUT as well as POST_ROUTING. In the first two cases we haven't performed IPsec yet so dst_mtu obviously does the right thing. As it is, POST_ROUTING is performed after xfrm_output so MSS clamping is useless there. With Patrick's IPsec netfilter stuff, there will be a POST_ROUTING processing before IPsec processing, in which case dst_mtu also returns exactly what we want. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> BTW Patrick, how is the IPsec netfilter stuff going? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-22 --] [-- Type: text/plain, Size: 924 bytes --] ===== net/ipv4/netfilter/ipt_TCPMSS.c 1.13 vs edited ===== --- 1.13/net/ipv4/netfilter/ipt_TCPMSS.c 2004-06-04 15:04:05 +10:00 +++ edited/net/ipv4/netfilter/ipt_TCPMSS.c 2005-03-18 19:56:34 +11:00 @@ -87,14 +87,14 @@ return NF_DROP; /* or IPT_CONTINUE ?? */ } - if(dst_pmtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) { + if(dst_mtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) { if (net_ratelimit()) printk(KERN_ERR - "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_pmtu((*pskb)->dst)); + "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_mtu((*pskb)->dst)); return NF_DROP; /* or IPT_CONTINUE ?? */ } - newmss = dst_pmtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr); + newmss = dst_mtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr); } else newmss = tcpmssinfo->mss; ^ permalink raw reply [flat|nested] 114+ messages in thread
* [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu 2005-03-18 9:11 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu @ 2005-03-18 9:19 ` Herbert Xu 2005-03-18 10:07 ` [24/*] [IPSEC] Get ttl from child instead of path Herbert Xu 2005-03-18 18:41 ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu David S. Miller 2005-03-18 18:40 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller 1 sibling, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-18 9:19 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 1004 bytes --] Hi Dave: Once again here is a couple of trivial dst_pmtu/dst_mtu replacements. In both locations, we can only have simple dst entries which means that dst == dst->path. BTW, this is the rule that we should apply in future for uses of dst_mtu/dst_pmtu (or other metrics on dst). If the only dst's that can appear are simple dst's (dst == dst->path), then we should use dst_mtu or dst_metric. If dst != dst->path, then whoever is writing the code will need to think about which of dst or dst->path is the right one. In most instances dst will be the one. However, as we have seen in ip_append_data, dst->path may be needed rarely. In particular, if we're doing fragmentation immediately after IPsec, then you may need it. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-23 --] [-- Type: text/plain, Size: 2165 bytes --] ===== net/ipv4/ipvs/ip_vs_xmit.c 1.12 vs edited ===== --- 1.12/net/ipv4/ipvs/ip_vs_xmit.c 2005-03-11 13:17:32 +11:00 +++ edited/net/ipv4/ipvs/ip_vs_xmit.c 2005-03-18 19:56:34 +11:00 @@ -178,7 +178,7 @@ } /* MTU checking */ - mtu = dst_pmtu(&rt->u.dst); + mtu = dst_mtu(&rt->u.dst); if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); @@ -245,7 +245,7 @@ goto tx_error_icmp; /* MTU checking */ - mtu = dst_pmtu(&rt->u.dst); + mtu = dst_mtu(&rt->u.dst); if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); @@ -342,7 +342,7 @@ tdev = rt->u.dst.dev; - mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); if (mtu < 68) { ip_rt_put(rt); IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); @@ -445,7 +445,7 @@ goto tx_error_icmp; /* MTU checking */ - mtu = dst_pmtu(&rt->u.dst); + mtu = dst_mtu(&rt->u.dst); if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); @@ -520,7 +520,7 @@ goto tx_error_icmp; /* MTU checking */ - mtu = dst_pmtu(&rt->u.dst); + mtu = dst_mtu(&rt->u.dst); if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); ===== net/ipv4/netfilter/ip_conntrack_standalone.c 1.66 vs edited ===== --- 1.66/net/ipv4/netfilter/ip_conntrack_standalone.c 2005-03-04 09:17:07 +11:00 +++ edited/net/ipv4/netfilter/ip_conntrack_standalone.c 2005-03-18 19:56:34 +11:00 @@ -457,7 +457,7 @@ /* Local packets are never produced too large for their interface. We degfragment them at LOCAL_OUT, however, so we have to refragment them here. */ - if ((*pskb)->len > dst_pmtu(&rt->u.dst) && + if ((*pskb)->len > dst_mtu(&rt->u.dst) && !skb_shinfo(*pskb)->tso_size) { /* No hook can be after us, so this should be OK. */ ip_fragment(*pskb, okfn); ^ permalink raw reply [flat|nested] 114+ messages in thread
* [24/*] [IPSEC] Get ttl from child instead of path 2005-03-18 9:19 ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu Herbert Xu @ 2005-03-18 10:07 ` Herbert Xu 2005-03-18 10:11 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric Herbert Xu 2005-03-18 18:43 ` [24/*] [IPSEC] Get ttl from child instead of path David S. Miller 2005-03-18 18:41 ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu David S. Miller 1 sibling, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-18 10:07 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 696 bytes --] Hi Dave: Now that dst_pmtu is almost gone let's do the same to dst_path_metric. I've only found one legitimate use of it and that's the one that was recently added in dst_allfrag. This patch makes xfrm4_encap/xfrm6_encap use dst->child instead of dst->path so that we choose the correct route to get the hoplimit from when nested tunnels are present. For simple tunnels dst->child == dst->path so there is no change. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-24 --] [-- Type: text/plain, Size: 1006 bytes --] ===== net/ipv4/xfrm4_output.c 1.9 vs edited ===== --- 1.9/net/ipv4/xfrm4_output.c 2005-03-15 16:38:41 +11:00 +++ edited/net/ipv4/xfrm4_output.c 2005-03-18 21:02:33 +11:00 @@ -58,7 +58,7 @@ if (!top_iph->frag_off) __ip_select_ident(top_iph, dst, 0); - top_iph->ttl = dst_path_metric(dst, RTAX_HOPLIMIT); + top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; ===== net/ipv6/xfrm6_output.c 1.10 vs edited ===== --- 1.10/net/ipv6/xfrm6_output.c 2005-03-15 16:38:41 +11:00 +++ edited/net/ipv6/xfrm6_output.c 2005-03-18 21:02:46 +11:00 @@ -69,7 +69,7 @@ dsfield &= ~INET_ECN_MASK; ipv6_change_dsfield(top_iph, 0, dsfield); top_iph->nexthdr = IPPROTO_IPV6; - top_iph->hop_limit = dst_path_metric(dst, RTAX_HOPLIMIT); + top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); } ^ permalink raw reply [flat|nested] 114+ messages in thread
* [25/*] [NET] Kill unnecessary uses of dst_path_metric 2005-03-18 10:07 ` [24/*] [IPSEC] Get ttl from child instead of path Herbert Xu @ 2005-03-18 10:11 ` Herbert Xu 2005-03-18 11:06 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric Herbert Xu 2005-03-18 18:44 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric David S. Miller 2005-03-18 18:43 ` [24/*] [IPSEC] Get ttl from child instead of path David S. Miller 1 sibling, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-18 10:11 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 394 bytes --] Hi Dave: This gets rid of the last unnecessary use of dst_path_metric. In decnet dst->path is always equal to dst. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-25 --] [-- Type: text/plain, Size: 786 bytes --] ===== net/decnet/af_decnet.c 1.51 vs edited ===== --- 1.51/net/decnet/af_decnet.c 2005-03-16 09:50:25 +11:00 +++ edited/net/decnet/af_decnet.c 2005-03-18 21:01:24 +11:00 @@ -811,7 +811,7 @@ return -EINVAL; scp->state = DN_CC; - scp->segsize_loc = dst_path_metric(__sk_dst_get(sk), RTAX_ADVMSS); + scp->segsize_loc = dst_metric(__sk_dst_get(sk), RTAX_ADVMSS); dn_send_conn_conf(sk, allocation); prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); @@ -940,7 +940,7 @@ sk->sk_route_caps = sk->sk_dst_cache->dev->features; sock->state = SS_CONNECTING; scp->state = DN_CI; - scp->segsize_loc = dst_path_metric(sk->sk_dst_cache, RTAX_ADVMSS); + scp->segsize_loc = dst_metric(sk->sk_dst_cache, RTAX_ADVMSS); dn_nsp_send_conninit(sk, NSP_CI); err = -EINPROGRESS; ^ permalink raw reply [flat|nested] 114+ messages in thread
* [26/*] [NET] Kill dst_pmtu/dst_path_metric 2005-03-18 10:11 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric Herbert Xu @ 2005-03-18 11:06 ` Herbert Xu 2005-03-18 11:28 ` [27/*] [NET] Make dst_allfrag use dst instead of dst->path Herbert Xu 2005-03-18 18:46 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric David S. Miller 2005-03-18 18:44 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric David S. Miller 1 sibling, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-18 11:06 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 623 bytes --] Hi Dave: This would have been the patch that removed dst->path. But ip_append_data got in the way :) However, using dst->path is only needed rarely and should always require a bit of deliberation. So let's get rid of dst_pmtu and dst_path_metric and use dst_mtu and dst_metric directly. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> BTW, shouldn't dst_allfrag be called dst_path_allfrag? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-26 --] [-- Type: text/plain, Size: 1819 bytes --] ===== include/net/dst.h 1.29 vs edited ===== --- 1.29/include/net/dst.h 2005-03-11 13:17:32 +11:00 +++ edited/include/net/dst.h 2005-03-18 21:08:56 +11:00 @@ -109,21 +109,6 @@ return dst->metrics[metric-1]; } -static inline u32 -dst_path_metric(const struct dst_entry *dst, int metric) -{ - return dst->path->metrics[metric-1]; -} - -static inline u32 -dst_pmtu(const struct dst_entry *dst) -{ - u32 mtu = dst_path_metric(dst, RTAX_MTU); - /* Yes, _exactly_. This is paranoia. */ - barrier(); - return mtu; -} - static inline u32 dst_mtu(const struct dst_entry *dst) { u32 mtu = dst_metric(dst, RTAX_MTU); @@ -137,7 +122,7 @@ static inline u32 dst_allfrag(const struct dst_entry *dst) { - int ret = dst_path_metric(dst, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG; + int ret = dst_metric(dst->path, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG; /* Yes, _exactly_. This is paranoia. */ barrier(); return ret; ===== net/ipv4/ip_output.c 1.79 vs edited ===== --- 1.79/net/ipv4/ip_output.c 2005-03-17 09:02:36 +11:00 +++ edited/net/ipv4/ip_output.c 2005-03-18 21:00:08 +11:00 @@ -746,7 +746,7 @@ inet->cork.addr = ipc->addr; } dst_hold(&rt->u.dst); - inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst); + inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); inet->cork.rt = rt; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; ===== net/ipv6/ip6_output.c 1.89 vs edited ===== --- 1.89/net/ipv6/ip6_output.c 2005-03-17 09:02:36 +11:00 +++ edited/net/ipv6/ip6_output.c 2005-03-18 21:00:08 +11:00 @@ -850,7 +850,7 @@ np->cork.rt = rt; inet->cork.fl = *fl; np->cork.hop_limit = hlimit; - inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst); + inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); if (dst_allfrag(&rt->u.dst)) inet->cork.flags |= IPCORK_ALLFRAG; inet->cork.length = 0; ^ permalink raw reply [flat|nested] 114+ messages in thread
* [27/*] [NET] Make dst_allfrag use dst instead of dst->path 2005-03-18 11:06 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric Herbert Xu @ 2005-03-18 11:28 ` Herbert Xu 2005-03-18 18:47 ` David S. Miller 2005-03-18 18:46 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric David S. Miller 1 sibling, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-18 11:28 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev [-- Attachment #1: Type: text/plain, Size: 617 bytes --] On Fri, Mar 18, 2005 at 10:06:11PM +1100, herbert wrote: > > BTW, shouldn't dst_allfrag be called dst_path_allfrag? Rather than doing that, let's make the path usage explicit in the one place that it's needed (the same place where we use dst_mtu(dst->path)). Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Hmm, it still doesn't feel right to have an IPv6-specific helper in net/dst.h. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-27 --] [-- Type: text/plain, Size: 879 bytes --] --- linux-2.6/include/net/dst.h.orig 2005-03-18 22:20:39.000000000 +1100 +++ linux-2.6/include/net/dst.h 2005-03-18 22:20:46.000000000 +1100 @@ -122,7 +122,7 @@ static inline u32 dst_allfrag(const struct dst_entry *dst) { - int ret = dst_metric(dst->path, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG; + int ret = dst_metric(dst, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG; /* Yes, _exactly_. This is paranoia. */ barrier(); return ret; --- linux-2.6/net/ipv6/ip6_output.c.orig 2005-03-18 22:21:01.000000000 +1100 +++ linux-2.6/net/ipv6/ip6_output.c 2005-03-18 22:23:52.000000000 +1100 @@ -851,7 +851,7 @@ inet->cork.fl = *fl; np->cork.hop_limit = hlimit; inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); - if (dst_allfrag(&rt->u.dst)) + if (dst_allfrag(rt->u.dst.path)) inet->cork.flags |= IPCORK_ALLFRAG; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [27/*] [NET] Make dst_allfrag use dst instead of dst->path 2005-03-18 11:28 ` [27/*] [NET] Make dst_allfrag use dst instead of dst->path Herbert Xu @ 2005-03-18 18:47 ` David S. Miller 0 siblings, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-18 18:47 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Fri, 18 Mar 2005 22:28:04 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > Rather than doing that, let's make the path usage explicit in > the one place that it's needed (the same place where we use > dst_mtu(dst->path)). > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Ok, applied. > Hmm, it still doesn't feel right to have an IPv6-specific helper > in net/dst.h. We have tons of stuff in the neighbour layer that really is only taken advantage of by ipv6, it's a similar situation. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [26/*] [NET] Kill dst_pmtu/dst_path_metric 2005-03-18 11:06 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric Herbert Xu 2005-03-18 11:28 ` [27/*] [NET] Make dst_allfrag use dst instead of dst->path Herbert Xu @ 2005-03-18 18:46 ` David S. Miller 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-18 18:46 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Fri, 18 Mar 2005 22:06:11 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > This would have been the patch that removed dst->path. But > ip_append_data got in the way :) > > However, using dst->path is only needed rarely and should always > require a bit of deliberation. So let's get rid of dst_pmtu > and dst_path_metric and use dst_mtu and dst_metric directly. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks. > BTW, shouldn't dst_allfrag be called dst_path_allfrag? If you think that makes it's functionality clearer, sure. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [25/*] [NET] Kill unnecessary uses of dst_path_metric 2005-03-18 10:11 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric Herbert Xu 2005-03-18 11:06 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric Herbert Xu @ 2005-03-18 18:44 ` David S. Miller 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-18 18:44 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Fri, 18 Mar 2005 21:11:26 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > This gets rid of the last unnecessary use of dst_path_metric. In > decnet dst->path is always equal to dst. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [24/*] [IPSEC] Get ttl from child instead of path 2005-03-18 10:07 ` [24/*] [IPSEC] Get ttl from child instead of path Herbert Xu 2005-03-18 10:11 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric Herbert Xu @ 2005-03-18 18:43 ` David S. Miller 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-18 18:43 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Fri, 18 Mar 2005 21:07:57 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > Now that dst_pmtu is almost gone let's do the same to dst_path_metric. > I've only found one legitimate use of it and that's the one that was > recently added in dst_allfrag. > > This patch makes xfrm4_encap/xfrm6_encap use dst->child instead of > dst->path so that we choose the correct route to get the hoplimit > from when nested tunnels are present. > > For simple tunnels dst->child == dst->path so there is no change. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Looks good, applied. Thanks Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu 2005-03-18 9:19 ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu Herbert Xu 2005-03-18 10:07 ` [24/*] [IPSEC] Get ttl from child instead of path Herbert Xu @ 2005-03-18 18:41 ` David S. Miller 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-18 18:41 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Fri, 18 Mar 2005 20:19:18 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > Once again here is a couple of trivial dst_pmtu/dst_mtu replacements. > In both locations, we can only have simple dst entries which means > that dst == dst->path. > > BTW, this is the rule that we should apply in future for uses of > dst_mtu/dst_pmtu (or other metrics on dst). If the only dst's that > can appear are simple dst's (dst == dst->path), then we should use > dst_mtu or dst_metric. If dst != dst->path, then whoever is writing > the code will need to think about which of dst or dst->path is the > right one. > > In most instances dst will be the one. However, as we have seen in > ip_append_data, dst->path may be needed rarely. In particular, if > we're doing fragmentation immediately after IPsec, then you may need > it. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, and thanks for the nice succinct version of the rule :-) ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-18 9:11 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu 2005-03-18 9:19 ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu Herbert Xu @ 2005-03-18 18:40 ` David S. Miller 2005-03-20 15:46 ` Patrick McHardy 1 sibling, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-03-18 18:40 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Fri, 18 Mar 2005 20:11:29 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > This patch makes ipt_TCPMSS use the correct MTU value for clamping. > This is a bit tricky actually since TCPMSS can be used in FORWARD, > LOCAL_OUT as well as POST_ROUTING. > > In the first two cases we haven't performed IPsec yet so dst_mtu > obviously does the right thing. As it is, POST_ROUTING is performed > after xfrm_output so MSS clamping is useless there. > > With Patrick's IPsec netfilter stuff, there will be a POST_ROUTING > processing before IPsec processing, in which case dst_mtu also returns > exactly what we want. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks Herbert. > BTW Patrick, how is the IPsec netfilter stuff going? That boy is seriously backlogged, so I'm not sure how much time he's gotten to work on that yet. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-18 18:40 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller @ 2005-03-20 15:46 ` Patrick McHardy 2005-03-20 16:32 ` Ludo Stellingwerff ` (2 more replies) 0 siblings, 3 replies; 114+ messages in thread From: Patrick McHardy @ 2005-03-20 15:46 UTC (permalink / raw) To: David S. Miller; +Cc: Herbert Xu, kuznet, yoshfuji, netdev David S. Miller wrote: > On Fri, 18 Mar 2005 20:11:29 +1100 > Herbert Xu <herbert@gondor.apana.org.au> wrote: > >>BTW Patrick, how is the IPsec netfilter stuff going? > > That boy is seriously backlogged, so I'm not sure how much time > he's gotten to work on that yet. Indeed, but in case of the netfilter patches that's not the problem. They are basically working fine, but I have doubts about submitting them. First, and most importantly, the input patch is incredible ugly. To recap, we want to pass the encapsulated packets to the netfilter hooks, then again the decapsulated packets after all decapsulation has been done. The current input patch makes packets that have been handled by IPsec skip the netfilter hooks until we know no further IPsec processing will be done (route is non-local or protocol handler is not marked as xfrm_prot). The packet is then marked as completely decapsulated and passed through the stack again and the plain packets go through netfilter again. There are a couple of problems with this approach: - decapsulated tunnel-mode packets go through the stack twice - netfilter only sees them once, everything else multiple times (statistics, packet sockets, ...) - racy, xfrm protocol could be registered after we determined decapsulation is done. - inefficient The second reason is that I'm not sure at all wether this is the way to go. With KLIPS-like IPsec-devices you can sniff the plain packets before they are handled by IPsec and you can perform traffic shaping on them. These two points are completely unhandled, and people seem to want them. So what's holding back these patches is getting some consensus on what exactly we want to do and finding a better method for determining when decapsulation is done. One possibility would be stealing packets in xfrm_policy_check(), but I haven't thought much about this yet. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-20 15:46 ` Patrick McHardy @ 2005-03-20 16:32 ` Ludo Stellingwerff 2005-03-20 17:17 ` Lennert Buytenhek 2005-03-23 3:49 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller 2005-03-23 9:24 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu 2 siblings, 1 reply; 114+ messages in thread From: Ludo Stellingwerff @ 2005-03-20 16:32 UTC (permalink / raw) To: netdev -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 As an daily user of Patricks patches, I like to agree with Patricks analysis about the loosing of flexibility with the choice not to use virtual devices. In a real-life situation I had to revert to using plain, unsave GRE-tunnels because of the complexity of using 2.6-ipsec for the same purpose. I'm routing one of multiple defaultroutes over this tunnel:) It felt like a relieve having a device to work with. Just my 2 cents, Greetings, Ludo. Patrick McHardy wrote: | David S. Miller wrote: | |> On Fri, 18 Mar 2005 20:11:29 +1100 Herbert Xu |> <herbert@gondor.apana.org.au> wrote: |> |>> BTW Patrick, how is the IPsec netfilter stuff going? |> |> |> That boy is seriously backlogged, so I'm not sure how much time |> he's gotten to work on that yet. | | | Indeed, but in case of the netfilter patches that's not the | problem. They are basically working fine, but I have doubts about | submitting them. First, and most importantly, the input patch is | incredible ugly. To recap, we want to pass the encapsulated packets | to the netfilter hooks, then again the decapsulated packets after | all decapsulation has been done. The current input patch makes | packets that have been handled by IPsec skip the netfilter hooks | until we know no further IPsec processing will be done (route is | non-local or protocol handler is not marked as xfrm_prot). The | packet is then marked as completely decapsulated and passed through | the stack again and the plain packets go through netfilter again. | There are a couple of problems with this approach: | | - decapsulated tunnel-mode packets go through the stack twice - | netfilter only sees them once, everything else multiple times | (statistics, packet sockets, ...) - racy, xfrm protocol could be | registered after we determined decapsulation is done. - inefficient | | | The second reason is that I'm not sure at all wether this is the | way to go. With KLIPS-like IPsec-devices you can sniff the plain | packets before they are handled by IPsec and you can perform | traffic shaping on them. These two points are completely unhandled, | and people seem to want them. | | So what's holding back these patches is getting some consensus on | what exactly we want to do and finding a better method for | determining when decapsulation is done. One possibility would be | stealing packets in xfrm_policy_check(), but I haven't thought much | about this yet. | | Regards Patrick | | - -- Ludo Stellingwerff V&S B.V. The Netherlands ProTactive firewall solution. Tel: +31 172 416116 Fax: +31 172 416124 site: www.protactive.nl demo: http://www.protactive.nl:81/netview.html -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.0 (GNU/Linux) iD8DBQFCPaWMOF3sCpZ+AJgRAiofAJ9VlmQ3GF+D3q5FLfsyj3vcRJUbogCgzy86 pS4CRtw1sGe3K3vUVgIqi8E= =5IAn -----END PGP SIGNATURE----- ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-20 16:32 ` Ludo Stellingwerff @ 2005-03-20 17:17 ` Lennert Buytenhek 2005-03-20 17:49 ` Patrick McHardy 0 siblings, 1 reply; 114+ messages in thread From: Lennert Buytenhek @ 2005-03-20 17:17 UTC (permalink / raw) To: Ludo Stellingwerff, Herbert Xu; +Cc: netdev On Sun, Mar 20, 2005 at 05:32:13PM +0100, Ludo Stellingwerff wrote: > As an daily user of Patricks patches, I like to agree with Patricks > analysis about the loosing of flexibility with the choice not to use > virtual devices. In a real-life situation I had to revert to using > plain, unsave GRE-tunnels because of the complexity of using 2.6-ipsec > for the same purpose. I'm routing one of multiple defaultroutes over > this tunnel:) It felt like a relieve having a device to work with. I'll add my 2 cents to this. In my situation, there are three different sites in the same city (Amsterdam), interconnected using a shared L2 vlan, and six routers (A1, A2, B1, B2, C1, C2) on that vlan, two per site for redundancy reasons. Each router runs ospf. The vlan is provided to us by a telco that we do not necessarily trust. So ideally, we'd like all traffic that goes over the vlan (modulo ARPs and STP and stuff) to be encrypted. ("-o eth3 -j ENCRYPT") The problem I kept running into with tunnel mode is that tunnel mode SPD rules appear to dictate routing policy in a way that's not compatible with dynamic routing. I.e., a line like: spdadd 10.10.1.0/24 10.0.1.0/24 any -P out ipsec esp/tunnel/1.2.3.4-5.6.7.8/require; effectively says "All traffic from 10.10.1.0/24 to 10.0.1.0/24 will be sent over a tunnel with local endpoint 1.2.3.4 and remote endpoint 5.6.7.8", but: - I have no idea beforehand what address ranges are going to be routed over this vlan. (Customers might send traffic with source addresses in address ranges that they don't announce to us (asymmetric routing), and I don't want those packets to remain unencrypted just because they don't match the SPD rule.) A 0.0.0.0/0 0.0.0.0/0 rule would not be appropriate either since that'd suck _all_ traffic into this tunnel. - I have no idea beforehand what the remote nexthop is going to be. A1 might ordinarily send its traffic for site B to B1, but if B1 fails it'll want to start using B2 instead, which would be prevented by the SPD rule hardcoding the remote tunnel endpoint to B1. The workaround we tried at first was to create GRE tunnels between each pair of routers on the vlan, and to run ospf over the tunnels instead of directly over the vlan interface. That gave MTU problems, though, which made us just forget about ipsec altogether and use vtund instead, hardly better than nothing. (Now that Herbert has submitted a number of fixes for ipsec MTU issues with tunnels, I guess I should go and give the GRE-over-ipsec setup a go again.) Then again, maybe I totally misunderstood the way this is supposed to work. --L ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-20 17:17 ` Lennert Buytenhek @ 2005-03-20 17:49 ` Patrick McHardy 2005-03-20 18:11 ` Ludo Stellingwerff 2005-03-30 9:49 ` Extending xfrm_selector (Was: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS) Herbert Xu 0 siblings, 2 replies; 114+ messages in thread From: Patrick McHardy @ 2005-03-20 17:49 UTC (permalink / raw) To: Lennert Buytenhek; +Cc: Ludo Stellingwerff, Herbert Xu, netdev Lennert Buytenhek wrote: > In my situation, there are three different sites in the same city > (Amsterdam), interconnected using a shared L2 vlan, and six routers > (A1, A2, B1, B2, C1, C2) on that vlan, two per site for redundancy > reasons. Each router runs ospf. > > The vlan is provided to us by a telco that we do not necessarily trust. > So ideally, we'd like all traffic that goes over the vlan (modulo ARPs > and STP and stuff) to be encrypted. ("-o eth3 -j ENCRYPT") > > The problem I kept running into with tunnel mode is that tunnel > mode SPD rules appear to dictate routing policy in a way that's not > compatible with dynamic routing. > > I.e., a line like: > > spdadd 10.10.1.0/24 10.0.1.0/24 any -P out ipsec esp/tunnel/1.2.3.4-5.6.7.8/require; > > effectively says "All traffic from 10.10.1.0/24 to 10.0.1.0/24 will > be sent over a tunnel with local endpoint 1.2.3.4 and remote endpoint > 5.6.7.8", but: > - I have no idea beforehand what address ranges are going to be routed > over this vlan. (Customers might send traffic with source addresses > in address ranges that they don't announce to us (asymmetric routing), > and I don't want those packets to remain unencrypted just because they > don't match the SPD rule.) A 0.0.0.0/0 0.0.0.0/0 rule would not be > appropriate either since that'd suck _all_ traffic into this tunnel You can specify an ifindex for oif in the selector, but you need to use the xfrm_user interface. > - I have no idea beforehand what the remote nexthop is going to be. A1 > might ordinarily send its traffic for site B to B1, but if B1 fails > it'll want to start using B2 instead, which would be prevented by the > SPD rule hardcoding the remote tunnel endpoint to B1. > > The workaround we tried at first was to create GRE tunnels between each > pair of routers on the vlan, and to run ospf over the tunnels instead of > directly over the vlan interface. That gave MTU problems, though, which > made us just forget about ipsec altogether and use vtund instead, hardly > better than nothing. (Now that Herbert has submitted a number of fixes > for ipsec MTU issues with tunnels, I guess I should go and give the > GRE-over-ipsec setup a go again.) Hmm .. sounds like using the routing realm in the selector would solve this while avoiding the GRE overhead. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-20 17:49 ` Patrick McHardy @ 2005-03-20 18:11 ` Ludo Stellingwerff 2005-03-20 18:22 ` Patrick McHardy 2005-03-30 9:49 ` Extending xfrm_selector (Was: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS) Herbert Xu 1 sibling, 1 reply; 114+ messages in thread From: Ludo Stellingwerff @ 2005-03-20 18:11 UTC (permalink / raw) To: netdev -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Patrick McHardy wrote: | Lennert Buytenhek wrote: |> - I have no idea beforehand what the remote nexthop is going to |> be. A1 might ordinarily send its traffic for site B to B1, but |> if B1 fails it'll want to start using B2 instead, which would be |> prevented by the SPD rule hardcoding the remote tunnel endpoint |> to B1. |> | | Hmm .. sounds like using the routing realm in the selector would | solve this while avoiding the GRE overhead. | | Regards Patrick | I'm hoping that using the fwmark as a selector can provide a workable solution for both mine and Lennert's problem, any many more related situations. Netfilter has a (almost) complete range of selectors. e.g. Lennerts problem could be solved using a combination of the "realm" match of iptables, in combination with a fwmark for SPD matching. Greetings, Ludo. PS. On a side note: Wouldn't it be possible to have a netfilter target stating that an transformation should be done? - -- Ludo Stellingwerff V&S B.V. The Netherlands ProTactive firewall solution. Tel: +31 172 416116 Fax: +31 172 416124 site: www.protactive.nl demo: http://www.protactive.nl:81/netview.html -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.0 (GNU/Linux) iD8DBQFCPbzNOF3sCpZ+AJgRApxBAJ9akLfP1onp+WKRgmJ1YDImkrXLHwCgkPS4 GvwO1PoUwkJnVTOjeaf/ZEw= =OebA -----END PGP SIGNATURE----- ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-20 18:11 ` Ludo Stellingwerff @ 2005-03-20 18:22 ` Patrick McHardy 2005-03-20 18:43 ` jamal 0 siblings, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-03-20 18:22 UTC (permalink / raw) To: Ludo Stellingwerff; +Cc: netdev Ludo Stellingwerff wrote: > I'm hoping that using the fwmark as a selector can provide a workable > solution for both mine and Lennert's problem, any many more related > situations. Netfilter has a (almost) complete range of selectors. > e.g. Lennerts problem could be solved using a combination of the > "realm" match of iptables, in combination with a fwmark for SPD matching. Routing of local packets is done before the first netfilter hook is called, but I forgot about ip_route_me_harder(). So you're right, the realm can be translated to nfmark values using iptables. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-20 18:22 ` Patrick McHardy @ 2005-03-20 18:43 ` jamal 2005-03-20 19:10 ` Patrick McHardy 0 siblings, 1 reply; 114+ messages in thread From: jamal @ 2005-03-20 18:43 UTC (permalink / raw) To: Patrick McHardy; +Cc: Ludo Stellingwerff, netdev On Sun, 2005-03-20 at 13:22, Patrick McHardy wrote: > Ludo Stellingwerff wrote: > > I'm hoping that using the fwmark as a selector can provide a workable > > solution for both mine and Lennert's problem, any many more related > > situations. Netfilter has a (almost) complete range of selectors. > > e.g. Lennerts problem could be solved using a combination of the > > "realm" match of iptables, in combination with a fwmark for SPD matching. > > Routing of local packets is done before the first netfilter hook > is called, but I forgot about ip_route_me_harder(). So you're right, > the realm can be translated to nfmark values using iptables. BTW, is there any reason the SPD couldnt have been implemented from day one using netfilter classification ? Why did we need another speacilized classifier? the actions are clearly implementable as targets. cheers, jamal ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-20 18:43 ` jamal @ 2005-03-20 19:10 ` Patrick McHardy 0 siblings, 0 replies; 114+ messages in thread From: Patrick McHardy @ 2005-03-20 19:10 UTC (permalink / raw) To: hadi; +Cc: Ludo Stellingwerff, netdev jamal wrote: > BTW, is there any reason the SPD couldnt have been implemented from day > one using netfilter classification ? Why did we need another speacilized > classifier? the actions are clearly implementable as targets. IMO iptables isn't so great that one would actually want to do this. The entire ruleset needs to be one continous area in memory, so it can not be changed, only replaced. To make it useable over pfkey would mean many things that are currently done by iptables in userspace need to be done in the kernel. There are multiple other reasons, but I don't think its even worth discussing this. This of course doesn't mean I'm against reducing the number of different classification engines. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Extending xfrm_selector (Was: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS) 2005-03-20 17:49 ` Patrick McHardy 2005-03-20 18:11 ` Ludo Stellingwerff @ 2005-03-30 9:49 ` Herbert Xu 1 sibling, 0 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-30 9:49 UTC (permalink / raw) To: Patrick McHardy; +Cc: Lennert Buytenhek, Ludo Stellingwerff, netdev On Sun, Mar 20, 2005 at 06:49:43PM +0100, Patrick McHardy wrote: > > >- I have no idea beforehand what address ranges are going to be routed > > over this vlan. (Customers might send traffic with source addresses > > in address ranges that they don't announce to us (asymmetric routing), > > and I don't want those packets to remain unencrypted just because they > > don't match the SPD rule.) A 0.0.0.0/0 0.0.0.0/0 rule would not be > > appropriate either since that'd suck _all_ traffic into this tunnel > > You can specify an ifindex for oif in the selector, but you need > to use the xfrm_user interface. Unfortunately that doesn't work currently because we don't fill in the oif field in the flow. More importantly, having an interface specific outbound policy isn't very useful when you can't have an interface specific inbound/forward policy to match. This actually leads to another issue. The xfrm_selector structure only contains one iface field. So we can't specify both iif and oif for forwarded packets. Extending xfrm_selector is not trivial. Ludo Stellingwerff also wanted to do that in order use fwmark in the selector. Here is one possible solution that allows xfrm_selector to be extended. We store new selector fields in a separate netlink payload. This works because all the current messages that carry xfrm_selector carry exactly one selector. When the kernel receives a message containing an xfrm_selector it'll initialise the kernel xfrm_selector to contain that plus whatever is in the correspond netlink payload. If the payload is absent the kernel should fill in sane default values for compatibility. For messages sent by the kernel we also attach a netlink payload to cover the new values. This should be backwards compatible as appliations should ignore unknown payloads. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-20 15:46 ` Patrick McHardy 2005-03-20 16:32 ` Ludo Stellingwerff @ 2005-03-23 3:49 ` David S. Miller 2005-03-23 4:03 ` Patrick McHardy 2005-03-23 9:24 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu 2 siblings, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-03-23 3:49 UTC (permalink / raw) To: Patrick McHardy; +Cc: herbert, kuznet, yoshfuji, netdev On Sun, 20 Mar 2005 16:46:34 +0100 Patrick McHardy <kaber@trash.net> wrote: > So what's holding back these patches is getting some consensus on what > exactly we want to do and finding a better method for determining when > decapsulation is done. One possibility would be stealing packets > in xfrm_policy_check(), but I haven't thought much about this yet. That latter idea sounds pursuable. I guess you'd do a netfilter hook in xfrm_policy_check() right? So then you'd need to pass struct sk_buff ** instead of a direct pointer. And that looks fine too, as nobody seems to cache skb->XXX state across xfrm_policy_check() calls. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-23 3:49 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller @ 2005-03-23 4:03 ` Patrick McHardy 2005-03-24 5:05 ` Netfilter+IPsec Patrick McHardy 0 siblings, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-03-23 4:03 UTC (permalink / raw) To: David S. Miller; +Cc: herbert, kuznet, yoshfuji, netdev David S. Miller wrote: > On Sun, 20 Mar 2005 16:46:34 +0100 > Patrick McHardy <kaber@trash.net> wrote: > > >>So what's holding back these patches is getting some consensus on what >>exactly we want to do and finding a better method for determining when >>decapsulation is done. One possibility would be stealing packets >>in xfrm_policy_check(), but I haven't thought much about this yet. > > > That latter idea sounds pursuable. I guess you'd do a netfilter > hook in xfrm_policy_check() right? It would call netif_rx(). The packet should pass all hooks as usual, so everything works as expected. It is cleaner than my current approach, but has the same problems wrt. statistics and AF_PACKET/raw sockets. I'll post a patch (probably tomorrow, its late here) so we have something concrete to talk about. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Netfilter+IPsec 2005-03-23 4:03 ` Patrick McHardy @ 2005-03-24 5:05 ` Patrick McHardy 2005-03-24 5:43 ` Netfilter+IPsec David S. Miller 0 siblings, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-03-24 5:05 UTC (permalink / raw) To: David S. Miller; +Cc: herbert, kuznet, yoshfuji, netdev [-- Attachment #1: Type: text/plain, Size: 1101 bytes --] Patrick McHardy wrote: > It would call netif_rx(). The packet should pass all hooks as usual, > so everything works as expected. It is cleaner than my current > approach, but has the same problems wrt. statistics and AF_PACKET/raw > sockets. I'll post a patch (probably tomorrow, its late here) so we > have something concrete to talk about. Unfortunately I have to delay again. This patch (not entirely reviewed myself yet) contains the parts necessary for hooking output IPsec packets for netfilter. dst_output() in ipv4/ and ipv6/ are replaced by ip_dst_output() and ip6_dst_output(), which pass the packets through POST_ROUTING before IPsec. All replaced calls should happen directly after NF_HOOK(LOCAL_OUT, ...). The packet is then marked as transformed in xfrm{4,6}_output() and passed through LOCAL_OUT in ip_output() again. This resembles the behaviour of tunnel-devices, a packet is first visible in plain on OUTPUT/FORWARD -> POST_ROUTING, then encapsulated on OUTPUT -> POST_ROUTING again. This part doesn't have any known problems, the input patch will follow tomorrow. Regards Patrick [-- Attachment #2: x --] [-- Type: text/plain, Size: 13535 bytes --] ===== include/linux/ipv6.h 1.29 vs edited ===== --- 1.29/include/linux/ipv6.h 2005-03-16 00:27:17 +01:00 +++ edited/include/linux/ipv6.h 2005-03-24 04:41:37 +01:00 @@ -177,19 +177,20 @@ #include <linux/tcp.h> #include <linux/udp.h> -/* - This structure contains results of exthdrs parsing - as offsets from skb->nh. - */ struct inet6_skb_parm { + /* results of exthdrs parsing as offsets from skb->nh. */ int iif; __u16 ra; __u16 hop; __u16 dst0; __u16 srcrt; __u16 dst1; + /* flags */ + __u16 flags; }; + +#define IP6SKB_XFRM_TRANSFORMED 0x1 #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) ===== include/linux/netfilter.h 1.18 vs edited ===== --- 1.18/include/linux/netfilter.h 2005-03-12 04:12:50 +01:00 +++ edited/include/linux/netfilter.h 2005-03-23 06:19:51 +01:00 @@ -139,9 +139,10 @@ /* This is gross, but inline doesn't cut it for avoiding the function call in fast path: gcc doesn't inline (needs value tracking?). --RR */ #ifdef CONFIG_NETFILTER_DEBUG -#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \ +#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) \ ({int __ret; \ -if ((__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \ +if (!(cond) || \ + (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \ __ret = (okfn)(skb); \ __ret;}) #define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh) \ @@ -150,9 +151,9 @@ __ret = (okfn)(skb); \ __ret;}) #else -#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \ +#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) \ ({int __ret; \ -if (list_empty(&nf_hooks[pf][hook]) || \ +if (!(cond) || list_empty(&nf_hooks[pf][hook]) || \ (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \ __ret = (okfn)(skb); \ __ret;}) @@ -163,6 +164,8 @@ __ret = (okfn)(skb); \ __ret;}) #endif +#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \ + NF_HOOK_COND((pf), (hook), (skb), (indev), (outdev), (okfn), 1) int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, struct net_device *indev, struct net_device *outdev, @@ -192,6 +195,7 @@ #else /* !CONFIG_NETFILTER */ #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb) +#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) (okfn)(skb) static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif /*CONFIG_NETFILTER*/ ===== include/net/ip.h 1.38 vs edited ===== --- 1.38/include/net/ip.h 2005-01-27 07:03:17 +01:00 +++ edited/include/net/ip.h 2005-03-23 06:20:11 +01:00 @@ -30,6 +30,8 @@ #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/in_route.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> #include <net/route.h> #include <net/arp.h> #include <net/snmp.h> @@ -45,6 +47,7 @@ #define IPSKB_TRANSLATED 2 #define IPSKB_FORWARDED 4 #define IPSKB_XFRM_TUNNEL_SIZE 8 +#define IPSKB_XFRM_TRANSFORMED 16 }; struct ipcm_cookie @@ -210,6 +213,12 @@ iph->id = 0; } else __ip_select_ident(iph, dst, more); +} + +static inline int ip_dst_output(struct sk_buff *skb) +{ + return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, + skb->dst->dev, dst_output, skb->dst->xfrm != NULL); } /* ===== include/net/ipv6.h 1.44 vs edited ===== --- 1.44/include/net/ipv6.h 2005-03-03 06:12:44 +01:00 +++ edited/include/net/ipv6.h 2005-03-23 06:14:52 +01:00 @@ -17,6 +17,8 @@ #include <linux/ipv6.h> #include <linux/hardirq.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> #include <net/ndisc.h> #include <net/flow.h> #include <net/snmp.h> @@ -335,6 +337,12 @@ { return ((a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3] ) == 0); +} + +static inline int ip6_dst_output(struct sk_buff *skb) +{ + return NF_HOOK_COND(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, + skb->dst->dev, dst_output, skb->dst->xfrm != NULL); } /* ===== net/ipv4/igmp.c 1.61 vs edited ===== --- 1.61/net/ipv4/igmp.c 2004-12-28 06:30:43 +01:00 +++ edited/net/ipv4/igmp.c 2005-03-23 05:53:14 +01:00 @@ -343,7 +343,7 @@ pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen); return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev, - dst_output); + ip_dst_output); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -674,7 +674,7 @@ ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr)); return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + ip_dst_output); } static void igmp_gq_timer_expire(unsigned long data) ===== net/ipv4/ip_forward.c 1.11 vs edited ===== --- 1.11/net/ipv4/ip_forward.c 2004-07-08 00:17:28 +02:00 +++ edited/net/ipv4/ip_forward.c 2005-03-23 05:53:14 +01:00 @@ -51,7 +51,7 @@ if (unlikely(opt->optlen)) ip_forward_options(skb); - return dst_output(skb); + return ip_dst_output(skb); } int ip_forward(struct sk_buff *skb) ===== net/ipv4/ip_output.c 1.80 vs edited ===== --- 1.80/net/ipv4/ip_output.c 2005-03-18 19:43:26 +01:00 +++ edited/net/ipv4/ip_output.c 2005-03-23 06:20:28 +01:00 @@ -166,7 +166,7 @@ /* Send it out. */ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + ip_dst_output); } static inline int ip_finish_output2(struct sk_buff *skb) @@ -284,7 +284,7 @@ return ip_finish_output(skb); } -int ip_output(struct sk_buff *skb) +static inline int ip_output2(struct sk_buff *skb) { IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); @@ -294,6 +294,16 @@ return ip_finish_output(skb); } +int ip_output(struct sk_buff *skb) +{ + int transformed = IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED; + + if (transformed) + nf_reset(skb); + return NF_HOOK_COND(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, + skb->dst->dev, ip_output2, transformed); +} + int ip_queue_xmit(struct sk_buff *skb, int ipfragok) { struct sock *sk = skb->sk; @@ -374,7 +384,7 @@ skb->priority = sk->sk_priority; return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + ip_dst_output); no_route: IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); @@ -1189,7 +1199,7 @@ /* Netfilter gets whole the not fragmented skb. */ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); + skb->dst->dev, ip_dst_output); if (err) { if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; ===== net/ipv4/ipmr.c 1.47 vs edited ===== --- 1.47/net/ipv4/ipmr.c 2005-03-18 19:36:11 +01:00 +++ edited/net/ipv4/ipmr.c 2005-03-23 05:53:13 +01:00 @@ -1119,7 +1119,7 @@ if (unlikely(opt->optlen)) ip_forward_options(skb); - return dst_output(skb); + return ip_dst_output(skb); } /* ===== net/ipv4/raw.c 1.63 vs edited ===== --- 1.63/net/ipv4/raw.c 2005-03-16 00:20:37 +01:00 +++ edited/net/ipv4/raw.c 2005-03-23 05:53:13 +01:00 @@ -310,7 +310,7 @@ } err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + ip_dst_output); if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; if (err) ===== net/ipv4/xfrm4_output.c 1.10 vs edited ===== --- 1.10/net/ipv4/xfrm4_output.c 2005-03-18 19:41:26 +01:00 +++ edited/net/ipv4/xfrm4_output.c 2005-03-23 05:53:13 +01:00 @@ -129,6 +129,7 @@ err = -EHOSTUNREACH; goto error_nolock; } + IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; err = NET_XMIT_BYPASS; out_exit: ===== net/ipv4/ipvs/ip_vs_xmit.c 1.13 vs edited ===== --- 1.13/net/ipv4/ipvs/ip_vs_xmit.c 2005-03-18 19:38:59 +01:00 +++ edited/net/ipv4/ipvs/ip_vs_xmit.c 2005-03-24 05:05:57 +01:00 @@ -131,7 +131,7 @@ (skb)->nfcache |= NFC_IPVS_PROPERTY; \ (skb)->ip_summed = CHECKSUM_NONE; \ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ - (rt)->u.dst.dev, dst_output); \ + (rt)->u.dst.dev, ip_dst_output); \ } while (0) ===== net/ipv4/netfilter/ipt_REJECT.c 1.37 vs edited ===== --- 1.37/net/ipv4/netfilter/ipt_REJECT.c 2005-03-17 19:05:37 +01:00 +++ edited/net/ipv4/netfilter/ipt_REJECT.c 2005-03-23 06:05:51 +01:00 @@ -213,7 +213,7 @@ nf_ct_attach(nskb, oldskb); NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev, - dst_output); + ip_dst_output); return; free_nskb: ===== net/ipv6/ip6_input.c 1.23 vs edited ===== --- 1.23/net/ipv6/ip6_input.c 2005-03-10 06:12:11 +01:00 +++ edited/net/ipv6/ip6_input.c 2005-03-24 05:06:19 +01:00 @@ -241,9 +241,9 @@ if (deliver) { skb2 = skb_clone(skb, GFP_ATOMIC); - dst_output(skb2); + ip6_dst_output(skb2); } else { - dst_output(skb); + ip6_dst_output(skb); return 0; } } ===== net/ipv6/ip6_output.c 1.91 vs edited ===== --- 1.91/net/ipv6/ip6_output.c 2005-03-18 19:44:52 +01:00 +++ edited/net/ipv6/ip6_output.c 2005-03-24 04:52:01 +01:00 @@ -108,7 +108,7 @@ } -static int ip6_output2(struct sk_buff *skb) +static int ip6_output3(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct net_device *dev = dst->dev; @@ -145,12 +145,22 @@ return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish); } -int ip6_output(struct sk_buff *skb) +static inline int ip6_output2(struct sk_buff *skb) { if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst)) - return ip6_fragment(skb, ip6_output2); + return ip6_fragment(skb, ip6_output3); else - return ip6_output2(skb); + return ip6_output3(skb); +} + +int ip6_output(struct sk_buff *skb) +{ + int transformed = IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED; + + if (transformed) + nf_reset(skb); + return NF_HOOK_COND(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, + skb->dst->dev, ip6_output2, transformed); } #ifdef CONFIG_NETFILTER @@ -195,7 +205,7 @@ } } #endif /* CONFIG_NETFILTER */ - return dst_output(skb); + return ip6_dst_output(skb); } /* @@ -342,7 +352,7 @@ static inline int ip6_forward_finish(struct sk_buff *skb) { - return dst_output(skb); + return ip6_dst_output(skb); } int ip6_forward(struct sk_buff *skb) @@ -1146,7 +1156,7 @@ skb->dst = dst_clone(&rt->u.dst); IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, ip6_dst_output); if (err) { if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; ===== net/ipv6/ip6_tunnel.c 1.30 vs edited ===== --- 1.30/net/ipv6/ip6_tunnel.c 2005-03-15 19:19:23 +01:00 +++ edited/net/ipv6/ip6_tunnel.c 2005-03-23 06:08:09 +01:00 @@ -744,7 +744,7 @@ nf_reset(skb); pkt_len = skb->len; err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); + skb->dst->dev, ip6_dst_output); if (err == NET_XMIT_SUCCESS || err == NET_XMIT_CN) { stats->tx_bytes += pkt_len; ===== net/ipv6/ndisc.c 1.124 vs edited ===== --- 1.124/net/ipv6/ndisc.c 2005-03-16 23:52:27 +01:00 +++ edited/net/ipv6/ndisc.c 2005-03-23 06:08:42 +01:00 @@ -501,7 +501,7 @@ skb->dst = dst; idev = in6_dev_get(dst->dev); IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_dst_output); if (!err) { ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS); ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); @@ -586,7 +586,7 @@ skb->dst = dst; idev = in6_dev_get(dst->dev); IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_dst_output); if (!err) { ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORSOLICITS); ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); @@ -660,7 +660,7 @@ skb->dst = dst; idev = in6_dev_get(dst->dev); IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_dst_output); if (!err) { ICMP6_INC_STATS(idev, ICMP6_MIB_OUTROUTERSOLICITS); ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); @@ -1446,7 +1446,7 @@ buff->dst = dst; idev = in6_dev_get(dst->dev); IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, buff, NULL, dst->dev, dst_output); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, buff, NULL, dst->dev, ip6_dst_output); if (!err) { ICMP6_INC_STATS(idev, ICMP6_MIB_OUTREDIRECTS); ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); ===== net/ipv6/raw.c 1.79 vs edited ===== --- 1.79/net/ipv6/raw.c 2005-03-03 06:12:38 +01:00 +++ edited/net/ipv6/raw.c 2005-03-23 06:08:54 +01:00 @@ -541,7 +541,7 @@ IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + ip6_dst_output); if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; if (err) ===== net/ipv6/xfrm6_output.c 1.11 vs edited ===== --- 1.11/net/ipv6/xfrm6_output.c 2005-03-18 19:41:26 +01:00 +++ edited/net/ipv6/xfrm6_output.c 2005-03-24 04:46:59 +01:00 @@ -131,6 +131,7 @@ err = -EHOSTUNREACH; goto error_nolock; } + IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; err = NET_XMIT_BYPASS; out_exit: ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: Netfilter+IPsec 2005-03-24 5:05 ` Netfilter+IPsec Patrick McHardy @ 2005-03-24 5:43 ` David S. Miller 2005-03-25 2:53 ` Netfilter+IPsec Herbert Xu 0 siblings, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-03-24 5:43 UTC (permalink / raw) To: Patrick McHardy; +Cc: herbert, kuznet, yoshfuji, netdev On Thu, 24 Mar 2005 06:05:50 +0100 Patrick McHardy <kaber@trash.net> wrote: > This patch (not entirely reviewed myself yet) contains the parts > necessary for hooking output IPsec packets for netfilter. This is actually much cleaner than I had ever anticipated. I like it. I suppose the input side will be quite a bit more involved? ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: Netfilter+IPsec 2005-03-24 5:43 ` Netfilter+IPsec David S. Miller @ 2005-03-25 2:53 ` Herbert Xu 2005-03-25 5:10 ` Netfilter+IPsec Patrick McHardy 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-25 2:53 UTC (permalink / raw) To: David S. Miller; +Cc: Patrick McHardy, kuznet, yoshfuji, netdev On Wed, Mar 23, 2005 at 09:43:40PM -0800, David S. Miller wrote: > On Thu, 24 Mar 2005 06:05:50 +0100 > Patrick McHardy <kaber@trash.net> wrote: > > > This patch (not entirely reviewed myself yet) contains the parts > > necessary for hooking output IPsec packets for netfilter. > > This is actually much cleaner than I had ever anticipated. > I like it. I completely agree. The output patch is an elegant piece of work. > I suppose the input side will be quite a bit more involved? Maybe it won't be that bad when we actually see it :) BTW Patrick, what about the other bits in your original patch set? In particular, have you still got the bit that does policy lookups after SNAT? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: Netfilter+IPsec 2005-03-25 2:53 ` Netfilter+IPsec Herbert Xu @ 2005-03-25 5:10 ` Patrick McHardy 0 siblings, 0 replies; 114+ messages in thread From: Patrick McHardy @ 2005-03-25 5:10 UTC (permalink / raw) To: Herbert Xu; +Cc: David S. Miller, kuznet, yoshfuji, netdev Herbert Xu wrote: > On Wed, Mar 23, 2005 at 09:43:40PM -0800, David S. Miller wrote: >> >>>This patch (not entirely reviewed myself yet) contains the parts >>>necessary for hooking output IPsec packets for netfilter. >> >>This is actually much cleaner than I had ever anticipated. >>I like it. > > I completely agree. The output patch is an elegant piece of work. Thanks. Unfortunantely it might need to be replaced because of issues with the input side. >>I suppose the input side will be quite a bit more involved? > > Maybe it won't be that bad when we actually see it :) Stealing the packets in xfrm_policy_check() didn't work out, a packet can be checked multiple times, and before all IPsec processing is done, because of raw sockets. Even worse, a raw socket can have its own policy and accept packets that will be further processed by IPsec. This suggests that the whole idea of skipping netfilter hooks before all IPsec processing is done was wrong and we need to call them on each pass through the stack as usual to be able to filter before raw sockets. For symetry in the output path we would need to pass the packet through POST_ROUTING and OUTPUT for each tunnel mode transform. I wanted to avoid this so far because I can't think of anything useful netfilter could do between two transforms on output, but the good part is that it shouldn't require any changes in the input path. I'm trying it now .. > BTW Patrick, what about the other bits in your original patch set? > In particular, have you still got the bit that does policy lookups > after SNAT? I haven't got up-to-date patches, but Christophe Saoute has ported them to 2.6.12-rc1 and published on his page: http://www.saout.de/misc/linux-2.6.12-rc1-ipsec-nat/ There are two patches that will probably be required either way, the policy lookup after SNAT patch you mentioned, and a patch that adds a function to restore struct flowi as it would have looked without NAT for policy checks. Both are small and should be painless. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS 2005-03-20 15:46 ` Patrick McHardy 2005-03-20 16:32 ` Ludo Stellingwerff 2005-03-23 3:49 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller @ 2005-03-23 9:24 ` Herbert Xu 2 siblings, 0 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-23 9:24 UTC (permalink / raw) To: Patrick McHardy; +Cc: David S. Miller, kuznet, yoshfuji, netdev On Sun, Mar 20, 2005 at 04:46:34PM +0100, Patrick McHardy wrote: > > been done. The current input patch makes packets that have been > handled by IPsec skip the netfilter hooks until we know no further > IPsec processing will be done (route is non-local or protocol handler > is not marked as xfrm_prot). The packet is then marked as completely Could you send me (or the list) a copy of the current input patch? Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit 2005-03-18 9:03 ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit Herbert Xu 2005-03-18 9:11 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu @ 2005-03-18 18:39 ` David S. Miller 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-18 18:39 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Fri, 18 Mar 2005 20:03:10 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > Somehow I missed four files with dst_pmtu usages in them. I'm going > to split them along the sames lines I did before: bug fixes and then > the trivial changes. Must be some bug in your grep binary... kidding ;-) > Here is a patch that replaces dst_pmtu with dst_pmtu in ipmr.c > since this is straight IPIP tunneling equivalent to what we have > in ipip.c. > > As it is we may send ICMP packets when IPsec is present which is > exactly what the comment says that we shouldn't do. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data 2005-03-15 9:19 ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu 2005-03-15 9:58 ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu @ 2005-03-15 18:18 ` David S. Miller 2005-03-16 11:31 ` Herbert Xu 2 siblings, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-15 18:18 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Tue, 15 Mar 2005 20:19:04 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > This patch fixes the IPsec overhead handling in ip_append_data and > ip6_append_data. As it is they assume that the IPsec overhead is > constant. This is not true as with ESP the IPsec overhead will vary > as the MTU varies. > > The result is that they may produce packets that will exceed the MTU > when ESP is used. Had it taken the trailer_len into account, it would > have produced packets less than the real MTU. > > By switching to dst_mtu we get the optimal result. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Looks nice. Applied, thanks Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data 2005-03-15 9:19 ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu 2005-03-15 9:58 ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu 2005-03-15 18:18 ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data David S. Miller @ 2005-03-16 11:31 ` Herbert Xu 2005-03-16 22:02 ` David S. Miller 2005-03-21 16:14 ` Mika Penttilä 2 siblings, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-16 11:31 UTC (permalink / raw) To: David S. Miller Cc: Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev Hi Dave: On Tue, Mar 15, 2005 at 08:19:04PM +1100, herbert wrote: > > This patch fixes the IPsec overhead handling in ip_append_data and > ip6_append_data. As it is they assume that the IPsec overhead is > constant. This is not true as with ESP the IPsec overhead will vary > as the MTU varies. This patch is wrong. This is the *one* place where we do need to use the path MTU. The reason is that when the packet is fragmented we only pay for the IPsec overhead once over all and not once for each fragment. Please revert it for now. The trailer_len in ip_append_data is not quite right as the trailer's length depends on the length of the entire packet. However, it should be harmless since ESP knows how to extend the packet when necessary. Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data 2005-03-16 11:31 ` Herbert Xu @ 2005-03-16 22:02 ` David S. Miller 2005-03-21 16:14 ` Mika Penttilä 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-16 22:02 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Wed, 16 Mar 2005 22:31:49 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > This patch is wrong. This is the *one* place where we do need to > use the path MTU. The reason is that when the packet is fragmented > we only pay for the IPsec overhead once over all and not once for > each fragment. > > Please revert it for now. Ok, done. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data 2005-03-16 11:31 ` Herbert Xu 2005-03-16 22:02 ` David S. Miller @ 2005-03-21 16:14 ` Mika Penttilä 2005-03-21 20:28 ` Herbert Xu 1 sibling, 1 reply; 114+ messages in thread From: Mika Penttilä @ 2005-03-21 16:14 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev Herbert Xu wrote: >Hi Dave: > >On Tue, Mar 15, 2005 at 08:19:04PM +1100, herbert wrote: > > >>This patch fixes the IPsec overhead handling in ip_append_data and >>ip6_append_data. As it is they assume that the IPsec overhead is >>constant. This is not true as with ESP the IPsec overhead will vary >>as the MTU varies. >> >> > >This patch is wrong. This is the *one* place where we do need to >use the path MTU. The reason is that when the packet is fragmented >we only pay for the IPsec overhead once over all and not once for >each fragment. > >Please revert it for now. > >The trailer_len in ip_append_data is not quite right as the trailer's >length depends on the length of the entire packet. However, it should >be harmless since ESP knows how to extend the packet when necessary. > >Thanks, > > Shouldn't ip_output also use the path variant, dst_mtu(skb->dst->path), it's surely after ipsec- processing? --Mika ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data 2005-03-21 16:14 ` Mika Penttilä @ 2005-03-21 20:28 ` Herbert Xu 2005-03-21 21:29 ` Mika Penttilä 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-21 20:28 UTC (permalink / raw) To: Mika Penttil? Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev On Mon, Mar 21, 2005 at 06:14:07PM +0200, Mika Penttil? wrote: > > Shouldn't ip_output also use the path variant, dst_mtu(skb->dst->path), > it's surely after ipsec- processing? That's the reason why it shouldn't use dst->path. The only time you should use dst_mtu(dst->path) is when dst may contain IPsec AND you need the MTU outside IPsec. In this case dst cannot contain IPsec so dst_mtu(dst) is correct. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data 2005-03-21 20:28 ` Herbert Xu @ 2005-03-21 21:29 ` Mika Penttilä 2005-03-21 22:04 ` Herbert Xu 0 siblings, 1 reply; 114+ messages in thread From: Mika Penttilä @ 2005-03-21 21:29 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev Herbert Xu wrote: >On Mon, Mar 21, 2005 at 06:14:07PM +0200, Mika Penttil? wrote: > > >>Shouldn't ip_output also use the path variant, dst_mtu(skb->dst->path), >>it's surely after ipsec- processing? >> >> > >That's the reason why it shouldn't use dst->path. The only time you >should use dst_mtu(dst->path) is when dst may contain IPsec AND you >need the MTU outside IPsec. > >In this case dst cannot contain IPsec so dst_mtu(dst) is correct. > >Cheers, > > ok I see it now, but this is really easy to get wrong... Thanks, Mika ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data 2005-03-21 21:29 ` Mika Penttilä @ 2005-03-21 22:04 ` Herbert Xu 0 siblings, 0 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-21 22:04 UTC (permalink / raw) To: Mika Penttil? Cc: David S. Miller, Alexey Kuznetsov, YOSHIFUJI Hideaki, Patrick McHardy, netdev On Mon, Mar 21, 2005 at 11:29:58PM +0200, Mika Penttil? wrote: > > ok I see it now, but this is really easy to get wrong... Well the rule is actually quite simple. If IPsec can't be present, then you should always use dst_mtu(dst). If IPsec may be present, then you should also use dst_mtu(dst) *unless* you're performing an operation such as fragmentation that has to be done after IPsec. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [13/*] [IPV4] Fix room calculation in icmp_send 2005-03-14 10:53 ` [13/*] [IPV4] Fix room calculation in icmp_send Herbert Xu 2005-03-14 11:10 ` [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward Herbert Xu @ 2005-03-15 5:26 ` David S. Miller 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-15 5:26 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Mon, 14 Mar 2005 21:53:13 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > I'm now cleaning up all users of dst_pmtu with the aim of replacing > dst_pmtu with dst_mtu. I'm going to start with the ones that actually > fix bugs. > > This patch fixes the length calculation in icmp_send. As it is > we're overestimating the space available by including the space > that would be used up by IPsec encapsulation. > > IPv6 doesn't have this problem since its calculation is based > on 1280 instead of the PMTU. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [12/*] [IPSEC] Handle local_df in IPv4 2005-03-14 10:26 ` [12/*] [IPSEC] Handle local_df in IPv4 Herbert Xu 2005-03-14 10:53 ` [13/*] [IPV4] Fix room calculation in icmp_send Herbert Xu @ 2005-03-15 5:25 ` David S. Miller 2005-03-15 18:25 ` YOSHIFUJI Hideaki / 吉藤英明 1 sibling, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-03-15 5:25 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, yoshfuji, kaber, netdev On Mon, 14 Mar 2005 21:26:14 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > When cleaning up the remaining users of dst_pmtu I noticed that > local_df wasn't being treated correctly in IPsec. In fact, if > you socket's dst went over IPsec, local_df is essentailly ignored. > > This patch fixes that. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks Herbert. > I was going to do the same thing to IPv6. Unfortunately it seems > that we don't have any local_df support over there. That is, we > always fragment outbound UDP/raw packets. Did I miss something? I have no idea offhand. Perhaps Yoshifuji can figure it out? ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [12/*] [IPSEC] Handle local_df in IPv4 2005-03-15 5:25 ` [12/*] [IPSEC] Handle local_df in IPv4 David S. Miller @ 2005-03-15 18:25 ` YOSHIFUJI Hideaki / 吉藤英明 2005-03-15 18:28 ` YOSHIFUJI Hideaki / 吉藤英明 0 siblings, 1 reply; 114+ messages in thread From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-03-15 18:25 UTC (permalink / raw) To: davem, herbert; +Cc: kuznet, kaber, netdev, yoshfuji In article <20050314212557.23f0ab09.davem@davemloft.net> (at Mon, 14 Mar 2005 21:25:57 -0800), "David S. Miller" <davem@davemloft.net> says: > > I was going to do the same thing to IPv6. Unfortunately it seems > > that we don't have any local_df support over there. That is, we > > always fragment outbound UDP/raw packets. Did I miss something? > > I have no idea offhand. Perhaps Yoshifuji can figure it out? Currently, yes. And, I know this is one of places we may need to revisit... --yoshfuji ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [12/*] [IPSEC] Handle local_df in IPv4 2005-03-15 18:25 ` YOSHIFUJI Hideaki / 吉藤英明 @ 2005-03-15 18:28 ` YOSHIFUJI Hideaki / 吉藤英明 0 siblings, 0 replies; 114+ messages in thread From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-03-15 18:28 UTC (permalink / raw) To: davem, herbert; +Cc: kuznet, kaber, netdev, yoshfuji In article <20050316.032537.72822422.yoshfuji@linux-ipv6.org> (at Wed, 16 Mar 2005 03:25:37 +0900 (JST)), YOSHIFUJI Hideaki / 吉藤英明 <yoshfuji@linux-ipv6.org> says: > > > that we don't have any local_df support over there. That is, we > > > always fragment outbound UDP/raw packets. Did I miss something? > > > > I have no idea offhand. Perhaps Yoshifuji can figure it out? > > Currently, yes. I mean: we always fragment outbount UDP/raw packets. --yoshfuji ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [4/4] [IPSEC] Store MTU at each xfrm_dst 2005-02-14 22:16 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu ` (2 preceding siblings ...) 2005-03-07 5:32 ` [4/4] [IPSEC] Store MTU at each xfrm_dst David S. Miller @ 2005-03-28 20:10 ` Patrick McHardy 2005-03-28 23:30 ` [IPSEC] Move xfrm_flush_bundles into xfrm_state GC Herbert Xu 2005-03-28 23:39 ` Checking SPI in xfrm_state_find Herbert Xu 3 siblings, 2 replies; 114+ messages in thread From: Patrick McHardy @ 2005-03-28 20:10 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev Hi Herbert, Herbert Xu wrote: > Finally this is the patch that sets the MTU values of each xfrm_dst > and keeps it up-to-date. while looking for ways to exchange the xfrm pointer of a live dst_entry I noticed what looks like an ABBA deadlock introduced by the call to xfrm_state_mtu() in xfrm_bundle_ok(). spin_lock x->lock (xfrm_state_delete) read_lock xfrm_policy_lock (xfrm_prune_bundles) write_lock policy->lock (xfrm_prune_bundles) read/write_lock policy->lock (__xfrm46_find_acq/xfrm_lookup) spin_lock x->lock (xfrm_state_mtu) I haven't though of a way to avoid this yet. It would be nice though if we could keep the rule that xfrm_policy_lock and policy->lock nest with x->lock. Something unrelated I was also wondering about, from xfrm_find_state(): list_for_each_entry(x, xfrm_state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == tmpl->reqid && xfrm_state_addr_check(x, daddr, saddr, family) && tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto) { Shouldn't we check for (tmpl->id.spi == x->id.spi || !tmpl->id.spi) ? Regards Patrick > > -int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family) > +int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family) > { > - struct dst_entry *dst = &xdst->u.dst; > + struct dst_entry *dst = &first->u.dst; > + struct xfrm_dst *last; > + u32 mtu; > > if (dst->path->obsolete > 0 || > (dst->dev && !netif_running(dst->dev))) > return 0; > > + last = NULL; > + > do { > + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; > + > if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family)) > return 0; > if (dst->xfrm->km.state != XFRM_STATE_VALID) > return 0; > + > + mtu = dst_pmtu(dst->child); > + if (xdst->child_mtu_cached != mtu) { > + last = xdst; > + xdst->child_mtu_cached = mtu; > + } > + > + mtu = dst_pmtu(xdst->route); > + if (xdst->child_mtu_cached != mtu) { > + last = xdst; > + xdst->route_mtu_cached = mtu; > + } > + > dst = dst->child; > } while (dst->xfrm); > + > + if (likely(!last)) > + return 1; > + > + mtu = last->child_mtu_cached; > + for (;;) { > + dst = &last->u.dst; > + > + mtu = xfrm_state_mtu(dst->xfrm, mtu); > + if (mtu > last->route_mtu_cached) > + mtu = last->route_mtu_cached; > + dst->metrics[RTAX_MTU-1] = mtu; > + > + if (last == first) > + break; > + > + last = last->u.next; > + last->child_mtu_cached = mtu; > + } > > return 1; > } ^ permalink raw reply [flat|nested] 114+ messages in thread
* [IPSEC] Move xfrm_flush_bundles into xfrm_state GC 2005-03-28 20:10 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Patrick McHardy @ 2005-03-28 23:30 ` Herbert Xu 2005-03-31 0:10 ` Patrick McHardy 2005-04-01 5:21 ` David S. Miller 2005-03-28 23:39 ` Checking SPI in xfrm_state_find Herbert Xu 1 sibling, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-28 23:30 UTC (permalink / raw) To: Patrick McHardy Cc: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev [-- Attachment #1: Type: text/plain, Size: 1149 bytes --] Hi Patrick: On Mon, Mar 28, 2005 at 10:10:54PM +0200, Patrick McHardy wrote: > > I haven't though of a way to avoid this yet. It would be nice though > if we could keep the rule that xfrm_policy_lock and policy->lock nest > with x->lock. Thanks for catching this. The locking in xfrm_state/xfrm_policy has always struck me as being an overkill. A lot of the locks should be replaced by rules that ensure the validity of most operations while a ref count is held. Now I have an excuse to do just that :) For 2.6.12 let's go for a simpler fix that breaks the dead lock. __xfrm_state_delete does not need to flush the bundles immediately. In fact, it is more efficient if we delay the flush to the GC worker since the flush is not dependent on any particular xfrm state. By delaying it we can do one single flush even when you're deleteing the entire xfrm state list. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: p --] [-- Type: text/plain, Size: 1269 bytes --] ===== net/xfrm/xfrm_state.c 1.56 vs edited ===== --- 1.56/net/xfrm/xfrm_state.c 2005-03-11 13:39:54 +11:00 +++ edited/net/xfrm/xfrm_state.c 2005-03-29 09:16:13 +10:00 @@ -48,6 +48,8 @@ static struct list_head xfrm_state_gc_list = LIST_HEAD_INIT(xfrm_state_gc_list); static DEFINE_SPINLOCK(xfrm_state_gc_lock); +static int xfrm_state_gc_flush_bundles; + static void __xfrm_state_delete(struct xfrm_state *x); static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family); @@ -81,6 +83,11 @@ struct list_head *entry, *tmp; struct list_head gc_list = LIST_HEAD_INIT(gc_list); + if (xfrm_state_gc_flush_bundles) { + xfrm_state_gc_flush_bundles = 0; + xfrm_flush_bundles(); + } + spin_lock_bh(&xfrm_state_gc_lock); list_splice_init(&xfrm_state_gc_list, &gc_list); spin_unlock_bh(&xfrm_state_gc_lock); @@ -228,8 +235,10 @@ * our caller holds. A larger value means that * there are DSTs attached to this xfrm_state. */ - if (atomic_read(&x->refcnt) > 2) - xfrm_flush_bundles(); + if (atomic_read(&x->refcnt) > 2) { + xfrm_state_gc_flush_bundles = 1; + schedule_work(&xfrm_state_gc_work); + } /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC] Move xfrm_flush_bundles into xfrm_state GC 2005-03-28 23:30 ` [IPSEC] Move xfrm_flush_bundles into xfrm_state GC Herbert Xu @ 2005-03-31 0:10 ` Patrick McHardy 2005-04-01 5:21 ` David S. Miller 1 sibling, 0 replies; 114+ messages in thread From: Patrick McHardy @ 2005-03-31 0:10 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev Herbert Xu wrote: > The locking in xfrm_state/xfrm_policy has always struck me as being > an overkill. A lot of the locks should be replaced by rules that > ensure the validity of most operations while a ref count is held. > Now I have an excuse to do just that :) I agree, it really looks like slight overkill. > For 2.6.12 let's go for a simpler fix that breaks the dead lock. > > __xfrm_state_delete does not need to flush the bundles immediately. > In fact, it is more efficient if we delay the flush to the GC worker > since the flush is not dependent on any particular xfrm state. By > delaying it we can do one single flush even when you're deleteing > the entire xfrm state list. Looks good to me. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC] Move xfrm_flush_bundles into xfrm_state GC 2005-03-28 23:30 ` [IPSEC] Move xfrm_flush_bundles into xfrm_state GC Herbert Xu 2005-03-31 0:10 ` Patrick McHardy @ 2005-04-01 5:21 ` David S. Miller 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-04-01 5:21 UTC (permalink / raw) To: Herbert Xu; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev On Tue, 29 Mar 2005 09:30:32 +1000 Herbert Xu <herbert@gondor.apana.org.au> wrote: > The locking in xfrm_state/xfrm_policy has always struck me as being > an overkill. A lot of the locks should be replaced by rules that > ensure the validity of most operations while a ref count is held. > Now I have an excuse to do just that :) > > For 2.6.12 let's go for a simpler fix that breaks the dead lock. > > __xfrm_state_delete does not need to flush the bundles immediately. > In fact, it is more efficient if we delay the flush to the GC worker > since the flush is not dependent on any particular xfrm state. By > delaying it we can do one single flush even when you're deleteing > the entire xfrm state list. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Looks good, patch applied, thanks Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Checking SPI in xfrm_state_find 2005-03-28 20:10 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Patrick McHardy 2005-03-28 23:30 ` [IPSEC] Move xfrm_flush_bundles into xfrm_state GC Herbert Xu @ 2005-03-28 23:39 ` Herbert Xu 2005-03-31 0:13 ` Patrick McHardy 1 sibling, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-28 23:39 UTC (permalink / raw) To: Patrick McHardy Cc: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev On Mon, Mar 28, 2005 at 10:10:54PM +0200, Patrick McHardy wrote: > > Something unrelated I was also wondering about, from xfrm_find_state(): > > list_for_each_entry(x, xfrm_state_bydst+h, bydst) { > if (x->props.family == family && > x->props.reqid == tmpl->reqid && > xfrm_state_addr_check(x, daddr, saddr, family) && > tmpl->mode == x->props.mode && > tmpl->id.proto == x->id.proto) { > > Shouldn't we check for (tmpl->id.spi == x->id.spi || !tmpl->id.spi) ? Absolutely. We should also fix the larval state generation in that same function to fail the operation if that SPI already exists. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: Checking SPI in xfrm_state_find 2005-03-28 23:39 ` Checking SPI in xfrm_state_find Herbert Xu @ 2005-03-31 0:13 ` Patrick McHardy 2005-03-31 0:46 ` Herbert Xu 0 siblings, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-03-31 0:13 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev [-- Attachment #1: Type: text/plain, Size: 788 bytes --] Herbert Xu wrote: > On Mon, Mar 28, 2005 at 10:10:54PM +0200, Patrick McHardy wrote: > >>Something unrelated I was also wondering about, from xfrm_find_state(): >> >> list_for_each_entry(x, xfrm_state_bydst+h, bydst) { >> if (x->props.family == family && >> x->props.reqid == tmpl->reqid && >> xfrm_state_addr_check(x, daddr, saddr, family) && >> tmpl->mode == x->props.mode && >> tmpl->id.proto == x->id.proto) { >> >>Shouldn't we check for (tmpl->id.spi == x->id.spi || !tmpl->id.spi) ? > > > Absolutely. We should also fix the larval state generation in that > same function to fail the operation if that SPI already exists. Thanks, both done by these two patches. Regards Patrick [-- Attachment #2: x1 --] [-- Type: text/plain, Size: 964 bytes --] # This is a BitKeeper generated diff -Nru style patch. # # ChangeSet # 2005/03/30 06:02:45+02:00 kaber@coreworks.de # [IPSEC]: Check SPI in xfrm_state_find() # # Signed-off-by: Patrick McHardy <kaber@trash.net> # # net/xfrm/xfrm_state.c # 2005/03/30 06:02:36+02:00 kaber@coreworks.de +2 -1 # [IPSEC]: Check SPI in xfrm_state_find() # # Signed-off-by: Patrick McHardy <kaber@trash.net> # diff -Nru a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c --- a/net/xfrm/xfrm_state.c 2005-03-31 02:12:12 +02:00 +++ b/net/xfrm/xfrm_state.c 2005-03-31 02:12:12 +02:00 @@ -306,7 +306,8 @@ x->props.reqid == tmpl->reqid && xfrm_state_addr_check(x, daddr, saddr, family) && tmpl->mode == x->props.mode && - tmpl->id.proto == x->id.proto) { + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) { /* Resolution logic: 1. There is a valid state with matching selector. Done. [-- Attachment #3: x2 --] [-- Type: text/plain, Size: 2235 bytes --] # This is a BitKeeper generated diff -Nru style patch. # # ChangeSet # 2005/03/31 02:07:54+02:00 kaber@coreworks.de # [IPSEC]: Check if SPI exists before creating acquire state # # Signed-off-by: Patrick McHardy <kaber@trash.net> # # net/xfrm/xfrm_state.c # 2005/03/31 02:07:42+02:00 kaber@coreworks.de +25 -7 # [IPSEC]: Check if SPI exists before creating acquire state # # Signed-off-by: Patrick McHardy <kaber@trash.net> # diff -Nru a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c --- a/net/xfrm/xfrm_state.c 2005-03-31 02:12:57 +02:00 +++ b/net/xfrm/xfrm_state.c 2005-03-31 02:12:57 +02:00 @@ -295,10 +295,17 @@ unsigned short family) { unsigned h = xfrm_dst_hash(daddr, family); - struct xfrm_state *x; + struct xfrm_state *x, *x0; int acquire_in_progress = 0; int error = 0; struct xfrm_state *best = NULL; + struct xfrm_state_afinfo *afinfo; + + afinfo = xfrm_state_get_afinfo(family); + if (afinfo == NULL) { + *err = -EAFNOSUPPORT; + return NULL; + } spin_lock_bh(&xfrm_state_lock); list_for_each_entry(x, xfrm_state_bydst+h, bydst) { @@ -334,14 +341,24 @@ } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { if (xfrm_selector_match(&x->sel, fl, family)) - error = 1; + error = -ESRCH; } } } x = best; - if (!x && !error && !acquire_in_progress && - ((x = xfrm_state_alloc()) != NULL)) { + if (!x && !error && !acquire_in_progress) { + x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto); + if (x0 != NULL) { + xfrm_state_put(x0); + error = -EEXIST; + goto out; + } + x = xfrm_state_alloc(); + if (x == NULL) { + error = -ENOMEM; + goto out; + } /* Initialize temporary selector matching only * to current session. */ xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family); @@ -363,15 +380,16 @@ x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); x = NULL; - error = 1; + error = -ESRCH; } } +out: if (x) xfrm_state_hold(x); else - *err = acquire_in_progress ? -EAGAIN : - (error ? -ESRCH : -ENOMEM); + *err = acquire_in_progress ? -EAGAIN : error; spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); return x; } ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: Checking SPI in xfrm_state_find 2005-03-31 0:13 ` Patrick McHardy @ 2005-03-31 0:46 ` Herbert Xu 2005-04-01 5:23 ` David S. Miller 2005-04-03 17:00 ` Checking SPI in xfrm_state_find Patrick McHardy 0 siblings, 2 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-31 0:46 UTC (permalink / raw) To: Patrick McHardy Cc: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev On Thu, Mar 31, 2005 at 02:13:54AM +0200, Patrick McHardy wrote: > > Thanks, both done by these two patches. Great. > # This is a BitKeeper generated diff -Nru style patch. > # > # ChangeSet > # 2005/03/30 06:02:45+02:00 kaber@coreworks.de > # [IPSEC]: Check SPI in xfrm_state_find() > # > # Signed-off-by: Patrick McHardy <kaber@trash.net> Looks good. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> > # This is a BitKeeper generated diff -Nru style patch. > # > # ChangeSet > # 2005/03/31 02:07:54+02:00 kaber@coreworks.de > # [IPSEC]: Check if SPI exists before creating acquire state > # > # Signed-off-by: Patrick McHardy <kaber@trash.net> > > x = best; > - if (!x && !error && !acquire_in_progress && > - ((x = xfrm_state_alloc()) != NULL)) { > + if (!x && !error && !acquire_in_progress) { > + x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto); It just occured to me that it would be much simpler if you did the existence check in the first loop. So something like if (x->props.family != family || !xfrm_state_addr_check(x, daddr, saddr, family) || tmpl->id.proto == x->id.proto) continue; if (tmpl->id.spi) { if (tmpl->id.spi != x->id.spi) continue; error = -EEXIST; } if (x->props.reqid == tmpl->reqid && tmpl->mode == x->props.mode) { } Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: Checking SPI in xfrm_state_find 2005-03-31 0:46 ` Herbert Xu @ 2005-04-01 5:23 ` David S. Miller 2005-04-02 0:49 ` [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel Herbert Xu 2005-04-03 17:00 ` Checking SPI in xfrm_state_find Patrick McHardy 1 sibling, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-04-01 5:23 UTC (permalink / raw) To: Herbert Xu; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev On Thu, 31 Mar 2005 10:46:58 +1000 Herbert Xu <herbert@gondor.apana.org.au> wrote: > > # This is a BitKeeper generated diff -Nru style patch. > > # > > # ChangeSet > > # 2005/03/30 06:02:45+02:00 kaber@coreworks.de > > # [IPSEC]: Check SPI in xfrm_state_find() > > # > > # Signed-off-by: Patrick McHardy <kaber@trash.net> > > Looks good. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> To me too, both patches applied, thanks Patrick. ^ permalink raw reply [flat|nested] 114+ messages in thread
* [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-01 5:23 ` David S. Miller @ 2005-04-02 0:49 ` Herbert Xu 2005-04-02 1:20 ` David S. Miller 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-04-02 0:49 UTC (permalink / raw) To: David S. Miller; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev [-- Attachment #1: Type: text/plain, Size: 1336 bytes --] Hi Dave: On Thu, Mar 31, 2005 at 09:23:25PM -0800, David S. Miller wrote: > On Thu, 31 Mar 2005 10:46:58 +1000 > Herbert Xu <herbert@gondor.apana.org.au> wrote: > > > > # This is a BitKeeper generated diff -Nru style patch. > > > # > > > # ChangeSet > > > # 2005/03/30 06:02:45+02:00 kaber@coreworks.de > > > # [IPSEC]: Check SPI in xfrm_state_find() > > > # > > > # Signed-off-by: Patrick McHardy <kaber@trash.net> > > > > Looks good. > > > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> > > To me too, both patches applied, thanks Patrick. Actually I only signed off on the first patch :) The second patch creates a dead lock since it does a nested read lock. The solution is simply to get rid of xfrm_init_tempsel and call the afinfo version directly. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> BTW I'd like to start cleaning up the locking in net/xfrm. I don't want these changes to go into 2.6.12. However, I'd like to have them sit in mm for a while so that they get some testing coverage. What's the best way to do this? Could you create a tree slated for 2.6.13? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: p --] [-- Type: text/plain, Size: 1024 bytes --] ===== net/xfrm/xfrm_state.c 1.60 vs edited ===== --- 1.60/net/xfrm/xfrm_state.c 2005-04-01 15:19:54 +10:00 +++ edited/net/xfrm/xfrm_state.c 2005-04-02 10:35:06 +10:00 @@ -283,20 +283,6 @@ } EXPORT_SYMBOL(xfrm_state_flush); -static int -xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl, - struct xfrm_tmpl *tmpl, - xfrm_address_t *daddr, xfrm_address_t *saddr, - unsigned short family) -{ - struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); - if (!afinfo) - return -1; - afinfo->init_tempsel(x, fl, tmpl, daddr, saddr); - xfrm_state_put_afinfo(afinfo); - return 0; -} - struct xfrm_state * xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, struct flowi *fl, struct xfrm_tmpl *tmpl, @@ -370,7 +356,7 @@ } /* Initialize temporary selector matching only * to current session. */ - xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family); + afinfo->init_tempsel(x, fl, tmpl, daddr, saddr); if (km_query(x, tmpl, pol) == 0) { x->km.state = XFRM_STATE_ACQ; ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-02 0:49 ` [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel Herbert Xu @ 2005-04-02 1:20 ` David S. Miller 2005-04-02 2:09 ` Herbert Xu 0 siblings, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-04-02 1:20 UTC (permalink / raw) To: Herbert Xu; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev On Sat, 2 Apr 2005 10:49:56 +1000 Herbert Xu <herbert@gondor.apana.org.au> wrote: > The second patch creates a dead lock since it does a nested read > lock. The solution is simply to get rid of xfrm_init_tempsel > and call the afinfo version directly. read locks nest even in the presence of pending writers ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-02 1:20 ` David S. Miller @ 2005-04-02 2:09 ` Herbert Xu 2005-04-03 16:48 ` Patrick McHardy 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-04-02 2:09 UTC (permalink / raw) To: David S. Miller; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev On Fri, Apr 01, 2005 at 05:20:07PM -0800, David S. Miller wrote: > On Sat, 2 Apr 2005 10:49:56 +1000 > Herbert Xu <herbert@gondor.apana.org.au> wrote: > > > The second patch creates a dead lock since it does a nested read > > lock. The solution is simply to get rid of xfrm_init_tempsel > > and call the afinfo version directly. > > read locks nest even in the presence of pending writers Doh! I should've read the code first :) It's still a valid clean-up patch though. There is another reason why it won't dead lock. We don't actually ever hold the write lock on afinfo :) Is there any reason why we dont't just use xfrm_state_afinfo_lock instead of afinfo->lock? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-02 2:09 ` Herbert Xu @ 2005-04-03 16:48 ` Patrick McHardy 2005-04-05 10:39 ` Herbert Xu 0 siblings, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-04-03 16:48 UTC (permalink / raw) To: Herbert Xu; +Cc: David S. Miller, kuznet, jmorris, yoshfuji, netdev [-- Attachment #1: Type: text/plain, Size: 779 bytes --] Herbert Xu wrote: > It's still a valid clean-up patch though. Agreed. There is also a bug in my patch, tmpl->daddr can be 0 in which case the daddr passed as an argument to xfrm_state_find() will be used. My patch only checked tmpl->daddr, this patch fixes it. It also uses afinfo->init_tempsel directly, but I didn't kill xfrm_init_tempsel() yet because I need it for xfrm resolution. > There is another reason why it won't dead lock. We don't actually > ever hold the write lock on afinfo :) Is there any reason why we > dont't just use xfrm_state_afinfo_lock instead of afinfo->lock? I don't think so. I also don't see a reason why the lock needs to be held between xfrm_state_get_afinfo() and xfrm_state_put_afinfo(), a reference count should be enough. Regards Patrick [-- Attachment #2: x --] [-- Type: text/plain, Size: 1698 bytes --] # This is a BitKeeper generated diff -Nru style patch. # # ChangeSet # 2005/04/03 18:41:22+02:00 kaber@coreworks.de # [IPSEC]: Use correct daddr for duplicate state check # # Signed-off-by: Patrick McHardy <kaber@trash.net> # # net/xfrm/xfrm_state.c # 2005/04/03 18:41:14+02:00 kaber@coreworks.de +9 -9 # [IPSEC]: Use correct daddr for duplicate state check # # Signed-off-by: Patrick McHardy <kaber@trash.net> # diff -Nru a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c --- a/net/xfrm/xfrm_state.c 2005-04-03 18:41:41 +02:00 +++ b/net/xfrm/xfrm_state.c 2005-04-03 18:41:41 +02:00 @@ -357,12 +357,6 @@ x = best; if (!x && !error && !acquire_in_progress) { - x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto); - if (x0 != NULL) { - xfrm_state_put(x0); - error = -EEXIST; - goto out; - } x = xfrm_state_alloc(); if (x == NULL) { error = -ENOMEM; @@ -370,9 +364,11 @@ } /* Initialize temporary selector matching only * to current session. */ - xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family); + afinfo->init_tempsel(x, fl, tmpl, daddr, saddr); + + x0 = afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto); - if (km_query(x, tmpl, pol) == 0) { + if (!x0 && km_query(x, tmpl, pol) == 0) { x->km.state = XFRM_STATE_ACQ; list_add_tail(&x->bydst, xfrm_state_bydst+h); xfrm_state_hold(x); @@ -386,10 +382,14 @@ x->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ; add_timer(&x->timer); } else { + error = -ESRCH; + if (x0) { + xfrm_state_put(x0); + error = -EEXIST; + } x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); x = NULL; - error = -ESRCH; } } out: ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-03 16:48 ` Patrick McHardy @ 2005-04-05 10:39 ` Herbert Xu 2005-04-05 20:01 ` Patrick McHardy 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-04-05 10:39 UTC (permalink / raw) To: Patrick McHardy; +Cc: David S. Miller, kuznet, jmorris, yoshfuji, netdev On Sun, Apr 03, 2005 at 06:48:17PM +0200, Patrick McHardy wrote: > > Agreed. There is also a bug in my patch, tmpl->daddr can be 0 in which > case the daddr passed as an argument to xfrm_state_find() will be used. > My patch only checked tmpl->daddr, this patch fixes it. It also uses Why not just use daddr? It's always guaranteed to be correct. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-05 10:39 ` Herbert Xu @ 2005-04-05 20:01 ` Patrick McHardy 2005-04-06 2:21 ` Herbert Xu 0 siblings, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-04-05 20:01 UTC (permalink / raw) To: Herbert Xu; +Cc: David S. Miller, kuznet, jmorris, yoshfuji, netdev [-- Attachment #1: Type: text/plain, Size: 581 bytes --] Herbert Xu wrote: > On Sun, Apr 03, 2005 at 06:48:17PM +0200, Patrick McHardy wrote: > >>Agreed. There is also a bug in my patch, tmpl->daddr can be 0 in which >>case the daddr passed as an argument to xfrm_state_find() will be used. >>My patch only checked tmpl->daddr, this patch fixes it. It also uses > > > Why not just use daddr? It's always guaranteed to be correct. Good idea, I didn't think of this. This patch is without replacement of xfrm_init_tempsel(), it caused an "unused function" warning and, as I mentioned before, I still need the function. Regards Patrick [-- Attachment #2: x --] [-- Type: text/plain, Size: 457 bytes --] ===== net/xfrm/xfrm_state.c 1.60 vs edited ===== --- 1.60/net/xfrm/xfrm_state.c 2005-04-01 07:19:54 +02:00 +++ edited/net/xfrm/xfrm_state.c 2005-04-05 21:58:08 +02:00 @@ -357,7 +357,7 @@ x = best; if (!x && !error && !acquire_in_progress) { - x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto); + x0 = afinfo->state_lookup(daddr, tmpl->id.spi, tmpl->id.proto); if (x0 != NULL) { xfrm_state_put(x0); error = -EEXIST; ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-05 20:01 ` Patrick McHardy @ 2005-04-06 2:21 ` Herbert Xu 2005-04-21 23:35 ` David S. Miller 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-04-06 2:21 UTC (permalink / raw) To: Patrick McHardy; +Cc: David S. Miller, kuznet, jmorris, yoshfuji, netdev On Tue, Apr 05, 2005 at 10:01:38PM +0200, Patrick McHardy wrote: > > Good idea, I didn't think of this. This patch is without replacement of > xfrm_init_tempsel(), it caused an "unused function" warning and, as I > mentioned before, I still need the function. Thanks. Just one more issue that I can think of, the check should only be done when tmpl->id.spi != 0. Otherwise the presence of valid states with differing state selectors will prevent new sessions from starting up. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-06 2:21 ` Herbert Xu @ 2005-04-21 23:35 ` David S. Miller 2005-04-21 23:52 ` Herbert Xu 2005-04-21 23:53 ` Patrick McHardy 0 siblings, 2 replies; 114+ messages in thread From: David S. Miller @ 2005-04-21 23:35 UTC (permalink / raw) To: Herbert Xu; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev On Wed, 6 Apr 2005 12:21:55 +1000 Herbert Xu <herbert@gondor.apana.org.au> wrote: > On Tue, Apr 05, 2005 at 10:01:38PM +0200, Patrick McHardy wrote: > > > > Good idea, I didn't think of this. This patch is without replacement of > > xfrm_init_tempsel(), it caused an "unused function" warning and, as I > > mentioned before, I still need the function. > > Thanks. Just one more issue that I can think of, the check should > only be done when tmpl->id.spi != 0. Otherwise the presence of > valid states with differing state selectors will prevent new > sessions from starting up. Is it really worthwhile, right now, to change that tmpl->id.daddr to daddr? That seems to be all that Patrick's most recent patch does. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-21 23:35 ` David S. Miller @ 2005-04-21 23:52 ` Herbert Xu 2005-04-21 23:53 ` Patrick McHardy 1 sibling, 0 replies; 114+ messages in thread From: Herbert Xu @ 2005-04-21 23:52 UTC (permalink / raw) To: David S. Miller; +Cc: kaber, kuznet, jmorris, yoshfuji, netdev On Thu, Apr 21, 2005 at 04:35:26PM -0700, David S. Miller wrote: > > > Thanks. Just one more issue that I can think of, the check should > > only be done when tmpl->id.spi != 0. Otherwise the presence of > > valid states with differing state selectors will prevent new > > sessions from starting up. > > Is it really worthwhile, right now, to change that tmpl->id.daddr to > daddr? That seems to be all that Patrick's most recent patch does. Yes, because tmpl->id.daddr can be zero when daddr is not zero. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-21 23:35 ` David S. Miller 2005-04-21 23:52 ` Herbert Xu @ 2005-04-21 23:53 ` Patrick McHardy 2005-04-22 3:13 ` David S. Miller 1 sibling, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-04-21 23:53 UTC (permalink / raw) To: David S. Miller; +Cc: Herbert Xu, kuznet, jmorris, yoshfuji, netdev [-- Attachment #1: Type: text/plain, Size: 844 bytes --] David S. Miller wrote: > On Wed, 6 Apr 2005 12:21:55 +1000 > Herbert Xu <herbert@gondor.apana.org.au> wrote: > >>Thanks. Just one more issue that I can think of, the check should >>only be done when tmpl->id.spi != 0. Otherwise the presence of >>valid states with differing state selectors will prevent new >>sessions from starting up. > > Is it really worthwhile, right now, to change that tmpl->id.daddr to > daddr? That seems to be all that Patrick's most recent patch does. Yes, tmpl->id.daddr might be 0, in which case the destination of the packet or previous tunnel mode transforms is used. daddr always contains the correct adress, so we should use it to check for duplicate SPIs. But as Herbert noted, we shouldn't perform the check if tmpl->id.spi == 0, so here is a new patch. Signed-off-by: Patrick McHardy <kaber@trash.net> [-- Attachment #2: x --] [-- Type: text/plain, Size: 544 bytes --] ===== net/xfrm/xfrm_state.c 1.60 vs edited ===== --- 1.60/net/xfrm/xfrm_state.c 2005-04-01 07:19:54 +02:00 +++ edited/net/xfrm/xfrm_state.c 2005-04-22 01:51:37 +02:00 @@ -357,8 +357,9 @@ x = best; if (!x && !error && !acquire_in_progress) { - x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto); - if (x0 != NULL) { + if (tmpl->id.spi && + (x0 = afinfo->state_lookup(daddr, tmpl->id.spi, + tmpl->id.proto)) != NULL) { xfrm_state_put(x0); error = -EEXIST; goto out; ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel 2005-04-21 23:53 ` Patrick McHardy @ 2005-04-22 3:13 ` David S. Miller 0 siblings, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-04-22 3:13 UTC (permalink / raw) To: Patrick McHardy; +Cc: herbert, kuznet, jmorris, yoshfuji, netdev On Fri, 22 Apr 2005 01:53:43 +0200 Patrick McHardy <kaber@trash.net> wrote: > Yes, tmpl->id.daddr might be 0, in which case the destination > of the packet or previous tunnel mode transforms is used. daddr > always contains the correct adress, so we should use it to check > for duplicate SPIs. But as Herbert noted, we shouldn't perform > the check if tmpl->id.spi == 0, so here is a new patch. > > Signed-off-by: Patrick McHardy <kaber@trash.net> Ok, that makes sense. Patch applied, thanks guys. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: Checking SPI in xfrm_state_find 2005-03-31 0:46 ` Herbert Xu 2005-04-01 5:23 ` David S. Miller @ 2005-04-03 17:00 ` Patrick McHardy 1 sibling, 0 replies; 114+ messages in thread From: Patrick McHardy @ 2005-04-03 17:00 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev Herbert Xu wrote: > It just occured to me that it would be much simpler if you did the > existence check in the first loop. > > So something like > > if (x->props.family != family || > !xfrm_state_addr_check(x, daddr, saddr, family) || > tmpl->id.proto == x->id.proto) > continue; > if (tmpl->id.spi) { > if (tmpl->id.spi != x->id.spi) > continue; > error = -EEXIST; > } > if (x->props.reqid == tmpl->reqid && > tmpl->mode == x->props.mode) { > } You're right, sorry for getting back to you so late. But since its already in now and not very important, I'm going to leave it until I have a better reason to touch that code, if you're ok with that. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [3/4] [IPSEC] Add route element to xfrm_dst 2005-02-14 22:14 ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu 2005-02-14 22:16 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu @ 2005-02-15 8:10 ` Mika Penttilä 2005-02-15 9:53 ` Herbert Xu 2005-03-07 5:28 ` David S. Miller ` (2 subsequent siblings) 4 siblings, 1 reply; 114+ messages in thread From: Mika Penttilä @ 2005-02-15 8:10 UTC (permalink / raw) To: Herbert Xu; +Cc: netdev Herbert Xu wrote: >This patch adds a pointer to the route corresponding to the specific >flow over the SA of an xfrm_dst that's being used. > >It also sets the next pointer of each xfrm_dst to the one above it. >This allows to traverse the list upwards from the bottom. > >Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> > >Cheers, > > >------------------------------------------------------------------------ > /* Decapsulation state, used by the input to store data during >===== net/ipv4/xfrm4_policy.c 1.14 vs edited ===== >--- 1.14/net/ipv4/xfrm4_policy.c 2005-02-14 14:02:13 +11:00 >+++ edited/net/ipv4/xfrm4_policy.c 2005-02-14 14:29:35 +11:00 >@@ -55,18 +55,29 @@ > struct rtable *rt = rt0; > u32 remote = fl->fl4_dst; > u32 local = fl->fl4_src; >+ struct flowi fl_tunnel = { >+ .nl_u = { >+ .ip4_u = { >+ .saddr = local, >+ .daddr = remote >+ } >+ } >+ }; > int i; > int err; > int header_len = 0; > int trailer_len = 0; > > dst = dst_prev = NULL; > > Shouldn't this pinning happen inside the loop for eaxh xdst, to be balanced with the dst_release(xdst->route) in xfrm_dst_destroy ? >+ dst_hold(&rt->u.dst); > > for (i = 0; i < nx; i++) { > struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops); >+ struct xfrm_dst *xdst; > > if (unlikely(dst1 == NULL)) { > err = -ENOBUFS; > > ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [3/4] [IPSEC] Add route element to xfrm_dst 2005-02-15 8:10 ` [3/4] [IPSEC] Add route element to xfrm_dst Mika Penttilä @ 2005-02-15 9:53 ` Herbert Xu 2005-02-15 10:22 ` Mika Penttilä 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-02-15 9:53 UTC (permalink / raw) To: Mika Penttil?; +Cc: netdev On Tue, Feb 15, 2005 at 10:10:06AM +0200, Mika Penttil? wrote: > > Shouldn't this pinning happen inside the loop for eaxh xdst, to be > balanced with the dst_release(xdst->route) in xfrm_dst_destroy ? > > >+ dst_hold(&rt->u.dst); > > > > for (i = 0; i < nx; i++) { > > struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops); > >+ struct xfrm_dst *xdst; > > > > if (unlikely(dst1 == NULL)) { > > err = -ENOBUFS; There is a dst_release here. The reference is either released here if the allocation fails, or it is held by dst1 which is either returned or freed as part of the bundle. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [3/4] [IPSEC] Add route element to xfrm_dst 2005-02-15 9:53 ` Herbert Xu @ 2005-02-15 10:22 ` Mika Penttilä 0 siblings, 0 replies; 114+ messages in thread From: Mika Penttilä @ 2005-02-15 10:22 UTC (permalink / raw) To: Herbert Xu; +Cc: netdev Herbert Xu wrote: >On Tue, Feb 15, 2005 at 10:10:06AM +0200, Mika Penttil? wrote: > > >>Shouldn't this pinning happen inside the loop for eaxh xdst, to be >>balanced with the dst_release(xdst->route) in xfrm_dst_destroy ? >> >> >> >>>+ dst_hold(&rt->u.dst); >>> >>> for (i = 0; i < nx; i++) { >>> struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops); >>>+ struct xfrm_dst *xdst; >>> >>> if (unlikely(dst1 == NULL)) { >>> err = -ENOBUFS; >>> >>> > >There is a dst_release here. > >The reference is either released here if the allocation fails, or it >is held by dst1 which is either returned or freed as part of the bundle. > >Cheers, > > ok I see now, thanks Mika ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [3/4] [IPSEC] Add route element to xfrm_dst 2005-02-14 22:14 ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu 2005-02-14 22:16 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu 2005-02-15 8:10 ` [3/4] [IPSEC] Add route element to xfrm_dst Mika Penttilä @ 2005-03-07 5:28 ` David S. Miller 2005-03-07 10:02 ` Herbert Xu 2005-03-07 10:16 ` [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy Herbert Xu 2005-03-14 11:52 ` [3/4] [IPSEC] Add route element to xfrm_dst Patrick McHardy 4 siblings, 1 reply; 114+ messages in thread From: David S. Miller @ 2005-03-07 5:28 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev On Tue, 15 Feb 2005 09:14:33 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > This patch adds a pointer to the route corresponding to the specific > flow over the SA of an xfrm_dst that's being used. > > It also sets the next pointer of each xfrm_dst to the one above it. > This allows to traverse the list upwards from the bottom. Applied, thanks Herbert. I guess this back traversal list makes it impossible for us to share xfrm_dst's for multiple unique flows even if that could be possible, right? ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [3/4] [IPSEC] Add route element to xfrm_dst 2005-03-07 5:28 ` David S. Miller @ 2005-03-07 10:02 ` Herbert Xu 0 siblings, 0 replies; 114+ messages in thread From: Herbert Xu @ 2005-03-07 10:02 UTC (permalink / raw) To: David S. Miller; +Cc: kuznet, jmorris, yoshfuji, netdev On Sun, Mar 06, 2005 at 09:28:00PM -0800, David S. Miller wrote: > > I guess this back traversal list makes it impossible for us > to share xfrm_dst's for multiple unique flows even if that > could be possible, right? I used the back traversal only because it was convenient. There is nothing fundamental about it. So if we ever implemented xdst sharing, we could easily modify the couple of functions involved to walk the bundle forward and then in reverse. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy 2005-02-14 22:14 ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu ` (2 preceding siblings ...) 2005-03-07 5:28 ` David S. Miller @ 2005-03-07 10:16 ` Herbert Xu 2005-03-07 17:35 ` David S. Miller 2005-03-14 11:52 ` [3/4] [IPSEC] Add route element to xfrm_dst Patrick McHardy 4 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-07 10:16 UTC (permalink / raw) To: David S. Miller, netdev [-- Attachment #1: Type: text/plain, Size: 413 bytes --] Hi Dave: Here's a trivial patch to get rid of a redundant check that I added in patch 3/4. dst_release already checks for dst == NULL. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt [-- Attachment #2: xfrm-8 --] [-- Type: text/plain, Size: 354 bytes --] ===== net/xfrm/xfrm_policy.c 1.69 vs edited ===== --- 1.69/net/xfrm/xfrm_policy.c 2005-03-07 16:31:30 +11:00 +++ edited/net/xfrm/xfrm_policy.c 2005-03-07 21:12:23 +11:00 @@ -1028,8 +1028,7 @@ { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - if (xdst->route) - dst_release(xdst->route); + dst_release(xdst->route); if (!dst->xfrm) return; ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy 2005-03-07 10:16 ` [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy Herbert Xu @ 2005-03-07 17:35 ` David S. Miller 0 siblings, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-07 17:35 UTC (permalink / raw) To: Herbert Xu; +Cc: netdev On Mon, 7 Mar 2005 21:16:57 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > Here's a trivial patch to get rid of a redundant check that I added > in patch 3/4. dst_release already checks for dst == NULL. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Applied, thanks Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [3/4] [IPSEC] Add route element to xfrm_dst 2005-02-14 22:14 ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu ` (3 preceding siblings ...) 2005-03-07 10:16 ` [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy Herbert Xu @ 2005-03-14 11:52 ` Patrick McHardy 2005-03-14 20:32 ` Herbert Xu 4 siblings, 1 reply; 114+ messages in thread From: Patrick McHardy @ 2005-03-14 11:52 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev Herbert Xu wrote: > This patch adds a pointer to the route corresponding to the specific > flow over the SA of an xfrm_dst that's being used. > > It also sets the next pointer of each xfrm_dst to the one above it. > This allows to traverse the list upwards from the bottom. Looking at this again, I noticed a problem: > + if (remote != fl_tunnel.fl4_dst) { > + fl_tunnel.fl4_src = local; > + fl_tunnel.fl4_dst = remote; > + err = xfrm_dst_lookup((struct xfrm_dst **)&rt, > + &fl_tunnel, AF_INET); > + if (err) > + goto error; > + } else > + dst_hold(&rt->u.dst); > } > + > dst_prev->child = &rt->u.dst; > + dst->path = &rt->u.dst; > + > + *dst_p = dst; > + dst = dst_prev; > + > + dst_prev = *dst_p; > i = 0; > - for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { > + for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { Since the tunnel dst is not necessarily the last in the bundle anymore, we might miss to initialize some dsts, for example with ipcomp/tunnel + esp/transport. If we have nested tunnels we'll fiddle with entries in the routing cache. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [3/4] [IPSEC] Add route element to xfrm_dst 2005-03-14 11:52 ` [3/4] [IPSEC] Add route element to xfrm_dst Patrick McHardy @ 2005-03-14 20:32 ` Herbert Xu 2005-03-15 19:05 ` Patrick McHardy 0 siblings, 1 reply; 114+ messages in thread From: Herbert Xu @ 2005-03-14 20:32 UTC (permalink / raw) To: Patrick McHardy Cc: David S. Miller, Alexey Kuznetsov, James Morris, YOSHIFUJI Hideaki, netdev On Mon, Mar 14, 2005 at 12:52:16PM +0100, Patrick McHardy wrote: > > >+ if (remote != fl_tunnel.fl4_dst) { > >+ fl_tunnel.fl4_src = local; > >+ fl_tunnel.fl4_dst = remote; > >+ err = xfrm_dst_lookup((struct xfrm_dst **)&rt, > >+ &fl_tunnel, AF_INET); > >+ if (err) > >+ goto error; > >+ } else > >+ dst_hold(&rt->u.dst); > > } > >+ > > dst_prev->child = &rt->u.dst; > >+ dst->path = &rt->u.dst; > >+ > >+ *dst_p = dst; > >+ dst = dst_prev; > >+ > >+ dst_prev = *dst_p; > > i = 0; > >- for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = > >dst_prev->child) { > >+ for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { > > Since the tunnel dst is not necessarily the last in the bundle anymore, > we might miss to initialize some dsts, for example with ipcomp/tunnel + > esp/transport. If we have nested tunnels we'll fiddle with entries in > the routing cache. Sorry, but I don't get it :) First of all what do you mean by the tunnel dst? If you mean &rt->u.dst then as far as I can see it's still the last child in the bundle. It may also appear in ->route elements earlier on but that does not come into play in this loop. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [3/4] [IPSEC] Add route element to xfrm_dst 2005-03-14 20:32 ` Herbert Xu @ 2005-03-15 19:05 ` Patrick McHardy 0 siblings, 0 replies; 114+ messages in thread From: Patrick McHardy @ 2005-03-15 19:05 UTC (permalink / raw) To: Herbert Xu; +Cc: netdev Herbert Xu wrote: > On Mon, Mar 14, 2005 at 12:52:16PM +0100, Patrick McHardy wrote: > >>Since the tunnel dst is not necessarily the last in the bundle anymore, >>we might miss to initialize some dsts, for example with ipcomp/tunnel + >>esp/transport. If we have nested tunnels we'll fiddle with entries in >>the routing cache. > > > Sorry, but I don't get it :) First of all what do you mean by the > tunnel dst? > > If you mean &rt->u.dst then as far as I can see it's still the last > child in the bundle. It may also appear in ->route elements earlier > on but that does not come into play in this loop. You're right, I must have misread the code somehow. Sorry for the noise. Regards Patrick ^ permalink raw reply [flat|nested] 114+ messages in thread
* Re: [2/4] [IPSEC] Add xfrm_state_mtu 2005-02-14 22:12 ` [2/4] [IPSEC] Add xfrm_state_mtu Herbert Xu 2005-02-14 22:14 ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu @ 2005-03-07 5:23 ` David S. Miller 1 sibling, 0 replies; 114+ messages in thread From: David S. Miller @ 2005-03-07 5:23 UTC (permalink / raw) To: Herbert Xu; +Cc: kuznet, jmorris, yoshfuji, netdev On Tue, 15 Feb 2005 09:12:00 +1100 Herbert Xu <herbert@gondor.apana.org.au> wrote: > This patch introduces the function xfrm_state_mtu which calculates > the MTU under a specific SA. This function can be simplified once > we get rid all other uses of get_max_size as we can move the calculation > into the invidual SA type. Applied, thanks Herbert. ^ permalink raw reply [flat|nested] 114+ messages in thread
end of thread, other threads:[~2005-04-22 3:13 UTC | newest] Thread overview: 114+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2005-02-14 22:10 [1/4] [IPSEC] Merge xfrm[46]_bundle/stale_bundle Herbert Xu 2005-02-14 22:12 ` [2/4] [IPSEC] Add xfrm_state_mtu Herbert Xu 2005-02-14 22:14 ` [3/4] [IPSEC] Add route element to xfrm_dst Herbert Xu 2005-02-14 22:16 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Herbert Xu 2005-02-15 15:53 ` James Morris 2005-02-15 20:31 ` Herbert Xu 2005-02-16 10:37 ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output Herbert Xu 2005-02-16 11:08 ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update top dst Herbert Xu 2005-02-16 11:38 ` [7/*] [IPSEC] Get metrics for xfrm_dst from " Herbert Xu 2005-03-07 5:47 ` David S. Miller 2005-03-07 10:41 ` Herbert Xu 2005-03-07 5:35 ` [6/*] [IPSEC] Fix xfrm[46]_update_pmtu to update " David S. Miller 2005-03-07 10:39 ` Herbert Xu 2005-03-07 5:33 ` [5/*] [IPSEC] Use dst_mtu in xfrm[46]_output David S. Miller 2005-03-07 11:45 ` [10/*] [TCP] Get rid of dst_ptmu/ext2_header_len Herbert Xu 2005-03-07 17:33 ` David S. Miller 2005-03-07 5:32 ` [4/4] [IPSEC] Store MTU at each xfrm_dst David S. Miller 2005-03-07 10:35 ` [9/*] [IPSEC] Check dst validity harder in xfrm_bundle_ok Herbert Xu 2005-03-07 17:32 ` David S. Miller 2005-03-08 10:27 ` [11/*] [NET] Move dst_release out of dst->ops->check Herbert Xu 2005-03-08 12:50 ` YOSHIFUJI Hideaki / 吉藤英明 2005-03-11 2:17 ` David S. Miller 2005-03-14 10:26 ` [12/*] [IPSEC] Handle local_df in IPv4 Herbert Xu 2005-03-14 10:53 ` [13/*] [IPV4] Fix room calculation in icmp_send Herbert Xu 2005-03-14 11:10 ` [14/*] [IPV6] Reload skb->dst after xfrm6_route_forward Herbert Xu 2005-03-15 5:27 ` David S. Miller 2005-03-15 9:19 ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data Herbert Xu 2005-03-15 9:58 ` [16/*] [INET] Take IPsec overhead into account in tunnels Herbert Xu 2005-03-15 10:05 ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu 2005-03-15 18:24 ` David S. Miller 2005-03-15 19:02 ` Patrick McHardy 2005-03-15 20:40 ` Replace send_unreach with icmp_send Herbert Xu 2005-03-15 20:48 ` Patrick McHardy 2005-03-16 10:51 ` [IPV4] Make ipt_REJECT use icmp_send again Herbert Xu 2005-03-16 19:00 ` Patrick McHardy 2005-03-16 22:44 ` David S. Miller 2005-03-17 10:51 ` [IPV4] Send TCP reset through dst_output in ipt_REJECT Herbert Xu 2005-03-17 18:06 ` David S. Miller 2005-03-15 20:31 ` [17/*] [NET] Replace dst_pmtu with dst_mtu Herbert Xu 2005-03-15 10:20 ` [16/*] [INET] Take IPsec overhead into account in tunnels Lennert Buytenhek 2005-03-15 10:27 ` Herbert Xu 2005-03-15 18:20 ` David S. Miller 2005-03-18 9:03 ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit Herbert Xu 2005-03-18 9:11 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu 2005-03-18 9:19 ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu Herbert Xu 2005-03-18 10:07 ` [24/*] [IPSEC] Get ttl from child instead of path Herbert Xu 2005-03-18 10:11 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric Herbert Xu 2005-03-18 11:06 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric Herbert Xu 2005-03-18 11:28 ` [27/*] [NET] Make dst_allfrag use dst instead of dst->path Herbert Xu 2005-03-18 18:47 ` David S. Miller 2005-03-18 18:46 ` [26/*] [NET] Kill dst_pmtu/dst_path_metric David S. Miller 2005-03-18 18:44 ` [25/*] [NET] Kill unnecessary uses of dst_path_metric David S. Miller 2005-03-18 18:43 ` [24/*] [IPSEC] Get ttl from child instead of path David S. Miller 2005-03-18 18:41 ` [23/*] [IPV4] Kill remaining unnecessary uses of dst_pmtu David S. Miller 2005-03-18 18:40 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller 2005-03-20 15:46 ` Patrick McHardy 2005-03-20 16:32 ` Ludo Stellingwerff 2005-03-20 17:17 ` Lennert Buytenhek 2005-03-20 17:49 ` Patrick McHardy 2005-03-20 18:11 ` Ludo Stellingwerff 2005-03-20 18:22 ` Patrick McHardy 2005-03-20 18:43 ` jamal 2005-03-20 19:10 ` Patrick McHardy 2005-03-30 9:49 ` Extending xfrm_selector (Was: [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS) Herbert Xu 2005-03-23 3:49 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS David S. Miller 2005-03-23 4:03 ` Patrick McHardy 2005-03-24 5:05 ` Netfilter+IPsec Patrick McHardy 2005-03-24 5:43 ` Netfilter+IPsec David S. Miller 2005-03-25 2:53 ` Netfilter+IPsec Herbert Xu 2005-03-25 5:10 ` Netfilter+IPsec Patrick McHardy 2005-03-23 9:24 ` [22/*] [NETFILTER] Use correct IPsec MTU in TCPMSS Herbert Xu 2005-03-18 18:39 ` [21/*] [IPv4] Fix MTU check in ipmr_queue_xmit David S. Miller 2005-03-15 18:18 ` [15/*] [INET] Fix IPsec calculation in ip_append_data/ip6_append_data David S. Miller 2005-03-16 11:31 ` Herbert Xu 2005-03-16 22:02 ` David S. Miller 2005-03-21 16:14 ` Mika Penttilä 2005-03-21 20:28 ` Herbert Xu 2005-03-21 21:29 ` Mika Penttilä 2005-03-21 22:04 ` Herbert Xu 2005-03-15 5:26 ` [13/*] [IPV4] Fix room calculation in icmp_send David S. Miller 2005-03-15 5:25 ` [12/*] [IPSEC] Handle local_df in IPv4 David S. Miller 2005-03-15 18:25 ` YOSHIFUJI Hideaki / 吉藤英明 2005-03-15 18:28 ` YOSHIFUJI Hideaki / 吉藤英明 2005-03-28 20:10 ` [4/4] [IPSEC] Store MTU at each xfrm_dst Patrick McHardy 2005-03-28 23:30 ` [IPSEC] Move xfrm_flush_bundles into xfrm_state GC Herbert Xu 2005-03-31 0:10 ` Patrick McHardy 2005-04-01 5:21 ` David S. Miller 2005-03-28 23:39 ` Checking SPI in xfrm_state_find Herbert Xu 2005-03-31 0:13 ` Patrick McHardy 2005-03-31 0:46 ` Herbert Xu 2005-04-01 5:23 ` David S. Miller 2005-04-02 0:49 ` [IPSEC]: Kill nested read lock by deleting xfrm_init_tempsel Herbert Xu 2005-04-02 1:20 ` David S. Miller 2005-04-02 2:09 ` Herbert Xu 2005-04-03 16:48 ` Patrick McHardy 2005-04-05 10:39 ` Herbert Xu 2005-04-05 20:01 ` Patrick McHardy 2005-04-06 2:21 ` Herbert Xu 2005-04-21 23:35 ` David S. Miller 2005-04-21 23:52 ` Herbert Xu 2005-04-21 23:53 ` Patrick McHardy 2005-04-22 3:13 ` David S. Miller 2005-04-03 17:00 ` Checking SPI in xfrm_state_find Patrick McHardy 2005-02-15 8:10 ` [3/4] [IPSEC] Add route element to xfrm_dst Mika Penttilä 2005-02-15 9:53 ` Herbert Xu 2005-02-15 10:22 ` Mika Penttilä 2005-03-07 5:28 ` David S. Miller 2005-03-07 10:02 ` Herbert Xu 2005-03-07 10:16 ` [IPSEC] Kill redundan dst_release check in xfrm_dst_destroy Herbert Xu 2005-03-07 17:35 ` David S. Miller 2005-03-14 11:52 ` [3/4] [IPSEC] Add route element to xfrm_dst Patrick McHardy 2005-03-14 20:32 ` Herbert Xu 2005-03-15 19:05 ` Patrick McHardy 2005-03-07 5:23 ` [2/4] [IPSEC] Add xfrm_state_mtu David S. Miller
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).