* [net-next fragmentation icmp v4 1/4] ipv4: introduce frag_expire_skip_icmp()
2015-05-14 2:27 [net-next fragmentation icmp v4 0/4] fragmentation ICMP Andy Zhou
@ 2015-05-14 2:27 ` Andy Zhou
2015-05-14 2:28 ` [net-next fragmentation icmp v4 2/4] IPv4: skip ICMP for bridge contrack users when defrag expires Andy Zhou
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: Andy Zhou @ 2015-05-14 2:27 UTC (permalink / raw)
To: davem; +Cc: netdev, Andy Zhou
Improve readability of skip ICMP for de-fragmentation expiration logic.
This change will also make the logic easier to maintain when the
following patches in this series are applied.
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
include/net/ip.h | 10 ++++++++++
net/ipv4/ip_fragment.c | 13 +++++++++----
2 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index 0ed6d76..43f6f39 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -478,6 +478,16 @@ enum ip_defrag_users {
IP_DEFRAG_MACVLAN,
};
+/* Return true if the value of 'user' is between 'lower_bond'
+ * and 'upper_bond' inclusively.
+ */
+static inline bool ip_defrag_user_in_between(u32 user,
+ enum ip_defrag_users lower_bond,
+ enum ip_defrag_users upper_bond)
+{
+ return user >= lower_bond && user <= upper_bond;
+}
+
int ip_defrag(struct sk_buff *skb, u32 user);
#ifdef CONFIG_INET
struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cc1da6d..83424f1 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -173,6 +173,13 @@ static void ipq_kill(struct ipq *ipq)
inet_frag_kill(&ipq->q, &ip4_frags);
}
+static bool frag_expire_skip_icmp(u32 user)
+{
+ return user == IP_DEFRAG_AF_PACKET ||
+ ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
+ __IP_DEFRAG_CONNTRACK_IN_END);
+}
+
/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
@@ -217,10 +224,8 @@ static void ip_expire(unsigned long arg)
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
- if (qp->user == IP_DEFRAG_AF_PACKET ||
- ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
- (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
- (skb_rtable(head)->rt_type != RTN_LOCAL)))
+ if (frag_expire_skip_icmp(qp->user) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL))
goto out_rcu_unlock;
/* Send an ICMP "Fragment Reassembly Timeout" message. */
--
1.9.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [net-next fragmentation icmp v4 3/4] bridge_netfilter: No ICMP packet on IPv4 defragmentation timeout
2015-05-14 2:27 [net-next fragmentation icmp v4 0/4] fragmentation ICMP Andy Zhou
2015-05-14 2:27 ` [net-next fragmentation icmp v4 1/4] ipv4: introduce frag_expire_skip_icmp() Andy Zhou
2015-05-14 2:28 ` [net-next fragmentation icmp v4 2/4] IPv4: skip ICMP for bridge contrack users when defrag expires Andy Zhou
@ 2015-05-14 2:28 ` Andy Zhou
2015-05-14 8:59 ` Florian Westphal
2015-05-14 2:28 ` [net-next fragmentation icmp v4 4/4] bridge_netfilter: No ICMP packet on IPv4 fragmentation error Andy Zhou
3 siblings, 1 reply; 9+ messages in thread
From: Andy Zhou @ 2015-05-14 2:28 UTC (permalink / raw)
To: davem; +Cc: netdev, Andy Zhou
Currently, on defragmentation timeout error, ICMP error message
will be generated. This is fine when they are used in a routing context,
but does not make sense in the context of bridging netfilter.
This patch adds a bit (IPSKB_NO_FRAG_ICMP) in IPCB to control
whether ICMP error message should be generated. br_netfiler sets
this bit.
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
include/net/inet_frag.h | 4 +++-
include/net/ip.h | 1 +
net/bridge/br_netfilter.c | 5 +++++
net/ipv4/ip_fragment.c | 12 ++++++++++--
4 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 8d17655..e3c8840 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -22,12 +22,14 @@ struct netns_frags {
* @INET_FRAG_LAST_IN: final fragment has arrived
* @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
* @INET_FRAG_EVICTED: frag queue is being evicted
+ * @INET_FRAG_NO_ICMP: Do not send icmp message on incomplete defrag
*/
enum {
INET_FRAG_FIRST_IN = BIT(0),
INET_FRAG_LAST_IN = BIT(1),
INET_FRAG_COMPLETE = BIT(2),
- INET_FRAG_EVICTED = BIT(3)
+ INET_FRAG_EVICTED = BIT(3),
+ INET_FRAG_NO_ICMP = BIT(4)
};
/**
diff --git a/include/net/ip.h b/include/net/ip.h
index 43f6f39..af44c9f 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -45,6 +45,7 @@ struct inet_skb_parm {
#define IPSKB_FRAG_COMPLETE BIT(3)
#define IPSKB_REROUTED BIT(4)
#define IPSKB_DOREDIRECT BIT(5)
+#define IPSKB_NO_FRAG_ICMP BIT(6)
u16 frag_max_size;
};
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index ab55e24..6a2adba 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -663,6 +663,11 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
if (br_parse_ip_options(skb))
return NF_DROP;
+ /* In case this is a fragmented packet, do not send icmp packet on
+ * defragmentation error
+ */
+ IPCB(skb)->flags |= IPSKB_NO_FRAG_ICMP;
+
nf_bridge_put(skb->nf_bridge);
if (!nf_bridge_alloc(skb))
return NF_DROP;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 47fa64e..60afaa7 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -226,7 +226,8 @@ static void ip_expire(unsigned long arg)
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
- if (frag_expire_skip_icmp(qp->user) &&
+ if ((qp->q.flags & INET_FRAG_NO_ICMP) ||
+ frag_expire_skip_icmp(qp->user) ||
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out_rcu_unlock;
@@ -330,6 +331,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
int ihl, end;
int err = -ENOENT;
u8 ecn;
+ bool no_icmp;
if (qp->q.flags & INET_FRAG_COMPLETE)
goto err;
@@ -347,6 +349,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
offset &= IP_OFFSET;
offset <<= 3; /* offset is in 8-byte chunks */
ihl = ip_hdrlen(skb);
+ no_icmp = IPCB(skb)->flags & IPSKB_NO_FRAG_ICMP;
/* Determine the position of this fragment. */
end = offset + skb->len - ihl;
@@ -485,7 +488,12 @@ found:
skb->len + ihl > qp->q.max_size)
qp->q.max_size = skb->len + ihl;
- if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ if (no_icmp) {
+ qp->q.flags |= INET_FRAG_NO_ICMP;
+ }
+
+ if (((qp->q.flags & ~INET_FRAG_NO_ICMP) ==
+ (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN)) &&
qp->q.meat == qp->q.len) {
unsigned long orefdst = skb->_skb_refdst;
--
1.9.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [net-next fragmentation icmp v4 4/4] bridge_netfilter: No ICMP packet on IPv4 fragmentation error
2015-05-14 2:27 [net-next fragmentation icmp v4 0/4] fragmentation ICMP Andy Zhou
` (2 preceding siblings ...)
2015-05-14 2:28 ` [net-next fragmentation icmp v4 3/4] bridge_netfilter: No ICMP packet on IPv4 defragmentation timeout Andy Zhou
@ 2015-05-14 2:28 ` Andy Zhou
3 siblings, 0 replies; 9+ messages in thread
From: Andy Zhou @ 2015-05-14 2:28 UTC (permalink / raw)
To: davem; +Cc: netdev, Andy Zhou
When bridge netfilter re-fragments an IP packet for output, all
packets that can not be re-fragmented to their original input size
should be silently discarded.
However, current bridge netfilter output path generates an ICMP packet
with 'size exceeded MTU' message for such packets, this is a bug.
This patch refactors the ip_fragment() API to allow two separate
use cases. The bridge netfilter user case will not
send ICMP, the routing output will, as before.
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
include/net/ip.h | 4 ++--
net/bridge/br_netfilter.c | 21 ++++++++++++++++++++-
net/ipv4/ip_output.c | 40 ++++++++++++++++++++++++++++------------
3 files changed, 50 insertions(+), 15 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index af44c9f..47e62ea 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -109,8 +109,8 @@ int ip_local_deliver(struct sk_buff *skb);
int ip_mr_input(struct sk_buff *skb);
int ip_output(struct sock *sk, struct sk_buff *skb);
int ip_mc_output(struct sock *sk, struct sk_buff *skb);
-int ip_fragment(struct sock *sk, struct sk_buff *skb,
- int (*output)(struct sock *, struct sk_buff *));
+int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct sock *, struct sk_buff *));
int ip_do_nat(struct sk_buff *skb);
void ip_send_check(struct iphdr *ip);
int __ip_local_out(struct sk_buff *skb);
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 6a2adba..f83a35c 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -849,6 +849,25 @@ static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb)
return br_dev_queue_push_xmit(sk, skb);
}
+static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct sock *, struct sk_buff *))
+{
+ unsigned int mtu = ip_skb_dst_mtu(skb);
+ struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt = skb_rtable(skb);
+ struct net_device *dev = rt->dst.dev;
+
+ if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ return ip_do_fragment(sk, skb, output);
+}
+
static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
{
int ret;
@@ -880,7 +899,7 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
data->size);
- ret = ip_fragment(sk, skb, br_nf_push_frag_xmit);
+ ret = br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit);
} else {
ret = br_dev_queue_push_xmit(sk, skb);
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 2acc5dc..8d91b92 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -83,6 +83,9 @@
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
+static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct sock *, struct sk_buff *));
+
/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
{
@@ -478,6 +481,28 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
+static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct sock *, struct sk_buff *))
+{
+ struct iphdr *iph = ip_hdr(skb);
+ unsigned int mtu = ip_skb_dst_mtu(skb);
+
+ if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ struct rtable *rt = skb_rtable(skb);
+ struct net_device *dev = rt->dst.dev;
+
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ return ip_do_fragment(sk, skb, output);
+}
+
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
@@ -485,8 +510,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
* single device frame, and queue such a frame for sending.
*/
-int ip_fragment(struct sock *sk, struct sk_buff *skb,
- int (*output)(struct sock *, struct sk_buff *))
+int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct sock *, struct sk_buff *))
{
struct iphdr *iph;
int ptr;
@@ -507,15 +532,6 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
iph = ip_hdr(skb);
mtu = ip_skb_dst_mtu(skb);
- if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
- (IPCB(skb)->frag_max_size &&
- IPCB(skb)->frag_max_size > mtu))) {
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- kfree_skb(skb);
- return -EMSGSIZE;
- }
/*
* Setup starting values.
@@ -751,7 +767,7 @@ fail:
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
}
-EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_do_fragment);
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
--
1.9.1
^ permalink raw reply related [flat|nested] 9+ messages in thread