* [PATCH net-next v2 1/3] net: ip_fragment: Add ip_defrag_ignore_cb support
2020-07-07 4:55 [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
@ 2020-07-07 4:55 ` wenxu
2020-07-07 4:55 ` [PATCH net-next v2 2/3] netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB clear wenxu
` (2 subsequent siblings)
3 siblings, 0 replies; 8+ messages in thread
From: wenxu @ 2020-07-07 4:55 UTC (permalink / raw)
To: netdev; +Cc: netfilter-devel, fw
From: wenxu <wenxu@ucloud.cn>
Add ip_defrag_ignore_cb for conntrack defrag and it will
elide the CB clear when packets are defragmented by
connection tracking.
Signed-off-by: wenxu <wenxu@ucloud.cn>
---
include/net/ip.h | 2 ++
net/ipv4/ip_fragment.c | 55 ++++++++++++++++++++++++++++++++++++++++++--------
2 files changed, 49 insertions(+), 8 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index 862c954..31779a5 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -671,6 +671,8 @@ static inline bool ip_defrag_user_in_between(u32 user,
}
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user);
+int ip_defrag_ignore_cb(struct net *net, struct sk_buff *skb,
+ u32 user, u16 *frag_max_size);
#ifdef CONFIG_INET
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user);
#else
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cfeb889..afc2b3d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -76,7 +76,8 @@ static u8 ip4_frag_ecn(u8 tos)
static struct inet_frags ip4_frags;
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct sk_buff *prev_tail, struct net_device *dev);
+ struct sk_buff *prev_tail, struct net_device *dev,
+ bool ignore_skb_cb, u16 *frag_max_size);
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -269,7 +270,8 @@ static int ip_frag_reinit(struct ipq *qp)
}
/* Add new segment to existing queue. */
-static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb,
+ bool ignore_skb_cb, u16 *frag_max_size)
{
struct net *net = qp->q.fqdir->net;
int ihl, end, flags, offset;
@@ -282,7 +284,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (qp->q.flags & INET_FRAG_COMPLETE)
goto err;
- if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+ if ((ignore_skb_cb || !(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE)) &&
unlikely(ip_frag_too_far(qp)) &&
unlikely(err = ip_frag_reinit(qp))) {
ipq_kill(qp);
@@ -368,7 +370,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, skb, prev_tail, dev);
+ err = ip_frag_reasm(qp, skb, prev_tail, dev, ignore_skb_cb,
+ frag_max_size);
skb->_skb_refdst = orefdst;
if (err)
inet_frag_kill(&qp->q);
@@ -400,7 +403,8 @@ static bool ip_frag_coalesce_ok(const struct ipq *qp)
/* Build a new IP datagram from all its fragments. */
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct sk_buff *prev_tail, struct net_device *dev)
+ struct sk_buff *prev_tail, struct net_device *dev,
+ bool ignore_skb_cb, u16 *frag_max_size)
{
struct net *net = qp->q.fqdir->net;
struct iphdr *iph;
@@ -430,7 +434,10 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
ip_frag_coalesce_ok(qp));
skb->dev = dev;
- IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
+ if (!ignore_skb_cb)
+ IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
+ else if (frag_max_size)
+ *frag_max_size = max(qp->max_df_size, qp->q.max_size);
iph = ip_hdr(skb);
iph->tot_len = htons(len);
@@ -445,7 +452,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
* from one very small df-fragment and one large non-df frag.
*/
if (qp->max_df_size == qp->q.max_size) {
- IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
+ if (!ignore_skb_cb)
+ IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
iph->frag_off = htons(IP_DF);
} else {
iph->frag_off = 0;
@@ -487,7 +495,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
spin_lock(&qp->q.lock);
- ret = ip_frag_queue(qp, skb);
+ ret = ip_frag_queue(qp, skb, false, NULL);
spin_unlock(&qp->q.lock);
ipq_put(qp);
@@ -500,6 +508,37 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_defrag);
+/* Process an incoming IP datagram fragment. */
+int ip_defrag_ignore_cb(struct net *net, struct sk_buff *skb,
+ u32 user, u16 *frag_max_size)
+{
+ struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
+ int vif = l3mdev_master_ifindex_rcu(dev);
+ struct ipq *qp;
+
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
+ skb_orphan(skb);
+
+ /* Lookup (or create) queue header */
+ qp = ip_find(net, ip_hdr(skb), user, vif);
+ if (qp) {
+ int ret;
+
+ spin_lock_bh(&qp->q.lock);
+
+ ret = ip_frag_queue(qp, skb, true, frag_max_size);
+
+ spin_unlock_bh(&qp->q.lock);
+ ipq_put(qp);
+ return ret;
+ }
+
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
+ kfree_skb(skb);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(ip_defrag_ignore_cb);
+
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct iphdr iph;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH net-next v2 2/3] netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB clear
2020-07-07 4:55 [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
2020-07-07 4:55 ` [PATCH net-next v2 1/3] net: ip_fragment: Add ip_defrag_ignore_cb support wenxu
@ 2020-07-07 4:55 ` wenxu
2020-07-07 4:55 ` [PATCH net-next v2 3/3] net/sched: act_ct: fix clobber qdisc_skb_cb in defrag wenxu
2020-07-15 20:26 ` [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear Jakub Kicinski
3 siblings, 0 replies; 8+ messages in thread
From: wenxu @ 2020-07-07 4:55 UTC (permalink / raw)
To: netdev; +Cc: netfilter-devel, fw
From: wenxu <wenxu@ucloud.cn>
Make nf_ct_frag6_gather elide the CB clear when packets are defragmented
by connection tracking. This can make each subsystem such as br_netfilter
, openvswitch, act_ct do defrag without restore the CB. And avoid
serious crashes and problems in ct subsystem. Because Some packet
schedulers store pointers in the qdisc CB private area and Parallel
accesses to the SKB.
Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Signed-off-by: wenxu <wenxu@ucloud.cn>
---
include/linux/netfilter_ipv6.h | 9 +++++----
include/net/netfilter/ipv6/nf_defrag_ipv6.h | 3 ++-
net/bridge/netfilter/nf_conntrack_bridge.c | 7 ++-----
net/ipv6/netfilter/nf_conntrack_reasm.c | 19 ++++++++++++-------
net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 3 ++-
net/openvswitch/conntrack.c | 8 +++-----
net/sched/act_ct.c | 3 +--
7 files changed, 27 insertions(+), 25 deletions(-)
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index aac42c2..ef9fa28 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -58,7 +58,8 @@ struct nf_ipv6_ops {
int (*output)(struct net *, struct sock *, struct sk_buff *));
int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry);
#if IS_MODULE(CONFIG_IPV6)
- int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user);
+ int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user,
+ u16 *frag_max_size);
int (*br_fragment)(struct net *net, struct sock *sk,
struct sk_buff *skb,
struct nf_bridge_frag_data *data,
@@ -118,7 +119,7 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst,
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
- u32 user)
+ u32 user, u16 *frag_max_size)
{
#if IS_MODULE(CONFIG_IPV6)
const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
@@ -126,9 +127,9 @@ static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
if (!v6_ops)
return 1;
- return v6_ops->br_defrag(net, skb, user);
+ return v6_ops->br_defrag(net, skb, user, frag_max_size);
#elif IS_BUILTIN(CONFIG_IPV6)
- return nf_ct_frag6_gather(net, skb, user);
+ return nf_ct_frag6_gather(net, skb, user, frag_max_size);
#else
return 1;
#endif
diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
index 6d31cd0..d83add2 100644
--- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
@@ -9,7 +9,8 @@
int nf_ct_frag6_init(void);
void nf_ct_frag6_cleanup(void);
-int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user);
+int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb,
+ u32 user, u16 *frag_max_size);
struct inet_frags_ctl;
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index 8096732..a11927b 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -170,7 +170,6 @@ static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
{
u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
enum ip_conntrack_info ctinfo;
- struct br_input_skb_cb cb;
const struct nf_conn *ct;
int err;
@@ -178,15 +177,13 @@ static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
if (ct)
zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
- br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm));
-
err = nf_ipv6_br_defrag(state->net, skb,
- IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+ IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id,
+ &BR_INPUT_SKB_CB(skb)->frag_max_size);
/* queued */
if (err == -EINPROGRESS)
return NF_STOLEN;
- br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size);
return err == 0 ? NF_ACCEPT : NF_DROP;
}
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index fed9666..ab28390 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -128,7 +128,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
#endif
static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
- struct sk_buff *prev_tail, struct net_device *dev);
+ struct sk_buff *prev_tail, struct net_device *dev,
+ u16 *frag_max_size);
static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
{
@@ -167,7 +168,8 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
- const struct frag_hdr *fhdr, int nhoff)
+ const struct frag_hdr *fhdr, int nhoff,
+ u16 *frag_max_size)
{
unsigned int payload_len;
struct net_device *dev;
@@ -286,7 +288,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = nf_ct_frag6_reasm(fq, skb, prev, dev);
+ err = nf_ct_frag6_reasm(fq, skb, prev, dev, frag_max_size);
skb->_skb_refdst = orefdst;
/* After queue has assumed skb ownership, only 0 or
@@ -313,7 +315,8 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
* the last and the first frames arrived and all the bits are here.
*/
static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
- struct sk_buff *prev_tail, struct net_device *dev)
+ struct sk_buff *prev_tail, struct net_device *dev,
+ u16 *frag_max_size)
{
void *reasm_data;
int payload_len;
@@ -354,7 +357,8 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
skb->dev = dev;
ipv6_hdr(skb)->payload_len = htons(payload_len);
ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
- IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
+ if (frag_max_size)
+ *frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
/* Yes, and fold redundant checksum back. 8) */
if (skb->ip_summed == CHECKSUM_COMPLETE)
@@ -436,7 +440,8 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
return 0;
}
-int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
+int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb,
+ u32 user, u16 *frag_max_size)
{
u16 savethdr = skb->transport_header;
int fhoff, nhoff, ret;
@@ -471,7 +476,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
spin_lock_bh(&fq->q.lock);
- ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff);
+ ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, frag_max_size);
if (ret == -EPROTO) {
skb->transport_header = savethdr;
ret = 0;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 6646a87..7d532ed 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -64,7 +64,8 @@ static unsigned int ipv6_defrag(void *priv,
#endif
err = nf_ct_frag6_gather(state->net, skb,
- nf_ct6_defrag_user(state->hook, skb));
+ nf_ct6_defrag_user(state->hook, skb),
+ &IP6CB(skb)->frag_max_size);
/* queued */
if (err == -EINPROGRESS)
return NF_STOLEN;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 4340f25..5984a84 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -493,11 +493,11 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
static int handle_fragments(struct net *net, struct sw_flow_key *key,
u16 zone, struct sk_buff *skb)
{
- struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
int err;
if (key->eth.type == htons(ETH_P_IP)) {
enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+ struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
err = ip_defrag(net, skb, user);
@@ -505,12 +505,12 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
return err;
ovs_cb.mru = IPCB(skb)->frag_max_size;
+ *OVS_CB(skb) = ovs_cb;
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
} else if (key->eth.type == htons(ETH_P_IPV6)) {
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
- memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
- err = nf_ct_frag6_gather(net, skb, user);
+ err = nf_ct_frag6_gather(net, skb, user, &OVS_CB(skb)->mru);
if (err) {
if (err != -EINPROGRESS)
kfree_skb(skb);
@@ -518,7 +518,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
}
key->ip.proto = ipv6_hdr(skb)->nexthdr;
- ovs_cb.mru = IP6CB(skb)->frag_max_size;
#endif
} else {
kfree_skb(skb);
@@ -533,7 +532,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
key->ip.frag = OVS_FRAG_TYPE_NONE;
skb_clear_hash(skb);
skb->ignore_df = 1;
- *OVS_CB(skb) = ovs_cb;
return 0;
}
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 1b9c6d4..20f3d11 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -707,8 +707,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
- memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
- err = nf_ct_frag6_gather(net, skb, user);
+ err = nf_ct_frag6_gather(net, skb, user, NULL);
if (err && err != -EINPROGRESS)
goto out_free;
#else
--
1.8.3.1
^ permalink raw reply related [flat|nested] 8+ messages in thread