From: Pablo Neira Ayuso <pablo@netfilter.org>
To: netfilter-devel@vger.kernel.org
Cc: davem@davemloft.net, netdev@vger.kernel.org
Subject: [PATCH 26/36] netfilter: conntrack: introduce clash resolution on insertion race
Date: Mon, 9 May 2016 20:46:44 +0200 [thread overview]
Message-ID: <1462819614-5402-27-git-send-email-pablo@netfilter.org> (raw)
In-Reply-To: <1462819614-5402-1-git-send-email-pablo@netfilter.org>
This patch introduces nf_ct_resolve_clash() to resolve race condition on
conntrack insertions.
This is particularly a problem for connection-less protocols such as
UDP, with no initial handshake. Two or more packets may race to insert
the entry resulting in packet drops.
Another problematic scenario are packets enqueued to userspace via
NFQUEUE after the raw table, that make it easier to trigger this
race.
To resolve this, the idea is to reset the conntrack entry to the one
that won race. Packet and bytes counters are also merged.
The 'insert_failed' stats still accounts for this situation, after
this patch, the drop counter is bumped whenever we drop packets, so we
can watch for unresolved clashes.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/net/netfilter/nf_conntrack_l4proto.h | 3 ++
net/netfilter/nf_conntrack_core.c | 53 ++++++++++++++++++++++++++--
net/netfilter/nf_conntrack_proto_udp.c | 2 ++
net/netfilter/nf_conntrack_proto_udplite.c | 2 ++
4 files changed, 57 insertions(+), 3 deletions(-)
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 956d8a6..1a5fb36 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -23,6 +23,9 @@ struct nf_conntrack_l4proto {
/* L4 Protocol number. */
u_int8_t l4proto;
+ /* Resolve clashes on insertion races. */
+ bool allow_clash;
+
/* Try to fill in the third arg: dataoff is offset past network protocol
hdr. Return true if possible. */
bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int dataoff,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 25e0c26..f58a704 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -617,6 +617,48 @@ static inline void nf_ct_acct_update(struct nf_conn *ct,
}
}
+static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ const struct nf_conn *loser_ct)
+{
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(loser_ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+ enum ip_conntrack_info ctinfo;
+ unsigned int bytes;
+
+ /* u32 should be fine since we must have seen one packet. */
+ bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
+ nf_ct_acct_update(ct, ctinfo, bytes);
+ }
+}
+
+/* Resolve race on insertion if this protocol allows this. */
+static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_tuple_hash *h)
+{
+ /* This is the conntrack entry already in hashes that won race. */
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ struct nf_conntrack_l4proto *l4proto;
+
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto->allow_clash &&
+ !nf_ct_is_dying(ct) &&
+ atomic_inc_not_zero(&ct->ct_general.use)) {
+ nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
+ nf_conntrack_put(skb->nfct);
+ /* Assign conntrack already in hashes to this skbuff. Don't
+ * modify skb->nfctinfo to ensure consistent stateful filtering.
+ */
+ skb->nfct = &ct->ct_general;
+ return NF_ACCEPT;
+ }
+ NF_CT_STAT_INC(net, drop);
+ return NF_DROP;
+}
+
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
@@ -631,6 +673,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
enum ip_conntrack_info ctinfo;
struct net *net;
unsigned int sequence;
+ int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);
@@ -673,8 +716,10 @@ __nf_conntrack_confirm(struct sk_buff *skb)
*/
nf_ct_del_from_dying_or_unconfirmed_list(ct);
- if (unlikely(nf_ct_is_dying(ct)))
- goto out;
+ if (unlikely(nf_ct_is_dying(ct))) {
+ nf_ct_add_to_dying_list(ct);
+ goto dying;
+ }
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
@@ -725,10 +770,12 @@ __nf_conntrack_confirm(struct sk_buff *skb)
out:
nf_ct_add_to_dying_list(ct);
+ ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
+dying:
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
- return NF_DROP;
+ return ret;
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 478f92f..4fd0405 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -309,6 +309,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDP,
.name = "udp",
+ .allow_clash = true,
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
@@ -341,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDP,
.name = "udp",
+ .allow_clash = true,
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 1ac8ee1..9d692f5 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -274,6 +274,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDPLITE,
.name = "udplite",
+ .allow_clash = true,
.pkt_to_tuple = udplite_pkt_to_tuple,
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
@@ -306,6 +307,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDPLITE,
.name = "udplite",
+ .allow_clash = true,
.pkt_to_tuple = udplite_pkt_to_tuple,
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
--
2.1.4
next prev parent reply other threads:[~2016-05-09 18:46 UTC|newest]
Thread overview: 38+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-05-09 18:46 [PATCH 00/36] Netfilter updates for net-next Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 01/36] ipvs: handle connections started by real-servers Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 02/36] ipvs: optimize release of connections in OPS mode Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 03/36] ipvs: don't alter conntrack " Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 04/36] netfilter: conntrack: move generation seqcnt out of netns_ct Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 05/36] netfilter: conntrack: use get_random_once for nat and expectations Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 06/36] netfilter: conntrack: use get_random_once for conntrack hash seed Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 07/36] netfilter: nf_tables: introduce nft_setelem_parse_flags() helper Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 08/36] netfilter: nf_tables: parse element flags from nft_del_setelem() Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 09/36] netfilter: nft_rbtree: introduce nft_rbtree_interval_end() helper Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 10/36] netfilter: nft_rbtree: allow adjacent intervals with dynamic updates Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 11/36] netfilter: nf_ct_helper: disable automatic helper assignment Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 12/36] netfilter: ip6t_SYNPROXY: unnecessary to check whether ip6_route_output returns NULL Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 13/36] netfilter: fix IS_ERR_VALUE usage Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 14/36] netfilter: nftables: add connlabel set support Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 15/36] netfilter: conntrack: keep BH enabled during lookup Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 16/36] netfilter: conntrack: fix lookup race during hash resize Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 17/36] netfilter: conntrack: don't attempt to iterate over empty table Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 18/36] netfilter: conntrack: use nf_ct_key_equal() in more places Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 19/36] netfilter: conntrack: small refactoring of conntrack seq_printf Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 20/36] netfilter: conntrack: check netns when comparing conntrack objects Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 21/36] netfilter: conntrack: make netns address part of hash Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 22/36] netfilter: conntrack: use a single hashtable for all namespaces Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 23/36] netfilter: conntrack: consider ct netns in early_drop logic Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 24/36] netfilter: conntrack: __nf_ct_l4proto_find() always returns valid pointer Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 25/36] netfilter: conntrack: introduce nf_ct_acct_update() Pablo Neira Ayuso
2016-05-09 18:46 ` Pablo Neira Ayuso [this message]
2016-05-09 18:46 ` [PATCH 27/36] openvswitch: __nf_ct_l{3,4}proto_find() always return a valid pointer Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 28/36] netfilter: x_tables: get rid of old and inconsistent debugging Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 29/36] netfilter: nf_tables: allow set names up to 32 bytes Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 30/36] ipvs: make drop_entry protection effective for SIP-pe Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 31/36] netfilter: conntrack: check netns when walking expect hash Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 32/36] netfilter: conntrack: make netns address part of " Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 33/36] netfilter: conntrack: use a single expectation table for all namespaces Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 34/36] netfilter: conntrack: make netns address part of nat bysrc hash Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 35/36] netfilter: conntrack: use a single nat bysource table for all namespaces Pablo Neira Ayuso
2016-05-09 18:46 ` [PATCH 36/36] netfilter: conntrack: use single slab cache Pablo Neira Ayuso
2016-05-09 19:15 ` [PATCH 00/36] Netfilter updates for net-next David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1462819614-5402-27-git-send-email-pablo@netfilter.org \
--to=pablo@netfilter.org \
--cc=davem@davemloft.net \
--cc=netdev@vger.kernel.org \
--cc=netfilter-devel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).