From: Pablo Neira Ayuso <pablo@netfilter.org>
To: netfilter-devel@vger.kernel.org
Cc: davem@davemloft.net, netdev@vger.kernel.org, kuba@kernel.org,
pabeni@redhat.com, edumazet@google.com, fw@strlen.de
Subject: [PATCH net-next 14/14] netfilter: flowtable: add CLOSING state
Date: Sun, 19 Jan 2025 18:20:51 +0100 [thread overview]
Message-ID: <20250119172051.8261-15-pablo@netfilter.org> (raw)
In-Reply-To: <20250119172051.8261-1-pablo@netfilter.org>
tcp rst/fin packet triggers an immediate teardown of the flow which
results in sending flows back to the classic forwarding path.
This behaviour was introduced by:
da5984e51063 ("netfilter: nf_flow_table: add support for sending flows back to the slow path")
b6f27d322a0a ("netfilter: nf_flow_table: tear down TCP flows if RST or FIN was seen")
whose goal is to expedite removal of flow entries from the hardware
table. Before these patches, the flow was released after the flow entry
timed out.
However, this approach leads to packet races when restoring the
conntrack state as well as late flow re-offload situations when the TCP
connection is ending.
This patch adds a new CLOSING state that is is entered when tcp rst/fin
packet is seen. This allows for an early removal of the flow entry from
the hardware table. But the flow entry still remains in software, so tcp
packets to shut down the flow are not sent back to slow path.
If syn packet is seen from this new CLOSING state, then this flow enters
teardown state, ct state is set to TCP_CONNTRACK_CLOSE state and packet
is sent to slow path, so this TCP reopen scenario can be handled by
conntrack. TCP_CONNTRACK_CLOSE provides a small timeout that aims at
quickly releasing this stale entry from the conntrack table.
Moreover, skip hardware re-offload from flowtable software packet if the
flow is in CLOSING state.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/net/netfilter/nf_flow_table.h | 1 +
net/netfilter/nf_flow_table_core.c | 70 ++++++++++++++++++++-------
net/netfilter/nf_flow_table_ip.c | 6 ++-
3 files changed, 58 insertions(+), 19 deletions(-)
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index b63d53bb9dd6..d711642e78b5 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -163,6 +163,7 @@ struct flow_offload_tuple_rhash {
enum nf_flow_flags {
NF_FLOW_SNAT,
NF_FLOW_DNAT,
+ NF_FLOW_CLOSING,
NF_FLOW_TEARDOWN,
NF_FLOW_HW,
NF_FLOW_HW_DYING,
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 35396f568ecd..9d8361526f82 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -161,11 +161,23 @@ void flow_offload_route_init(struct flow_offload *flow,
}
EXPORT_SYMBOL_GPL(flow_offload_route_init);
-static void flow_offload_fixup_tcp(struct nf_conn *ct)
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+{
+ return nf_flow_timeout_delta(flow->timeout) <= 0;
+}
+
+static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
{
struct ip_ct_tcp *tcp = &ct->proto.tcp;
spin_lock_bh(&ct->lock);
+ if (tcp->state != tcp_state)
+ tcp->state = tcp_state;
+
+ /* syn packet triggers the TCP reopen case from conntrack. */
+ if (tcp->state == TCP_CONNTRACK_CLOSE)
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+
/* Conntrack state is outdated due to offload bypass.
* Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
* TCP reset validation will fail.
@@ -177,36 +189,58 @@ static void flow_offload_fixup_tcp(struct nf_conn *ct)
spin_unlock_bh(&ct->lock);
}
-static void flow_offload_fixup_ct(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct flow_offload *flow)
{
+ struct nf_conn *ct = flow->ct;
struct net *net = nf_ct_net(ct);
int l4num = nf_ct_protonum(ct);
+ bool expired, closing = false;
+ u32 offload_timeout = 0;
s32 timeout;
if (l4num == IPPROTO_TCP) {
- struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ const struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ u8 tcp_state;
- flow_offload_fixup_tcp(ct);
+ /* Enter CLOSE state if fin/rst packet has been seen, this
+ * allows TCP reopen from conntrack. Otherwise, pick up from
+ * the last seen TCP state.
+ */
+ closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
+ if (closing) {
+ flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
+ timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
+ expired = false;
+ } else {
+ tcp_state = READ_ONCE(ct->proto.tcp.state);
+ flow_offload_fixup_tcp(ct, tcp_state);
+ timeout = READ_ONCE(tn->timeouts[tcp_state]);
+ expired = nf_flow_has_expired(flow);
+ }
+ offload_timeout = READ_ONCE(tn->offload_timeout);
- timeout = tn->timeouts[ct->proto.tcp.state];
- timeout -= tn->offload_timeout;
} else if (l4num == IPPROTO_UDP) {
- struct nf_udp_net *tn = nf_udp_pernet(net);
+ const struct nf_udp_net *tn = nf_udp_pernet(net);
enum udp_conntrack state =
test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
UDP_CT_REPLIED : UDP_CT_UNREPLIED;
- timeout = tn->timeouts[state];
- timeout -= tn->offload_timeout;
+ timeout = READ_ONCE(tn->timeouts[state]);
+ expired = nf_flow_has_expired(flow);
+ offload_timeout = READ_ONCE(tn->offload_timeout);
} else {
return;
}
+ if (expired)
+ timeout -= offload_timeout;
+
if (timeout < 0)
timeout = 0;
- if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
- WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
+ if (closing ||
+ nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
+ nf_ct_refresh(ct, timeout);
}
static void flow_offload_route_release(struct flow_offload *flow)
@@ -326,18 +360,14 @@ void flow_offload_refresh(struct nf_flowtable *flow_table,
else
return;
- if (likely(!nf_flowtable_hw_offload(flow_table)))
+ if (likely(!nf_flowtable_hw_offload(flow_table)) ||
+ test_bit(NF_FLOW_CLOSING, &flow->flags))
return;
nf_flow_offload_add(flow_table, flow);
}
EXPORT_SYMBOL_GPL(flow_offload_refresh);
-static inline bool nf_flow_has_expired(const struct flow_offload *flow)
-{
- return nf_flow_timeout_delta(flow->timeout) <= 0;
-}
-
static void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
@@ -354,7 +384,7 @@ void flow_offload_teardown(struct flow_offload *flow)
{
clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
set_bit(NF_FLOW_TEARDOWN, &flow->flags);
- flow_offload_fixup_ct(flow->ct);
+ flow_offload_fixup_ct(flow);
}
EXPORT_SYMBOL_GPL(flow_offload_teardown);
@@ -542,6 +572,10 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
} else {
flow_offload_del(flow_table, flow);
}
+ } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
+ test_bit(NF_FLOW_HW, &flow->flags) &&
+ !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
+ nf_flow_offload_del(flow_table, flow);
} else if (test_bit(NF_FLOW_HW, &flow->flags)) {
nf_flow_offload_stats(flow_table, flow);
}
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index a22856106383..97c6eb8847a0 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -28,11 +28,15 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
return 0;
tcph = (void *)(skb_network_header(skb) + thoff);
- if (unlikely(tcph->fin || tcph->rst)) {
+ if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) {
flow_offload_teardown(flow);
return -1;
}
+ if ((tcph->fin || tcph->rst) &&
+ !test_bit(NF_FLOW_CLOSING, &flow->flags))
+ set_bit(NF_FLOW_CLOSING, &flow->flags);
+
return 0;
}
--
2.30.2
next prev parent reply other threads:[~2025-01-19 17:21 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-01-19 17:20 [PATCH net-next,v2 00/14] Netfilter updates for net-next Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 01/14] netfilter: nf_tables: fix set size with rbtree backend Pablo Neira Ayuso
2025-01-20 20:10 ` patchwork-bot+netdevbpf
2025-01-19 17:20 ` [PATCH net-next 02/14] netfilter: br_netfilter: remove unused conditional and dead code Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 03/14] netfilter: nf_tables: Flowtable hook's pf value never varies Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 04/14] netfilter: nf_tables: Store user-defined hook ifname Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 05/14] netfilter: nf_tables: Use stored ifname in netdev hook dumps Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 06/14] netfilter: nf_tables: Compare netdev hooks based on stored name Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 07/14] netfilter: nf_tables: Tolerate chains with no remaining hooks Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 08/14] netfilter: nf_tables: Simplify chain netdev notifier Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 09/14] netfilter: nft_flow_offload: clear tcp MAXACK flag before moving to slowpath Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 10/14] netfilter: nft_flow_offload: update tcp state flags under lock Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 11/14] netfilter: conntrack: remove skb argument from nf_ct_refresh Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 12/14] netfilter: conntrack: rework offload nf_conn timeout extension logic Pablo Neira Ayuso
2025-01-19 17:20 ` [PATCH net-next 13/14] netfilter: flowtable: teardown flow if cached mtu is stale Pablo Neira Ayuso
2025-01-19 17:20 ` Pablo Neira Ayuso [this message]
-- strict thread matches above, loose matches on Subject: below --
2025-01-16 17:18 [PATCH net-next 00/14] Netfilter updates for net-next Pablo Neira Ayuso
2025-01-16 17:19 ` [PATCH net-next 14/14] netfilter: flowtable: add CLOSING state Pablo Neira Ayuso
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250119172051.8261-15-pablo@netfilter.org \
--to=pablo@netfilter.org \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=fw@strlen.de \
--cc=kuba@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=netfilter-devel@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).