Netdev List
 help / color / mirror / Atom feed
* [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized
@ 2026-06-29  9:34 xietangxin
  2026-06-29 13:09 ` Victor Nogueira
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: xietangxin @ 2026-06-29  9:34 UTC (permalink / raw)
  To: Pablo Neira Ayuso, Florian Westphal, Phil Sutter,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: gaoxingwang, huyizhen, netfilter-devel, coreteam, netdev,
	linux-kernel, stable

Problem observed in Kubernetes environments where MASQUERADE target with
--random-fully is configured by default. after commit
165573e41f2f ("tcp: secure_seq: add back ports to TS offset") TCP short
connection QPS dropped from ~20000 to ~10000. This added source and
destination ports into TS offset calculation.

However, with MASQUERADE --random-fully, when multiple internal connections
(e.g sport 10000,20000) are mapped to the same external port (e.g 30000),
their TS offsets are calculated as ts_offset(10000) and ts_offset(20000).
If the server reuses the TIME_WAIT slot from the first connection, there is
a chance that ts_offset(20000) < ts_offset(10000), breaking TSval
monotonicity for the same 4-tuple and causing RST packets:
  Client -> Server 24870 -> 80 [SYN] TSval=2294041168
  Server -> Client 80 -> 24870 [ACK] TSecr=2846236456
  Client -> Server 24870 -> 80 [RST] Seq=855605690

After nf_nat_setup_info() successfully assigns a new randomized
source port, recalculate the TS offset using the new port and
update the SYN packet's TSval accordingly.

Test results on 4U4G VM with
`./wrk -t8 -c200 -H "Connection: close" -d10s --latency http://5.5.5.5:80`
Before:
  random:10712 req/s, random-fully:10986 req/s
After:
  random:21463 req/s, random-fully:19181 req/s

Fixes: 165573e41f2f ("tcp: secure_seq: add back ports to TS offset")
Cc: stable@vger.kernel.org
Closes:https://lore.kernel.org/all/92935c00-e0be-4591-ac44-5978c7804d57@yeah.net/
Signed-off-by: xietangxin <xietangxin@h-partners.com>
---
 net/netfilter/nf_nat_masquerade.c | 91 ++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
index 4de6e0a51701..8c9ca5a051cc 100644
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -6,8 +6,11 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
+#include <linux/tcp.h>
 
+#include <net/tcp.h>
 #include <net/netfilter/nf_nat_masquerade.h>
+#include <net/secure_seq.h>
 
 struct masq_dev_work {
 	struct work_struct work;
@@ -24,6 +27,76 @@ static DEFINE_MUTEX(masq_mutex);
 static unsigned int masq_refcnt __read_mostly;
 static atomic_t masq_worker_count __read_mostly;
 
+static __be32 *tcp_ts_option_ptr(const struct sk_buff *skb)
+{
+	const struct tcphdr *th;
+	unsigned char *ptr;
+	unsigned char opsize;
+	unsigned int optlen, offset;
+
+	th = tcp_hdr(skb);
+	optlen = (th->doff - 5) * 4;
+	ptr = (unsigned char *)(th + 1);
+	offset = 0;
+
+	while (offset < optlen) {
+		unsigned char opcode = ptr[offset];
+
+		if (opcode == TCPOPT_EOL)
+			break;
+		if (opcode == TCPOPT_NOP) {
+			offset++;
+			continue;
+		}
+
+		if (offset + 1 >= optlen)
+			break;
+
+		opsize = ptr[offset + 1];
+		if (opsize < 2 || offset + opsize > optlen)
+			break;
+
+		if (opcode == TCPOPT_TIMESTAMP && opsize == TCPOLEN_TIMESTAMP)
+			return (__be32 *)(ptr + offset + 2);
+
+		offset += opsize;
+	}
+
+	return NULL;
+}
+
+static void masquerade_update_tcp_ts_offset(struct nf_conn *ct, struct sk_buff *skb)
+{
+	__be32 *tsptr;
+	struct net *net;
+	struct tcphdr *th;
+	struct tcp_sock *tp;
+	union tcp_seq_and_ts_off st;
+	struct nf_conntrack_tuple *tuple;
+
+	th = tcp_hdr(skb);
+	net = nf_ct_net(ct);
+	tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+	if (th && th->syn && !th->ack && skb->sk &&
+	    READ_ONCE(net->ipv4.sysctl_tcp_timestamps) == 1) {
+		tp = tcp_sk(skb->sk);
+		tsptr = tcp_ts_option_ptr(skb);
+		if (!tsptr)
+			return;
+
+		if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+			st = secure_tcp_seq_and_ts_off(net, tuple->src.u3.ip, tuple->dst.u3.ip,
+				tuple->src.u.tcp.port, tuple->dst.u.tcp.port);
+		else
+			st = secure_tcpv6_seq_and_ts_off(net, tuple->src.u3.ip6,
+				tuple->dst.u3.ip6, tuple->src.u.tcp.port, tuple->dst.u.tcp.port);
+
+		*tsptr = htonl(tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + st.ts_off);
+		WRITE_ONCE(tp->tsoffset, st.ts_off);
+	}
+}
+
 unsigned int
 nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
 		       const struct nf_nat_range2 *range,
@@ -35,6 +108,7 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
 	struct nf_nat_range2 newrange;
 	const struct rtable *rt;
 	__be32 newsrc, nh;
+	unsigned int ret;
 
 	WARN_ON(hooknum != NF_INET_POST_ROUTING);
 
@@ -71,7 +145,13 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
 	newrange.max_proto   = range->max_proto;
 
 	/* Hand modified range to generic setup. */
-	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+	ret = nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+
+	if (ret == NF_ACCEPT && nf_ct_protonum(ct) == IPPROTO_TCP &&
+	    (range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
+		masquerade_update_tcp_ts_offset(ct, skb);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
 
@@ -229,6 +309,7 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 	struct in6_addr src;
 	struct nf_conn *ct;
 	struct nf_nat_range2 newrange;
+	unsigned int ret;
 
 	ct = nf_ct_get(skb, &ctinfo);
 	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
@@ -248,7 +329,13 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 	newrange.min_proto	= range->min_proto;
 	newrange.max_proto	= range->max_proto;
 
-	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+	ret = nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+
+	if (ret == NF_ACCEPT && nf_ct_protonum(ct) == IPPROTO_TCP &&
+	    (range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
+		masquerade_update_tcp_ts_offset(ct, skb);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized
  2026-06-29  9:34 [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized xietangxin
@ 2026-06-29 13:09 ` Victor Nogueira
  2026-06-29 15:23 ` Florian Westphal
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 8+ messages in thread
From: Victor Nogueira @ 2026-06-29 13:09 UTC (permalink / raw)
  To: xietangxin, Pablo Neira Ayuso, Florian Westphal, Phil Sutter,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: gaoxingwang, huyizhen, netfilter-devel, coreteam, netdev,
	linux-kernel, stable

Hi!

On 29/06/2026 06:34, xietangxin wrote:
> Problem observed in Kubernetes environments where MASQUERADE target with
> --random-fully is configured by default. after commit
> 165573e41f2f ("tcp: secure_seq: add back ports to TS offset") TCP short
> connection QPS dropped from ~20000 to ~10000. This added source and
> destination ports into TS offset calculation.
> 
> However, with MASQUERADE --random-fully, when multiple internal connections
> (e.g sport 10000,20000) are mapped to the same external port (e.g 30000),
> their TS offsets are calculated as ts_offset(10000) and ts_offset(20000).
> If the server reuses the TIME_WAIT slot from the first connection, there is
> a chance that ts_offset(20000) < ts_offset(10000), breaking TSval
> monotonicity for the same 4-tuple and causing RST packets:
>    Client -> Server 24870 -> 80 [SYN] TSval=2294041168
>    Server -> Client 80 -> 24870 [ACK] TSecr=2846236456
>    Client -> Server 24870 -> 80 [RST] Seq=855605690
> 
> After nf_nat_setup_info() successfully assigns a new randomized
> source port, recalculate the TS offset using the new port and
> update the SYN packet's TSval accordingly.
> 
> Test results on 4U4G VM with
> `./wrk -t8 -c200 -H "Connection: close" -d10s --latency http://5.5.5.5:80`
> Before:
>    random:10712 req/s, random-fully:10986 req/s
> After:
>    random:21463 req/s, random-fully:19181 req/s
> 
> Fixes: 165573e41f2f ("tcp: secure_seq: add back ports to TS offset")
> Cc: stable@vger.kernel.org
> Closes:https://lore.kernel.org/all/92935c00-e0be-4591-ac44-5978c7804d57@yeah.net/
> Signed-off-by: xietangxin <xietangxin@h-partners.com>
> [...]
> +
> +static void masquerade_update_tcp_ts_offset(struct nf_conn *ct, struct sk_buff *skb)
> +{
> [...]
> +
> +		if (nf_ct_l3num(ct) == NFPROTO_IPV4)
> +			st = secure_tcp_seq_and_ts_off(net, tuple->src.u3.ip, tuple->dst.u3.ip,
> +				tuple->src.u.tcp.port, tuple->dst.u.tcp.port);
> +		else
> +			st = secure_tcpv6_seq_and_ts_off(net, tuple->src.u3.ip6,
> +				tuple->dst.u3.ip6, tuple->src.u.tcp.port, tuple->dst.u.tcp.port);

This breaks the build when CONFIG_IPV6 is not set.

.config:4948:warning: override: reassigning to symbol NET
.config:4949:warning: override: reassigning to symbol NET_CORE
.config:4950:warning: override: reassigning to symbol NETDEVICES
.config:4951:warning: override: reassigning to symbol NETWORK_FILESYSTEMS
ERROR: modpost: "secure_tcpv6_seq_and_ts_off" [net/netfilter/nf_nat.ko] 
undefined!

cheers,
Victor

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized
  2026-06-29  9:34 [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized xietangxin
  2026-06-29 13:09 ` Victor Nogueira
@ 2026-06-29 15:23 ` Florian Westphal
  2026-07-01 14:09   ` xietangxin
  2026-06-29 21:10 ` kernel test robot
  2026-07-01  1:44 ` Jiayuan Chen
  3 siblings, 1 reply; 8+ messages in thread
From: Florian Westphal @ 2026-06-29 15:23 UTC (permalink / raw)
  To: xietangxin
  Cc: Pablo Neira Ayuso, Phil Sutter, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, gaoxingwang, huyizhen,
	netfilter-devel, coreteam, netdev, linux-kernel, stable

xietangxin <xietangxin@h-partners.com> wrote:
> Problem observed in Kubernetes environments where MASQUERADE target with
> --random-fully is configured by default. after commit
> 165573e41f2f ("tcp: secure_seq: add back ports to TS offset") TCP short
> connection QPS dropped from ~20000 to ~10000. This added source and
> destination ports into TS offset calculation.
> 
> However, with MASQUERADE --random-fully, when multiple internal connections
> (e.g sport 10000,20000) are mapped to the same external port (e.g 30000),
> their TS offsets are calculated as ts_offset(10000) and ts_offset(20000).
> If the server reuses the TIME_WAIT slot from the first connection, there is
> a chance that ts_offset(20000) < ts_offset(10000), breaking TSval
> monotonicity for the same 4-tuple and causing RST packets:
>   Client -> Server 24870 -> 80 [SYN] TSval=2294041168
>   Server -> Client 80 -> 24870 [ACK] TSecr=2846236456
>   Client -> Server 24870 -> 80 [RST] Seq=855605690
> 
> After nf_nat_setup_info() successfully assigns a new randomized
> source port, recalculate the TS offset using the new port and
> update the SYN packet's TSval accordingly.

I don't think this is related to masquerade but to snat (port address
rewrite) in general.

I think you could place your new helper in nf_nat_core.c and call it
from nf_nat_l4proto_unique_tuple() once we've found a usable tuple:

 668 another_round:
 669         for (i = 0; i < attempts; i++, off++) {
 670                 *keyptr = htons(min + off % range_size);
 671                 if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))

	 		     ... here.
 672                         return;
 673         }


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized
  2026-06-29  9:34 [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized xietangxin
  2026-06-29 13:09 ` Victor Nogueira
  2026-06-29 15:23 ` Florian Westphal
@ 2026-06-29 21:10 ` kernel test robot
  2026-07-01  1:44 ` Jiayuan Chen
  3 siblings, 0 replies; 8+ messages in thread
From: kernel test robot @ 2026-06-29 21:10 UTC (permalink / raw)
  To: xietangxin, Pablo Neira Ayuso, Florian Westphal, Phil Sutter,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: oe-kbuild-all, gaoxingwang, huyizhen, netfilter-devel, coreteam,
	netdev, linux-kernel, stable

Hi xietangxin,

kernel test robot noticed the following build errors:

[auto build test ERROR on net/main]

url:    https://github.com/intel-lab-lkp/linux/commits/xietangxin/netfilter-nf_nat_masquerade-recalculate-TCP-TS-offset-when-port-is-randomized/20260629-173037
base:   net/main
patch link:    https://lore.kernel.org/r/20260629093408.3927103-1-xietangxin%40h-partners.com
patch subject: [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized
config: arm-randconfig-004-20260630 (https://download.01.org/0day-ci/archive/20260630/202606300522.3jMZ6dLb-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 8.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260630/202606300522.3jMZ6dLb-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202606300522.3jMZ6dLb-lkp@intel.com/

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "secure_tcpv6_seq_and_ts_off" [net/netfilter/nf_nat.ko] undefined!

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized
  2026-06-29  9:34 [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized xietangxin
                   ` (2 preceding siblings ...)
  2026-06-29 21:10 ` kernel test robot
@ 2026-07-01  1:44 ` Jiayuan Chen
  2026-07-01 14:11   ` xietangxin
  3 siblings, 1 reply; 8+ messages in thread
From: Jiayuan Chen @ 2026-07-01  1:44 UTC (permalink / raw)
  To: xietangxin, Pablo Neira Ayuso, Florian Westphal, Phil Sutter,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: gaoxingwang, huyizhen, netfilter-devel, coreteam, netdev,
	linux-kernel, stable


On 6/29/26 5:34 PM, xietangxin wrote:
> Problem observed in Kubernetes environments where MASQUERADE target with
> --random-fully is configured by default. after commit
> 165573e41f2f ("tcp: secure_seq: add back ports to TS offset") TCP short
> connection QPS dropped from ~20000 to ~10000. This added source and
> destination ports into TS offset calculation.
>
> However, with MASQUERADE --random-fully, when multiple internal connections
> (e.g sport 10000,20000) are mapped to the same external port (e.g 30000),
> their TS offsets are calculated as ts_offset(10000) and ts_offset(20000).
> If the server reuses the TIME_WAIT slot from the first connection, there is
> a chance that ts_offset(20000) < ts_offset(10000), breaking TSval
> monotonicity for the same 4-tuple and causing RST packets:
>    Client -> Server 24870 -> 80 [SYN] TSval=2294041168
>    Server -> Client 80 -> 24870 [ACK] TSecr=2846236456
>    Client -> Server 24870 -> 80 [RST] Seq=855605690
>
> After nf_nat_setup_info() successfully assigns a new randomized
> source port, recalculate the TS offset using the new port and
> update the SYN packet's TSval accordingly.
>
> Test results on 4U4G VM with
> `./wrk -t8 -c200 -H "Connection: close" -d10s --latency http://5.5.5.5:80`
> Before:
>    random:10712 req/s, random-fully:10986 req/s
> After:
>    random:21463 req/s, random-fully:19181 req/s
>
> Fixes: 165573e41f2f ("tcp: secure_seq: add back ports to TS offset")
> Cc: stable@vger.kernel.org


I'd treat it as a feature not a fix.


> Closes:https://lore.kernel.org/all/92935c00-e0be-4591-ac44-5978c7804d57@yeah.net/
> Signed-off-by: xietangxin <xietangxin@h-partners.com>
> ---
>   net/netfilter/nf_nat_masquerade.c | 91 ++++++++++++++++++++++++++++++-
>   1 file changed, 89 insertions(+), 2 deletions(-)
>
> diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
> index 4de6e0a51701..8c9ca5a051cc 100644
> --- a/net/netfilter/nf_nat_masquerade.c
> +++ b/net/netfilter/nf_nat_masquerade.c
> @@ -6,8 +6,11 @@
>   #include <linux/netfilter.h>
>   #include <linux/netfilter_ipv4.h>
>   #include <linux/netfilter_ipv6.h>
> +#include <linux/tcp.h>
>   
> +#include <net/tcp.h>
>   #include <net/netfilter/nf_nat_masquerade.h>
> +#include <net/secure_seq.h>
>   
>   struct masq_dev_work {
>   	struct work_struct work;
> @@ -24,6 +27,76 @@ static DEFINE_MUTEX(masq_mutex);
>   static unsigned int masq_refcnt __read_mostly;
>   static atomic_t masq_worker_count __read_mostly;
>   
> +static __be32 *tcp_ts_option_ptr(const struct sk_buff *skb)
> +{
> +	const struct tcphdr *th;
> +	unsigned char *ptr;
> +	unsigned char opsize;
> +	unsigned int optlen, offset;
> +
> +	th = tcp_hdr(skb);
> +	optlen = (th->doff - 5) * 4;
> +	ptr = (unsigned char *)(th + 1);
> +	offset = 0;
> +
> +	while (offset < optlen) {
> +		unsigned char opcode = ptr[offset];
> +
> +		if (opcode == TCPOPT_EOL)
> +			break;
> +		if (opcode == TCPOPT_NOP) {
> +			offset++;
> +			continue;
> +		}
> +
> +		if (offset + 1 >= optlen)
> +			break;
> +
> +		opsize = ptr[offset + 1];
> +		if (opsize < 2 || offset + opsize > optlen)
> +			break;
> +
> +		if (opcode == TCPOPT_TIMESTAMP && opsize == TCPOLEN_TIMESTAMP)
> +			return (__be32 *)(ptr + offset + 2);
> +
> +		offset += opsize;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void masquerade_update_tcp_ts_offset(struct nf_conn *ct, struct sk_buff *skb)
> +{
> +	__be32 *tsptr;
> +	struct net *net;
> +	struct tcphdr *th;
> +	struct tcp_sock *tp;
> +	union tcp_seq_and_ts_off st;
> +	struct nf_conntrack_tuple *tuple;
> +
> +	th = tcp_hdr(skb);
> +	net = nf_ct_net(ct);
> +	tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
> +

why use reply not original, or do I miss something ?



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized
  2026-06-29 15:23 ` Florian Westphal
@ 2026-07-01 14:09   ` xietangxin
  2026-07-01 14:17     ` Florian Westphal
  0 siblings, 1 reply; 8+ messages in thread
From: xietangxin @ 2026-07-01 14:09 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Pablo Neira Ayuso, Phil Sutter, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, gaoxingwang, huyizhen,
	netfilter-devel, coreteam, netdev, linux-kernel, stable



On 6/29/2026 11:23 PM, Florian Westphal wrote:
> xietangxin <xietangxin@h-partners.com> wrote:
>> Problem observed in Kubernetes environments where MASQUERADE target with
>> --random-fully is configured by default. after commit
>> 165573e41f2f ("tcp: secure_seq: add back ports to TS offset") TCP short
>> connection QPS dropped from ~20000 to ~10000. This added source and
>> destination ports into TS offset calculation.
>>
>> However, with MASQUERADE --random-fully, when multiple internal connections
>> (e.g sport 10000,20000) are mapped to the same external port (e.g 30000),
>> their TS offsets are calculated as ts_offset(10000) and ts_offset(20000).
>> If the server reuses the TIME_WAIT slot from the first connection, there is
>> a chance that ts_offset(20000) < ts_offset(10000), breaking TSval
>> monotonicity for the same 4-tuple and causing RST packets:
>>   Client -> Server 24870 -> 80 [SYN] TSval=2294041168
>>   Server -> Client 80 -> 24870 [ACK] TSecr=2846236456
>>   Client -> Server 24870 -> 80 [RST] Seq=855605690
>>
>> After nf_nat_setup_info() successfully assigns a new randomized
>> source port, recalculate the TS offset using the new port and
>> update the SYN packet's TSval accordingly.
> 
> I don't think this is related to masquerade but to snat (port address
> rewrite) in general.
> 
> I think you could place your new helper in nf_nat_core.c and call it
> from nf_nat_l4proto_unique_tuple() once we've found a usable tuple:
> 
>  668 another_round:
>  669         for (i = 0; i < attempts; i++, off++) {
>  670                 *keyptr = htons(min + off % range_size);
>  671                 if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
> 
> 	 		     ... here.
>  672                         return;
>  673         }
> 
Hi Florian,

Thank you for the insightful feedback. You are absolutely right that
this issue is releated to SNAT with port rewrite, rather masquerade.

Shifting the helper down to nf_nat_l4proto_unique_tuple() as you suggested
encounters a structural roadblock. we don't have access to the skb there.
Adding skb to all intermediate callers (like nf_nat_setup_info, get_unique_tuple)
would severely pollute the core NAT APIs.

would it be acceptable to place this logic in nf_nat_inet_fn() before do_nat?

 963 do_nat:
             ..here
 964         return nf_nat_packet(ct, ctinfo, state->hook, skb);
 965
 966 oif_changed:
 967         nf_ct_kill_acct(ct, ctinfo, skb);
 968         return NF_DROP;
 969 }

Best regards,
Tangxin Xie


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized
  2026-07-01  1:44 ` Jiayuan Chen
@ 2026-07-01 14:11   ` xietangxin
  0 siblings, 0 replies; 8+ messages in thread
From: xietangxin @ 2026-07-01 14:11 UTC (permalink / raw)
  To: Jiayuan Chen, Pablo Neira Ayuso, Florian Westphal, Phil Sutter,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: gaoxingwang, huyizhen, netfilter-devel, coreteam, netdev,
	linux-kernel, stable



On 7/1/2026 9:44 AM, Jiayuan Chen wrote:
> 
> On 6/29/26 5:34 PM, xietangxin wrote:
>> Problem observed in Kubernetes environments where MASQUERADE target with
>> --random-fully is configured by default. after commit
>> 165573e41f2f ("tcp: secure_seq: add back ports to TS offset") TCP short
>> connection QPS dropped from ~20000 to ~10000. This added source and
>> destination ports into TS offset calculation.
>>
>> However, with MASQUERADE --random-fully, when multiple internal connections
>> (e.g sport 10000,20000) are mapped to the same external port (e.g 30000),
>> their TS offsets are calculated as ts_offset(10000) and ts_offset(20000).
>> If the server reuses the TIME_WAIT slot from the first connection, there is
>> a chance that ts_offset(20000) < ts_offset(10000), breaking TSval
>> monotonicity for the same 4-tuple and causing RST packets:
>>    Client -> Server 24870 -> 80 [SYN] TSval=2294041168
>>    Server -> Client 80 -> 24870 [ACK] TSecr=2846236456
>>    Client -> Server 24870 -> 80 [RST] Seq=855605690
>>
>> After nf_nat_setup_info() successfully assigns a new randomized
>> source port, recalculate the TS offset using the new port and
>> update the SYN packet's TSval accordingly.
>>
>> Test results on 4U4G VM with
>> `./wrk -t8 -c200 -H "Connection: close" -d10s --latency http://5.5.5.5:80`
>> Before:
>>    random:10712 req/s, random-fully:10986 req/s
>> After:
>>    random:21463 req/s, random-fully:19181 req/s
>>
>> Fixes: 165573e41f2f ("tcp: secure_seq: add back ports to TS offset")
>> Cc: stable@vger.kernel.org
> 
> 
> I'd treat it as a feature not a fix.

I prefer it as a bugfix, because after commit
165573e41f2f ("tcp: secure_seq: add back ports to TS offset") TCP short
connection QPS dropped from ~20000 to ~10000 with MASQUERADE --random-fully,

> 
> 
>> Closes:https://lore.kernel.org/all/92935c00-e0be-4591-ac44-5978c7804d57@yeah.net/
>> Signed-off-by: xietangxin <xietangxin@h-partners.com>
>> ---
>>   net/netfilter/nf_nat_masquerade.c | 91 ++++++++++++++++++++++++++++++-
>>   1 file changed, 89 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
>> index 4de6e0a51701..8c9ca5a051cc 100644
>> --- a/net/netfilter/nf_nat_masquerade.c
>> +++ b/net/netfilter/nf_nat_masquerade.c
>> @@ -6,8 +6,11 @@
>>   #include <linux/netfilter.h>
>>   #include <linux/netfilter_ipv4.h>
>>   #include <linux/netfilter_ipv6.h>
>> +#include <linux/tcp.h>
>>   +#include <net/tcp.h>
>>   #include <net/netfilter/nf_nat_masquerade.h>
>> +#include <net/secure_seq.h>
>>     struct masq_dev_work {
>>       struct work_struct work;
>> @@ -24,6 +27,76 @@ static DEFINE_MUTEX(masq_mutex);
>>   static unsigned int masq_refcnt __read_mostly;
>>   static atomic_t masq_worker_count __read_mostly;
>>   +static __be32 *tcp_ts_option_ptr(const struct sk_buff *skb)
>> +{
>> +    const struct tcphdr *th;
>> +    unsigned char *ptr;
>> +    unsigned char opsize;
>> +    unsigned int optlen, offset;
>> +
>> +    th = tcp_hdr(skb);
>> +    optlen = (th->doff - 5) * 4;
>> +    ptr = (unsigned char *)(th + 1);
>> +    offset = 0;
>> +
>> +    while (offset < optlen) {
>> +        unsigned char opcode = ptr[offset];
>> +
>> +        if (opcode == TCPOPT_EOL)
>> +            break;
>> +        if (opcode == TCPOPT_NOP) {
>> +            offset++;
>> +            continue;
>> +        }
>> +
>> +        if (offset + 1 >= optlen)
>> +            break;
>> +
>> +        opsize = ptr[offset + 1];
>> +        if (opsize < 2 || offset + opsize > optlen)
>> +            break;
>> +
>> +        if (opcode == TCPOPT_TIMESTAMP && opsize == TCPOLEN_TIMESTAMP)
>> +            return (__be32 *)(ptr + offset + 2);
>> +
>> +        offset += opsize;
>> +    }
>> +
>> +    return NULL;
>> +}
>> +
>> +static void masquerade_update_tcp_ts_offset(struct nf_conn *ct, struct sk_buff *skb)
>> +{
>> +    __be32 *tsptr;
>> +    struct net *net;
>> +    struct tcphdr *th;
>> +    struct tcp_sock *tp;
>> +    union tcp_seq_and_ts_off st;
>> +    struct nf_conntrack_tuple *tuple;
>> +
>> +    th = tcp_hdr(skb);
>> +    net = nf_ct_net(ct);
>> +    tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
>> +
> 
> why use reply not original, or do I miss something ?
> 
> 

We use IP_CT_DIR_REPLY here because we need the post-NAT (translated)
4-tuple to correctly recalculate the new ts_offset

Best regards,
Tangxin Xie


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized
  2026-07-01 14:09   ` xietangxin
@ 2026-07-01 14:17     ` Florian Westphal
  0 siblings, 0 replies; 8+ messages in thread
From: Florian Westphal @ 2026-07-01 14:17 UTC (permalink / raw)
  To: xietangxin
  Cc: Pablo Neira Ayuso, Phil Sutter, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, gaoxingwang, huyizhen,
	netfilter-devel, coreteam, netdev, linux-kernel, stable

xietangxin <xietangxin@h-partners.com> wrote:
> Shifting the helper down to nf_nat_l4proto_unique_tuple() as you suggested
> encounters a structural roadblock. we don't have access to the skb there.
> Adding skb to all intermediate callers (like nf_nat_setup_info, get_unique_tuple)
> would severely pollute the core NAT APIs.

Right, propagating the skb is too much code churn.

> would it be acceptable to place this logic in nf_nat_inet_fn() before do_nat?
> 
>  963 do_nat:
>              ..here

This is hit for every packet, not just the first one after
nf_nat_setup_info().  I suggest a slightly earlier spot in the
same function.

 936                                 ret = e->hooks[i].hook(e->hooks[i].priv, skb,
 937                                                        state);
 938                                 if (ret != NF_ACCEPT)
 939                                         return ret;
 940                                 if (nf_nat_initialized(ct, maniptype))
 941                                         goto do_nat;
 942                         }
 943 null_bind:
 944                         ret = nf_nat_alloc_null_binding(ct, state->hook);
 945                         if (ret != NF_ACCEPT)
 946                                 return ret;

 .... Here.

 947                 } else {

This spot runs only for new connections, right after a nf_nat_setup_info() call.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-07-01 14:17 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-29  9:34 [PATCH net] netfilter: nf_nat_masquerade: recalculate TCP TS offset when port is randomized xietangxin
2026-06-29 13:09 ` Victor Nogueira
2026-06-29 15:23 ` Florian Westphal
2026-07-01 14:09   ` xietangxin
2026-07-01 14:17     ` Florian Westphal
2026-06-29 21:10 ` kernel test robot
2026-07-01  1:44 ` Jiayuan Chen
2026-07-01 14:11   ` xietangxin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox