Linux Netfilter development
 help / color / mirror / Atom feed
* [RFC PATCH] netfilter: conntrack: simplify sctp state machine
@ 2023-01-04 11:31 Sriram Yagnaraman
  2023-01-04 12:41 ` Florian Westphal
  2023-01-04 15:02 ` Pablo Neira Ayuso
  0 siblings, 2 replies; 14+ messages in thread
From: Sriram Yagnaraman @ 2023-01-04 11:31 UTC (permalink / raw)
  To: netfilter-devel
  Cc: Florian Westphal, Marcelo Ricardo Leitner, Long Xin,
	Sriram Yagnaraman

All the paths in an SCTP connection are kept alive either by actual
DATA/SACK running through the connection or by HEARTBEAT. This patch
proposes a simple state machine with only two states OPEN_WAIT and
ESTABLISHED (similar to UDP). The reason for this change is a full
stateful approach to SCTP is difficult when the association is
multihomed since the endpoints could use different paths in the network
during the lifetime of an association.

Default timeouts are:
OPEN_WAIT:   3 seconds   (rto_initial)
ESTABLISHED: 210 seconds (rto_max + hb_interval * path_max_retrans)

Important changes/notes
- Timeout is used to clean up conntrack entries
- VTAG checks are kept as is (can be moved to a conntrack extension if
  desired)
- SCTP chunks are parsed only once, and a map is populated with the
  information on the chunks present in the packet
- ASSURED bit is NOT set in this version of the patch, need help
  understanding when to set it

Note that this patch has changed uapi headers.

Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
---
 .../uapi/linux/netfilter/nf_conntrack_sctp.h  |  10 +-
 .../linux/netfilter/nfnetlink_cttimeout.h     |  10 +-
 net/netfilter/nf_conntrack_proto_sctp.c       | 589 ++++--------------
 net/netfilter/nf_conntrack_standalone.c       |  72 +--
 4 files changed, 143 insertions(+), 538 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h
index c742469afe21..89381a57021a 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h
@@ -7,16 +7,8 @@
 
 enum sctp_conntrack {
 	SCTP_CONNTRACK_NONE,
-	SCTP_CONNTRACK_CLOSED,
-	SCTP_CONNTRACK_COOKIE_WAIT,
-	SCTP_CONNTRACK_COOKIE_ECHOED,
+	SCTP_CONNTRACK_OPEN_WAIT,
 	SCTP_CONNTRACK_ESTABLISHED,
-	SCTP_CONNTRACK_SHUTDOWN_SENT,
-	SCTP_CONNTRACK_SHUTDOWN_RECD,
-	SCTP_CONNTRACK_SHUTDOWN_ACK_SENT,
-	SCTP_CONNTRACK_HEARTBEAT_SENT,
-	SCTP_CONNTRACK_HEARTBEAT_ACKED,
-	SCTP_CONNTRACK_DATA_SENT,
 	SCTP_CONNTRACK_MAX
 };
 
diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
index 94e74034706d..372dfe7c07ed 100644
--- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
+++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
@@ -86,16 +86,8 @@ enum ctattr_timeout_dccp {
 
 enum ctattr_timeout_sctp {
 	CTA_TIMEOUT_SCTP_UNSPEC,
-	CTA_TIMEOUT_SCTP_CLOSED,
-	CTA_TIMEOUT_SCTP_COOKIE_WAIT,
-	CTA_TIMEOUT_SCTP_COOKIE_ECHOED,
+	CTA_TIMEOUT_SCTP_OPEN_WAIT,
 	CTA_TIMEOUT_SCTP_ESTABLISHED,
-	CTA_TIMEOUT_SCTP_SHUTDOWN_SENT,
-	CTA_TIMEOUT_SCTP_SHUTDOWN_RECD,
-	CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
-	CTA_TIMEOUT_SCTP_HEARTBEAT_SENT,
-	CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED,
-	CTA_TIMEOUT_SCTP_DATA_SENT,
 	__CTA_TIMEOUT_SCTP_MAX
 };
 #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1)
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index d88b92a8ffca..d79ed476b764 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -5,12 +5,13 @@
  * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com>
  * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net>
  *
- * SCTP is defined in RFC 2960. References to various sections in this code
+ * SCTP is defined in RFC 4960. References to various sections in this code
  * are to this RFC.
  */
 
 #include <linux/types.h>
 #include <linux/timer.h>
+#include <linux/jiffies.h>
 #include <linux/netfilter.h>
 #include <linux/in.h>
 #include <linux/ip.h>
@@ -27,127 +28,19 @@
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_timeout.h>
 
-/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
-   closely.  They're more complex. --RR
-
-   And so for me for SCTP :D -Kiran */
+#define	SCTP_FLAG_HEARTBEAT_VTAG_FAILED	1
 
 static const char *const sctp_conntrack_names[] = {
 	"NONE",
-	"CLOSED",
-	"COOKIE_WAIT",
-	"COOKIE_ECHOED",
+	"OPEN_WAIT",
 	"ESTABLISHED",
-	"SHUTDOWN_SENT",
-	"SHUTDOWN_RECD",
-	"SHUTDOWN_ACK_SENT",
-	"HEARTBEAT_SENT",
-	"HEARTBEAT_ACKED",
 };
 
 #define SECS  * HZ
-#define MINS  * 60 SECS
-#define HOURS * 60 MINS
-#define DAYS  * 24 HOURS
 
 static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
-	[SCTP_CONNTRACK_CLOSED]			= 10 SECS,
-	[SCTP_CONNTRACK_COOKIE_WAIT]		= 3 SECS,
-	[SCTP_CONNTRACK_COOKIE_ECHOED]		= 3 SECS,
-	[SCTP_CONNTRACK_ESTABLISHED]		= 5 DAYS,
-	[SCTP_CONNTRACK_SHUTDOWN_SENT]		= 300 SECS / 1000,
-	[SCTP_CONNTRACK_SHUTDOWN_RECD]		= 300 SECS / 1000,
-	[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]	= 3 SECS,
-	[SCTP_CONNTRACK_HEARTBEAT_SENT]		= 30 SECS,
-	[SCTP_CONNTRACK_HEARTBEAT_ACKED]	= 210 SECS,
-	[SCTP_CONNTRACK_DATA_SENT]		= 30 SECS,
-};
-
-#define	SCTP_FLAG_HEARTBEAT_VTAG_FAILED	1
-
-#define sNO SCTP_CONNTRACK_NONE
-#define	sCL SCTP_CONNTRACK_CLOSED
-#define	sCW SCTP_CONNTRACK_COOKIE_WAIT
-#define	sCE SCTP_CONNTRACK_COOKIE_ECHOED
-#define	sES SCTP_CONNTRACK_ESTABLISHED
-#define	sSS SCTP_CONNTRACK_SHUTDOWN_SENT
-#define	sSR SCTP_CONNTRACK_SHUTDOWN_RECD
-#define	sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
-#define	sHS SCTP_CONNTRACK_HEARTBEAT_SENT
-#define	sHA SCTP_CONNTRACK_HEARTBEAT_ACKED
-#define	sDS SCTP_CONNTRACK_DATA_SENT
-#define	sIV SCTP_CONNTRACK_MAX
-
-/*
-	These are the descriptions of the states:
-
-NOTE: These state names are tantalizingly similar to the states of an
-SCTP endpoint. But the interpretation of the states is a little different,
-considering that these are the states of the connection and not of an end
-point. Please note the subtleties. -Kiran
-
-NONE              - Nothing so far.
-COOKIE WAIT       - We have seen an INIT chunk in the original direction, or also
-		    an INIT_ACK chunk in the reply direction.
-COOKIE ECHOED     - We have seen a COOKIE_ECHO chunk in the original direction.
-ESTABLISHED       - We have seen a COOKIE_ACK in the reply direction.
-SHUTDOWN_SENT     - We have seen a SHUTDOWN chunk in the original direction.
-SHUTDOWN_RECD     - We have seen a SHUTDOWN chunk in the reply direction.
-SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
-		    to that of the SHUTDOWN chunk.
-CLOSED            - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
-		    the SHUTDOWN chunk. Connection is closed.
-HEARTBEAT_SENT    - We have seen a HEARTBEAT in a new flow.
-HEARTBEAT_ACKED   - We have seen a HEARTBEAT-ACK/DATA/SACK in the direction
-		    opposite to that of the HEARTBEAT/DATA chunk. Secondary connection
-		    is established.
-DATA_SENT         - We have seen a DATA/SACK in a new flow.
-*/
-
-/* TODO
- - I have assumed that the first INIT is in the original direction.
- This messes things when an INIT comes in the reply direction in CLOSED
- state.
- - Check the error type in the reply dir before transitioning from
-cookie echoed to closed.
- - Sec 5.2.4 of RFC 2960
- - Full Multi Homing support.
-*/
-
-/* SCTP conntrack state transitions */
-static const u8 sctp_conntracks[2][12][SCTP_CONNTRACK_MAX] = {
-	{
-/*	ORIGINAL	*/
-/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */
-/* init         */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA, sCW},
-/* init_ack     */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},
-/* abort        */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown     */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS, sCL},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA, sSA},
-/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't have Stale cookie*/
-/* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* 5.2.4 - Big TODO */
-/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA, sCL},
-/* heartbeat    */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS},
-/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS},
-/* data/sack    */ {sDS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}
-	},
-	{
-/*	REPLY	*/
-/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */
-/* init         */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* INIT in sCL Big TODO */
-/* init_ack     */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},
-/* abort        */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL, sIV},
-/* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR, sIV},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA, sIV},
-/* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA, sIV},
-/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* Can't come in reply dir */
-/* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA, sIV},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA, sIV},
-/* heartbeat    */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sHA},
-/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA},
-/* data/sack    */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA},
-	}
+	[SCTP_CONNTRACK_OPEN_WAIT]			= 3 SECS,
+	[SCTP_CONNTRACK_ESTABLISHED]		= 210 SECS,
 };
 
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
@@ -158,184 +51,6 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 }
 #endif
 
-#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count)	\
-for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0;	\
-	(offset) < (skb)->len &&					\
-	((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch)));	\
-	(offset) += (ntohs((sch)->length) + 3) & ~3, (count)++)
-
-/* Some validity checks to make sure the chunks are fine */
-static int do_basic_checks(struct nf_conn *ct,
-			   const struct sk_buff *skb,
-			   unsigned int dataoff,
-			   unsigned long *map)
-{
-	u_int32_t offset, count;
-	struct sctp_chunkhdr _sch, *sch;
-	int flag;
-
-	flag = 0;
-
-	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
-		pr_debug("Chunk Num: %d  Type: %d\n", count, sch->type);
-
-		if (sch->type == SCTP_CID_INIT ||
-		    sch->type == SCTP_CID_INIT_ACK ||
-		    sch->type == SCTP_CID_SHUTDOWN_COMPLETE)
-			flag = 1;
-
-		/*
-		 * Cookie Ack/Echo chunks not the first OR
-		 * Init / Init Ack / Shutdown compl chunks not the only chunks
-		 * OR zero-length.
-		 */
-		if (((sch->type == SCTP_CID_COOKIE_ACK ||
-		      sch->type == SCTP_CID_COOKIE_ECHO ||
-		      flag) &&
-		     count != 0) || !sch->length) {
-			pr_debug("Basic checks failed\n");
-			return 1;
-		}
-
-		if (map)
-			set_bit(sch->type, map);
-	}
-
-	pr_debug("Basic checks passed\n");
-	return count == 0;
-}
-
-static int sctp_new_state(enum ip_conntrack_dir dir,
-			  enum sctp_conntrack cur_state,
-			  int chunk_type)
-{
-	int i;
-
-	pr_debug("Chunk type: %d\n", chunk_type);
-
-	switch (chunk_type) {
-	case SCTP_CID_INIT:
-		pr_debug("SCTP_CID_INIT\n");
-		i = 0;
-		break;
-	case SCTP_CID_INIT_ACK:
-		pr_debug("SCTP_CID_INIT_ACK\n");
-		i = 1;
-		break;
-	case SCTP_CID_ABORT:
-		pr_debug("SCTP_CID_ABORT\n");
-		i = 2;
-		break;
-	case SCTP_CID_SHUTDOWN:
-		pr_debug("SCTP_CID_SHUTDOWN\n");
-		i = 3;
-		break;
-	case SCTP_CID_SHUTDOWN_ACK:
-		pr_debug("SCTP_CID_SHUTDOWN_ACK\n");
-		i = 4;
-		break;
-	case SCTP_CID_ERROR:
-		pr_debug("SCTP_CID_ERROR\n");
-		i = 5;
-		break;
-	case SCTP_CID_COOKIE_ECHO:
-		pr_debug("SCTP_CID_COOKIE_ECHO\n");
-		i = 6;
-		break;
-	case SCTP_CID_COOKIE_ACK:
-		pr_debug("SCTP_CID_COOKIE_ACK\n");
-		i = 7;
-		break;
-	case SCTP_CID_SHUTDOWN_COMPLETE:
-		pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
-		i = 8;
-		break;
-	case SCTP_CID_HEARTBEAT:
-		pr_debug("SCTP_CID_HEARTBEAT");
-		i = 9;
-		break;
-	case SCTP_CID_HEARTBEAT_ACK:
-		pr_debug("SCTP_CID_HEARTBEAT_ACK");
-		i = 10;
-		break;
-	case SCTP_CID_DATA:
-	case SCTP_CID_SACK:
-		pr_debug("SCTP_CID_DATA/SACK");
-		i = 11;
-		break;
-	default:
-		/* Other chunks like DATA or SACK do not change the state */
-		pr_debug("Unknown chunk type, Will stay in %s\n",
-			 sctp_conntrack_names[cur_state]);
-		return cur_state;
-	}
-
-	pr_debug("dir: %d   cur_state: %s  chunk_type: %d  new_state: %s\n",
-		 dir, sctp_conntrack_names[cur_state], chunk_type,
-		 sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
-
-	return sctp_conntracks[dir][i][cur_state];
-}
-
-/* Don't need lock here: this conntrack not in circulation yet */
-static noinline bool
-sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
-	 const struct sctphdr *sh, unsigned int dataoff)
-{
-	enum sctp_conntrack new_state;
-	const struct sctp_chunkhdr *sch;
-	struct sctp_chunkhdr _sch;
-	u32 offset, count;
-
-	memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
-	new_state = SCTP_CONNTRACK_MAX;
-	for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) {
-		new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
-					   SCTP_CONNTRACK_NONE, sch->type);
-
-		/* Invalid: delete conntrack */
-		if (new_state == SCTP_CONNTRACK_NONE ||
-		    new_state == SCTP_CONNTRACK_MAX) {
-			pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
-			return false;
-		}
-
-		/* Copy the vtag into the state info */
-		if (sch->type == SCTP_CID_INIT) {
-			struct sctp_inithdr _inithdr, *ih;
-			/* Sec 8.5.1 (A) */
-			if (sh->vtag)
-				return false;
-
-			ih = skb_header_pointer(skb, offset + sizeof(_sch),
-						sizeof(_inithdr), &_inithdr);
-			if (!ih)
-				return false;
-
-			pr_debug("Setting vtag %x for new conn\n",
-				 ih->init_tag);
-
-			ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
-		} else if (sch->type == SCTP_CID_HEARTBEAT ||
-			   sch->type == SCTP_CID_DATA ||
-			   sch->type == SCTP_CID_SACK) {
-			pr_debug("Setting vtag %x for secondary conntrack\n",
-				 sh->vtag);
-			ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
-		} else {
-		/* If it is a shutdown ack OOTB packet, we expect a return
-		   shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
-			pr_debug("Setting vtag %x for new conn OOTB\n",
-				 sh->vtag);
-			ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
-		}
-
-		ct->proto.sctp.state = SCTP_CONNTRACK_NONE;
-	}
-
-	return true;
-}
-
 static bool sctp_error(struct sk_buff *skb,
 		       unsigned int dataoff,
 		       const struct nf_hook_state *state)
@@ -367,6 +82,12 @@ static bool sctp_error(struct sk_buff *skb,
 	return true;
 }
 
+#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count)	\
+for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0;	\
+	(offset) < (skb)->len &&					\
+	((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch)));	\
+	(offset) += (ntohs((sch)->length) + 3) & ~3, (count)++)
+
 /* Returns verdict for packet, or -NF_ACCEPT for invalid. */
 int nf_conntrack_sctp_packet(struct nf_conn *ct,
 			     struct sk_buff *skb,
@@ -374,195 +95,168 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
 			     enum ip_conntrack_info ctinfo,
 			     const struct nf_hook_state *state)
 {
-	enum sctp_conntrack new_state, old_state;
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	const struct sctphdr *sh;
-	struct sctphdr _sctph;
-	const struct sctp_chunkhdr *sch;
-	struct sctp_chunkhdr _sch;
-	u_int32_t offset, count;
-	unsigned int *timeouts;
 	unsigned long map[256 / sizeof(unsigned long)] = { 0 };
-	bool ignore = false;
+	unsigned int *timeouts;
+	u_int32_t init_vtag = 0;
+	u_int32_t offset, count;
+	struct sctphdr _sctph, *sctph;
+	struct sctp_chunkhdr _sch, *sch;
 
 	if (sctp_error(skb, dataoff, state))
-		return -NF_ACCEPT;
+		goto out_drop;
+
+	sctph = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
+	if (sctph == NULL)
+		goto out_drop;
 
-	sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
-	if (sh == NULL)
-		goto out;
+	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
+		pr_debug("Chunk Num: %d  Type: %d\n", count, sch->type);
+		set_bit(sch->type, map);
 
-	if (do_basic_checks(ct, skb, dataoff, map) != 0)
-		goto out;
+		if (sch->type == SCTP_CID_INIT ||
+			sch->type == SCTP_CID_INIT_ACK) {
+			struct sctp_inithdr _inith, *inith;
+			inith = skb_header_pointer(skb, offset + sizeof(_sch),
+						sizeof(_inith), &_inith);
+			if (inith)
+				init_vtag = inith->init_tag;
+			else
+				goto out_drop;
+		}
+	}
 
+	spin_lock_bh(&ct->lock);
 	if (!nf_ct_is_confirmed(ct)) {
 		/* If an OOTB packet has any of these chunks discard (Sec 8.4) */
 		if (test_bit(SCTP_CID_ABORT, map) ||
 		    test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
 		    test_bit(SCTP_CID_COOKIE_ACK, map))
-			return -NF_ACCEPT;
+			goto out_unlock;
 
-		if (!sctp_new(ct, skb, sh, dataoff))
-			return -NF_ACCEPT;
+		memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
+		ct->proto.sctp.state = SCTP_CONNTRACK_OPEN_WAIT;
+		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+
+		if (test_bit(SCTP_CID_INIT, map))
+			ct->proto.sctp.vtag[!dir] = init_vtag;
+		else if (test_bit(SCTP_CID_SHUTDOWN_ACK, map))
+			/* If it is a shutdown ack OOTB packet, we expect a return
+			shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
+			ct->proto.sctp.vtag[!dir] = sctph->vtag;
+		else
+			ct->proto.sctp.vtag[dir] = sctph->vtag;
 	} else {
-		/* Check the verification tag (Sec 8.5) */
-		if (!test_bit(SCTP_CID_INIT, map) &&
-		    !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
-		    !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
-		    !test_bit(SCTP_CID_ABORT, map) &&
-		    !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
-		    !test_bit(SCTP_CID_HEARTBEAT, map) &&
-		    !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
-		    sh->vtag != ct->proto.sctp.vtag[dir]) {
-			pr_debug("Verification tag check failed\n");
-			goto out;
+		if (!ct->proto.sctp.vtag[!dir] &&
+			test_bit(SCTP_CID_INIT_ACK, map)) {
+			ct->proto.sctp.vtag[!dir] = init_vtag;
+		}
+		if (!ct->proto.sctp.vtag[dir]) {
+			ct->proto.sctp.vtag[dir] = sctph->vtag;
 		}
-	}
 
-	old_state = new_state = SCTP_CONNTRACK_NONE;
-	spin_lock_bh(&ct->lock);
-	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
-		/* Special cases of Verification tag check (Sec 8.5.1) */
-		if (sch->type == SCTP_CID_INIT) {
-			/* Sec 8.5.1 (A) */
-			if (sh->vtag != 0)
-				goto out_unlock;
-		} else if (sch->type == SCTP_CID_ABORT) {
-			/* Sec 8.5.1 (B) */
-			if (sh->vtag != ct->proto.sctp.vtag[dir] &&
-			    sh->vtag != ct->proto.sctp.vtag[!dir])
-				goto out_unlock;
-		} else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
-			/* Sec 8.5.1 (C) */
-			if (sh->vtag != ct->proto.sctp.vtag[dir] &&
-			    sh->vtag != ct->proto.sctp.vtag[!dir] &&
-			    sch->flags & SCTP_CHUNK_FLAG_T)
-				goto out_unlock;
-		} else if (sch->type == SCTP_CID_COOKIE_ECHO) {
-			/* Sec 8.5.1 (D) */
-			if (sh->vtag != ct->proto.sctp.vtag[dir])
-				goto out_unlock;
-		} else if (sch->type == SCTP_CID_HEARTBEAT) {
-			if (ct->proto.sctp.vtag[dir] == 0) {
-				pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir);
-				ct->proto.sctp.vtag[dir] = sh->vtag;
-			} else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
-				if (test_bit(SCTP_CID_DATA, map) || ignore)
-					goto out_unlock;
-
-				ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
-				ct->proto.sctp.last_dir = dir;
-				ignore = true;
-				continue;
-			} else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
-				ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
-			}
-		} else if (sch->type == SCTP_CID_HEARTBEAT_ACK) {
-			if (ct->proto.sctp.vtag[dir] == 0) {
-				pr_debug("Setting vtag %x for dir %d\n",
-					 sh->vtag, dir);
-				ct->proto.sctp.vtag[dir] = sh->vtag;
-			} else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
-				if (test_bit(SCTP_CID_DATA, map) || ignore)
-					goto out_unlock;
-
-				if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 ||
-				    ct->proto.sctp.last_dir == dir)
-					goto out_unlock;
-
-				ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
-				ct->proto.sctp.vtag[dir] = sh->vtag;
-				ct->proto.sctp.vtag[!dir] = 0;
-			} else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
-				ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
-			}
-		} else if (sch->type == SCTP_CID_DATA || sch->type == SCTP_CID_SACK) {
-			if (ct->proto.sctp.vtag[dir] == 0) {
-				pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir);
-				ct->proto.sctp.vtag[dir] = sh->vtag;
-			}
+		/* we have seen traffic both ways, go to established */
+		if (dir == IP_CT_DIR_REPLY &&
+			ct->proto.sctp.state == SCTP_CONNTRACK_OPEN_WAIT) {
+			ct->proto.sctp.state = SCTP_CONNTRACK_ESTABLISHED;
+			nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
 		}
+	}
 
-		old_state = ct->proto.sctp.state;
-		new_state = sctp_new_state(dir, old_state, sch->type);
+	/* Check the verification tag (Sec 8.5) */
+	if (!test_bit(SCTP_CID_INIT, map) &&
+		!test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
+		!test_bit(SCTP_CID_COOKIE_ECHO, map) &&
+		!test_bit(SCTP_CID_ABORT, map) &&
+		!test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+		!test_bit(SCTP_CID_HEARTBEAT, map) &&
+		!test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
+		sctph->vtag != ct->proto.sctp.vtag[dir]) {
+		pr_debug("Verification tag check failed\n");
+		goto out_unlock;
+	}
 
-		/* Invalid */
-		if (new_state == SCTP_CONNTRACK_MAX) {
-			pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u "
-				 "conntrack=%u\n",
-				 dir, sch->type, old_state);
+	/* Special cases of Verification tag check (Sec 8.5.1) */
+	if (test_bit(SCTP_CID_INIT, map)) {
+		/* Sec 8.5.1 (A) */
+		if (sctph->vtag != 0)
 			goto out_unlock;
-		}
+		else if (nf_ct_is_confirmed(ct))
+			/* don't renew timeout on init retransmit so
+			* port reuse by client or NAT middlebox cannot
+			* keep entry alive indefinitely (incl. nat info).
+			*/
+			goto out;
+	}
+	if (test_bit(SCTP_CID_ABORT, map)) {
+		/* Sec 8.5.1 (B) */
+		if (sctph->vtag != ct->proto.sctp.vtag[dir] &&
+			sctph->vtag != ct->proto.sctp.vtag[!dir])
+			goto out_unlock;
+	}
+	if (test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map)) {
+		/* Sec 8.5.1 (C) */
+		if (sctph->vtag != ct->proto.sctp.vtag[dir] &&
+			sctph->vtag != ct->proto.sctp.vtag[!dir] &&
+			sch->flags & SCTP_CHUNK_FLAG_T)
+			goto out_unlock;
+	}
+	if (test_bit(SCTP_CID_COOKIE_ECHO, map)) {
+		/* Sec 8.5.1 (D) */
+		if (sctph->vtag != ct->proto.sctp.vtag[dir])
+			goto out_unlock;
+	}
+	if (test_bit(SCTP_CID_HEARTBEAT, map)) {
+		if (sctph->vtag != ct->proto.sctp.vtag[dir]) {
+			if (test_bit(SCTP_CID_DATA, map))
+				goto out_unlock;
 
-		/* If it is an INIT or an INIT ACK note down the vtag */
-		if (sch->type == SCTP_CID_INIT ||
-		    sch->type == SCTP_CID_INIT_ACK) {
-			struct sctp_inithdr _inithdr, *ih;
+			ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+			ct->proto.sctp.last_dir = dir;
+		} else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
+			ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+		}
+	}
+	if (sch->type == SCTP_CID_HEARTBEAT_ACK) {
+		if (sctph->vtag != ct->proto.sctp.vtag[dir]) {
+			if (test_bit(SCTP_CID_DATA, map))
+				goto out_unlock;
 
-			ih = skb_header_pointer(skb, offset + sizeof(_sch),
-						sizeof(_inithdr), &_inithdr);
-			if (ih == NULL)
+			if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 ||
+				ct->proto.sctp.last_dir == dir)
 				goto out_unlock;
-			pr_debug("Setting vtag %x for dir %d\n",
-				 ih->init_tag, !dir);
-			ct->proto.sctp.vtag[!dir] = ih->init_tag;
 
-			/* don't renew timeout on init retransmit so
-			 * port reuse by client or NAT middlebox cannot
-			 * keep entry alive indefinitely (incl. nat info).
-			 */
-			if (new_state == SCTP_CONNTRACK_CLOSED &&
-			    old_state == SCTP_CONNTRACK_CLOSED &&
-			    nf_ct_is_confirmed(ct))
-				ignore = true;
+			ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+			ct->proto.sctp.vtag[dir] = sctph->vtag;
+			ct->proto.sctp.vtag[!dir] = 0;
+		} else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
+			ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
 		}
-
-		ct->proto.sctp.state = new_state;
-		if (old_state != new_state)
-			nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
 	}
-	spin_unlock_bh(&ct->lock);
-
-	/* allow but do not refresh timeout */
-	if (ignore)
-		return NF_ACCEPT;
 
 	timeouts = nf_ct_timeout_lookup(ct);
 	if (!timeouts)
 		timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts;
 
-	nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
+	nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[ct->proto.sctp.state]);
 
-	if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
-	    dir == IP_CT_DIR_REPLY &&
-	    new_state == SCTP_CONNTRACK_ESTABLISHED) {
-		pr_debug("Setting assured bit\n");
-		set_bit(IPS_ASSURED_BIT, &ct->status);
-		nf_conntrack_event_cache(IPCT_ASSURED, ct);
-	}
+	/* Need some thought on how to set the assured bit */
+	// if (dir == IP_CT_DIR_REPLY &&
+	// 	!(test_bit(IPS_ASSURED_BIT, &ct->status))) {
+	// 	  set_bit(IPS_ASSURED_BIT, &ct->status);
+	// 	  nf_conntrack_event_cache(IPCT_ASSURED, ct);
+	// }
 
+out:
+	spin_unlock_bh(&ct->lock);
 	return NF_ACCEPT;
 
 out_unlock:
 	spin_unlock_bh(&ct->lock);
-out:
+out_drop:
 	return -NF_ACCEPT;
 }
 
-static bool sctp_can_early_drop(const struct nf_conn *ct)
-{
-	switch (ct->proto.sctp.state) {
-	case SCTP_CONNTRACK_SHUTDOWN_SENT:
-	case SCTP_CONNTRACK_SHUTDOWN_RECD:
-	case SCTP_CONNTRACK_SHUTDOWN_ACK_SENT:
-		return true;
-	default:
-		break;
-	}
-
-	return false;
-}
-
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
 #include <linux/netfilter/nfnetlink.h>
@@ -670,7 +364,7 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
 		}
 	}
 
-	timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED];
+	timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_OPEN_WAIT];
 	return 0;
 }
 
@@ -692,16 +386,8 @@ sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
 
 static const struct nla_policy
 sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
-	[CTA_TIMEOUT_SCTP_CLOSED]		= { .type = NLA_U32 },
-	[CTA_TIMEOUT_SCTP_COOKIE_WAIT]		= { .type = NLA_U32 },
-	[CTA_TIMEOUT_SCTP_COOKIE_ECHOED]	= { .type = NLA_U32 },
+	[CTA_TIMEOUT_SCTP_OPEN_WAIT]		= { .type = NLA_U32 },
 	[CTA_TIMEOUT_SCTP_ESTABLISHED]		= { .type = NLA_U32 },
-	[CTA_TIMEOUT_SCTP_SHUTDOWN_SENT]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_SCTP_SHUTDOWN_RECD]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_SCTP_HEARTBEAT_SENT]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_SCTP_DATA_SENT]		= { .type = NLA_U32 },
 };
 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
@@ -716,7 +402,7 @@ void nf_conntrack_sctp_init_net(struct net *net)
 	/* timeouts[0] is unused, init it so ->timeouts[0] contains
 	 * 'new' timeout, like udp or icmp.
 	 */
-	sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED];
+	sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_OPEN_WAIT];
 }
 
 const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = {
@@ -724,7 +410,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = {
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack	= sctp_print_conntrack,
 #endif
-	.can_early_drop		= sctp_can_early_drop,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.nlattr_size		= SCTP_NLATTR_SIZE,
 	.to_nlattr		= sctp_to_nlattr,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 0250725e38a4..07da9db31783 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -593,16 +593,8 @@ enum nf_ct_sysctl_index {
 	NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP,
 	NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6,
 #ifdef CONFIG_NF_CT_PROTO_SCTP
-	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED,
+	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_OPEN_WAIT,
 	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT,
 #endif
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 	NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST,
@@ -839,20 +831,8 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.proc_handler	= proc_dointvec_jiffies,
 	},
 #ifdef CONFIG_NF_CT_PROTO_SCTP
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED] = {
-		.procname	= "nf_conntrack_sctp_timeout_closed",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT] = {
-		.procname	= "nf_conntrack_sctp_timeout_cookie_wait",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED] = {
-		.procname	= "nf_conntrack_sctp_timeout_cookie_echoed",
+	[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_OPEN_WAIT] = {
+		.procname	= "nf_conntrack_sctp_timeout_open_wait",
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
@@ -863,42 +843,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT] = {
-		.procname	= "nf_conntrack_sctp_timeout_shutdown_sent",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD] = {
-		.procname	= "nf_conntrack_sctp_timeout_shutdown_recd",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = {
-		.procname	= "nf_conntrack_sctp_timeout_shutdown_ack_sent",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT] = {
-		.procname	= "nf_conntrack_sctp_timeout_heartbeat_sent",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED] = {
-		.procname       = "nf_conntrack_sctp_timeout_heartbeat_acked",
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT] = {
-		.procname       = "nf_conntrack_sctp_timeout_data_sent",
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec_jiffies,
-	},
 #endif
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 	[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = {
@@ -1034,16 +978,8 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,
 	table[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ ## XNAME].data = \
 			&(sn)->timeouts[SCTP_CONNTRACK_ ## XNAME]
 
-	XASSIGN(CLOSED, sn);
-	XASSIGN(COOKIE_WAIT, sn);
-	XASSIGN(COOKIE_ECHOED, sn);
+	XASSIGN(OPEN_WAIT, sn);
 	XASSIGN(ESTABLISHED, sn);
-	XASSIGN(SHUTDOWN_SENT, sn);
-	XASSIGN(SHUTDOWN_RECD, sn);
-	XASSIGN(SHUTDOWN_ACK_SENT, sn);
-	XASSIGN(HEARTBEAT_SENT, sn);
-	XASSIGN(HEARTBEAT_ACKED, sn);
-	XASSIGN(DATA_SENT, sn);
 #undef XASSIGN
 #endif
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-04 11:31 [RFC PATCH] netfilter: conntrack: simplify sctp state machine Sriram Yagnaraman
@ 2023-01-04 12:41 ` Florian Westphal
  2023-01-05 11:25   ` Sriram Yagnaraman
  2023-01-04 15:02 ` Pablo Neira Ayuso
  1 sibling, 1 reply; 14+ messages in thread
From: Florian Westphal @ 2023-01-04 12:41 UTC (permalink / raw)
  To: Sriram Yagnaraman
  Cc: netfilter-devel, Florian Westphal, Marcelo Ricardo Leitner,
	Long Xin

Sriram Yagnaraman <sriram.yagnaraman@est.tech> wrote:
> All the paths in an SCTP connection are kept alive either by actual
> DATA/SACK running through the connection or by HEARTBEAT. This patch
> proposes a simple state machine with only two states OPEN_WAIT and
> ESTABLISHED (similar to UDP). The reason for this change is a full
> stateful approach to SCTP is difficult when the association is
> multihomed since the endpoints could use different paths in the network
> during the lifetime of an association.
> 
> Default timeouts are:
> OPEN_WAIT:   3 seconds   (rto_initial)
> ESTABLISHED: 210 seconds (rto_max + hb_interval * path_max_retrans)
> 
> Important changes/notes
> - Timeout is used to clean up conntrack entries
> - VTAG checks are kept as is (can be moved to a conntrack extension if
>   desired)
> - SCTP chunks are parsed only once, and a map is populated with the
>   information on the chunks present in the packet
> - ASSURED bit is NOT set in this version of the patch, need help
>   understanding when to set it
> 
> Note that this patch has changed uapi headers.

Don't do that please, this will cause trouble.

> Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> ---
>  .../uapi/linux/netfilter/nf_conntrack_sctp.h  |  10 +-
>  .../linux/netfilter/nfnetlink_cttimeout.h     |  10 +-
>  net/netfilter/nf_conntrack_proto_sctp.c       | 589 ++++--------------
>  net/netfilter/nf_conntrack_standalone.c       |  72 +--
>  4 files changed, 143 insertions(+), 538 deletions(-)
> 
> diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h
> index c742469afe21..89381a57021a 100644
> --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h
> +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h
> @@ -7,16 +7,8 @@
>  
>  enum sctp_conntrack {
>  	SCTP_CONNTRACK_NONE,
> -	SCTP_CONNTRACK_CLOSED,
> -	SCTP_CONNTRACK_COOKIE_WAIT,
> -	SCTP_CONNTRACK_COOKIE_ECHOED,
> +	SCTP_CONNTRACK_OPEN_WAIT,
>  	SCTP_CONNTRACK_ESTABLISHED,
> -	SCTP_CONNTRACK_SHUTDOWN_SENT,
> -	SCTP_CONNTRACK_SHUTDOWN_RECD,
> -	SCTP_CONNTRACK_SHUTDOWN_ACK_SENT,
> -	SCTP_CONNTRACK_HEARTBEAT_SENT,
> -	SCTP_CONNTRACK_HEARTBEAT_ACKED,
> -	SCTP_CONNTRACK_DATA_SENT,
>  	SCTP_CONNTRACK_MAX

Please keep all as-is.

You might want to add a /* no loner used */ or similar.

You could hijack an existing enum to avoid adding a new one:

SCTP_CONNTRACK_OPEN_WAIT = SCTP_CONNTRACK_COOKIE_WAIT,

> diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
> index 94e74034706d..372dfe7c07ed 100644
> --- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
> +++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
> @@ -86,16 +86,8 @@ enum ctattr_timeout_dccp {
>  
>  enum ctattr_timeout_sctp {
>  	CTA_TIMEOUT_SCTP_UNSPEC,
> -	CTA_TIMEOUT_SCTP_CLOSED,
> -	CTA_TIMEOUT_SCTP_COOKIE_WAIT,
> -	CTA_TIMEOUT_SCTP_COOKIE_ECHOED,
> +	CTA_TIMEOUT_SCTP_OPEN_WAIT,
>  	CTA_TIMEOUT_SCTP_ESTABLISHED,
> -	CTA_TIMEOUT_SCTP_SHUTDOWN_SENT,
> -	CTA_TIMEOUT_SCTP_SHUTDOWN_RECD,
> -	CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
> -	CTA_TIMEOUT_SCTP_HEARTBEAT_SENT,
> -	CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED,
> -	CTA_TIMEOUT_SCTP_DATA_SENT,
>  	__CTA_TIMEOUT_SCTP_MAX

Same, this is frozen, you can add to it but you
cannot remove this.

You can add a kernel internal enum if you like, to replace the existing
ones, with kernel mapping the new ones to old (and ignoring the old ones
on input from userspace).

This would allow to shrink struct nf_sctp_net size for example.

>  #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1)
> diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
> index d88b92a8ffca..d79ed476b764 100644
> --- a/net/netfilter/nf_conntrack_proto_sctp.c
> +++ b/net/netfilter/nf_conntrack_proto_sctp.c
> @@ -5,12 +5,13 @@
>   * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com>
>   * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net>
>   *
> - * SCTP is defined in RFC 2960. References to various sections in this code
> + * SCTP is defined in RFC 4960. References to various sections in this code
>   * are to this RFC.
>   */
>  
>  #include <linux/types.h>
>  #include <linux/timer.h>
> +#include <linux/jiffies.h>
>  #include <linux/netfilter.h>
>  #include <linux/in.h>
>  #include <linux/ip.h>
> @@ -27,127 +28,19 @@
>  #include <net/netfilter/nf_conntrack_ecache.h>
>  #include <net/netfilter/nf_conntrack_timeout.h>
>  
> -/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
> -   closely.  They're more complex. --RR
> -
> -   And so for me for SCTP :D -Kiran */
> +#define	SCTP_FLAG_HEARTBEAT_VTAG_FAILED	1
>  
>  static const char *const sctp_conntrack_names[] = {
>  	"NONE",
> -	"CLOSED",
> -	"COOKIE_WAIT",
> -	"COOKIE_ECHOED",
> +	"OPEN_WAIT",
>  	"ESTABLISHED",
> -	"SHUTDOWN_SENT",
> -	"SHUTDOWN_RECD",
> -	"SHUTDOWN_ACK_SENT",
> -	"HEARTBEAT_SENT",
> -	"HEARTBEAT_ACKED",
>  };

> -	}
> +	[SCTP_CONNTRACK_OPEN_WAIT]			= 3 SECS,
> +	[SCTP_CONNTRACK_ESTABLISHED]		= 210 SECS,
>  };
  
>  
> +#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count)	\
> +for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0;	\
> +	(offset) < (skb)->len &&					\

I think skb_header_pointer() will return NULL if offset + sizeof(_sch)
exceeds skb->len, so this offset < skb->len test is redundant.

> +	(offset) += (ntohs((sch)->length) + 3) & ~3, (count)++)

What if sch->length == 0?

> +	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
> +		pr_debug("Chunk Num: %d  Type: %d\n", count, sch->type);

Is this pr_debug() needed?  Its pretty useless because it would print
for every packet (its not an error path).

> +		set_bit(sch->type, map);
>  
> -	if (do_basic_checks(ct, skb, dataoff, map) != 0)
> -		goto out;
> +		if (sch->type == SCTP_CID_INIT ||
> +			sch->type == SCTP_CID_INIT_ACK) {
> +			struct sctp_inithdr _inith, *inith;
> +			inith = skb_header_pointer(skb, offset + sizeof(_sch),
> +						sizeof(_inith), &_inith);
> +			if (inith)
> +				init_vtag = inith->init_tag;
> +			else
> +				goto out_drop;

			if (!inith)
				goto out_drop;

			init_vtag = inith->init_tag;

Also, please run your patch through scripts/checkpatch.pl script, I'm
sure there are several coding style warnings here.

> +	spin_lock_bh(&ct->lock);

Why is this spinlock needed?

>  	if (!nf_ct_is_confirmed(ct)) {
>  		/* If an OOTB packet has any of these chunks discard (Sec 8.4) */
>  		if (test_bit(SCTP_CID_ABORT, map) ||
>  		    test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
>  		    test_bit(SCTP_CID_COOKIE_ACK, map))
> -			return -NF_ACCEPT;
> +			goto out_unlock;
>  
> -		if (!sctp_new(ct, skb, sh, dataoff))
> -			return -NF_ACCEPT;

Any reason for deleting sctp_new()?
It makes this body a lot larger, the lines below could have been done in
sctp_new().

> +		memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
> +		ct->proto.sctp.state = SCTP_CONNTRACK_OPEN_WAIT;
> +		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
> +
> +		if (test_bit(SCTP_CID_INIT, map))
> +			ct->proto.sctp.vtag[!dir] = init_vtag;
> +		else if (test_bit(SCTP_CID_SHUTDOWN_ACK, map))
> +			/* If it is a shutdown ack OOTB packet, we expect a return
> +			shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
> +			ct->proto.sctp.vtag[!dir] = sctph->vtag;
> +		else
> +			ct->proto.sctp.vtag[dir] = sctph->vtag;

Maybe the else branch below can be elided by adding a
goto here?

AFAICS the spinlock is only needed for some parts of the else branch,
so the spin_lock_bh can be moved.
> +		/* we have seen traffic both ways, go to established */
> +		if (dir == IP_CT_DIR_REPLY &&
> +			ct->proto.sctp.state == SCTP_CONNTRACK_OPEN_WAIT) {
> +			ct->proto.sctp.state = SCTP_CONNTRACK_ESTABLISHED;
> +			nf_conntrack_event_cache(IPCT_PROTOINFO, ct);

> +	/* Check the verification tag (Sec 8.5) */
> +	if (!test_bit(SCTP_CID_INIT, map) &&
> +		!test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
> +		!test_bit(SCTP_CID_COOKIE_ECHO, map) &&
> +		!test_bit(SCTP_CID_ABORT, map) &&
> +		!test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
> +		!test_bit(SCTP_CID_HEARTBEAT, map) &&
> +		!test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
> +		sctph->vtag != ct->proto.sctp.vtag[dir]) {
> +		pr_debug("Verification tag check failed\n");

Please have a look at
https://patchwork.ozlabs.org/project/netfilter-devel/patch/20230102114612.15860-2-fw@strlen.de/

I hope it will be applied shortly so you can rebase.
I don't have any other sctp patches.

This should be
nf_ct_l4proto_log_invalid(skb, ct, state,
			  "verification tag check failed %x vs %x for dir %d",
			  sh->vtag, ct->proto.sctp.vtag[dir], dir);

instead of pr_debug().

> +	/* Special cases of Verification tag check (Sec 8.5.1) */

Please extend the comments a bit so I don't have to look at the RFC
while reviewing, just quote the relevant part, i.e.

> +	if (test_bit(SCTP_CID_INIT, map)) {
> +		/* Sec 8.5.1 (A) */
> +		if (sctph->vtag != 0)
>  			goto out_unlock;
> -		}

if (sctph->vtag != 0) /* A) init vtag MUST be 0 */
	goto out_unlock;

> +		else if (nf_ct_is_confirmed(ct))

No need to 'else if', just use 'if'.

> +	/* Need some thought on how to set the assured bit */
> +	// if (dir == IP_CT_DIR_REPLY &&
> +	// 	!(test_bit(IPS_ASSURED_BIT, &ct->status))) {
> +	// 	  set_bit(IPS_ASSURED_BIT, &ct->status);
> +	// 	  nf_conntrack_event_cache(IPCT_ASSURED, ct);

Probably do a test_and_set_bit() when the connection switches to
ESTABLISHED?

>  sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
> -	[CTA_TIMEOUT_SCTP_CLOSED]		= { .type = NLA_U32 },
> -	[CTA_TIMEOUT_SCTP_COOKIE_WAIT]		= { .type = NLA_U32 },
> -	[CTA_TIMEOUT_SCTP_COOKIE_ECHOED]	= { .type = NLA_U32 },
> +	[CTA_TIMEOUT_SCTP_OPEN_WAIT]		= { .type = NLA_U32 },
>  	[CTA_TIMEOUT_SCTP_ESTABLISHED]		= { .type = NLA_U32 },
> -	[CTA_TIMEOUT_SCTP_SHUTDOWN_SENT]	= { .type = NLA_U32 },
> -	[CTA_TIMEOUT_SCTP_SHUTDOWN_RECD]	= { .type = NLA_U32 },
> -	[CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT]	= { .type = NLA_U32 },
> -	[CTA_TIMEOUT_SCTP_HEARTBEAT_SENT]	= { .type = NLA_U32 },
> -	[CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED]	= { .type = NLA_U32 },
> -	[CTA_TIMEOUT_SCTP_DATA_SENT]		= { .type = NLA_U32 },

Please retain this as-is for now.

I'm fine with removing the sysctls though.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-04 11:31 [RFC PATCH] netfilter: conntrack: simplify sctp state machine Sriram Yagnaraman
  2023-01-04 12:41 ` Florian Westphal
@ 2023-01-04 15:02 ` Pablo Neira Ayuso
  2023-01-05 11:41   ` Sriram Yagnaraman
  1 sibling, 1 reply; 14+ messages in thread
From: Pablo Neira Ayuso @ 2023-01-04 15:02 UTC (permalink / raw)
  To: Sriram Yagnaraman
  Cc: netfilter-devel, Florian Westphal, Marcelo Ricardo Leitner,
	Long Xin

On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram Yagnaraman wrote:
> All the paths in an SCTP connection are kept alive either by actual
> DATA/SACK running through the connection or by HEARTBEAT. This patch
> proposes a simple state machine with only two states OPEN_WAIT and
> ESTABLISHED (similar to UDP). The reason for this change is a full
> stateful approach to SCTP is difficult when the association is
> multihomed since the endpoints could use different paths in the network
> during the lifetime of an association.

Do you mean the router/firewall might not see all packets for
association is multihomed?

Could you please provide an example?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-04 12:41 ` Florian Westphal
@ 2023-01-05 11:25   ` Sriram Yagnaraman
  0 siblings, 0 replies; 14+ messages in thread
From: Sriram Yagnaraman @ 2023-01-05 11:25 UTC (permalink / raw)
  To: Florian Westphal
  Cc: netfilter-devel@vger.kernel.org, Marcelo Ricardo Leitner,
	Long Xin

> -----Original Message-----
> From: Florian Westphal <fw@strlen.de>
> Sent: Wednesday, 4 January 2023 13:41
> To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> <lxin@redhat.com>
> Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> 
> Sriram Yagnaraman <sriram.yagnaraman@est.tech> wrote:
> > All the paths in an SCTP connection are kept alive either by actual
> > DATA/SACK running through the connection or by HEARTBEAT. This patch
> > proposes a simple state machine with only two states OPEN_WAIT and
> > ESTABLISHED (similar to UDP). The reason for this change is a full
> > stateful approach to SCTP is difficult when the association is
> > multihomed since the endpoints could use different paths in the
> > network during the lifetime of an association.
> >
> > Default timeouts are:
> > OPEN_WAIT:   3 seconds   (rto_initial)
> > ESTABLISHED: 210 seconds (rto_max + hb_interval * path_max_retrans)
> >
> > Important changes/notes
> > - Timeout is used to clean up conntrack entries
> > - VTAG checks are kept as is (can be moved to a conntrack extension if
> >   desired)
> > - SCTP chunks are parsed only once, and a map is populated with the
> >   information on the chunks present in the packet
> > - ASSURED bit is NOT set in this version of the patch, need help
> >   understanding when to set it
> >
> > Note that this patch has changed uapi headers.
> 
> Don't do that please, this will cause trouble.

Yes, I understand, will remove the changes and reuse COOKIE_WAIT value for OPEN_WAIT.

> 
> > Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > ---
> >  .../uapi/linux/netfilter/nf_conntrack_sctp.h  |  10 +-
> >  .../linux/netfilter/nfnetlink_cttimeout.h     |  10 +-
> >  net/netfilter/nf_conntrack_proto_sctp.c       | 589 ++++--------------
> >  net/netfilter/nf_conntrack_standalone.c       |  72 +--
> >  4 files changed, 143 insertions(+), 538 deletions(-)
> >
> > diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h
> > b/include/uapi/linux/netfilter/nf_conntrack_sctp.h
> > index c742469afe21..89381a57021a 100644
> > --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h
> > +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h
> > @@ -7,16 +7,8 @@
> >
> >  enum sctp_conntrack {
> >  	SCTP_CONNTRACK_NONE,
> > -	SCTP_CONNTRACK_CLOSED,
> > -	SCTP_CONNTRACK_COOKIE_WAIT,
> > -	SCTP_CONNTRACK_COOKIE_ECHOED,
> > +	SCTP_CONNTRACK_OPEN_WAIT,
> >  	SCTP_CONNTRACK_ESTABLISHED,
> > -	SCTP_CONNTRACK_SHUTDOWN_SENT,
> > -	SCTP_CONNTRACK_SHUTDOWN_RECD,
> > -	SCTP_CONNTRACK_SHUTDOWN_ACK_SENT,
> > -	SCTP_CONNTRACK_HEARTBEAT_SENT,
> > -	SCTP_CONNTRACK_HEARTBEAT_ACKED,
> > -	SCTP_CONNTRACK_DATA_SENT,
> >  	SCTP_CONNTRACK_MAX
> 
> Please keep all as-is.
> 
> You might want to add a /* no loner used */ or similar.
> 
> You could hijack an existing enum to avoid adding a new one:
> 
> SCTP_CONNTRACK_OPEN_WAIT = SCTP_CONNTRACK_COOKIE_WAIT,
> 
> > diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
> > b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
> > index 94e74034706d..372dfe7c07ed 100644
> > --- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
> > +++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h
> > @@ -86,16 +86,8 @@ enum ctattr_timeout_dccp {
> >
> >  enum ctattr_timeout_sctp {
> >  	CTA_TIMEOUT_SCTP_UNSPEC,
> > -	CTA_TIMEOUT_SCTP_CLOSED,
> > -	CTA_TIMEOUT_SCTP_COOKIE_WAIT,
> > -	CTA_TIMEOUT_SCTP_COOKIE_ECHOED,
> > +	CTA_TIMEOUT_SCTP_OPEN_WAIT,
> >  	CTA_TIMEOUT_SCTP_ESTABLISHED,
> > -	CTA_TIMEOUT_SCTP_SHUTDOWN_SENT,
> > -	CTA_TIMEOUT_SCTP_SHUTDOWN_RECD,
> > -	CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
> > -	CTA_TIMEOUT_SCTP_HEARTBEAT_SENT,
> > -	CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED,
> > -	CTA_TIMEOUT_SCTP_DATA_SENT,
> >  	__CTA_TIMEOUT_SCTP_MAX
> 
> Same, this is frozen, you can add to it but you cannot remove this.
> 
> You can add a kernel internal enum if you like, to replace the existing ones,
> with kernel mapping the new ones to old (and ignoring the old ones on input
> from userspace).
> 
> This would allow to shrink struct nf_sctp_net size for example.
> 
> >  #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1) diff --
> git
> > a/net/netfilter/nf_conntrack_proto_sctp.c
> > b/net/netfilter/nf_conntrack_proto_sctp.c
> > index d88b92a8ffca..d79ed476b764 100644
> > --- a/net/netfilter/nf_conntrack_proto_sctp.c
> > +++ b/net/netfilter/nf_conntrack_proto_sctp.c
> > @@ -5,12 +5,13 @@
> >   * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com>
> >   * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net>
> >   *
> > - * SCTP is defined in RFC 2960. References to various sections in
> > this code
> > + * SCTP is defined in RFC 4960. References to various sections in
> > + this code
> >   * are to this RFC.
> >   */
> >
> >  #include <linux/types.h>
> >  #include <linux/timer.h>
> > +#include <linux/jiffies.h>
> >  #include <linux/netfilter.h>
> >  #include <linux/in.h>
> >  #include <linux/ip.h>
> > @@ -27,127 +28,19 @@
> >  #include <net/netfilter/nf_conntrack_ecache.h>
> >  #include <net/netfilter/nf_conntrack_timeout.h>
> >
> > -/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
> > -   closely.  They're more complex. --RR
> > -
> > -   And so for me for SCTP :D -Kiran */
> > +#define	SCTP_FLAG_HEARTBEAT_VTAG_FAILED	1
> >
> >  static const char *const sctp_conntrack_names[] = {
> >  	"NONE",
> > -	"CLOSED",
> > -	"COOKIE_WAIT",
> > -	"COOKIE_ECHOED",
> > +	"OPEN_WAIT",
> >  	"ESTABLISHED",
> > -	"SHUTDOWN_SENT",
> > -	"SHUTDOWN_RECD",
> > -	"SHUTDOWN_ACK_SENT",
> > -	"HEARTBEAT_SENT",
> > -	"HEARTBEAT_ACKED",
> >  };
> 
> > -	}
> > +	[SCTP_CONNTRACK_OPEN_WAIT]			= 3 SECS,
> > +	[SCTP_CONNTRACK_ESTABLISHED]		= 210 SECS,
> >  };
> 
> >
> > +#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count)	\
> > +for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0;	\
> > +	(offset) < (skb)->len &&					\
> 
> I think skb_header_pointer() will return NULL if offset + sizeof(_sch) exceeds
> skb->len, so this offset < skb->len test is redundant.
> 
> > +	(offset) += (ntohs((sch)->length) + 3) & ~3, (count)++)
> 
> What if sch->length == 0?

Oops, infinite loop. I will fix it.

> 
> > +	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
> > +		pr_debug("Chunk Num: %d  Type: %d\n", count, sch->type);
> 
> Is this pr_debug() needed?  Its pretty useless because it would print for every
> packet (its not an error path).

Removed now.

> 
> > +		set_bit(sch->type, map);
> >
> > -	if (do_basic_checks(ct, skb, dataoff, map) != 0)
> > -		goto out;
> > +		if (sch->type == SCTP_CID_INIT ||
> > +			sch->type == SCTP_CID_INIT_ACK) {
> > +			struct sctp_inithdr _inith, *inith;
> > +			inith = skb_header_pointer(skb, offset + sizeof(_sch),
> > +						sizeof(_inith), &_inith);
> > +			if (inith)
> > +				init_vtag = inith->init_tag;
> > +			else
> > +				goto out_drop;
> 
> 			if (!inith)
> 				goto out_drop;
> 
> 			init_vtag = inith->init_tag;
> 
> Also, please run your patch through scripts/checkpatch.pl script, I'm sure
> there are several coding style warnings here.
> 
> > +	spin_lock_bh(&ct->lock);
> 
> Why is this spinlock needed?

I have reworked the code a bit now, and will use the spinlock only for the parts that write to ct.

> 
> >  	if (!nf_ct_is_confirmed(ct)) {
> >  		/* If an OOTB packet has any of these chunks discard (Sec 8.4)
> */
> >  		if (test_bit(SCTP_CID_ABORT, map) ||
> >  		    test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
> >  		    test_bit(SCTP_CID_COOKIE_ACK, map))
> > -			return -NF_ACCEPT;
> > +			goto out_unlock;
> >
> > -		if (!sctp_new(ct, skb, sh, dataoff))
> > -			return -NF_ACCEPT;
> 
> Any reason for deleting sctp_new()?
> It makes this body a lot larger, the lines below could have been done in
> sctp_new().

I will bring back sctp_new() and move vtag checks into its own function sctp_vtag_check().

> 
> > +		memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
> > +		ct->proto.sctp.state = SCTP_CONNTRACK_OPEN_WAIT;
> > +		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
> > +
> > +		if (test_bit(SCTP_CID_INIT, map))
> > +			ct->proto.sctp.vtag[!dir] = init_vtag;
> > +		else if (test_bit(SCTP_CID_SHUTDOWN_ACK, map))
> > +			/* If it is a shutdown ack OOTB packet, we expect a
> return
> > +			shutdown complete, otherwise an ABORT Sec 8.4 (5)
> and (8) */
> > +			ct->proto.sctp.vtag[!dir] = sctph->vtag;
> > +		else
> > +			ct->proto.sctp.vtag[dir] = sctph->vtag;
> 
> Maybe the else branch below can be elided by adding a goto here?
> 
> AFAICS the spinlock is only needed for some parts of the else branch, so the
> spin_lock_bh can be moved.
> > +		/* we have seen traffic both ways, go to established */
> > +		if (dir == IP_CT_DIR_REPLY &&
> > +			ct->proto.sctp.state ==
> SCTP_CONNTRACK_OPEN_WAIT) {
> > +			ct->proto.sctp.state =
> SCTP_CONNTRACK_ESTABLISHED;
> > +			nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
> 
> > +	/* Check the verification tag (Sec 8.5) */
> > +	if (!test_bit(SCTP_CID_INIT, map) &&
> > +		!test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
> > +		!test_bit(SCTP_CID_COOKIE_ECHO, map) &&
> > +		!test_bit(SCTP_CID_ABORT, map) &&
> > +		!test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
> > +		!test_bit(SCTP_CID_HEARTBEAT, map) &&
> > +		!test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
> > +		sctph->vtag != ct->proto.sctp.vtag[dir]) {
> > +		pr_debug("Verification tag check failed\n");
> 
> Please have a look at
> https://patchwork.ozlabs.org/project/netfilter-
> devel/patch/20230102114612.15860-2-fw@strlen.de/
> 
> I hope it will be applied shortly so you can rebase.
> I don't have any other sctp patches.
> 
> This should be
> nf_ct_l4proto_log_invalid(skb, ct, state,
> 			  "verification tag check failed %x vs %x for dir %d",
> 			  sh->vtag, ct->proto.sctp.vtag[dir], dir);
> 
> instead of pr_debug().

Yes, will do. Will also remove other pr_debug() if any.

> 
> > +	/* Special cases of Verification tag check (Sec 8.5.1) */
> 
> Please extend the comments a bit so I don't have to look at the RFC while
> reviewing, just quote the relevant part, i.e.
> 
> > +	if (test_bit(SCTP_CID_INIT, map)) {
> > +		/* Sec 8.5.1 (A) */
> > +		if (sctph->vtag != 0)
> >  			goto out_unlock;
> > -		}
> 
> if (sctph->vtag != 0) /* A) init vtag MUST be 0 */
> 	goto out_unlock;
> ö
> > +		else if (nf_ct_is_confirmed(ct))
> 
> No need to 'else if', just use 'if'.
> 
> > +	/* Need some thought on how to set the assured bit */
> > +	// if (dir == IP_CT_DIR_REPLY &&
> > +	// 	!(test_bit(IPS_ASSURED_BIT, &ct->status))) {
> > +	// 	  set_bit(IPS_ASSURED_BIT, &ct->status);
> > +	// 	  nf_conntrack_event_cache(IPCT_ASSURED, ct);
> 
> Probably do a test_and_set_bit() when the connection switches to
> ESTABLISHED?

Ok, makes sense.

> 
> >  sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
> > -	[CTA_TIMEOUT_SCTP_CLOSED]		= { .type = NLA_U32 },
> > -	[CTA_TIMEOUT_SCTP_COOKIE_WAIT]		= { .type = NLA_U32 },
> > -	[CTA_TIMEOUT_SCTP_COOKIE_ECHOED]	= { .type = NLA_U32 },
> > +	[CTA_TIMEOUT_SCTP_OPEN_WAIT]		= { .type = NLA_U32 },
> >  	[CTA_TIMEOUT_SCTP_ESTABLISHED]		= { .type = NLA_U32 },
> > -	[CTA_TIMEOUT_SCTP_SHUTDOWN_SENT]	= { .type = NLA_U32 },
> > -	[CTA_TIMEOUT_SCTP_SHUTDOWN_RECD]	= { .type = NLA_U32 },
> > -	[CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT]	= { .type = NLA_U32 },
> > -	[CTA_TIMEOUT_SCTP_HEARTBEAT_SENT]	= { .type = NLA_U32 },
> > -	[CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED]	= { .type = NLA_U32 },
> > -	[CTA_TIMEOUT_SCTP_DATA_SENT]		= { .type = NLA_U32 },
> 
> Please retain this as-is for now.
> 
> I'm fine with removing the sysctls though.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-04 15:02 ` Pablo Neira Ayuso
@ 2023-01-05 11:41   ` Sriram Yagnaraman
  2023-01-05 11:53     ` Pablo Neira Ayuso
  0 siblings, 1 reply; 14+ messages in thread
From: Sriram Yagnaraman @ 2023-01-05 11:41 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel@vger.kernel.org, Florian Westphal,
	Marcelo Ricardo Leitner, Long Xin, Claudio Porfiri

> -----Original Message-----
> From: Pablo Neira Ayuso <pablo@netfilter.org>
> Sent: Wednesday, 4 January 2023 16:02
> To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> <lxin@redhat.com>
> Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> 
> On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram Yagnaraman wrote:
> > All the paths in an SCTP connection are kept alive either by actual
> > DATA/SACK running through the connection or by HEARTBEAT. This patch
> > proposes a simple state machine with only two states OPEN_WAIT and
> > ESTABLISHED (similar to UDP). The reason for this change is a full
> > stateful approach to SCTP is difficult when the association is
> > multihomed since the endpoints could use different paths in the
> > network during the lifetime of an association.
> 
> Do you mean the router/firewall might not see all packets for association is
> multihomed?
> 
> Could you please provide an example?

Let's say the primary and alternate/secondary paths between the SCTP endpoints traverse different middle boxes. 
If an SCTP endpoint detects network failure on the primary path, it will switch to using the secondary path and all subsequent packets will not be seen by the middlebox on the primary path, including SHUTDOWN sequences if they happen at that time.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-05 11:41   ` Sriram Yagnaraman
@ 2023-01-05 11:53     ` Pablo Neira Ayuso
  2023-01-05 12:11       ` Sriram Yagnaraman
  0 siblings, 1 reply; 14+ messages in thread
From: Pablo Neira Ayuso @ 2023-01-05 11:53 UTC (permalink / raw)
  To: Sriram Yagnaraman
  Cc: netfilter-devel@vger.kernel.org, Florian Westphal,
	Marcelo Ricardo Leitner, Long Xin, Claudio Porfiri

On Thu, Jan 05, 2023 at 11:41:13AM +0000, Sriram Yagnaraman wrote:
> > -----Original Message-----
> > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > Sent: Wednesday, 4 January 2023 16:02
> > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> > Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> > <lxin@redhat.com>
> > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> > 
> > On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram Yagnaraman wrote:
> > > All the paths in an SCTP connection are kept alive either by actual
> > > DATA/SACK running through the connection or by HEARTBEAT. This patch
> > > proposes a simple state machine with only two states OPEN_WAIT and
> > > ESTABLISHED (similar to UDP). The reason for this change is a full
> > > stateful approach to SCTP is difficult when the association is
> > > multihomed since the endpoints could use different paths in the
> > > network during the lifetime of an association.
> > 
> > Do you mean the router/firewall might not see all packets for association is
> > multihomed?
> > 
> > Could you please provide an example?
> 
> Let's say the primary and alternate/secondary paths between the SCTP
> endpoints traverse different middle boxes. If an SCTP endpoint
> detects network failure on the primary path, it will switch to using
> the secondary path and all subsequent packets will not be seen by
> the middlebox on the primary path, including SHUTDOWN sequences if
> they happen at that time.

OK, then on the primary middle box the SCTP flow will just timeout?
(because no more packets are seen).

Or the scenario you describe is this? Assuming a middle box that is an
alternate/secondary path, then such middle box (which did not see any
other packets before for this connection) starts seeing packets for
this flow, so packets are dropped by the secondary middle box (which
now became the primary path after the network failure)?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-05 11:53     ` Pablo Neira Ayuso
@ 2023-01-05 12:11       ` Sriram Yagnaraman
  2023-01-06  0:50         ` Pablo Neira Ayuso
  0 siblings, 1 reply; 14+ messages in thread
From: Sriram Yagnaraman @ 2023-01-05 12:11 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel@vger.kernel.org, Florian Westphal,
	Marcelo Ricardo Leitner, Long Xin, Claudio Porfiri

> -----Original Message-----
> From: Pablo Neira Ayuso <pablo@netfilter.org>
> Sent: Thursday, 5 January 2023 12:54
> To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> <lxin@redhat.com>; Claudio Porfiri <claudio.porfiri@ericsson.com>
> Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> 
> On Thu, Jan 05, 2023 at 11:41:13AM +0000, Sriram Yagnaraman wrote:
> > > -----Original Message-----
> > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > Sent: Wednesday, 4 January 2023 16:02
> > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>; Long
> > > Xin <lxin@redhat.com>
> > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state
> > > machine
> > >
> > > On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram Yagnaraman wrote:
> > > > All the paths in an SCTP connection are kept alive either by
> > > > actual DATA/SACK running through the connection or by HEARTBEAT.
> > > > This patch proposes a simple state machine with only two states
> > > > OPEN_WAIT and ESTABLISHED (similar to UDP). The reason for this
> > > > change is a full stateful approach to SCTP is difficult when the
> > > > association is multihomed since the endpoints could use different
> > > > paths in the network during the lifetime of an association.
> > >
> > > Do you mean the router/firewall might not see all packets for
> > > association is multihomed?
> > >
> > > Could you please provide an example?
> >
> > Let's say the primary and alternate/secondary paths between the SCTP
> > endpoints traverse different middle boxes. If an SCTP endpoint detects
> > network failure on the primary path, it will switch to using the
> > secondary path and all subsequent packets will not be seen by the
> > middlebox on the primary path, including SHUTDOWN sequences if they
> > happen at that time.
> 
> OK, then on the primary middle box the SCTP flow will just timeout?
> (because no more packets are seen).

Yes, they will timeout unless the primary path comes up before the SHUTDOWN sequence. And the default timeout for an ESTABLISHED SCTP connection is 5 days, which is a "long" time to clean-up this entry.

> 
> Or the scenario you describe is this? Assuming a middle box that is an
> alternate/secondary path, then such middle box (which did not see any other
> packets before for this connection) starts seeing packets for this flow, so
> packets are dropped by the secondary middle box (which now became the
> primary path after the network failure)?

With d7ee35190427 ("netfilter: nf_ct_sctp: minimal multihoming support") and bff3d0534804 ("netfilter: conntrack: add sctp DATA_SENT state ") there is some multihoming support, so secondary middleboxes should have an entry already. SCTP endpoints can send packets to secondary transport addresses according to the RFC, and they do path monitoring of secondary paths using HEARTBEAT when needed.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-05 12:11       ` Sriram Yagnaraman
@ 2023-01-06  0:50         ` Pablo Neira Ayuso
  2023-01-11  9:36           ` Sriram Yagnaraman
  0 siblings, 1 reply; 14+ messages in thread
From: Pablo Neira Ayuso @ 2023-01-06  0:50 UTC (permalink / raw)
  To: Sriram Yagnaraman
  Cc: netfilter-devel@vger.kernel.org, Florian Westphal,
	Marcelo Ricardo Leitner, Long Xin, Claudio Porfiri

On Thu, Jan 05, 2023 at 12:11:44PM +0000, Sriram Yagnaraman wrote:
> > -----Original Message-----
> > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > Sent: Thursday, 5 January 2023 12:54
> > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> > Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> > <lxin@redhat.com>; Claudio Porfiri <claudio.porfiri@ericsson.com>
> > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> > 
> > On Thu, Jan 05, 2023 at 11:41:13AM +0000, Sriram Yagnaraman wrote:
> > > > -----Original Message-----
> > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > Sent: Wednesday, 4 January 2023 16:02
> > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>; Long
> > > > Xin <lxin@redhat.com>
> > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state
> > > > machine
> > > >
> > > > On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram Yagnaraman wrote:
> > > > > All the paths in an SCTP connection are kept alive either by
> > > > > actual DATA/SACK running through the connection or by HEARTBEAT.
> > > > > This patch proposes a simple state machine with only two states
> > > > > OPEN_WAIT and ESTABLISHED (similar to UDP). The reason for this
> > > > > change is a full stateful approach to SCTP is difficult when the
> > > > > association is multihomed since the endpoints could use different
> > > > > paths in the network during the lifetime of an association.
> > > >
> > > > Do you mean the router/firewall might not see all packets for
> > > > association is multihomed?
> > > >
> > > > Could you please provide an example?
> > >
> > > Let's say the primary and alternate/secondary paths between the SCTP
> > > endpoints traverse different middle boxes. If an SCTP endpoint detects
> > > network failure on the primary path, it will switch to using the
> > > secondary path and all subsequent packets will not be seen by the
> > > middlebox on the primary path, including SHUTDOWN sequences if they
> > > happen at that time.
> > 
> > OK, then on the primary middle box the SCTP flow will just timeout?
> > (because no more packets are seen).
> 
> Yes, they will timeout unless the primary path comes up before the
> SHUTDOWN sequence. And the default timeout for an ESTABLISHED SCTP
> connection is 5 days, which is a "long" time to clean-up this entry.

Does the middle box have a chance to see any packet that provides a
hint to shorten this timeout? no HEARTBEAT packets are seen in this
case on the former primary path?

What I am missing are a more detailed list of issues with the existing
approach. Your patch description says "SCTP tracking with multihoming
is difficult", probably a list of scenarios would help to understand
the motivation to simplify the state machine.

Thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-06  0:50         ` Pablo Neira Ayuso
@ 2023-01-11  9:36           ` Sriram Yagnaraman
  2023-01-11 13:53             ` Marcelo Ricardo Leitner
  2023-01-12 11:50             ` Pablo Neira Ayuso
  0 siblings, 2 replies; 14+ messages in thread
From: Sriram Yagnaraman @ 2023-01-11  9:36 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel@vger.kernel.org, Florian Westphal,
	Marcelo Ricardo Leitner, Long Xin, Claudio Porfiri

> -----Original Message-----
> From: Pablo Neira Ayuso <pablo@netfilter.org>
> Sent: Friday, 6 January 2023 01:50
> To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> <lxin@redhat.com>; Claudio Porfiri <claudio.porfiri@ericsson.com>
> Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> 
> On Thu, Jan 05, 2023 at 12:11:44PM +0000, Sriram Yagnaraman wrote:
> > > -----Original Message-----
> > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > Sent: Thursday, 5 January 2023 12:54
> > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>; Long
> > > Xin <lxin@redhat.com>; Claudio Porfiri
> > > <claudio.porfiri@ericsson.com>
> > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state
> > > machine
> > >
> > > On Thu, Jan 05, 2023 at 11:41:13AM +0000, Sriram Yagnaraman wrote:
> > > > > -----Original Message-----
> > > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > > Sent: Wednesday, 4 January 2023 16:02
> > > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>;
> > > > > Long Xin <lxin@redhat.com>
> > > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp
> > > > > state machine
> > > > >
> > > > > On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram Yagnaraman
> wrote:
> > > > > > All the paths in an SCTP connection are kept alive either by
> > > > > > actual DATA/SACK running through the connection or by HEARTBEAT.
> > > > > > This patch proposes a simple state machine with only two
> > > > > > states OPEN_WAIT and ESTABLISHED (similar to UDP). The reason
> > > > > > for this change is a full stateful approach to SCTP is
> > > > > > difficult when the association is multihomed since the
> > > > > > endpoints could use different paths in the network during the lifetime
> of an association.
> > > > >
> > > > > Do you mean the router/firewall might not see all packets for
> > > > > association is multihomed?
> > > > >
> > > > > Could you please provide an example?
> > > >
> > > > Let's say the primary and alternate/secondary paths between the
> > > > SCTP endpoints traverse different middle boxes. If an SCTP
> > > > endpoint detects network failure on the primary path, it will
> > > > switch to using the secondary path and all subsequent packets will
> > > > not be seen by the middlebox on the primary path, including
> > > > SHUTDOWN sequences if they happen at that time.
> > >
> > > OK, then on the primary middle box the SCTP flow will just timeout?
> > > (because no more packets are seen).
> >
> > Yes, they will timeout unless the primary path comes up before the
> > SHUTDOWN sequence. And the default timeout for an ESTABLISHED SCTP
> > connection is 5 days, which is a "long" time to clean-up this entry.
> 
> Does the middle box have a chance to see any packet that provides a hint to
> shorten this timeout? no HEARTBEAT packets are seen in this case on the
> former primary path?

There will be HEARTBEAT as soon as a path becomes unreachable from the SCTP endpoints. But depending on the location of the network failure, the middlebox may or may not see the HEARTBEAT. Also, HEARTBEAT is sent when there are no data to be transmitted or if the path is unreachable/unconfirmed, so I think there is no deterministic way of finding out when to shorten the timeout. Of course, a user has the option of setting the ESTABLISHED state timeout to a more reasonable value, for e.g., same as the HEARTBEAT_ACKED state timeout (210 sec), OR we could reduce the default timeout of ESTABLISHED to 210 sec. 

> 
> What I am missing are a more detailed list of issues with the existing
> approach. Your patch description says "SCTP tracking with multihoming is
> difficult", probably a list of scenarios would help to understand the motivation
> to simplify the state machine.

Thank you for reviewing and asking these questions, it made me step back and think. I list below some background
- I want to simplify the state machine, because it is possible to track an SCTP connection with fewer states, for e.g., SCTP with UDP encapsulation uses UDP conntrack with just UNREPLIED/REPLIED states and it works perfectly fine
- My stakeholders, at the behest of whom I am proposing these changes hit some problems running SCTP client endpoints behind NAT (inside Kubernetes pods) towards multihomed SCTP server endpoints (1a-g) and (2a-c) below
- Some upcoming SCTP protocol changes in IETF (if approved/implemented) will make it hard to read beyond the SCTP common header, for e.g., DTLS over SCTP https://datatracker.ietf.org/doc/draft-ietf-tsvwg-dtls-over-sctp-bis/, proposes to encrypt all SCTP chunks, conntrack will only be able to see SCTP common header, these changes hopefully will make it easier to adapt to such changes in SCTP protocol
- While at it, I also made some other "improvements"
	a) Avoid multiple walk-throughs of SCTP chunks in sctp_new(), sctp_basic_checks() and nf_conntrack_sctp_packet(), and parse it only once
	b) SCTP conntrack has the same state regardless of it is a primary or a secondary path

Let's say there are two SCTP endpoints A and B with addresses A' and B, B'' correspondingly.
Primary path is A' <----> B' that traverses middlebox C, and secondary path is A' <----> B'' that traverses middlebox D.
1) SHUTDOWN sent on secondary path
1a) SCTP endpoint A sets up an association towards SCTP endpoint B
1b) Middlebox C sees INIT sequence and creates "primary" conntrack entry (5 days)
1c) Middlebox D sees HEARTBEAT sequence and creates "secondary" conntrack entry (210 seconds)
1d) Path failure between A and C, and SCTP endpoint A switches to secondary path and continues sending data on the association
1e) SCTP endpoint A decides to SHUTDOWN the connection
1f) Middlebox C is in ESTABLISHED state, doesn't see any SHUTDOWN sequence or HEARTBEAT, waits for timeout (5 days)
1g) Middlebox D is in HEARTBEAT_ACKED state, doesn't care about SHUTDOWN sequence, waits for timeout (210 seconds)

2) Recently fixed by bff3d0534804 ("netfilter: conntrack: add sctp DATA_SENT state ")
2a) SCTP endpoint A sets up an association towards SCTP endpoint B
2b) Middlebox C sees INIT sequence and creates "primary" conntrack entry (5 days)
2c) Middlebox D sees DATA/SACK, and DROPS packets until HEARTBEAT is seen to setup "secondary" conntrack entry (210 seconds)

> 
> Thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-11  9:36           ` Sriram Yagnaraman
@ 2023-01-11 13:53             ` Marcelo Ricardo Leitner
  2023-01-12 11:38               ` Sriram Yagnaraman
  2023-01-12 11:50             ` Pablo Neira Ayuso
  1 sibling, 1 reply; 14+ messages in thread
From: Marcelo Ricardo Leitner @ 2023-01-11 13:53 UTC (permalink / raw)
  To: Sriram Yagnaraman
  Cc: Pablo Neira Ayuso, netfilter-devel@vger.kernel.org,
	Florian Westphal, Long Xin, Claudio Porfiri

On Wed, Jan 11, 2023 at 09:36:38AM +0000, Sriram Yagnaraman wrote:
> > -----Original Message-----
> > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > Sent: Friday, 6 January 2023 01:50
> > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> > Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> > <lxin@redhat.com>; Claudio Porfiri <claudio.porfiri@ericsson.com>
> > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> >
> > On Thu, Jan 05, 2023 at 12:11:44PM +0000, Sriram Yagnaraman wrote:
> > > > -----Original Message-----
> > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > Sent: Thursday, 5 January 2023 12:54
> > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>; Long
> > > > Xin <lxin@redhat.com>; Claudio Porfiri
> > > > <claudio.porfiri@ericsson.com>
> > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state
> > > > machine
> > > >
> > > > On Thu, Jan 05, 2023 at 11:41:13AM +0000, Sriram Yagnaraman wrote:
> > > > > > -----Original Message-----
> > > > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > > > Sent: Wednesday, 4 January 2023 16:02
> > > > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>;
> > > > > > Long Xin <lxin@redhat.com>
> > > > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp
> > > > > > state machine
> > > > > >
> > > > > > On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram Yagnaraman
> > wrote:
> > > > > > > All the paths in an SCTP connection are kept alive either by
> > > > > > > actual DATA/SACK running through the connection or by HEARTBEAT.
> > > > > > > This patch proposes a simple state machine with only two
> > > > > > > states OPEN_WAIT and ESTABLISHED (similar to UDP). The reason
> > > > > > > for this change is a full stateful approach to SCTP is
> > > > > > > difficult when the association is multihomed since the
> > > > > > > endpoints could use different paths in the network during the lifetime
> > of an association.
> > > > > >
> > > > > > Do you mean the router/firewall might not see all packets for
> > > > > > association is multihomed?
> > > > > >
> > > > > > Could you please provide an example?
> > > > >
> > > > > Let's say the primary and alternate/secondary paths between the
> > > > > SCTP endpoints traverse different middle boxes. If an SCTP
> > > > > endpoint detects network failure on the primary path, it will
> > > > > switch to using the secondary path and all subsequent packets will
> > > > > not be seen by the middlebox on the primary path, including
> > > > > SHUTDOWN sequences if they happen at that time.
> > > >
> > > > OK, then on the primary middle box the SCTP flow will just timeout?
> > > > (because no more packets are seen).
> > >
> > > Yes, they will timeout unless the primary path comes up before the
> > > SHUTDOWN sequence. And the default timeout for an ESTABLISHED SCTP
> > > connection is 5 days, which is a "long" time to clean-up this entry.
> >
> > Does the middle box have a chance to see any packet that provides a hint to
> > shorten this timeout? no HEARTBEAT packets are seen in this case on the
> > former primary path?
>
> There will be HEARTBEAT as soon as a path becomes unreachable from the SCTP endpoints. But depending on the location of the network failure, the middlebox may or may not see the HEARTBEAT. Also, HEARTBEAT is sent when there are no data to be transmitted or if the path is unreachable/unconfirmed, so I think there is no deterministic way of finding out when to shorten the timeout. Of course, a user has the option of setting the ESTABLISHED state timeout to a more reasonable value, for e.g., same as the HEARTBEAT_ACKED state timeout (210 sec), OR we could reduce the default timeout of ESTABLISHED to 210 sec.
>
> >
> > What I am missing are a more detailed list of issues with the existing
> > approach. Your patch description says "SCTP tracking with multihoming is
> > difficult", probably a list of scenarios would help to understand the motivation
> > to simplify the state machine.
>
> Thank you for reviewing and asking these questions, it made me step back and think. I list below some background
> - I want to simplify the state machine, because it is possible to track an SCTP connection with fewer states, for e.g., SCTP with UDP encapsulation uses UDP conntrack with just UNREPLIED/REPLIED states and it works perfectly fine

[*] (more below)

> - My stakeholders, at the behest of whom I am proposing these changes hit some problems running SCTP client endpoints behind NAT (inside Kubernetes pods) towards multihomed SCTP server endpoints (1a-g) and (2a-c) below
> - Some upcoming SCTP protocol changes in IETF (if approved/implemented) will make it hard to read beyond the SCTP common header, for e.g., DTLS over SCTP https://datatracker.ietf.org/doc/draft-ietf-tsvwg-dtls-over-sctp-bis/, proposes to encrypt all SCTP chunks, conntrack will only be able to see SCTP common header, these changes hopefully will make it easier to adapt to such changes in SCTP protocol

If this is really the case, then I'm confused by this new RFC (be
warned I don't know the gory details of DTLS). I don't see it
specifying that other chunk headers will be encrypted, but instead I
see texts like:

3.7.  Message Sizes

   ...
   The sequence of DTLS records is then fragmented into DATA or I-DATA
   Chunks to fit the path MTU by SCTP.  These changes ensure that DTLS/
   SCTP has the same capability as SCTP to support user messages of any
   size. ...

this is just packing encrypted payload.

Sections 4.1 and 4.5 also lead to that, and:

6.  DTLS over SCTP Service

   The adoption of DTLS over SCTP according to the current specification
   is meant to add to SCTP the option for transferring encrypted data.
   When DTLS over SCTP is used, all data being transferred MUST be
   protected by chunk authentication and DTLS encrypted.

and finally:

9.6.  Privacy Considerations

   For each SCTP user message, the user also provides a stream
   identifier, a flag to indicate whether the message is sent ordered or
   unordered, and a payload protocol identifier.  Although DTLS/SCTP
   provides privacy for the actual user message, the other three
   information fields are not confidentiality protected.  They are sent
   as clear text because they are part of the SCTP DATA chunk header.


So AFAICU the claim here is that a middle box wouldn't be able to
inspect SCTP payload, but the protocol itself is still possible.
Did I miss something maybe?

> - While at it, I also made some other "improvements"
> 	a) Avoid multiple walk-throughs of SCTP chunks in sctp_new(), sctp_basic_checks() and nf_conntrack_sctp_packet(), and parse it only once
> 	b) SCTP conntrack has the same state regardless of it is a primary or a secondary path

This a good change and may, actually, be the core reasoning behind the
change here. 'Primary' is more in the sense of 'active' than anything
else (and it gets more confusing with CMT-SCTP heh). It's not because
a path saw the INIT handshake that it is more reliable and so from
conntrack side.

With that, conntrack handling all paths similarly makes it more
consistent and closer with what the actual SCTP sockets are seeing for
such paths.

One easy example here of a bad situation due to the current difference
between paths is that an application can start an association through
a path and tear it down over another one. There's no shutdown seen by
the first path, and the conntrack entry will be left there af it is
was in Established state for 5 days. (this is also illustrated in
Sriram's example below, just easier to read)

Item I marked with [*] above is vague. You need to consider what it
would be loosing by such simplification and argument that it is okay.
We can't just cut the fat out and pray for the best later on, right?
But with this description here, it not only provides a solid
justification on why it (specificly the new solution) is needed but
also why it is currently already broken/not reliable in multi-homing
setups. Yay! :-)

Then, next question is: would it make sense to keep the current
processing for single path associations? That's a lot of
code/maintenance and I fail to see a reasoning that justifies it.

>
> Let's say there are two SCTP endpoints A and B with addresses A' and B, B'' correspondingly.
> Primary path is A' <----> B' that traverses middlebox C, and secondary path is A' <----> B'' that traverses middlebox D.
> 1) SHUTDOWN sent on secondary path
> 1a) SCTP endpoint A sets up an association towards SCTP endpoint B
> 1b) Middlebox C sees INIT sequence and creates "primary" conntrack entry (5 days)
> 1c) Middlebox D sees HEARTBEAT sequence and creates "secondary" conntrack entry (210 seconds)
> 1d) Path failure between A and C, and SCTP endpoint A switches to secondary path and continues sending data on the association
> 1e) SCTP endpoint A decides to SHUTDOWN the connection
> 1f) Middlebox C is in ESTABLISHED state, doesn't see any SHUTDOWN sequence or HEARTBEAT, waits for timeout (5 days)
> 1g) Middlebox D is in HEARTBEAT_ACKED state, doesn't care about SHUTDOWN sequence, waits for timeout (210 seconds)
>
> 2) Recently fixed by bff3d0534804 ("netfilter: conntrack: add sctp DATA_SENT state ")
> 2a) SCTP endpoint A sets up an association towards SCTP endpoint B
> 2b) Middlebox C sees INIT sequence and creates "primary" conntrack entry (5 days)
> 2c) Middlebox D sees DATA/SACK, and DROPS packets until HEARTBEAT is seen to setup "secondary" conntrack entry (210 seconds)
>
> >
> > Thanks.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-11 13:53             ` Marcelo Ricardo Leitner
@ 2023-01-12 11:38               ` Sriram Yagnaraman
  0 siblings, 0 replies; 14+ messages in thread
From: Sriram Yagnaraman @ 2023-01-12 11:38 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: Pablo Neira Ayuso, netfilter-devel@vger.kernel.org,
	Florian Westphal, Long Xin, Claudio Porfiri



> -----Original Message-----
> From: Marcelo Ricardo Leitner <mleitner@redhat.com>
> Sent: Wednesday, 11 January 2023 14:54
> To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> Cc: Pablo Neira Ayuso <pablo@netfilter.org>; netfilter-devel@vger.kernel.org;
> Florian Westphal <fw@strlen.de>; Long Xin <lxin@redhat.com>; Claudio
> Porfiri <claudio.porfiri@ericsson.com>
> Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> 
> On Wed, Jan 11, 2023 at 09:36:38AM +0000, Sriram Yagnaraman wrote:
> > > -----Original Message-----
> > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > Sent: Friday, 6 January 2023 01:50
> > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>; Long
> > > Xin <lxin@redhat.com>; Claudio Porfiri
> > > <claudio.porfiri@ericsson.com>
> > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state
> > > machine
> > >
> > > On Thu, Jan 05, 2023 at 12:11:44PM +0000, Sriram Yagnaraman wrote:
> > > > > -----Original Message-----
> > > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > > Sent: Thursday, 5 January 2023 12:54
> > > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>;
> > > > > Long Xin <lxin@redhat.com>; Claudio Porfiri
> > > > > <claudio.porfiri@ericsson.com>
> > > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp
> > > > > state machine
> > > > >
> > > > > On Thu, Jan 05, 2023 at 11:41:13AM +0000, Sriram Yagnaraman wrote:
> > > > > > > -----Original Message-----
> > > > > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > > > > Sent: Wednesday, 4 January 2023 16:02
> > > > > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > > > > <fw@strlen.de>; Marcelo Ricardo Leitner
> > > > > > > <mleitner@redhat.com>; Long Xin <lxin@redhat.com>
> > > > > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp
> > > > > > > state machine
> > > > > > >
> > > > > > > On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram Yagnaraman
> > > wrote:
> > > > > > > > All the paths in an SCTP connection are kept alive either
> > > > > > > > by actual DATA/SACK running through the connection or by
> HEARTBEAT.
> > > > > > > > This patch proposes a simple state machine with only two
> > > > > > > > states OPEN_WAIT and ESTABLISHED (similar to UDP). The
> > > > > > > > reason for this change is a full stateful approach to SCTP
> > > > > > > > is difficult when the association is multihomed since the
> > > > > > > > endpoints could use different paths in the network during
> > > > > > > > the lifetime
> > > of an association.
> > > > > > >
> > > > > > > Do you mean the router/firewall might not see all packets
> > > > > > > for association is multihomed?
> > > > > > >
> > > > > > > Could you please provide an example?
> > > > > >
> > > > > > Let's say the primary and alternate/secondary paths between
> > > > > > the SCTP endpoints traverse different middle boxes. If an SCTP
> > > > > > endpoint detects network failure on the primary path, it will
> > > > > > switch to using the secondary path and all subsequent packets
> > > > > > will not be seen by the middlebox on the primary path,
> > > > > > including SHUTDOWN sequences if they happen at that time.
> > > > >
> > > > > OK, then on the primary middle box the SCTP flow will just timeout?
> > > > > (because no more packets are seen).
> > > >
> > > > Yes, they will timeout unless the primary path comes up before the
> > > > SHUTDOWN sequence. And the default timeout for an ESTABLISHED
> SCTP
> > > > connection is 5 days, which is a "long" time to clean-up this entry.
> > >
> > > Does the middle box have a chance to see any packet that provides a
> > > hint to shorten this timeout? no HEARTBEAT packets are seen in this
> > > case on the former primary path?
> >
> > There will be HEARTBEAT as soon as a path becomes unreachable from the
> SCTP endpoints. But depending on the location of the network failure, the
> middlebox may or may not see the HEARTBEAT. Also, HEARTBEAT is sent
> when there are no data to be transmitted or if the path is
> unreachable/unconfirmed, so I think there is no deterministic way of finding
> out when to shorten the timeout. Of course, a user has the option of setting
> the ESTABLISHED state timeout to a more reasonable value, for e.g., same as
> the HEARTBEAT_ACKED state timeout (210 sec), OR we could reduce the
> default timeout of ESTABLISHED to 210 sec.
> >
> > >
> > > What I am missing are a more detailed list of issues with the
> > > existing approach. Your patch description says "SCTP tracking with
> > > multihoming is difficult", probably a list of scenarios would help
> > > to understand the motivation to simplify the state machine.
> >
> > Thank you for reviewing and asking these questions, it made me step
> > back and think. I list below some background
> > - I want to simplify the state machine, because it is possible to
> > track an SCTP connection with fewer states, for e.g., SCTP with UDP
> > encapsulation uses UDP conntrack with just UNREPLIED/REPLIED states
> > and it works perfectly fine
> 
> [*] (more below)
> 
> > - My stakeholders, at the behest of whom I am proposing these changes
> > hit some problems running SCTP client endpoints behind NAT (inside
> > Kubernetes pods) towards multihomed SCTP server endpoints (1a-g) and
> > (2a-c) below
> > - Some upcoming SCTP protocol changes in IETF (if
> > approved/implemented) will make it hard to read beyond the SCTP common
> > header, for e.g., DTLS over SCTP
> > https://datatracker.ietf.org/doc/draft-ietf-tsvwg-dtls-over-sctp-bis/,
> > proposes to encrypt all SCTP chunks, conntrack will only be able to
> > see SCTP common header, these changes hopefully will make it easier to
> > adapt to such changes in SCTP protocol
> 
> If this is really the case, then I'm confused by this new RFC (be warned I don't
> know the gory details of DTLS). I don't see it specifying that other chunk
> headers will be encrypted, but instead I see texts like:
> 
> 3.7.  Message Sizes
> 
>    ...
>    The sequence of DTLS records is then fragmented into DATA or I-DATA
>    Chunks to fit the path MTU by SCTP.  These changes ensure that DTLS/
>    SCTP has the same capability as SCTP to support user messages of any
>    size. ...
> 
> this is just packing encrypted payload.
> 
> Sections 4.1 and 4.5 also lead to that, and:
> 
> 6.  DTLS over SCTP Service
> 
>    The adoption of DTLS over SCTP according to the current specification
>    is meant to add to SCTP the option for transferring encrypted data.
>    When DTLS over SCTP is used, all data being transferred MUST be
>    protected by chunk authentication and DTLS encrypted.
> 
> and finally:
> 
> 9.6.  Privacy Considerations
> 
>    For each SCTP user message, the user also provides a stream
>    identifier, a flag to indicate whether the message is sent ordered or
>    unordered, and a payload protocol identifier.  Although DTLS/SCTP
>    provides privacy for the actual user message, the other three
>    information fields are not confidentiality protected.  They are sent
>    as clear text because they are part of the SCTP DATA chunk header.
> 
> 
> So AFAICU the claim here is that a middle box wouldn't be able to inspect
> SCTP payload, but the protocol itself is still possible.
> Did I miss something maybe?

Okay, my bad for using somebody else's words instead of reading the draft myself. You are right, the proposal is only to apply SCTP-AUTH to all the chunks, it doesn’t propose encryption/confidentiality. 
Anyhow that was just an example of why making a simplified state machine will avoid changing conntrack when the protocol changes are made, especially with the protocol still being actively improved.

> 
> > - While at it, I also made some other "improvements"
> > 	a) Avoid multiple walk-throughs of SCTP chunks in sctp_new(),
> sctp_basic_checks() and nf_conntrack_sctp_packet(), and parse it only once
> > 	b) SCTP conntrack has the same state regardless of it is a primary or
> > a secondary path
> 
> This a good change and may, actually, be the core reasoning behind the
> change here. 'Primary' is more in the sense of 'active' than anything else (and
> it gets more confusing with CMT-SCTP heh). It's not because a path saw the
> INIT handshake that it is more reliable and so from conntrack side.
> 
> With that, conntrack handling all paths similarly makes it more consistent and
> closer with what the actual SCTP sockets are seeing for such paths.
> 

Yes, I agree. 
 
> One easy example here of a bad situation due to the current difference
> between paths is that an application can start an association through a path
> and tear it down over another one. There's no shutdown seen by the first
> path, and the conntrack entry will be left there af it is was in Established state
> for 5 days. (this is also illustrated in Sriram's example below, just easier to
> read)
> 
> Item I marked with [*] above is vague. You need to consider what it would be
> loosing by such simplification and argument that it is okay.
> We can't just cut the fat out and pray for the best later on, right?
> But with this description here, it not only provides a solid justification on why
> it (specificly the new solution) is needed but also why it is currently already
> broken/not reliable in multi-homing setups. Yay! :-)

Agreed, I will update the change log with this description.

> 
> Then, next question is: would it make sense to keep the current processing for
> single path associations? That's a lot of code/maintenance and I fail to see a
> reasoning that justifies it.

I tried to simplify as much as possible, but of course, we can keep some states.
For e.g., keeping SHUTDOWN_* states can reduce the timeout of an ESTABLISHED connection even further. Of course, we will see that only on one of the paths.

> 
> >
> > Let's say there are two SCTP endpoints A and B with addresses A' and B, B''
> correspondingly.
> > Primary path is A' <----> B' that traverses middlebox C, and secondary path is
> A' <----> B'' that traverses middlebox D.
> > 1) SHUTDOWN sent on secondary path
> > 1a) SCTP endpoint A sets up an association towards SCTP endpoint B
> > 1b) Middlebox C sees INIT sequence and creates "primary" conntrack
> > entry (5 days)
> > 1c) Middlebox D sees HEARTBEAT sequence and creates "secondary"
> > conntrack entry (210 seconds)
> > 1d) Path failure between A and C, and SCTP endpoint A switches to
> > secondary path and continues sending data on the association
> > 1e) SCTP endpoint A decides to SHUTDOWN the connection
> > 1f) Middlebox C is in ESTABLISHED state, doesn't see any SHUTDOWN
> > sequence or HEARTBEAT, waits for timeout (5 days)
> > 1g) Middlebox D is in HEARTBEAT_ACKED state, doesn't care about
> > SHUTDOWN sequence, waits for timeout (210 seconds)
> >
> > 2) Recently fixed by bff3d0534804 ("netfilter: conntrack: add sctp
> > DATA_SENT state ")
> > 2a) SCTP endpoint A sets up an association towards SCTP endpoint B
> > 2b) Middlebox C sees INIT sequence and creates "primary" conntrack
> > entry (5 days)
> > 2c) Middlebox D sees DATA/SACK, and DROPS packets until HEARTBEAT is
> > seen to setup "secondary" conntrack entry (210 seconds)
> >
> > >
> > > Thanks.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-11  9:36           ` Sriram Yagnaraman
  2023-01-11 13:53             ` Marcelo Ricardo Leitner
@ 2023-01-12 11:50             ` Pablo Neira Ayuso
  2023-01-13  9:04               ` Sriram Yagnaraman
  1 sibling, 1 reply; 14+ messages in thread
From: Pablo Neira Ayuso @ 2023-01-12 11:50 UTC (permalink / raw)
  To: Sriram Yagnaraman
  Cc: netfilter-devel@vger.kernel.org, Florian Westphal,
	Marcelo Ricardo Leitner, Long Xin, Claudio Porfiri

On Wed, Jan 11, 2023 at 09:36:38AM +0000, Sriram Yagnaraman wrote:
> > -----Original Message-----
> > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > Sent: Friday, 6 January 2023 01:50
> > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> > Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> > <lxin@redhat.com>; Claudio Porfiri <claudio.porfiri@ericsson.com>
> > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> > 
> > On Thu, Jan 05, 2023 at 12:11:44PM +0000, Sriram Yagnaraman wrote:
> > > > -----Original Message-----
> > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > Sent: Thursday, 5 January 2023 12:54
> > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>; Long
> > > > Xin <lxin@redhat.com>; Claudio Porfiri
> > > > <claudio.porfiri@ericsson.com>
> > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state
> > > > machine
> > > >
> > > > On Thu, Jan 05, 2023 at 11:41:13AM +0000, Sriram Yagnaraman wrote:
> > > > > > -----Original Message-----
> > > > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > > > Sent: Wednesday, 4 January 2023 16:02
> > > > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>;
> > > > > > Long Xin <lxin@redhat.com>
> > > > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp
> > > > > > state machine
> > > > > >
> > > > > > On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram Yagnaraman
> > wrote:
> > > > > > > All the paths in an SCTP connection are kept alive either by
> > > > > > > actual DATA/SACK running through the connection or by HEARTBEAT.
> > > > > > > This patch proposes a simple state machine with only two
> > > > > > > states OPEN_WAIT and ESTABLISHED (similar to UDP). The reason
> > > > > > > for this change is a full stateful approach to SCTP is
> > > > > > > difficult when the association is multihomed since the
> > > > > > > endpoints could use different paths in the network during the lifetime
> > of an association.
> > > > > >
> > > > > > Do you mean the router/firewall might not see all packets for
> > > > > > association is multihomed?
> > > > > >
> > > > > > Could you please provide an example?
> > > > >
> > > > > Let's say the primary and alternate/secondary paths between the
> > > > > SCTP endpoints traverse different middle boxes. If an SCTP
> > > > > endpoint detects network failure on the primary path, it will
> > > > > switch to using the secondary path and all subsequent packets will
> > > > > not be seen by the middlebox on the primary path, including
> > > > > SHUTDOWN sequences if they happen at that time.
> > > >
> > > > OK, then on the primary middle box the SCTP flow will just timeout?
> > > > (because no more packets are seen).
> > >
> > > Yes, they will timeout unless the primary path comes up before the
> > > SHUTDOWN sequence. And the default timeout for an ESTABLISHED SCTP
> > > connection is 5 days, which is a "long" time to clean-up this entry.
> > 
> > Does the middle box have a chance to see any packet that provides a hint to
> > shorten this timeout? no HEARTBEAT packets are seen in this case on the
> > former primary path?
> 
> There will be HEARTBEAT as soon as a path becomes unreachable from
> the SCTP endpoints. But depending on the location of the network
> failure, the middlebox may or may not see the HEARTBEAT.

Conntrack assumes you have see all traffic that belongs the flow for
other protocols too.

> Also, HEARTBEAT is sent when there are no data to be transmitted or
> if the path is unreachable/unconfirmed, so I think there is no
> deterministic way of finding out when to shorten the timeout. Of
> course, a user has the option of setting the ESTABLISHED state
> timeout to a more reasonable value, for e.g., same as the
> HEARTBEAT_ACKED state timeout (210 sec), OR we could reduce the
> default timeout of ESTABLISHED to 210 sec.

Then just set up a short ESTABLISHED when multihoming is in place
since the beginning.

> > What I am missing are a more detailed list of issues with the existing
> > approach. Your patch description says "SCTP tracking with multihoming is
> > difficult", probably a list of scenarios would help to understand the motivation
> > to simplify the state machine.
> 
> Thank you for reviewing and asking these questions, it made me step back and think. I list below some background
> - I want to simplify the state machine, because it is possible to
> track an SCTP connection with fewer states, for e.g., SCTP with UDP
> encapsulation uses UDP conntrack with just UNREPLIED/REPLIED states
> and it works perfectly fine

I think it would preferrable to add some configuration via ruleset to
track SCTP over UDP, rather than deranking SCTP to become almost
stateless.

> - My stakeholders, at the behest of whom I am proposing these
> changes hit some problems running SCTP client endpoints behind NAT
> (inside Kubernetes pods) towards multihomed SCTP server endpoints
> (1a-g) and (2a-c) below
> - Some upcoming SCTP protocol changes in IETF (if
> approved/implemented) will make it hard to read beyond the SCTP
> common header, for e.g., DTLS over SCTP
> https://datatracker.ietf.org/doc/draft-ietf-tsvwg-dtls-over-sctp-bis/,
> proposes to encrypt all SCTP chunks, conntrack will only be able to
> see SCTP common header, these changes hopefully will make it easier
> to adapt to such changes in SCTP protocol - While at it, I also made
> some other "improvements"

For this DTLS case it should be possible to fall back to the SCTP
"stateless" approach.

> 	a) Avoid multiple walk-throughs of SCTP chunks in sctp_new(), sctp_basic_checks() and nf_conntrack_sctp_packet(), and parse it only once
> 	b) SCTP conntrack has the same state regardless of it is a primary or a secondary path
> 
> Let's say there are two SCTP endpoints A and B with addresses A' and B, B'' correspondingly.
> Primary path is A' <----> B' that traverses middlebox C, and secondary path is A' <----> B'' that traverses middlebox D.
> 1) SHUTDOWN sent on secondary path
> 1a) SCTP endpoint A sets up an association towards SCTP endpoint B
> 1b) Middlebox C sees INIT sequence and creates "primary" conntrack entry (5 days)
> 1c) Middlebox D sees HEARTBEAT sequence and creates "secondary" conntrack entry (210 seconds)
> 1d) Path failure between A and C, and SCTP endpoint A switches to secondary path and continues sending data on the association
> 1e) SCTP endpoint A decides to SHUTDOWN the connection
> 1f) Middlebox C is in ESTABLISHED state, doesn't see any SHUTDOWN sequence or HEARTBEAT, waits for timeout (5 days)
> 1g) Middlebox D is in HEARTBEAT_ACKED state, doesn't care about SHUTDOWN sequence, waits for timeout (210 seconds)

I guess similar problem will occur with MP-TCP, and I am not sure
taking TCP to be more stateless is the way to address this.

> 2) Recently fixed by bff3d0534804 ("netfilter: conntrack: add sctp DATA_SENT state ")
> 2a) SCTP endpoint A sets up an association towards SCTP endpoint B
> 2b) Middlebox C sees INIT sequence and creates "primary" conntrack entry (5 days)
> 2c) Middlebox D sees DATA/SACK, and DROPS packets until HEARTBEAT is seen to setup "secondary" conntrack entry (210 seconds)

I assume this is already fixed.

Another possibility would be to introduce this alternative
state-machine and use it for multihoming?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-12 11:50             ` Pablo Neira Ayuso
@ 2023-01-13  9:04               ` Sriram Yagnaraman
  2023-01-16  9:37                 ` Sriram Yagnaraman
  0 siblings, 1 reply; 14+ messages in thread
From: Sriram Yagnaraman @ 2023-01-13  9:04 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel@vger.kernel.org, Florian Westphal,
	Marcelo Ricardo Leitner, Long Xin, Claudio Porfiri



> -----Original Message-----
> From: Pablo Neira Ayuso <pablo@netfilter.org>
> Sent: Thursday, 12 January 2023 12:50
> To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> <lxin@redhat.com>; Claudio Porfiri <claudio.porfiri@ericsson.com>
> Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> 
> On Wed, Jan 11, 2023 at 09:36:38AM +0000, Sriram Yagnaraman wrote:
> > > -----Original Message-----
> > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > Sent: Friday, 6 January 2023 01:50
> > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>; Long
> > > Xin <lxin@redhat.com>; Claudio Porfiri
> > > <claudio.porfiri@ericsson.com>
> > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state
> > > machine
> > >
> > > On Thu, Jan 05, 2023 at 12:11:44PM +0000, Sriram Yagnaraman wrote:
> > > > > -----Original Message-----
> > > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > > Sent: Thursday, 5 January 2023 12:54
> > > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>;
> > > > > Long Xin <lxin@redhat.com>; Claudio Porfiri
> > > > > <claudio.porfiri@ericsson.com>
> > > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp
> > > > > state machine
> > > > >
> > > > > On Thu, Jan 05, 2023 at 11:41:13AM +0000, Sriram Yagnaraman wrote:
> > > > > > > -----Original Message-----
> > > > > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > > > > Sent: Wednesday, 4 January 2023 16:02
> > > > > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > > > > <fw@strlen.de>; Marcelo Ricardo Leitner
> > > > > > > <mleitner@redhat.com>; Long Xin <lxin@redhat.com>
> > > > > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp
> > > > > > > state machine
> > > > > > >
> > > > > > > On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram Yagnaraman
> > > wrote:
> > > > > > > > All the paths in an SCTP connection are kept alive either
> > > > > > > > by actual DATA/SACK running through the connection or by
> HEARTBEAT.
> > > > > > > > This patch proposes a simple state machine with only two
> > > > > > > > states OPEN_WAIT and ESTABLISHED (similar to UDP). The
> > > > > > > > reason for this change is a full stateful approach to SCTP
> > > > > > > > is difficult when the association is multihomed since the
> > > > > > > > endpoints could use different paths in the network during
> > > > > > > > the lifetime
> > > of an association.
> > > > > > >
> > > > > > > Do you mean the router/firewall might not see all packets
> > > > > > > for association is multihomed?
> > > > > > >
> > > > > > > Could you please provide an example?
> > > > > >
> > > > > > Let's say the primary and alternate/secondary paths between
> > > > > > the SCTP endpoints traverse different middle boxes. If an SCTP
> > > > > > endpoint detects network failure on the primary path, it will
> > > > > > switch to using the secondary path and all subsequent packets
> > > > > > will not be seen by the middlebox on the primary path,
> > > > > > including SHUTDOWN sequences if they happen at that time.
> > > > >
> > > > > OK, then on the primary middle box the SCTP flow will just timeout?
> > > > > (because no more packets are seen).
> > > >
> > > > Yes, they will timeout unless the primary path comes up before the
> > > > SHUTDOWN sequence. And the default timeout for an ESTABLISHED
> SCTP
> > > > connection is 5 days, which is a "long" time to clean-up this entry.
> > >
> > > Does the middle box have a chance to see any packet that provides a
> > > hint to shorten this timeout? no HEARTBEAT packets are seen in this
> > > case on the former primary path?
> >
> > There will be HEARTBEAT as soon as a path becomes unreachable from the
> > SCTP endpoints. But depending on the location of the network failure,
> > the middlebox may or may not see the HEARTBEAT.
> 
> Conntrack assumes you have see all traffic that belongs the flow for other
> protocols too.
> 
> > Also, HEARTBEAT is sent when there are no data to be transmitted or if
> > the path is unreachable/unconfirmed, so I think there is no
> > deterministic way of finding out when to shorten the timeout. Of
> > course, a user has the option of setting the ESTABLISHED state timeout
> > to a more reasonable value, for e.g., same as the HEARTBEAT_ACKED
> > state timeout (210 sec), OR we could reduce the default timeout of
> > ESTABLISHED to 210 sec.
> 
> Then just set up a short ESTABLISHED when multihoming is in place since the
> beginning.
> 
> > > What I am missing are a more detailed list of issues with the
> > > existing approach. Your patch description says "SCTP tracking with
> > > multihoming is difficult", probably a list of scenarios would help
> > > to understand the motivation to simplify the state machine.
> >
> > Thank you for reviewing and asking these questions, it made me step
> > back and think. I list below some background
> > - I want to simplify the state machine, because it is possible to
> > track an SCTP connection with fewer states, for e.g., SCTP with UDP
> > encapsulation uses UDP conntrack with just UNREPLIED/REPLIED states
> > and it works perfectly fine
> 
> I think it would preferrable to add some configuration via ruleset to track SCTP
> over UDP, rather than deranking SCTP to become almost stateless.

Okay 😊

> 
> > - My stakeholders, at the behest of whom I am proposing these changes
> > hit some problems running SCTP client endpoints behind NAT (inside
> > Kubernetes pods) towards multihomed SCTP server endpoints
> > (1a-g) and (2a-c) below
> > - Some upcoming SCTP protocol changes in IETF (if
> > approved/implemented) will make it hard to read beyond the SCTP common
> > header, for e.g., DTLS over SCTP
> > https://datatracker.ietf.org/doc/draft-ietf-tsvwg-dtls-over-sctp-bis/,
> > proposes to encrypt all SCTP chunks, conntrack will only be able to
> > see SCTP common header, these changes hopefully will make it easier to
> > adapt to such changes in SCTP protocol - While at it, I also made some
> > other "improvements"
> 
> For this DTLS case it should be possible to fall back to the SCTP "stateless"
> approach.
> 
> > 	a) Avoid multiple walk-throughs of SCTP chunks in sctp_new(),
> sctp_basic_checks() and nf_conntrack_sctp_packet(), and parse it only once
> > 	b) SCTP conntrack has the same state regardless of it is a primary or
> > a secondary path
> >
> > Let's say there are two SCTP endpoints A and B with addresses A' and B, B''
> correspondingly.
> > Primary path is A' <----> B' that traverses middlebox C, and secondary path is
> A' <----> B'' that traverses middlebox D.
> > 1) SHUTDOWN sent on secondary path
> > 1a) SCTP endpoint A sets up an association towards SCTP endpoint B
> > 1b) Middlebox C sees INIT sequence and creates "primary" conntrack
> > entry (5 days)
> > 1c) Middlebox D sees HEARTBEAT sequence and creates "secondary"
> > conntrack entry (210 seconds)
> > 1d) Path failure between A and C, and SCTP endpoint A switches to
> > secondary path and continues sending data on the association
> > 1e) SCTP endpoint A decides to SHUTDOWN the connection
> > 1f) Middlebox C is in ESTABLISHED state, doesn't see any SHUTDOWN
> > sequence or HEARTBEAT, waits for timeout (5 days)
> > 1g) Middlebox D is in HEARTBEAT_ACKED state, doesn't care about
> > SHUTDOWN sequence, waits for timeout (210 seconds)
> 
> I guess similar problem will occur with MP-TCP, and I am not sure taking TCP
> to be more stateless is the way to address this.

Ok, I am a newbie to this area and am most probably mistaken, so forgive my naive question below.
Shouldn't conntrack understand as less as possible about the protocol, and parse the bare minimum from the packet to detect that an active connection?
For packet filtering/firewall, I understand we will need deep packet inspection, but is conntrack the place to do that?

> 
> > 2) Recently fixed by bff3d0534804 ("netfilter: conntrack: add sctp
> > DATA_SENT state ")
> > 2a) SCTP endpoint A sets up an association towards SCTP endpoint B
> > 2b) Middlebox C sees INIT sequence and creates "primary" conntrack
> > entry (5 days)
> > 2c) Middlebox D sees DATA/SACK, and DROPS packets until HEARTBEAT is
> > seen to setup "secondary" conntrack entry (210 seconds)
> 
> I assume this is already fixed.
> 
> Another possibility would be to introduce this alternative state-machine and
> use it for multihoming?

Or I could unify the established states for both the connection that saw an INIT/INIT_ACK sequence and HEARTBEAT/HEARTBEAT_ACK sequence and use the HEARTBEAT_ACKED state timeout for both. That way, there is no difference from a conntrack perspective between "primary" and "secondary" connections. I can send another patch if the group here thinks this is a good idea.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
  2023-01-13  9:04               ` Sriram Yagnaraman
@ 2023-01-16  9:37                 ` Sriram Yagnaraman
  0 siblings, 0 replies; 14+ messages in thread
From: Sriram Yagnaraman @ 2023-01-16  9:37 UTC (permalink / raw)
  To: Sriram Yagnaraman, Pablo Neira Ayuso
  Cc: netfilter-devel@vger.kernel.org, Florian Westphal,
	Marcelo Ricardo Leitner, Long Xin, Claudio Porfiri



> -----Original Message-----
> From: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> Sent: Friday, 13 January 2023 10:04
> To: Pablo Neira Ayuso <pablo@netfilter.org>
> Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> <lxin@redhat.com>; Claudio Porfiri <claudio.porfiri@ericsson.com>
> Subject: RE: [RFC PATCH] netfilter: conntrack: simplify sctp state machine
> 
> 
> 
> > -----Original Message-----
> > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > Sent: Thursday, 12 January 2023 12:50
> > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > Cc: netfilter-devel@vger.kernel.org; Florian Westphal <fw@strlen.de>;
> > Marcelo Ricardo Leitner <mleitner@redhat.com>; Long Xin
> > <lxin@redhat.com>; Claudio Porfiri <claudio.porfiri@ericsson.com>
> > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state
> > machine
> >
> > On Wed, Jan 11, 2023 at 09:36:38AM +0000, Sriram Yagnaraman wrote:
> > > > -----Original Message-----
> > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > Sent: Friday, 6 January 2023 01:50
> > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>;
> > > > Long Xin <lxin@redhat.com>; Claudio Porfiri
> > > > <claudio.porfiri@ericsson.com>
> > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp state
> > > > machine
> > > >
> > > > On Thu, Jan 05, 2023 at 12:11:44PM +0000, Sriram Yagnaraman wrote:
> > > > > > -----Original Message-----
> > > > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > > > Sent: Thursday, 5 January 2023 12:54
> > > > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > > > <fw@strlen.de>; Marcelo Ricardo Leitner <mleitner@redhat.com>;
> > > > > > Long Xin <lxin@redhat.com>; Claudio Porfiri
> > > > > > <claudio.porfiri@ericsson.com>
> > > > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify sctp
> > > > > > state machine
> > > > > >
> > > > > > On Thu, Jan 05, 2023 at 11:41:13AM +0000, Sriram Yagnaraman
> wrote:
> > > > > > > > -----Original Message-----
> > > > > > > > From: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > > > > > Sent: Wednesday, 4 January 2023 16:02
> > > > > > > > To: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> > > > > > > > Cc: netfilter-devel@vger.kernel.org; Florian Westphal
> > > > > > > > <fw@strlen.de>; Marcelo Ricardo Leitner
> > > > > > > > <mleitner@redhat.com>; Long Xin <lxin@redhat.com>
> > > > > > > > Subject: Re: [RFC PATCH] netfilter: conntrack: simplify
> > > > > > > > sctp state machine
> > > > > > > >
> > > > > > > > On Wed, Jan 04, 2023 at 12:31:43PM +0100, Sriram
> > > > > > > > Yagnaraman
> > > > wrote:
> > > > > > > > > All the paths in an SCTP connection are kept alive
> > > > > > > > > either by actual DATA/SACK running through the
> > > > > > > > > connection or by
> > HEARTBEAT.
> > > > > > > > > This patch proposes a simple state machine with only two
> > > > > > > > > states OPEN_WAIT and ESTABLISHED (similar to UDP). The
> > > > > > > > > reason for this change is a full stateful approach to
> > > > > > > > > SCTP is difficult when the association is multihomed
> > > > > > > > > since the endpoints could use different paths in the
> > > > > > > > > network during the lifetime
> > > > of an association.
> > > > > > > >
> > > > > > > > Do you mean the router/firewall might not see all packets
> > > > > > > > for association is multihomed?
> > > > > > > >
> > > > > > > > Could you please provide an example?
> > > > > > >
> > > > > > > Let's say the primary and alternate/secondary paths between
> > > > > > > the SCTP endpoints traverse different middle boxes. If an
> > > > > > > SCTP endpoint detects network failure on the primary path,
> > > > > > > it will switch to using the secondary path and all
> > > > > > > subsequent packets will not be seen by the middlebox on the
> > > > > > > primary path, including SHUTDOWN sequences if they happen at
> that time.
> > > > > >
> > > > > > OK, then on the primary middle box the SCTP flow will just timeout?
> > > > > > (because no more packets are seen).
> > > > >
> > > > > Yes, they will timeout unless the primary path comes up before
> > > > > the SHUTDOWN sequence. And the default timeout for an
> > > > > ESTABLISHED
> > SCTP
> > > > > connection is 5 days, which is a "long" time to clean-up this entry.
> > > >
> > > > Does the middle box have a chance to see any packet that provides
> > > > a hint to shorten this timeout? no HEARTBEAT packets are seen in
> > > > this case on the former primary path?
> > >
> > > There will be HEARTBEAT as soon as a path becomes unreachable from
> > > the SCTP endpoints. But depending on the location of the network
> > > failure, the middlebox may or may not see the HEARTBEAT.
> >
> > Conntrack assumes you have see all traffic that belongs the flow for
> > other protocols too.
> >
> > > Also, HEARTBEAT is sent when there are no data to be transmitted or
> > > if the path is unreachable/unconfirmed, so I think there is no
> > > deterministic way of finding out when to shorten the timeout. Of
> > > course, a user has the option of setting the ESTABLISHED state
> > > timeout to a more reasonable value, for e.g., same as the
> > > HEARTBEAT_ACKED state timeout (210 sec), OR we could reduce the
> > > default timeout of ESTABLISHED to 210 sec.
> >
> > Then just set up a short ESTABLISHED when multihoming is in place
> > since the beginning.
> >
> > > > What I am missing are a more detailed list of issues with the
> > > > existing approach. Your patch description says "SCTP tracking with
> > > > multihoming is difficult", probably a list of scenarios would help
> > > > to understand the motivation to simplify the state machine.
> > >
> > > Thank you for reviewing and asking these questions, it made me step
> > > back and think. I list below some background
> > > - I want to simplify the state machine, because it is possible to
> > > track an SCTP connection with fewer states, for e.g., SCTP with UDP
> > > encapsulation uses UDP conntrack with just UNREPLIED/REPLIED states
> > > and it works perfectly fine
> >
> > I think it would preferrable to add some configuration via ruleset to
> > track SCTP over UDP, rather than deranking SCTP to become almost
> stateless.
> 
> Okay 😊
> 
> >
> > > - My stakeholders, at the behest of whom I am proposing these
> > > changes hit some problems running SCTP client endpoints behind NAT
> > > (inside Kubernetes pods) towards multihomed SCTP server endpoints
> > > (1a-g) and (2a-c) below
> > > - Some upcoming SCTP protocol changes in IETF (if
> > > approved/implemented) will make it hard to read beyond the SCTP
> > > common header, for e.g., DTLS over SCTP
> > > https://datatracker.ietf.org/doc/draft-ietf-tsvwg-dtls-over-sctp-bis
> > > /, proposes to encrypt all SCTP chunks, conntrack will only be able
> > > to see SCTP common header, these changes hopefully will make it
> > > easier to adapt to such changes in SCTP protocol - While at it, I
> > > also made some other "improvements"
> >
> > For this DTLS case it should be possible to fall back to the SCTP "stateless"
> > approach.
> >
> > > 	a) Avoid multiple walk-throughs of SCTP chunks in sctp_new(),
> > sctp_basic_checks() and nf_conntrack_sctp_packet(), and parse it only
> > once
> > > 	b) SCTP conntrack has the same state regardless of it is a primary
> > > or a secondary path
> > >
> > > Let's say there are two SCTP endpoints A and B with addresses A' and B, B''
> > correspondingly.
> > > Primary path is A' <----> B' that traverses middlebox C, and
> > > secondary path is
> > A' <----> B'' that traverses middlebox D.
> > > 1) SHUTDOWN sent on secondary path
> > > 1a) SCTP endpoint A sets up an association towards SCTP endpoint B
> > > 1b) Middlebox C sees INIT sequence and creates "primary" conntrack
> > > entry (5 days)
> > > 1c) Middlebox D sees HEARTBEAT sequence and creates "secondary"
> > > conntrack entry (210 seconds)
> > > 1d) Path failure between A and C, and SCTP endpoint A switches to
> > > secondary path and continues sending data on the association
> > > 1e) SCTP endpoint A decides to SHUTDOWN the connection
> > > 1f) Middlebox C is in ESTABLISHED state, doesn't see any SHUTDOWN
> > > sequence or HEARTBEAT, waits for timeout (5 days)
> > > 1g) Middlebox D is in HEARTBEAT_ACKED state, doesn't care about
> > > SHUTDOWN sequence, waits for timeout (210 seconds)
> >
> > I guess similar problem will occur with MP-TCP, and I am not sure
> > taking TCP to be more stateless is the way to address this.
> 
> Ok, I am a newbie to this area and am most probably mistaken, so forgive my
> naive question below.
> Shouldn't conntrack understand as less as possible about the protocol, and
> parse the bare minimum from the packet to detect that an active connection?
> For packet filtering/firewall, I understand we will need deep packet inspection,
> but is conntrack the place to do that?
> 
> >
> > > 2) Recently fixed by bff3d0534804 ("netfilter: conntrack: add sctp
> > > DATA_SENT state ")
> > > 2a) SCTP endpoint A sets up an association towards SCTP endpoint B
> > > 2b) Middlebox C sees INIT sequence and creates "primary" conntrack
> > > entry (5 days)
> > > 2c) Middlebox D sees DATA/SACK, and DROPS packets until HEARTBEAT is
> > > seen to setup "secondary" conntrack entry (210 seconds)
> >
> > I assume this is already fixed.
> >
> > Another possibility would be to introduce this alternative
> > state-machine and use it for multihoming?
> 
> Or I could unify the established states for both the connection that saw an
> INIT/INIT_ACK sequence and HEARTBEAT/HEARTBEAT_ACK sequence and use
> the HEARTBEAT_ACKED state timeout for both. That way, there is no
> difference from a conntrack perspective between "primary" and "secondary"
> connections. I can send another patch if the group here thinks this is a good
> idea.
Here it is: https://lore.kernel.org/netfilter-devel/20230116093556.9437-1-sriram.yagnaraman@est.tech/T/#t


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2023-01-16  9:37 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-01-04 11:31 [RFC PATCH] netfilter: conntrack: simplify sctp state machine Sriram Yagnaraman
2023-01-04 12:41 ` Florian Westphal
2023-01-05 11:25   ` Sriram Yagnaraman
2023-01-04 15:02 ` Pablo Neira Ayuso
2023-01-05 11:41   ` Sriram Yagnaraman
2023-01-05 11:53     ` Pablo Neira Ayuso
2023-01-05 12:11       ` Sriram Yagnaraman
2023-01-06  0:50         ` Pablo Neira Ayuso
2023-01-11  9:36           ` Sriram Yagnaraman
2023-01-11 13:53             ` Marcelo Ricardo Leitner
2023-01-12 11:38               ` Sriram Yagnaraman
2023-01-12 11:50             ` Pablo Neira Ayuso
2023-01-13  9:04               ` Sriram Yagnaraman
2023-01-16  9:37                 ` Sriram Yagnaraman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox