All of lore.kernel.org
 help / color / mirror / Atom feed
* Expected SCTP DATA chunk per second performance
@ 2020-03-02  9:35 Harald Welte
  2020-03-02 11:41 ` Michael Tuexen
                   ` (5 more replies)
  0 siblings, 6 replies; 8+ messages in thread
From: Harald Welte @ 2020-03-02  9:35 UTC (permalink / raw)
  To: linux-sctp

[-- Attachment #1: Type: text/plain, Size: 1980 bytes --]

Hi!

I've been trying to implement some DIAMETER load testing, and I discovered that
it's apparently not my application layer code that is throttling the rate,
but the kernel SCTP stack.  I currently cannot get to more than 500 SCTP data
chunks per second on a 5.4.19 kernel (happy to try other versions).

The most simplistic setup to reproduce is:
* run a single-threaded SCTP server and SCTP client on the same machine
* use loopback / localhost for communication
* have the transmitter continuously transmit sctp_sendmsg() of 100-200 bytes
* have the receiver just sctp_recvmsg() and discard the data
* use a single stream in a single association for now to establish a base-line

Whether I use a complex diameter stack and test framework or whether I use a
simplistic 120 line C program that just transmits small data chunks, the
rate always is limited to about 500 DATA chunks per second.

In wireshark, I can see that up to 9 DATA chunks are aggregated into each SCTP
packet.  However, it typically takes the stack 203-201ms to send a SACK to each
of those packets.  Only after that SACK is received, it seems the sender is
transmitting more DATA chunks in the next packet.

I wonder if this is expected behavior?  As far as I understand, SCTP only has
a congestion window based on number of bytes, and not on number of chunks. The
windows as per INIT/INIT_ACK is at 160496 bytes, while 144 bytes * 9 chunks is
only 1296 bytes, i.e. the window cannot be full at all.

Any ideas what's happening here and how to increase the throughput in terms of
number of DATA chunks per second?

A demo program is attached for your reference

Thanks in advance.

Regards,
	Harald

-- 
- Harald Welte <laforge@gnumonks.org>           http://laforge.gnumonks.org/
============================================================================
"Privacy in residential applications is a desirable marketing option."
                                                  (ETSI EN 300 175-7 Ch. A6)

[-- Attachment #2: sctptest.c --]
[-- Type: text/x-csrc, Size: 4790 bytes --]

/*
 * Simple SCTP test program, original version by Daniel Mack
 * at https://gist.github.com/zonque/7d03568eab14a2bb57cb
 *
 * Modified in 2020 by Harald Welte <laforge@gnumonks.org> for
 * - DATA chunk rate testing.
 * - initial support for userspace SCTP stack testing
 *
 * Compile:
 *
 *   gcc sctptest.c -o server -lsctp -Wall
 *   ln -s server client
 *
 * Invoke:
 *
 *   ./client
 *   ./server
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <libgen.h>
#include <time.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <arpa/inet.h>

#define _GNU_SOURCE
#include <getopt.h>

#define HAVE_KERNEL_SCTP

#ifdef HAVE_KERNEL_SCTP
#include <netinet/sctp.h>
#define ext_socket socket
#define ext_bind bind
#define ext_setsockopt setsockopt
#define ext_listen listen
#define ext_accept accept
#define ext_close close
#define ext_connect connect
#else
/* sctplib + socketapi */
#include <ext_socket.h>
#include <sctp.h>
#endif

#define MY_PORT_NUM 62324

/* compute differece between two timespec */
static void timespec_diff(const struct timespec *start, const struct timespec *stop,
			  struct timespec *result)
{
	if ((stop->tv_nsec - start->tv_nsec) < 0) {
		result->tv_sec = stop->tv_sec - start->tv_sec - 1;
		result->tv_nsec = stop->tv_nsec - start->tv_nsec + 1000000000;
	} else {
		result->tv_sec = stop->tv_sec - start->tv_sec;
		result->tv_nsec = stop->tv_nsec - start->tv_nsec;
	}
}

static void die(const char *s) {
	perror(s);
	exit(1);
}

static void server(int argc, char **argv)
{
	struct sockaddr_in servaddr = {
		.sin_family = AF_INET,
		.sin_addr.s_addr = htonl(INADDR_ANY),
		.sin_port = htons(MY_PORT_NUM),
	};
	struct sctp_initmsg initmsg = {
		.sinit_num_ostreams = 5,
		.sinit_max_instreams = 5,
		.sinit_max_attempts = 4,
	};
	struct sctp_sndrcvinfo sndrcvinfo;
	int listen_fd, conn_fd, flags, ret, in;

	listen_fd = ext_socket(AF_INET, SOCK_STREAM, IPPROTO_SCTP);
	if (listen_fd < 0)
		die("socket");

	ret = ext_bind(listen_fd, (struct sockaddr *) &servaddr, sizeof(servaddr));
	if (ret < 0)
		die("bind");

	ret = ext_setsockopt(listen_fd, IPPROTO_SCTP, SCTP_INITMSG, &initmsg, sizeof(initmsg));
	if (ret < 0)
		die("setsockopt");

	ret = ext_listen(listen_fd, initmsg.sinit_max_instreams);
	if (ret < 0)
		die("listen");

	for (;;) {
		char buffer[1024];
		unsigned int num_chunks_rcvd;

		printf("Waiting for connection\n");
		fflush(stdout);

		conn_fd = ext_accept(listen_fd, (struct sockaddr *) NULL, NULL);
		if(conn_fd < 0)
			die("accept()");

		printf("New client connected\n");
		fflush(stdout);
		num_chunks_rcvd = 0;

		while (1) {
			in = sctp_recvmsg(conn_fd, buffer, sizeof(buffer), NULL, 0, &sndrcvinfo, &flags);
			if (in <= 0)
				break;
			num_chunks_rcvd++;
		}

		printf("Server: Received %u chunks, closing\n", num_chunks_rcvd);
		fflush(stdout);

		ext_close(conn_fd);
	}
}

static void client(int argc, char **argv) {
	struct sockaddr_in servaddr = {
		.sin_family = AF_INET,
		.sin_port = htons(MY_PORT_NUM),
		.sin_addr.s_addr = inet_addr("127.0.0.1"),
	};
	struct timespec ts_start, ts_stop, ts_diff;
	uint8_t *payload;
	unsigned int num_chunks = 10000;
	unsigned int chunksize = 150;
	int conn_fd, ret;

	while (1) {
		int option_index = 0, c;
		const struct option long_options[] = {
			{ "num-chunks", 1, 0, 'n' },
			{ "chunk-size", 1, 0, 's' },
			{ 0, 0, 0, 0 }
		};

		c = getopt_long(argc, argv, "n:s:", long_options, &option_index);
		if (c == -1)
			break;

		switch (c) {
		case 'n':
			num_chunks = atoi(optarg);
			break;
		case 's':
			chunksize = atoi(optarg);
			break;
		default:
			break;
		}
	}

	printf("About to send %u chunks of each %u bytes\n", num_chunks, chunksize);

	payload = malloc(chunksize);
	if (!payload)
		die("malloc()");

	conn_fd = ext_socket(AF_INET, SOCK_STREAM, IPPROTO_SCTP);
	if (conn_fd < 0)
		die("socket()");

	ret = ext_connect(conn_fd, (struct sockaddr *) &servaddr, sizeof(servaddr));
	if (ret < 0)
		die("connect()");

	ret = clock_gettime(CLOCK_MONOTONIC_RAW, &ts_start);
	if (ret < 0)
		die("clock_gettime()");

	for (int i = 0; i < num_chunks; i++) {
		ret = sctp_sendmsg(conn_fd, payload, chunksize, NULL, 0, 0, 0, 0, 0, 0 );
		if (ret < 0)
			die("sctp_sendmsg");
	}

	ret = clock_gettime(CLOCK_MONOTONIC_RAW, &ts_stop);
	if (ret < 0)
		die("clock_gettime()");
	timespec_diff(&ts_start, &ts_stop, &ts_diff);
	float diff_f = (float)ts_diff.tv_sec + (float)ts_diff.tv_nsec/1000000000.0;
	printf("%u DATA chunks of %u bytes each in %5.2f seconds: %5.2f DATA chunks per second\n",
		num_chunks, chunksize, diff_f, (float)num_chunks/diff_f);

	close(conn_fd);

}

int main(int argc, char **argv) {

	if (strstr(basename(argv[0]), "server"))
		server(argc, argv);
	else
		client(argc, argv);

	return 0;
}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Expected SCTP DATA chunk per second performance
  2020-03-02  9:35 Expected SCTP DATA chunk per second performance Harald Welte
@ 2020-03-02 11:41 ` Michael Tuexen
  2020-03-02 12:37 ` Harald Welte
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Michael Tuexen @ 2020-03-02 11:41 UTC (permalink / raw)
  To: linux-sctp

> On 2. Mar 2020, at 10:35, Harald Welte <laforge@gnumonks.org> wrote:
> 
> Hi!
> 
> I've been trying to implement some DIAMETER load testing, and I discovered that
> it's apparently not my application layer code that is throttling the rate,
> but the kernel SCTP stack.  I currently cannot get to more than 500 SCTP data
> chunks per second on a 5.4.19 kernel (happy to try other versions).
> 
> The most simplistic setup to reproduce is:
> * run a single-threaded SCTP server and SCTP client on the same machine
> * use loopback / localhost for communication
> * have the transmitter continuously transmit sctp_sendmsg() of 100-200 bytes
> * have the receiver just sctp_recvmsg() and discard the data
> * use a single stream in a single association for now to establish a base-line
> 
> Whether I use a complex diameter stack and test framework or whether I use a
> simplistic 120 line C program that just transmits small data chunks, the
> rate always is limited to about 500 DATA chunks per second.
> 
> In wireshark, I can see that up to 9 DATA chunks are aggregated into each SCTP
> packet.  However, it typically takes the stack 203-201ms to send a SACK to each
That looks suspicious. It seems this is the 200ms delayed ACK timer. That is fine.
The question is why the sender is not sending more? I guess you can work around this
issue by disabling the Nagle Algorithm:
https://tools.ietf.org/html/rfc6458#section-8.1.5
Enable SCTP_NODELAY on the sender side. Does that fix the issue?
However, Nagle should not step into the game here...

Best regards
Michael
> of those packets.  Only after that SACK is received, it seems the sender is
> transmitting more DATA chunks in the next packet.
> 
> I wonder if this is expected behavior?  As far as I understand, SCTP only has
> a congestion window based on number of bytes, and not on number of chunks. The
> windows as per INIT/INIT_ACK is at 160496 bytes, while 144 bytes * 9 chunks is
> only 1296 bytes, i.e. the window cannot be full at all.
> 
> Any ideas what's happening here and how to increase the throughput in terms of
> number of DATA chunks per second?
> 
> A demo program is attached for your reference
> 
> Thanks in advance.
> 
> Regards,
> 	Harald
> 
> -- 
> - Harald Welte <laforge@gnumonks.org>           http://laforge.gnumonks.org/
> ======================================
> "Privacy in residential applications is a desirable marketing option."
>                                                  (ETSI EN 300 175-7 Ch. A6)
> <sctptest.c>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Expected SCTP DATA chunk per second performance
  2020-03-02  9:35 Expected SCTP DATA chunk per second performance Harald Welte
  2020-03-02 11:41 ` Michael Tuexen
@ 2020-03-02 12:37 ` Harald Welte
  2020-03-02 13:28 ` Michael Tuexen
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Harald Welte @ 2020-03-02 12:37 UTC (permalink / raw)
  To: linux-sctp

Hi Michael,

On Mon, Mar 02, 2020 at 12:41:57PM +0100, Michael Tuexen wrote:
> > In wireshark, I can see that up to 9 DATA chunks are aggregated into each SCTP
> > packet.  However, it typically takes the stack 203-201ms to send a SACK to each
>
> That looks suspicious. It seems this is the 200ms delayed ACK timer. That is fine.
> The question is why the sender is not sending more? I guess you can work around this
> issue by disabling the Nagle Algorithm:
> https://tools.ietf.org/html/rfc6458#section-8.1.5
> Enable SCTP_NODELAY on the sender side. Does that fix the issue?
> However, Nagle should not step into the game here...

I was thinking of SCTP_NODELAY before, but didn't do it as I thought it
would only impact the lower latency bound in sporadic communication, but
not throttle the transmit message rate?

I've just tried your suggestion, and indeed:

with SCTP_NODELAY=0
10000 DATA chunks of 150 bytes each in 19.59 seconds: 510.53 DATA chunks per second

with SCTP_NODELAY=1
10000 DATA chunks of 150 bytes each in  0.26 seconds: 38360.42 DATA chunks per second

So AFAICT there now is a work-around... but still I assume there is a bug in lksctp
if it throttles the overall message rate down to 1.3% of what it could
be when Nagle is enabled?

Regards,
	Harald

-- 
- Harald Welte <laforge@gnumonks.org>           http://laforge.gnumonks.org/
======================================
"Privacy in residential applications is a desirable marketing option."
                                                  (ETSI EN 300 175-7 Ch. A6)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Expected SCTP DATA chunk per second performance
  2020-03-02  9:35 Expected SCTP DATA chunk per second performance Harald Welte
  2020-03-02 11:41 ` Michael Tuexen
  2020-03-02 12:37 ` Harald Welte
@ 2020-03-02 13:28 ` Michael Tuexen
  2020-03-02 13:47 ` Harald Welte
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Michael Tuexen @ 2020-03-02 13:28 UTC (permalink / raw)
  To: linux-sctp

> On 2. Mar 2020, at 13:37, Harald Welte <laforge@gnumonks.org> wrote:
> 
> Hi Michael,
> 
> On Mon, Mar 02, 2020 at 12:41:57PM +0100, Michael Tuexen wrote:
>>> In wireshark, I can see that up to 9 DATA chunks are aggregated into each SCTP
>>> packet.  However, it typically takes the stack 203-201ms to send a SACK to each
>> 
>> That looks suspicious. It seems this is the 200ms delayed ACK timer. That is fine.
>> The question is why the sender is not sending more? I guess you can work around this
>> issue by disabling the Nagle Algorithm:
>> https://tools.ietf.org/html/rfc6458#section-8.1.5
>> Enable SCTP_NODELAY on the sender side. Does that fix the issue?
>> However, Nagle should not step into the game here...
> 
> I was thinking of SCTP_NODELAY before, but didn't do it as I thought it
> would only impact the lower latency bound in sporadic communication, but
> not throttle the transmit message rate?
> 
> I've just tried your suggestion, and indeed:
> 
> with SCTP_NODELAY=0
> 10000 DATA chunks of 150 bytes each in 19.59 seconds: 510.53 DATA chunks per second
> 
> with SCTP_NODELAY=1
> 10000 DATA chunks of 150 bytes each in  0.26 seconds: 38360.42 DATA chunks per second
> 
> So AFAICT there now is a work-around... but still I assume there is a bug in lksctp
> if it throttles the overall message rate down to 1.3% of what it could
> be when Nagle is enabled?
I consider it a bug. Nagle normally is implemented by not sending small packets.
From the numbers you provided, I guess the SCTP packets are about 1500 bytes. But
I guess Linux has an MTU on the loopback interface which is much larger.
So I guess one part of the code thinks the packet is full (you can't put another
chunk into it), so send it. Another part thinks the packet is not full, since the
MTU is much larger.

Similar bugs where in the FreeBSD stack. However, I'm not familiar with the Linux
code base. Someone else has to chime in. But it shouldn't be hard to find and fix.

Best regards
Michael
> 
> Regards,
> 	Harald
> 
> -- 
> - Harald Welte <laforge@gnumonks.org>           http://laforge.gnumonks.org/
> ======================================
> "Privacy in residential applications is a desirable marketing option."
>                                                  (ETSI EN 300 175-7 Ch. A6)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Expected SCTP DATA chunk per second performance
  2020-03-02  9:35 Expected SCTP DATA chunk per second performance Harald Welte
                   ` (2 preceding siblings ...)
  2020-03-02 13:28 ` Michael Tuexen
@ 2020-03-02 13:47 ` Harald Welte
  2020-03-02 15:26 ` David Laight
  2021-11-07 21:09 ` SCTP <= 500 pps unless SCTP_NODELAY set (was: Expected SCTP DATA chunk per second performance) Harald Welte
  5 siblings, 0 replies; 8+ messages in thread
From: Harald Welte @ 2020-03-02 13:47 UTC (permalink / raw)
  To: linux-sctp

Hi Michael,

On Mon, Mar 02, 2020 at 02:28:26PM +0100, Michael Tuexen wrote:

> I consider it a bug. 

Agreed.

> Nagle normally is implemented by not sending small packets.
> From the numbers you provided, I guess the SCTP packets are about 1500 bytes. But
> I guess Linux has an MTU on the loopback interface which is much larger.

Actually, it depends on the type of sender code I use.  With Eclipse TITAN IPL4asp
(my ultimate target for writing the tests), the packets indeed are pnly 1500 bytes
in size.

When using the small C program attached, I'm seeing ~34 kByte sized IP
packets on loopback, but only at 500-510 DATA chunks per second overall
rate.

When disabling NAGLE ('client -d' of the attached program), I'm getting
much higher throughput, but there is no single IP packet with more than
a single DATA chunk inside at all anymore.  The latter is expected on
the one hand side (every syscall goes all the way to build a packet and
send it), but given at the high sender rate I would have expected that
every so often multiple DATA chunks arrive from userspace before a
packet has been sent (socket send buffer)?  In any case, no complaints
in this case.

Also interesting: With the application code (TITAN) in place, I am
seeing higher DATA chunk throughput over actual Ethernet than I'm seeing
over loopback.

I'd appreciate any feedback from the lksctp hackers here if I should
open a bugzilla issue about the poor performance with Nagle.

Regards,
	Harald

-- 
- Harald Welte <laforge@gnumonks.org>           http://laforge.gnumonks.org/
======================================
"Privacy in residential applications is a desirable marketing option."
                                                  (ETSI EN 300 175-7 Ch. A6)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: Expected SCTP DATA chunk per second performance
  2020-03-02  9:35 Expected SCTP DATA chunk per second performance Harald Welte
                   ` (3 preceding siblings ...)
  2020-03-02 13:47 ` Harald Welte
@ 2020-03-02 15:26 ` David Laight
  2021-11-07 21:09 ` SCTP <= 500 pps unless SCTP_NODELAY set (was: Expected SCTP DATA chunk per second performance) Harald Welte
  5 siblings, 0 replies; 8+ messages in thread
From: David Laight @ 2020-03-02 15:26 UTC (permalink / raw)
  To: linux-sctp

From: Harald Welte
> Sent: 02 March 2020 13:48
...
> When disabling NAGLE ('client -d' of the attached program), I'm getting
> much higher throughput, but there is no single IP packet with more than
> a single DATA chunk inside at all anymore.  The latter is expected on
> the one hand side (every syscall goes all the way to build a packet and
> send it), but given at the high sender rate I would have expected that
> every so often multiple DATA chunks arrive from userspace before a
> packet has been sent (socket send buffer)?  In any case, no complaints
> in this case.

You'd have to flow control off (ie no ack from the remote system) the
connections before data chunks get merged.

If you know you have another data chunk to send, set MSG_MORE on the send.
That should stop the packet being sent until it is full.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* SCTP <= 500 pps unless SCTP_NODELAY set (was: Expected SCTP DATA chunk per second performance)
  2020-03-02  9:35 Expected SCTP DATA chunk per second performance Harald Welte
                   ` (4 preceding siblings ...)
  2020-03-02 15:26 ` David Laight
@ 2021-11-07 21:09 ` Harald Welte
  2026-05-13 22:52   ` Jonas Falkevik
  5 siblings, 1 reply; 8+ messages in thread
From: Harald Welte @ 2021-11-07 21:09 UTC (permalink / raw)
  To: linux-sctp

Dear list,

after quite some time I'm again involved in a project that involved performance
testing of SCTP based protocols.

Unfortuantely it seems that the problems I reported in March 2020 on kernel 5.4.19
still persist at least up to 5.10.46

Using the same program from that old thread, I can not reach more than 500 DATA
chunks per second unless the SCTP_NODELAY option set].  When I set SCTP_NODELAY,
I'm reaching up to 77k DATA chunks per second on my venerable Thinkpad x260.

So I still think there is the fundamental problem:

  Nagle should batch up multiple DATA chunks in each packet, but it should not
  reduce the sending rate to a staggering 0.6% of the throughput without nagle

Can anybody let me know from which kernel version onwards this problem was addressed,
if any?

In general, it is of course useful to have Nagle enabled:  The expectation is that
it would batch up DATA chunks until the MTU is reached to ensure fewer larger packets.

When Nagle is disabled with SCTP_NODELAY, it is one IP packet per DATA chunk, which
is of course quite suboptimal in many traditional SCTP use cases such as SIGTRAN
where the encapsulated SS7 messages are unlikely to be > 256 bytes each.

Regards,
	Harald

On Mon, Mar 02, 2020 at 10:35:32AM +0100, Harald Welte wrote:
> Hi!
> 
> I've been trying to implement some DIAMETER load testing, and I discovered that
> it's apparently not my application layer code that is throttling the rate,
> but the kernel SCTP stack.  I currently cannot get to more than 500 SCTP data
> chunks per second on a 5.4.19 kernel (happy to try other versions).
> 
> The most simplistic setup to reproduce is:
> * run a single-threaded SCTP server and SCTP client on the same machine
> * use loopback / localhost for communication
> * have the transmitter continuously transmit sctp_sendmsg() of 100-200 bytes
> * have the receiver just sctp_recvmsg() and discard the data
> * use a single stream in a single association for now to establish a base-line
> 
> Whether I use a complex diameter stack and test framework or whether I use a
> simplistic 120 line C program that just transmits small data chunks, the
> rate always is limited to about 500 DATA chunks per second.
> 
> In wireshark, I can see that up to 9 DATA chunks are aggregated into each SCTP
> packet.  However, it typically takes the stack 203-201ms to send a SACK to each
> of those packets.  Only after that SACK is received, it seems the sender is
> transmitting more DATA chunks in the next packet.
> 
> I wonder if this is expected behavior?  As far as I understand, SCTP only has
> a congestion window based on number of bytes, and not on number of chunks. The
> windows as per INIT/INIT_ACK is at 160496 bytes, while 144 bytes * 9 chunks is
> only 1296 bytes, i.e. the window cannot be full at all.
> 
> Any ideas what's happening here and how to increase the throughput in terms of
> number of DATA chunks per second?
> 
> A demo program is attached for your reference
> 
> Thanks in advance.
> 
> Regards,
> 	Harald
> 
> -- 
> - Harald Welte <laforge@gnumonks.org>           http://laforge.gnumonks.org/
> ============================================================================
> "Privacy in residential applications is a desirable marketing option."
>                                                   (ETSI EN 300 175-7 Ch. A6)

> /*
>  * Simple SCTP test program, original version by Daniel Mack
>  * at https://gist.github.com/zonque/7d03568eab14a2bb57cb
>  *
>  * Modified in 2020 by Harald Welte <laforge@gnumonks.org> for
>  * - DATA chunk rate testing.
>  * - initial support for userspace SCTP stack testing
>  *
>  * Compile:
>  *
>  *   gcc sctptest.c -o server -lsctp -Wall
>  *   ln -s server client
>  *
>  * Invoke:
>  *
>  *   ./client
>  *   ./server
>  */
> 
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <libgen.h>
> #include <time.h>
> #include <sys/socket.h>
> #include <sys/types.h>
> #include <netinet/in.h>
> #include <arpa/inet.h>
> 
> #define _GNU_SOURCE
> #include <getopt.h>
> 
> #define HAVE_KERNEL_SCTP
> 
> #ifdef HAVE_KERNEL_SCTP
> #include <netinet/sctp.h>
> #define ext_socket socket
> #define ext_bind bind
> #define ext_setsockopt setsockopt
> #define ext_listen listen
> #define ext_accept accept
> #define ext_close close
> #define ext_connect connect
> #else
> /* sctplib + socketapi */
> #include <ext_socket.h>
> #include <sctp.h>
> #endif
> 
> #define MY_PORT_NUM 62324
> 
> /* compute differece between two timespec */
> static void timespec_diff(const struct timespec *start, const struct timespec *stop,
> 			  struct timespec *result)
> {
> 	if ((stop->tv_nsec - start->tv_nsec) < 0) {
> 		result->tv_sec = stop->tv_sec - start->tv_sec - 1;
> 		result->tv_nsec = stop->tv_nsec - start->tv_nsec + 1000000000;
> 	} else {
> 		result->tv_sec = stop->tv_sec - start->tv_sec;
> 		result->tv_nsec = stop->tv_nsec - start->tv_nsec;
> 	}
> }
> 
> static void die(const char *s) {
> 	perror(s);
> 	exit(1);
> }
> 
> static void server(int argc, char **argv)
> {
> 	struct sockaddr_in servaddr = {
> 		.sin_family = AF_INET,
> 		.sin_addr.s_addr = htonl(INADDR_ANY),
> 		.sin_port = htons(MY_PORT_NUM),
> 	};
> 	struct sctp_initmsg initmsg = {
> 		.sinit_num_ostreams = 5,
> 		.sinit_max_instreams = 5,
> 		.sinit_max_attempts = 4,
> 	};
> 	struct sctp_sndrcvinfo sndrcvinfo;
> 	int listen_fd, conn_fd, flags, ret, in;
> 
> 	listen_fd = ext_socket(AF_INET, SOCK_STREAM, IPPROTO_SCTP);
> 	if (listen_fd < 0)
> 		die("socket");
> 
> 	ret = ext_bind(listen_fd, (struct sockaddr *) &servaddr, sizeof(servaddr));
> 	if (ret < 0)
> 		die("bind");
> 
> 	ret = ext_setsockopt(listen_fd, IPPROTO_SCTP, SCTP_INITMSG, &initmsg, sizeof(initmsg));
> 	if (ret < 0)
> 		die("setsockopt");
> 
> 	ret = ext_listen(listen_fd, initmsg.sinit_max_instreams);
> 	if (ret < 0)
> 		die("listen");
> 
> 	for (;;) {
> 		char buffer[1024];
> 		unsigned int num_chunks_rcvd;
> 
> 		printf("Waiting for connection\n");
> 		fflush(stdout);
> 
> 		conn_fd = ext_accept(listen_fd, (struct sockaddr *) NULL, NULL);
> 		if(conn_fd < 0)
> 			die("accept()");
> 
> 		printf("New client connected\n");
> 		fflush(stdout);
> 		num_chunks_rcvd = 0;
> 
> 		while (1) {
> 			in = sctp_recvmsg(conn_fd, buffer, sizeof(buffer), NULL, 0, &sndrcvinfo, &flags);
> 			if (in <= 0)
> 				break;
> 			num_chunks_rcvd++;
> 		}
> 
> 		printf("Server: Received %u chunks, closing\n", num_chunks_rcvd);
> 		fflush(stdout);
> 
> 		ext_close(conn_fd);
> 	}
> }
> 
> static void client(int argc, char **argv) {
> 	struct sockaddr_in servaddr = {
> 		.sin_family = AF_INET,
> 		.sin_port = htons(MY_PORT_NUM),
> 		.sin_addr.s_addr = inet_addr("127.0.0.1"),
> 	};
> 	struct timespec ts_start, ts_stop, ts_diff;
> 	uint8_t *payload;
> 	unsigned int num_chunks = 10000;
> 	unsigned int chunksize = 150;
> 	int conn_fd, ret;
> 
> 	while (1) {
> 		int option_index = 0, c;
> 		const struct option long_options[] = {
> 			{ "num-chunks", 1, 0, 'n' },
> 			{ "chunk-size", 1, 0, 's' },
> 			{ 0, 0, 0, 0 }
> 		};
> 
> 		c = getopt_long(argc, argv, "n:s:", long_options, &option_index);
> 		if (c == -1)
> 			break;
> 
> 		switch (c) {
> 		case 'n':
> 			num_chunks = atoi(optarg);
> 			break;
> 		case 's':
> 			chunksize = atoi(optarg);
> 			break;
> 		default:
> 			break;
> 		}
> 	}
> 
> 	printf("About to send %u chunks of each %u bytes\n", num_chunks, chunksize);
> 
> 	payload = malloc(chunksize);
> 	if (!payload)
> 		die("malloc()");
> 
> 	conn_fd = ext_socket(AF_INET, SOCK_STREAM, IPPROTO_SCTP);
> 	if (conn_fd < 0)
> 		die("socket()");
> 
> 	ret = ext_connect(conn_fd, (struct sockaddr *) &servaddr, sizeof(servaddr));
> 	if (ret < 0)
> 		die("connect()");
> 
> 	ret = clock_gettime(CLOCK_MONOTONIC_RAW, &ts_start);
> 	if (ret < 0)
> 		die("clock_gettime()");
> 
> 	for (int i = 0; i < num_chunks; i++) {
> 		ret = sctp_sendmsg(conn_fd, payload, chunksize, NULL, 0, 0, 0, 0, 0, 0 );
> 		if (ret < 0)
> 			die("sctp_sendmsg");
> 	}
> 
> 	ret = clock_gettime(CLOCK_MONOTONIC_RAW, &ts_stop);
> 	if (ret < 0)
> 		die("clock_gettime()");
> 	timespec_diff(&ts_start, &ts_stop, &ts_diff);
> 	float diff_f = (float)ts_diff.tv_sec + (float)ts_diff.tv_nsec/1000000000.0;
> 	printf("%u DATA chunks of %u bytes each in %5.2f seconds: %5.2f DATA chunks per second\n",
> 		num_chunks, chunksize, diff_f, (float)num_chunks/diff_f);
> 
> 	close(conn_fd);
> 
> }
> 
> int main(int argc, char **argv) {
> 
> 	if (strstr(basename(argv[0]), "server"))
> 		server(argc, argv);
> 	else
> 		client(argc, argv);
> 
> 	return 0;
> }


-- 
- Harald Welte <laforge@gnumonks.org>           http://laforge.gnumonks.org/
============================================================================
"Privacy in residential applications is a desirable marketing option."
                                                  (ETSI EN 300 175-7 Ch. A6)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: SCTP <= 500 pps unless SCTP_NODELAY set (was: Expected SCTP DATA chunk per second performance)
  2021-11-07 21:09 ` SCTP <= 500 pps unless SCTP_NODELAY set (was: Expected SCTP DATA chunk per second performance) Harald Welte
@ 2026-05-13 22:52   ` Jonas Falkevik
  0 siblings, 0 replies; 8+ messages in thread
From: Jonas Falkevik @ 2026-05-13 22:52 UTC (permalink / raw)
  To: Harald Welte; +Cc: linux-sctp

> Unfortuantely it seems that the problems I reported in March 2020 on kernel 5.4.19
> still persist at least up to 5.10.46

I ran into this problem as well when doing some testing.
And the problem still exists as far as I can tell.
After looking into the details it seems to be related to pathmtu and
sndbuf size.
Since the messages sent out are quite small, the overhead for bundling
the outgoing data eats up the sndbuf before hitting the pathmtu size boundary.

This makes the send block with only 1 packet in flight,
which is not acked by the remote side until SACK Delay or number of packets
set with SACK freq, usually set to 2.
Sender is blocked by sndbuf and can't fill the Nagle bundle buffer
for the second packet to go out.

Verified send is blocked in sctp_wait_for_sndbuf by:
Small change to make the function traceable.

 /* Helper function to wait for space in the sndbuf.  */
-static int sctp_wait_for_sndbuf(struct sctp_association *asoc,
+static noinline int sctp_wait_for_sndbuf(struct sctp_association *asoc,
                                struct sctp_transport *transport,
                                long *timeo_p, size_t msg_len)

root@virtme-ng:/home/jonas/tmp# bpftrace -e
'kprobe:sctp_wait_for_sndbuf { @callstack[kstack] = count();}'
Attaching 1 probe...
^C

@callstack[
    sctp_wait_for_sndbuf+1
    sctp_sendmsg_to_asoc+362
    sctp_sendmsg+1619
    ____sys_sendmsg+376
    ___sys_sendmsg+153
    __sys_sendmsg+136
    do_syscall_64+270
    entry_SYSCALL_64_after_hwframe+119
]: 47


Then made a small change where the max size for bundling is
min(pathmtu, SCTP_DEFAULT_MAXSEGMENT).
Would such a patch be of interest?
Or add a max data chunk counter to cap on?

$ ./client -n 1000000
About to send 1000000 chunks of each 150 bytes
1000000 DATA chunks of 150 bytes each in  0.95 seconds: 1051674.12
DATA chunks per second

The problem only manifests itself when sending data where you can fill
the sndbuf before hitting pathmtu size.
Bumping the sndbuf to to approx 5,6x pathmtu should work as well.

-Jonas

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-05-13 22:52 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-03-02  9:35 Expected SCTP DATA chunk per second performance Harald Welte
2020-03-02 11:41 ` Michael Tuexen
2020-03-02 12:37 ` Harald Welte
2020-03-02 13:28 ` Michael Tuexen
2020-03-02 13:47 ` Harald Welte
2020-03-02 15:26 ` David Laight
2021-11-07 21:09 ` SCTP <= 500 pps unless SCTP_NODELAY set (was: Expected SCTP DATA chunk per second performance) Harald Welte
2026-05-13 22:52   ` Jonas Falkevik

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.