public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: "Ahmed, Aaron" <aarnahmd@amazon.com>
To: Kuniyuki Iwashima <kuniyu@google.com>
Cc: "stable@vger.kernel.org" <stable@vger.kernel.org>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	"ncardwell@google.com" <ncardwell@google.com>,
	"edumazet@google.com" <edumazet@google.com>
Subject: Re: [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data
Date: Mon, 27 Apr 2026 22:26:58 +0000	[thread overview]
Message-ID: <A7A3F2FE-B18C-4F6D-A5E4-78164D6904F5@amazon.com> (raw)
In-Reply-To: <CAAVpQUCfMsWBpPpywbwBLRCdHUqWqFBoDK=17dwDkG6T0dQxzw@mail.gmail.com>

Hi Kuniyuki!

Thanks for taking a look! To clarify the issue: the problem shows up on long-running servers
with many concurrent connections. The original reproducer exits
right after closing the sockets, so the memory gets cleaned up at
process exit. In production the server never exits, so the memory
just keeps growing. Is this expected behavior?

I've written an updated reproducer that models a persistent
server. You can pass 0 or 1 as an argument to set the l_linger value.

This outputs the following:

When l_linger=0:

  TCP: inuse 7 orphan 0 tw 2 alloc 100009 mem 197259

When l_linger=1:

  TCP: inuse 50008 orphan 0 tw 5 alloc 50009 mem 14426

With l_linger=0, only 7 sockets are in use but ~770 MB of TCP
memory has no owner. With l_linger=1, 50,008 sockets are in use
but only ~56 MB of memory.

Updated reproducer:

  Build:  gcc -O2 -pthread -o tcp_linger_memleak tcp_linger_memleak.c
  Run:    ulimit -n 100000
          sudo sysctl -w net.core.wmem_max=4194304
          sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
          ./tcp_linger_memleak 0    
          ./tcp_linger_memleak 1    

---8<---
/* tcp_linger_memleak.c - SO_LINGER(0) TCP memory leak reproducer
 *
 * Build:  gcc -O2 -pthread -o tcp_linger_memleak tcp_linger_memleak.c
 * Run:    ulimit -n 100000
 *         sudo sysctl -w net.core.wmem_max=4194304
 *         sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
 *         ./tcp_linger_memleak [linger_sec]
 *           linger_sec=0 (default) -> leaks memory
 *           linger_sec=1           -> no leak
 *
 * Monitor: watch -n5 'cat /proc/net/sockstat; echo ---; free -m'
 */
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <pthread.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>

#define PORT          6666
#define NUM_THREADS   8
#define MAX_CLIENTS   4096
#define NUM_CONNS     25000
#define WRITE_INTERVAL_MS 200
#define MSG_SIZE_MIN  128
#define MSG_SIZE_MAX  2046
#define CLIENT_RDBUF  10240

static int g_linger_sec = 0;

struct worker {
	pthread_mutex_t lock;
	int    fds[MAX_CLIENTS];
	int    bufsz[MAX_CLIENTS];
	int    count;
	int    pipe_rd;
	int    pipe_wr;
};

static struct worker workers[NUM_THREADS];

static void *worker_thread(void *arg)
{
	struct worker *w = (struct worker *)arg;
	char buf[MSG_SIZE_MAX];

	memset(buf, 'A', sizeof(buf));

	while (1) {
		char dummy;

		if (read(w->pipe_rd, &dummy, 1) <= 0)
			break;

		pthread_mutex_lock(&w->lock);
		int i = 0;

		while (i < w->count) {
			ssize_t n = send(w->fds[i], buf, w->bufsz[i],
					 MSG_NOSIGNAL);
			if (n < 0) {
				struct linger lg = {
					.l_onoff = 1,
					.l_linger = g_linger_sec
				};

				setsockopt(w->fds[i], SOL_SOCKET,
					   SO_LINGER, &lg, sizeof(lg));
				close(w->fds[i]);
				w->fds[i] = w->fds[w->count - 1];
				w->bufsz[i] = w->bufsz[w->count - 1];
				w->count--;
				continue;
			}
			i++;
		}
		pthread_mutex_unlock(&w->lock);
	}
	return NULL;
}

static void *tick_thread(void *arg)
{
	(void)arg;
	while (1) {
		usleep(WRITE_INTERVAL_MS * 1000);
		for (int t = 0; t < NUM_THREADS; t++) {
			char c = 1;

			write(workers[t].pipe_wr, &c, 1);
		}
	}
	return NULL;
}

static void run_client(void)
{
	struct sockaddr_in addr = {
		.sin_family = AF_INET,
		.sin_port = htons(PORT),
	};
	int fds[NUM_CONNS];
	char rdbuf[CLIENT_RDBUF];

	inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr);

	for (int i = 0; i < NUM_CONNS; i++) {
		fds[i] = socket(AF_INET, SOCK_STREAM, 0);
		if (fds[i] < 0) {
			usleep(1000);
			i--;
			continue;
		}
		if (connect(fds[i], (struct sockaddr *)&addr,
			    sizeof(addr)) < 0) {
			close(fds[i]);
			usleep(1000);
			i--;
			continue;
		}
		int opt = 1;

		setsockopt(fds[i], IPPROTO_TCP, TCP_NODELAY,
			   &opt, sizeof(opt));
		if (i % 1000 == 0)
			printf("Client: %d connections established\n", i);
		usleep(100);
	}

	printf("Client: all %d connections established, reading slowly...\n",
	       NUM_CONNS);

	while (1) {
		for (int i = 0; i < NUM_CONNS; i++) {
			if (fds[i] < 0)
				continue;
			ssize_t n = recv(fds[i], rdbuf, sizeof(rdbuf),
					 MSG_DONTWAIT);
			if (n == 0 || (n < 0 && errno != EAGAIN &&
				       errno != EWOULDBLOCK)) {
				close(fds[i]);
				fds[i] = -1;
			}
		}
		usleep(50000);
	}
}

int main(int argc, char *argv[])
{
	g_linger_sec = (argc > 1) ? atoi(argv[1]) : 0;

	printf("SO_LINGER l_linger=%d\n", g_linger_sec);
	printf("Monitor: watch -n5 'cat /proc/net/sockstat'\n\n");

	signal(SIGPIPE, SIG_IGN);

	for (int t = 0; t < NUM_THREADS; t++) {
		int pfd[2];
		pthread_t tid;

		pthread_mutex_init(&workers[t].lock, NULL);
		workers[t].count = 0;
		pipe(pfd);
		workers[t].pipe_rd = pfd[0];
		workers[t].pipe_wr = pfd[1];
		pthread_create(&tid, NULL, worker_thread, &workers[t]);
		pthread_detach(tid);
	}

	pthread_t tick_tid;

	pthread_create(&tick_tid, NULL, tick_thread, NULL);
	pthread_detach(tick_tid);

	pid_t child = fork();

	if (child == 0) {
		run_client();
		_exit(0);
	}

	struct sockaddr_in addr = {
		.sin_family = AF_INET,
		.sin_port = htons(PORT),
		.sin_addr.s_addr = htonl(INADDR_ANY)
	};
	int opt = 1;
	int lsn = socket(AF_INET, SOCK_STREAM, 0);

	setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
	bind(lsn, (struct sockaddr *)&addr, sizeof(addr));
	listen(lsn, 4096);

	int thread_idx = 0;
	unsigned long accepted = 0;

	while (1) {
		int fd = accept(lsn, NULL, NULL);

		if (fd < 0)
			continue;

		opt = 1;
		setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
			   &opt, sizeof(opt));
		int flags = fcntl(fd, F_GETFL, 0);

		fcntl(fd, F_SETFL, flags | O_NONBLOCK);
		int sndbuf = 4 * 1024 * 1024;

		setsockopt(fd, SOL_SOCKET, SO_SNDBUF,
			   &sndbuf, sizeof(sndbuf));

		struct worker *w = &workers[thread_idx % NUM_THREADS];

		pthread_mutex_lock(&w->lock);
		if (w->count < MAX_CLIENTS) {
			w->fds[w->count] = fd;
			w->bufsz[w->count] = MSG_SIZE_MIN +
				(rand() % (MSG_SIZE_MAX - MSG_SIZE_MIN));
			w->count++;
		} else {
			close(fd);
		}
		pthread_mutex_unlock(&w->lock);

		thread_idx++;
		accepted++;
		if (accepted % 5000 == 0)
			printf("Server: accepted %lu connections\n",
			       accepted);
	}
}
---8<---

Thanks,
Aaron

On 4/17/26, 5:45 PM, "Kuniyuki Iwashima" <kuniyu@google.com <mailto:kuniyu@google.com>> wrote:


CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.






Hi Aaron :)


Thanks for the report.


On Fri, Apr 17, 2026 at 5:20 PM Ahmed, Aaron <aarnahmd@amazon.com <mailto:aarnahmd@amazon.com>> wrote:
>
> Hi,
>
> We have identified a TCP memory leak issue on Amazon Linux with kernel versions 5.15.168 through 6.18.20 that occurs when closing sockets with SO_LINGER set to l_onoff=1, l_linger=0, on servers handling many persistent connections with full write buffers.
>
> Overview:
>
> The issue was discovered on a public-facing non-blocking TCP server that maintains many persistent connections and streams data to clients. When a client cannot read fast enough, the TCP write socket buffer on the server side fills up and send() returns EAGAIN. At that point, the server application disconnects the slow client by setting SO_LINGER to l_onoff=1, l_linger=0 and calling close(). This is intended to immediately reset the connection and release all associated kernel resources. However, while the socket disappears from netstat and sockstat (TCP inuse drops), the write buffer memory is not properly reclaimed. /proc/net/sockstat shows TCP mem pages accumulating with no owning sockets, causing the leaked memory to grow past the tcp_mem limits. Setting SO_LINGER to l_onoff=1, l_linger=1 instead does not leak. With l_linger=1, the connection goes through FIN_WAIT1 → FIN_WAIT2 → CLOSE (confirmed with BPF tcpstates), and all memory is freed properly. With l_linger=0, the connection transitions directly from ESTABLISHED → CLOSE via RST, bypassing the FIN states entirely.
>
> Reproducer:
> ```
> /* tcp_linger_memleak.c - SO_LINGER(0) TCP memory leak reproducer
> *
> * Build: gcc -O2 -o tcp_linger_memleak tcp_linger_memleak.c
> * Run: sudo sysctl -w net.core.wmem_max=4194304
> * sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
> * ./tcp_linger_memleak
> */
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <signal.h>
> #include <sys/socket.h>
> #include <sys/wait.h>
> #include <netinet/in.h>
>
> #define NUM_CONNS 5000
> #define PORT 6666
>
> static void print_mem(const char *label) {
> FILE *f;
> char line[256];
> f = fopen("/proc/meminfo", "r");
> while (fgets(line, sizeof(line), f))
> if (strncmp(line, "MemAvailable:", 13) == 0)
> printf("%s: %s", label, line);
> fclose(f);
> f = fopen("/proc/net/sockstat", "r");
> while (fgets(line, sizeof(line), f))
> if (strncmp(line, "TCP:", 4) == 0)
> printf("%s: %s", label, line);
> fclose(f);
> }
>
> int main(void) {
> struct sockaddr_in addr = {
> .sin_family = AF_INET,
> .sin_port = htons(PORT),
> .sin_addr.s_addr = htonl(INADDR_LOOPBACK)
> };
> int opt = 1;
> signal(SIGPIPE, SIG_IGN);
>
> int lsn = socket(AF_INET, SOCK_STREAM, 0);
> setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
> bind(lsn, (struct sockaddr *)&addr, sizeof(addr));
> listen(lsn, NUM_CONNS);
>
> /* Fork client: connect N times, never read */
> pid_t child = fork();
> if (child == 0) {
> int fds[NUM_CONNS];
> for (int i = 0; i < NUM_CONNS; i++) {
> fds[i] = socket(AF_INET, SOCK_STREAM, 0);
> connect(fds[i], (struct sockaddr *)&addr, sizeof(addr));
> }
> pause(); /* sit forever, never read */
> _exit(0);
> }
>
> /* Accept all connections */
> int clients[NUM_CONNS];
> for (int i = 0; i < NUM_CONNS; i++)
> clients[i] = accept(lsn, NULL, NULL);
>
> /* Freeze client so it stops reading */
> kill(child, SIGSTOP);
> printf("=== %d connections established, client frozen ===\n", NUM_CONNS);
> print_mem("BEFORE");
>
> /* Fill buffers and close with SO_LINGER(1,0) */
> char buf[2048];
> memset(buf, 'A', sizeof(buf));
> for (int i = 0; i < NUM_CONNS; i++) {
> int flags = fcntl(clients[i], F_GETFL, 0);
> fcntl(clients[i], F_SETFL, flags | O_NONBLOCK);
> while (send(clients[i], buf, sizeof(buf), MSG_NOSIGNAL) > 0);
> struct linger lg = { .l_onoff = 1, .l_linger = 0 };
> setsockopt(clients[i], SOL_SOCKET, SO_LINGER, &lg, sizeof(lg));
> close(clients[i]);
> }
>
> sleep(2);
> printf("\n=== All sockets closed with SO_LINGER(1,0) ===\n");
> print_mem("AFTER");
> kill(child, SIGKILL);
> waitpid(child, NULL, 0);
> close(lsn);
> return 0;
> }
> ```
> Output (Tested on 6.18.20):
> ```
> === 5000 connections established, client frozen ===
> BEFORE: MemAvailable: 95491288 kB
> BEFORE: TCP: inuse 10005 orphan 0 tw 5 alloc 10006 mem 0
>
> === All sockets closed with SO_LINGER(1,0) ===
> AFTER: MemAvailable: 95321800 kB
> AFTER: TCP: inuse 5 orphan 0 tw 5 alloc 5006 mem 8300
> ```


Unfortunately, it dies immediately on my end.


=== 5000 connections established, client frozen ===
Segmentation fault (core dumped) ./linux/tcp_linger




Did you see actual memory leak with kmemleak or is it
just the tcp_mem counter that is really leaked ?


# echo clear > /sys/kernel/debug/kmemleak
~ run repro ~
# echo scan > /sys/kernel/debug/kmemleak




  parent reply	other threads:[~2026-04-27 22:27 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-18  0:19 [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data Ahmed, Aaron
2026-04-18  0:44 ` Kuniyuki Iwashima
2026-04-18  1:06   ` Kuniyuki Iwashima
2026-04-27 22:26   ` Ahmed, Aaron [this message]
2026-04-28  0:15     ` Kuniyuki Iwashima

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=A7A3F2FE-B18C-4F6D-A5E4-78164D6904F5@amazon.com \
    --to=aarnahmd@amazon.com \
    --cc=edumazet@google.com \
    --cc=kuniyu@google.com \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox