From: "Ahmed, Aaron" <aarnahmd@amazon.com>
To: Kuniyuki Iwashima <kuniyu@google.com>
Cc: "stable@vger.kernel.org" <stable@vger.kernel.org>,
"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
"ncardwell@google.com" <ncardwell@google.com>,
"edumazet@google.com" <edumazet@google.com>
Subject: Re: [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data
Date: Mon, 27 Apr 2026 22:26:58 +0000 [thread overview]
Message-ID: <A7A3F2FE-B18C-4F6D-A5E4-78164D6904F5@amazon.com> (raw)
In-Reply-To: <CAAVpQUCfMsWBpPpywbwBLRCdHUqWqFBoDK=17dwDkG6T0dQxzw@mail.gmail.com>
Hi Kuniyuki!
Thanks for taking a look! To clarify the issue: the problem shows up on long-running servers
with many concurrent connections. The original reproducer exits
right after closing the sockets, so the memory gets cleaned up at
process exit. In production the server never exits, so the memory
just keeps growing. Is this expected behavior?
I've written an updated reproducer that models a persistent
server. You can pass 0 or 1 as an argument to set the l_linger value.
This outputs the following:
When l_linger=0:
TCP: inuse 7 orphan 0 tw 2 alloc 100009 mem 197259
When l_linger=1:
TCP: inuse 50008 orphan 0 tw 5 alloc 50009 mem 14426
With l_linger=0, only 7 sockets are in use but ~770 MB of TCP
memory has no owner. With l_linger=1, 50,008 sockets are in use
but only ~56 MB of memory.
Updated reproducer:
Build: gcc -O2 -pthread -o tcp_linger_memleak tcp_linger_memleak.c
Run: ulimit -n 100000
sudo sysctl -w net.core.wmem_max=4194304
sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
./tcp_linger_memleak 0
./tcp_linger_memleak 1
---8<---
/* tcp_linger_memleak.c - SO_LINGER(0) TCP memory leak reproducer
*
* Build: gcc -O2 -pthread -o tcp_linger_memleak tcp_linger_memleak.c
* Run: ulimit -n 100000
* sudo sysctl -w net.core.wmem_max=4194304
* sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
* ./tcp_linger_memleak [linger_sec]
* linger_sec=0 (default) -> leaks memory
* linger_sec=1 -> no leak
*
* Monitor: watch -n5 'cat /proc/net/sockstat; echo ---; free -m'
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <pthread.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#define PORT 6666
#define NUM_THREADS 8
#define MAX_CLIENTS 4096
#define NUM_CONNS 25000
#define WRITE_INTERVAL_MS 200
#define MSG_SIZE_MIN 128
#define MSG_SIZE_MAX 2046
#define CLIENT_RDBUF 10240
static int g_linger_sec = 0;
struct worker {
pthread_mutex_t lock;
int fds[MAX_CLIENTS];
int bufsz[MAX_CLIENTS];
int count;
int pipe_rd;
int pipe_wr;
};
static struct worker workers[NUM_THREADS];
static void *worker_thread(void *arg)
{
struct worker *w = (struct worker *)arg;
char buf[MSG_SIZE_MAX];
memset(buf, 'A', sizeof(buf));
while (1) {
char dummy;
if (read(w->pipe_rd, &dummy, 1) <= 0)
break;
pthread_mutex_lock(&w->lock);
int i = 0;
while (i < w->count) {
ssize_t n = send(w->fds[i], buf, w->bufsz[i],
MSG_NOSIGNAL);
if (n < 0) {
struct linger lg = {
.l_onoff = 1,
.l_linger = g_linger_sec
};
setsockopt(w->fds[i], SOL_SOCKET,
SO_LINGER, &lg, sizeof(lg));
close(w->fds[i]);
w->fds[i] = w->fds[w->count - 1];
w->bufsz[i] = w->bufsz[w->count - 1];
w->count--;
continue;
}
i++;
}
pthread_mutex_unlock(&w->lock);
}
return NULL;
}
static void *tick_thread(void *arg)
{
(void)arg;
while (1) {
usleep(WRITE_INTERVAL_MS * 1000);
for (int t = 0; t < NUM_THREADS; t++) {
char c = 1;
write(workers[t].pipe_wr, &c, 1);
}
}
return NULL;
}
static void run_client(void)
{
struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_port = htons(PORT),
};
int fds[NUM_CONNS];
char rdbuf[CLIENT_RDBUF];
inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr);
for (int i = 0; i < NUM_CONNS; i++) {
fds[i] = socket(AF_INET, SOCK_STREAM, 0);
if (fds[i] < 0) {
usleep(1000);
i--;
continue;
}
if (connect(fds[i], (struct sockaddr *)&addr,
sizeof(addr)) < 0) {
close(fds[i]);
usleep(1000);
i--;
continue;
}
int opt = 1;
setsockopt(fds[i], IPPROTO_TCP, TCP_NODELAY,
&opt, sizeof(opt));
if (i % 1000 == 0)
printf("Client: %d connections established\n", i);
usleep(100);
}
printf("Client: all %d connections established, reading slowly...\n",
NUM_CONNS);
while (1) {
for (int i = 0; i < NUM_CONNS; i++) {
if (fds[i] < 0)
continue;
ssize_t n = recv(fds[i], rdbuf, sizeof(rdbuf),
MSG_DONTWAIT);
if (n == 0 || (n < 0 && errno != EAGAIN &&
errno != EWOULDBLOCK)) {
close(fds[i]);
fds[i] = -1;
}
}
usleep(50000);
}
}
int main(int argc, char *argv[])
{
g_linger_sec = (argc > 1) ? atoi(argv[1]) : 0;
printf("SO_LINGER l_linger=%d\n", g_linger_sec);
printf("Monitor: watch -n5 'cat /proc/net/sockstat'\n\n");
signal(SIGPIPE, SIG_IGN);
for (int t = 0; t < NUM_THREADS; t++) {
int pfd[2];
pthread_t tid;
pthread_mutex_init(&workers[t].lock, NULL);
workers[t].count = 0;
pipe(pfd);
workers[t].pipe_rd = pfd[0];
workers[t].pipe_wr = pfd[1];
pthread_create(&tid, NULL, worker_thread, &workers[t]);
pthread_detach(tid);
}
pthread_t tick_tid;
pthread_create(&tick_tid, NULL, tick_thread, NULL);
pthread_detach(tick_tid);
pid_t child = fork();
if (child == 0) {
run_client();
_exit(0);
}
struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_port = htons(PORT),
.sin_addr.s_addr = htonl(INADDR_ANY)
};
int opt = 1;
int lsn = socket(AF_INET, SOCK_STREAM, 0);
setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
bind(lsn, (struct sockaddr *)&addr, sizeof(addr));
listen(lsn, 4096);
int thread_idx = 0;
unsigned long accepted = 0;
while (1) {
int fd = accept(lsn, NULL, NULL);
if (fd < 0)
continue;
opt = 1;
setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
&opt, sizeof(opt));
int flags = fcntl(fd, F_GETFL, 0);
fcntl(fd, F_SETFL, flags | O_NONBLOCK);
int sndbuf = 4 * 1024 * 1024;
setsockopt(fd, SOL_SOCKET, SO_SNDBUF,
&sndbuf, sizeof(sndbuf));
struct worker *w = &workers[thread_idx % NUM_THREADS];
pthread_mutex_lock(&w->lock);
if (w->count < MAX_CLIENTS) {
w->fds[w->count] = fd;
w->bufsz[w->count] = MSG_SIZE_MIN +
(rand() % (MSG_SIZE_MAX - MSG_SIZE_MIN));
w->count++;
} else {
close(fd);
}
pthread_mutex_unlock(&w->lock);
thread_idx++;
accepted++;
if (accepted % 5000 == 0)
printf("Server: accepted %lu connections\n",
accepted);
}
}
---8<---
Thanks,
Aaron
On 4/17/26, 5:45 PM, "Kuniyuki Iwashima" <kuniyu@google.com <mailto:kuniyu@google.com>> wrote:
CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
Hi Aaron :)
Thanks for the report.
On Fri, Apr 17, 2026 at 5:20 PM Ahmed, Aaron <aarnahmd@amazon.com <mailto:aarnahmd@amazon.com>> wrote:
>
> Hi,
>
> We have identified a TCP memory leak issue on Amazon Linux with kernel versions 5.15.168 through 6.18.20 that occurs when closing sockets with SO_LINGER set to l_onoff=1, l_linger=0, on servers handling many persistent connections with full write buffers.
>
> Overview:
>
> The issue was discovered on a public-facing non-blocking TCP server that maintains many persistent connections and streams data to clients. When a client cannot read fast enough, the TCP write socket buffer on the server side fills up and send() returns EAGAIN. At that point, the server application disconnects the slow client by setting SO_LINGER to l_onoff=1, l_linger=0 and calling close(). This is intended to immediately reset the connection and release all associated kernel resources. However, while the socket disappears from netstat and sockstat (TCP inuse drops), the write buffer memory is not properly reclaimed. /proc/net/sockstat shows TCP mem pages accumulating with no owning sockets, causing the leaked memory to grow past the tcp_mem limits. Setting SO_LINGER to l_onoff=1, l_linger=1 instead does not leak. With l_linger=1, the connection goes through FIN_WAIT1 → FIN_WAIT2 → CLOSE (confirmed with BPF tcpstates), and all memory is freed properly. With l_linger=0, the connection transitions directly from ESTABLISHED → CLOSE via RST, bypassing the FIN states entirely.
>
> Reproducer:
> ```
> /* tcp_linger_memleak.c - SO_LINGER(0) TCP memory leak reproducer
> *
> * Build: gcc -O2 -o tcp_linger_memleak tcp_linger_memleak.c
> * Run: sudo sysctl -w net.core.wmem_max=4194304
> * sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
> * ./tcp_linger_memleak
> */
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <signal.h>
> #include <sys/socket.h>
> #include <sys/wait.h>
> #include <netinet/in.h>
>
> #define NUM_CONNS 5000
> #define PORT 6666
>
> static void print_mem(const char *label) {
> FILE *f;
> char line[256];
> f = fopen("/proc/meminfo", "r");
> while (fgets(line, sizeof(line), f))
> if (strncmp(line, "MemAvailable:", 13) == 0)
> printf("%s: %s", label, line);
> fclose(f);
> f = fopen("/proc/net/sockstat", "r");
> while (fgets(line, sizeof(line), f))
> if (strncmp(line, "TCP:", 4) == 0)
> printf("%s: %s", label, line);
> fclose(f);
> }
>
> int main(void) {
> struct sockaddr_in addr = {
> .sin_family = AF_INET,
> .sin_port = htons(PORT),
> .sin_addr.s_addr = htonl(INADDR_LOOPBACK)
> };
> int opt = 1;
> signal(SIGPIPE, SIG_IGN);
>
> int lsn = socket(AF_INET, SOCK_STREAM, 0);
> setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
> bind(lsn, (struct sockaddr *)&addr, sizeof(addr));
> listen(lsn, NUM_CONNS);
>
> /* Fork client: connect N times, never read */
> pid_t child = fork();
> if (child == 0) {
> int fds[NUM_CONNS];
> for (int i = 0; i < NUM_CONNS; i++) {
> fds[i] = socket(AF_INET, SOCK_STREAM, 0);
> connect(fds[i], (struct sockaddr *)&addr, sizeof(addr));
> }
> pause(); /* sit forever, never read */
> _exit(0);
> }
>
> /* Accept all connections */
> int clients[NUM_CONNS];
> for (int i = 0; i < NUM_CONNS; i++)
> clients[i] = accept(lsn, NULL, NULL);
>
> /* Freeze client so it stops reading */
> kill(child, SIGSTOP);
> printf("=== %d connections established, client frozen ===\n", NUM_CONNS);
> print_mem("BEFORE");
>
> /* Fill buffers and close with SO_LINGER(1,0) */
> char buf[2048];
> memset(buf, 'A', sizeof(buf));
> for (int i = 0; i < NUM_CONNS; i++) {
> int flags = fcntl(clients[i], F_GETFL, 0);
> fcntl(clients[i], F_SETFL, flags | O_NONBLOCK);
> while (send(clients[i], buf, sizeof(buf), MSG_NOSIGNAL) > 0);
> struct linger lg = { .l_onoff = 1, .l_linger = 0 };
> setsockopt(clients[i], SOL_SOCKET, SO_LINGER, &lg, sizeof(lg));
> close(clients[i]);
> }
>
> sleep(2);
> printf("\n=== All sockets closed with SO_LINGER(1,0) ===\n");
> print_mem("AFTER");
> kill(child, SIGKILL);
> waitpid(child, NULL, 0);
> close(lsn);
> return 0;
> }
> ```
> Output (Tested on 6.18.20):
> ```
> === 5000 connections established, client frozen ===
> BEFORE: MemAvailable: 95491288 kB
> BEFORE: TCP: inuse 10005 orphan 0 tw 5 alloc 10006 mem 0
>
> === All sockets closed with SO_LINGER(1,0) ===
> AFTER: MemAvailable: 95321800 kB
> AFTER: TCP: inuse 5 orphan 0 tw 5 alloc 5006 mem 8300
> ```
Unfortunately, it dies immediately on my end.
=== 5000 connections established, client frozen ===
Segmentation fault (core dumped) ./linux/tcp_linger
Did you see actual memory leak with kmemleak or is it
just the tcp_mem counter that is really leaked ?
# echo clear > /sys/kernel/debug/kmemleak
~ run repro ~
# echo scan > /sys/kernel/debug/kmemleak
next prev parent reply other threads:[~2026-04-27 22:27 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-18 0:19 [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data Ahmed, Aaron
2026-04-18 0:44 ` Kuniyuki Iwashima
2026-04-18 1:06 ` Kuniyuki Iwashima
2026-04-27 22:26 ` Ahmed, Aaron [this message]
2026-04-28 0:15 ` Kuniyuki Iwashima
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=A7A3F2FE-B18C-4F6D-A5E4-78164D6904F5@amazon.com \
--to=aarnahmd@amazon.com \
--cc=edumazet@google.com \
--cc=kuniyu@google.com \
--cc=ncardwell@google.com \
--cc=netdev@vger.kernel.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox