From: rtm@csail.mit.edu
To: Martin KaFai Lau <martin.lau@linux.dev>,
bpf@vger.kernel.org, netdev@vger.kernel.org
Subject: unregistering tcp_ca struct_ops can cause kernel page fault
Date: Sun, 08 Dec 2024 09:59:36 -0500 [thread overview]
Message-ID: <74665.1733669976@localhost> (raw)
[-- Attachment #1: Type: text/plain, Size: 2295 bytes --]
The attached program, along with the attached bpf_cubic.o eBPF binary,
uses bpftool to install a tcp_ca struct ops, creates a tcp connection,
and then unregisters the struct ops while the tcp connection is still
active. On my 6.13.0 system this causes a kernel fault due to
tcp_tso_segs() calling through a de-allocated struct tcp_congestion_ops.
bpf_cubic.o came from
https://github.com/aroodgar/bpf-tcp-congestion-control-algorithm
Linux ubuntu66 6.13.0-rc1-00337-g7503345ac5f5 #11 SMP Sun Dec 8 08:37:57 EST 2024 x86_64 x86_64 x86_64 GNU/Linux
Oops: general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6b6b: 0000 [#1] SMP DEBUG_PAGEALLOC PTI
CPU: 6 UID: 0 PID: 1594 Comm: a.out Not tainted 6.13.0-rc1-00337-g7503345ac5f5 #11
Hardware name: FreeBSD BHYVE/BHYVE, BIOS 14.0 10/17/2021
RIP: 0010:__x86_indirect_thunk_array+0xa/0x20
Code: 66 0f 1f 00 31 ff e9 15 70 d6 fe cc cc cc cc cc cc cc cc cc cc cc cc cc cc
cc cc cc cc cc cc cc e8 01 00 00 00 cc 48 89 04 24 <c3> cc cc cc cc 90 66 66 2e
0f 1f 84 00 00 00 00 00 0f 1f 44 00 00
RSP: 0018:ffffc90002037c60 EFLAGS: 00010202
RAX: 6b6b6b6b6b6b6b6b RBX: ffff88810afc0b00 RCX: 0000000000000018
RDX: 00000000077b668a RSI: 0000000000008000 RDI: ffff88810afc0b00
RBP: 0000000000008000 R08: 0000000000000000 R09: 0000000000008000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: 0000000000008000 R14: 0000000000000000 R15: ffffffff842f6140
FS: 00007f5457b84740(0000) GS:ffff88842db80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f5457d8c710 CR3: 000000011b4b8005 CR4: 00000000003706f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? die_addr+0x31/0x80
? exc_general_protection+0x1b4/0x3c0
? asm_exc_general_protection+0x26/0x30
? __x86_indirect_thunk_array+0xa/0x20
? tcp_tso_segs+0x1c/0x90
? tcp_write_xmit+0x74/0x1840
? __mod_memcg_state+0x91/0x190
? __tcp_push_pending_frames+0x31/0xc0
? tcp_sendmsg_locked+0xafc/0xd10
? tcp_sendmsg+0x26/0x40
? sock_write_iter+0x167/0x1a0
? vfs_write+0x35d/0x400
? ksys_write+0xc6/0xe0
? do_syscall_64+0x3f/0xd0
? entry_SYSCALL_64_after_hwframe+0x76/0x7e
</TASK>
Robert Morris
rtm@mit.edu
[-- Attachment #2: tcpbpf12a.c --]
[-- Type: application/octet-stream, Size: 2473 bytes --]
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <argp.h>
#include <signal.h>
#include <time.h>
#include <fcntl.h>
#include <sys/resource.h>
void
setcong(int s)
{
char buf[256];
socklen_t len;
strcpy(buf, "bpf_cubic");
len = strlen(buf);
if (setsockopt(s, IPPROTO_TCP, TCP_CONGESTION, buf, len) != 0) {
perror("setsockopt");
}
len = sizeof(buf);
if (getsockopt(s, IPPROTO_TCP, TCP_CONGESTION, buf, &len) != 0) {
perror("getsockopt");
}
printf("New: %s\n", buf);
}
int main(int argc, char **argv)
{
system("echo 1 > /proc/sys/net/core/bpf_jit_enable");
system("bpftool struct_ops register bpf_cubic.o");
int ss = socket(AF_INET, SOCK_STREAM, 0);
int yes = 1;
setsockopt(ss, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes));
struct sockaddr_in sin;
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
sin.sin_port = htons(8011);
if(bind(ss, (struct sockaddr *)&sin, sizeof(sin)) < 0){
perror("bind");
}
if(listen(ss, 10) < 0){
perror("listen");
}
{ int yes = 1;
setsockopt(ss, IPPROTO_TCP, TCP_NODELAY, &yes, sizeof(yes));
}
int pid = fork();
if(pid == 0){
socklen_t sinlen = sizeof(sin);
int s1 = accept(ss, (struct sockaddr *) &sin, &sinlen);
if(s1 < 0)
perror("accept");
close(ss);
{ int yes = 1;
setsockopt(s1, IPPROTO_TCP, TCP_NODELAY, &yes, sizeof(yes));
}
char buf[512];
while(1){
int n = read(s1, buf, 512);
printf("read %d\n", n);
if(n <= 0)
break;
}
exit(0);
}
close(ss);
int cs = socket(AF_INET, SOCK_STREAM, 0);
setcong(cs);
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
sin.sin_port = htons(8011);
if(connect(cs, (struct sockaddr *)&sin, sizeof(sin)) < 0)
perror("connect");
usleep(200000);
if(write(cs, "xyz", 3) < 0)
perror("write");
usleep(200000);
printf("first unregister:\n");
system("bpftool struct_ops unregister name cubic");
usleep(200000);
if(write(cs, "abcd", 4) < 0)
perror("write");
usleep(200000);
printf("close:\n");
close(cs);
usleep(200000);
printf("second unregister:\n");
system("bpftool struct_ops unregister name cubic");
usleep(200000);
}
[-- Attachment #3: bpf_cubic.o --]
[-- Type: application/octet-stream, Size: 40984 bytes --]
next reply other threads:[~2024-12-08 15:12 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-12-08 14:59 rtm [this message]
2024-12-09 21:56 ` unregistering tcp_ca struct_ops can cause kernel page fault Martin KaFai Lau
2024-12-11 23:28 ` rtm
2024-12-14 0:56 ` Martin KaFai Lau
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=74665.1733669976@localhost \
--to=rtm@csail.mit.edu \
--cc=bpf@vger.kernel.org \
--cc=martin.lau@linux.dev \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.