All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC bpf-next 0/8] BPF 'force to MPTCP'
@ 2023-07-06  4:08 Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 1/8] bpf: Add new prog type sockinit Geliang Tang
                   ` (8 more replies)
  0 siblings, 9 replies; 15+ messages in thread
From: Geliang Tang @ 2023-07-06  4:08 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa
  Cc: Geliang Tang, bpf, mptcp

As is described in the "How to use MPTCP?" section in MPTCP wiki [1]:

"Your app can create sockets with IPPROTO_MPTCP as the proto:
( socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP); ). Legacy apps can be
forced to create and use MPTCP sockets instead of TCP ones via the
mptcpize command bundled with the mptcpd daemon."

But the mptcpize (LD_PRELOAD technique) command has some limitations
[2]:

 - it doesn't work if the application is not using libc (e.g. GoLang
apps)
 - in some envs, it might not be easy to set env vars / change the way
apps are launched, e.g. on Android
 - mptcpize needs to be launched with all apps that want MPTCP: we could
have more control from BPF to enable MPTCP only for some apps or all the
ones of a netns or a cgroup, etc.
 - it is not in BPF, we cannot talk about it at netdev conf.

So this patchset attempts to use BPF to implement functions similer to
mptcpize.

The main idea is add a hook in sys_socket() to change the protocol id
from IPPROTO_TCP (or 0) to IPPROTO_MPTCP.

1. This first solution [3] is using "cgroup/sock_create":

Implement a new helper bpf_mptcpify() to change the protocol id:

+BPF_CALL_1(bpf_mptcpify, struct sock *, sk)
+{
+	if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) {
+		sk->sk_protocol = IPPROTO_MPTCP;
+		return (unsigned long)sk;
+	}
+
+	return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_mptcpify_proto = {
+	.func		= bpf_mptcpify,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_BTF_ID_OR_NULL,
+	.ret_btf_id	= &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+	.arg1_type	= ARG_PTR_TO_CTX,
+};

Add a hook in "cgroup/sock_create" section, invoking bpf_mptcpify()
helper in this hook:

+SEC("cgroup/sock_create")
+int sock(struct bpf_sock *ctx)
+{
+	struct sock *sk;
+
+	if (ctx->type != SOCK_STREAM)
+		return 1;
+
+	sk = bpf_mptcpify(ctx);
+	if (!sk)
+		return 1;
+
+	protocol = sk->sk_protocol;
+	return 1;
+}

But this solution doesn't work, because the sock_create section is
hooked at the end of inet_create(). It's too late to change the protocol
id.

2. The second solution [4] is using "fentry":

Implement a bpf_mptcpify() helper:

+BPF_CALL_1(bpf_mptcpify, struct socket_args *, args)
+{
+	if (args->family == AF_INET &&
+	    args->type == SOCK_STREAM &&
+	    (!args->protocol || args->protocol == IPPROTO_TCP))
+		args->protocol = IPPROTO_MPTCP;
+
+	return 0;
+}
+
+BTF_ID_LIST(bpf_mptcpify_btf_ids)
+BTF_ID(struct, socket_args)
+
+static const struct bpf_func_proto bpf_mptcpify_proto = {
+	.func		= bpf_mptcpify,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg1_btf_id	= &bpf_mptcpify_btf_ids[0],
+};

Add a new wrapper socket_create() in sys_socket() path, passing a
pointer of struct socket_args (int family; int type; int protocol) to
the wrapper.

+int socket_create(struct socket_args *args, struct socket **res)
+{
+	return sock_create(args->family, args->type, args->protocol, res);
+}
+EXPORT_SYMBOL(socket_create);
+
 /**
  *	sock_create_kern - creates a socket (kernel space)
  *	@net: net namespace
@@ -1608,6 +1614,7 @@  EXPORT_SYMBOL(sock_create_kern);
 
 static struct socket *__sys_socket_create(int family, int type, int protocol)
 {
+	struct socket_args args = { 0 };
 	struct socket *sock;
 	int retval;
 
@@ -1621,7 +1628,10 @@  static struct socket *__sys_socket_create(int family, int type, int protocol)
 		return ERR_PTR(-EINVAL);
 	type &= SOCK_TYPE_MASK;
 
-	retval = sock_create(family, type, protocol, &sock);
+	args.family = family;
+	args.type = type;
+	args.protocol = protocol;
+	retval = socket_create(&args, &sock);
 	if (retval < 0)
 		return ERR_PTR(retval);

Add "fentry" hook to the newly added wrapper, invoking bpf_mptcpify()
in the hook:

+SEC("fentry/socket_create")
+int BPF_PROG(trace_socket_create, void *args,
+		struct socket **res)
+{
+	bpf_mptcpify(args);
+	return 0;
+}

This version works. But it's just a work around version. Since the code
to add a wrapper to sys_socket() is not very elegant indeed, and it
shouldn't be accepted by upstream.

3. The last solution is this patchset itself:

Introduce new program type BPF_PROG_TYPE_CGROUP_SOCKINIT and attach type
BPF_CGROUP_SOCKINIT on cgroup basis.

Define BPF_CGROUP_RUN_PROG_SOCKINIT() helper, and implement
__cgroup_bpf_run_sockinit() helper to run a sockinit program:

+#define BPF_CGROUP_RUN_PROG_SOCKINIT(family, type, protocol)		       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled(CGROUP_SOCKINIT))			       \
+		__ret = __cgroup_bpf_run_sockinit(family, type, protocol,      \
+						  CGROUP_SOCKINIT);	       \
+	__ret;								       \
+})
+

Invoke BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create() to change
the arguments:

@@ -1469,6 +1469,12 @@  int __sock_create(struct net *net, int family, int type, int protocol,
 	struct socket *sock;
 	const struct net_proto_family *pf;
 
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_SOCKINIT(&family, &type, &protocol);
+		if (err)
+			return err;
+	}
+
 	/*
 	 *      Check protocol is in range
 	 */

Define BPF program in this newly added 'sockinit' SEC, so it will be
hooked in BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create().

@@ -158,6 +158,11 @@  static struct sec_name_test tests[] = {
 		{0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT},
 		{0, BPF_CGROUP_SETSOCKOPT},
 	},
+	{
+		"cgroup/sockinit",
+		{0, BPF_PROG_TYPE_CGROUP_SOCKINIT, BPF_CGROUP_SOCKINIT},
+		{0, BPF_CGROUP_SOCKINIT},
+	},
 };

+SEC("cgroup/sockinit")
+int mptcpify(struct bpf_sockinit_ctx *ctx)
+{
+	if ((ctx->family == AF_INET || ctx->family == AF_INET6) &&
+	    ctx->type == SOCK_STREAM &&
+	    (!ctx->protocol || ctx->protocol == IPPROTO_TCP)) {
+		ctx->protocol = IPPROTO_MPTCP;
+	}
+
+	return 1;
+}

This version is the best solution we have found so far.

[1]
https://github.com/multipath-tcp/mptcp_net-next/wiki
[2]
https://github.com/multipath-tcp/mptcp_net-next/issues/79
[3]
https://patchwork.kernel.org/project/mptcp/cover/cover.1688215769.git.geliang.tang@suse.com/
[4]
https://patchwork.kernel.org/project/mptcp/cover/cover.1688366249.git.geliang.tang@suse.com/


Geliang Tang (8):
  bpf: Add new prog type sockinit
  bpf: Run a sockinit program
  net: socket: run sockinit hooks
  libbpf: Support sockinit hook
  selftests/bpf: Add mptcpify program
  selftests/bpf: use random netns name for mptcp
  selftests/bpf: add two mptcp netns helpers
  selftests/bpf: Add mptcpify selftest

 include/linux/bpf-cgroup-defs.h               |   1 +
 include/linux/bpf-cgroup.h                    |  14 ++
 include/linux/bpf_types.h                     |   2 +
 include/uapi/linux/bpf.h                      |   8 ++
 kernel/bpf/cgroup.c                           |  90 +++++++++++++
 kernel/bpf/syscall.c                          |   7 +
 kernel/bpf/verifier.c                         |   1 +
 net/socket.c                                  |   6 +
 tools/include/uapi/linux/bpf.h                |   8 ++
 tools/lib/bpf/libbpf.c                        |   3 +
 tools/lib/bpf/libbpf_probes.c                 |   1 +
 .../bpf/cgroup_getset_retval_hooks.h          |   1 +
 .../testing/selftests/bpf/prog_tests/mptcp.c  | 123 ++++++++++++++++--
 .../selftests/bpf/prog_tests/section_names.c  |   5 +
 tools/testing/selftests/bpf/progs/mptcpify.c  |  26 ++++
 15 files changed, 287 insertions(+), 9 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/mptcpify.c

-- 
2.35.3

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC bpf-next 1/8] bpf: Add new prog type sockinit
  2023-07-06  4:08 [RFC bpf-next 0/8] BPF 'force to MPTCP' Geliang Tang
@ 2023-07-06  4:08 ` Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 2/8] bpf: Run a sockinit program Geliang Tang
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2023-07-06  4:08 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa
  Cc: Geliang Tang, bpf, mptcp

The patch introduces new program type BPF_PROG_TYPE_CGROUP_SOCKINIT
and attach type BPF_CGROUP_SOCKINIT on cgroup basis.

Define this program by BPF_PROG_TYPE(), and implement two operations:
cg_sockinit_prog_ops and cg_sockinit_verifier_ops.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 include/linux/bpf_types.h |  2 ++
 include/uapi/linux/bpf.h  |  8 +++++
 kernel/bpf/cgroup.c       | 66 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c      |  7 +++++
 kernel/bpf/verifier.c     |  1 +
 5 files changed, 84 insertions(+)

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fc0d6f32c687..07e1f21e82e9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -56,6 +56,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
 	      struct bpf_sysctl, struct bpf_sysctl_kern)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
 	      struct bpf_sockopt, struct bpf_sockopt_kern)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKINIT, cg_sockinit,
+	      struct bpf_sockinit_ctx, struct bpf_sockinit_ctx)
 #endif
 #ifdef CONFIG_BPF_LIRC_MODE2
 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 60a9d59beeab..cb882ab8065d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -980,6 +980,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
 	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
 	BPF_PROG_TYPE_CGROUP_SOCKOPT,
+	BPF_PROG_TYPE_CGROUP_SOCKINIT,
 	BPF_PROG_TYPE_TRACING,
 	BPF_PROG_TYPE_STRUCT_OPS,
 	BPF_PROG_TYPE_EXT,
@@ -1013,6 +1014,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UDP6_RECVMSG,
 	BPF_CGROUP_GETSOCKOPT,
 	BPF_CGROUP_SETSOCKOPT,
+	BPF_CGROUP_SOCKINIT,
 	BPF_TRACE_RAW_TP,
 	BPF_TRACE_FENTRY,
 	BPF_TRACE_FEXIT,
@@ -6829,6 +6831,12 @@ struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+struct bpf_sockinit_ctx {
+	__u32 family;
+	__u32 type;
+	__u32 protocol;
+};
+
 /* DIRECT:  Skip the FIB rules and go to FIB table associated with device
  * OUTPUT:  Do lookup from egress perspective; default is ingress
  */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..93b9f404a007 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -2505,6 +2505,72 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
 const struct bpf_prog_ops cg_sockopt_prog_ops = {
 };
 
+static const struct bpf_func_proto *
+cgroup_sockinit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	const struct bpf_func_proto *func_proto;
+
+	func_proto = cgroup_common_func_proto(func_id, prog);
+	if (func_proto)
+		return func_proto;
+
+	func_proto = cgroup_current_func_proto(func_id, prog);
+	if (func_proto)
+		return func_proto;
+
+	switch (func_id) {
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static bool cgroup_sockinit_is_valid_access(int off, int size,
+					    enum bpf_access_type type,
+					    const struct bpf_prog *prog,
+					    struct bpf_insn_access_aux *info)
+{
+	const int size_default = sizeof(__u32);
+
+	if (off < 0 || off + size > sizeof(struct bpf_sockinit_ctx))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	switch (off) {
+	case bpf_ctx_range(struct bpf_sockinit_ctx, family):
+		bpf_ctx_record_field_size(info, size_default);
+		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+			return false;
+		break;
+	case bpf_ctx_range(struct bpf_sockinit_ctx, type):
+		bpf_ctx_record_field_size(info, size_default);
+		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+			return false;
+		break;
+	case bpf_ctx_range(struct bpf_sockinit_ctx, protocol):
+		if (type == BPF_READ) {
+			bpf_ctx_record_field_size(info, size_default);
+			return bpf_ctx_narrow_access_ok(off, size, size_default);
+		} else {
+			return size == size_default;
+		}
+	default:
+		if (size != size_default)
+			return false;
+	}
+
+	return true;
+}
+
+const struct bpf_verifier_ops cg_sockinit_verifier_ops = {
+	.get_func_proto		= cgroup_sockinit_func_proto,
+	.is_valid_access	= cgroup_sockinit_is_valid_access,
+};
+
+const struct bpf_prog_ops cg_sockinit_prog_ops = {
+};
+
 /* Common helpers for cgroup hooks. */
 const struct bpf_func_proto *
 cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a2aef900519c..2952dd88c614 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2513,6 +2513,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_SOCKINIT:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_EXT: /* extends any prog */
 	case BPF_PROG_TYPE_NETFILTER:
@@ -3574,6 +3575,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
 		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
+	case BPF_CGROUP_SOCKINIT:
+		return BPF_PROG_TYPE_CGROUP_SOCKINIT;
 	case BPF_TRACE_ITER:
 	case BPF_TRACE_RAW_TP:
 	case BPF_TRACE_FENTRY:
@@ -3640,6 +3643,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_SOCKINIT:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_LSM:
 		if (ptype == BPF_PROG_TYPE_LSM &&
@@ -3682,6 +3686,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_SOCKINIT:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_LSM:
 		return cgroup_bpf_prog_detach(attr, ptype);
@@ -3726,6 +3731,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SYSCTL:
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
+	case BPF_CGROUP_SOCKINIT:
 	case BPF_LSM_CGROUP:
 		return cgroup_bpf_prog_query(attr, uattr);
 	case BPF_LIRC_MODE2:
@@ -4717,6 +4723,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_CGROUP_SOCKINIT:
 		ret = cgroup_bpf_link_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_EXT:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 11e54dd8b6dd..d0ade7759123 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14316,6 +14316,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_CGROUP_SOCKINIT:
 		break;
 	case BPF_PROG_TYPE_RAW_TRACEPOINT:
 		if (!env->prog->aux->attach_btf_id)
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [RFC bpf-next 2/8] bpf: Run a sockinit program
  2023-07-06  4:08 [RFC bpf-next 0/8] BPF 'force to MPTCP' Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 1/8] bpf: Add new prog type sockinit Geliang Tang
@ 2023-07-06  4:08 ` Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 3/8] net: socket: run sockinit hooks Geliang Tang
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2023-07-06  4:08 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa
  Cc: Geliang Tang, bpf, mptcp

This patch defines BPF_CGROUP_RUN_PROG_SOCKINIT() helper, and implements
__cgroup_bpf_run_sockinit() helper to run a sockinit program.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 include/linux/bpf-cgroup-defs.h |  1 +
 include/linux/bpf-cgroup.h      | 14 ++++++++++++++
 kernel/bpf/cgroup.c             | 24 ++++++++++++++++++++++++
 3 files changed, 39 insertions(+)

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 7b121bd780eb..aa9ee82f5d20 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -37,6 +37,7 @@ enum cgroup_bpf_attach_type {
 	CGROUP_UDP6_RECVMSG,
 	CGROUP_GETSOCKOPT,
 	CGROUP_SETSOCKOPT,
+	CGROUP_SOCKINIT,
 	CGROUP_INET4_GETPEERNAME,
 	CGROUP_INET6_GETPEERNAME,
 	CGROUP_INET4_GETSOCKNAME,
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 57e9e109257e..a2f58f0d2260 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -57,6 +57,7 @@ to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
 	CGROUP_ATYPE(CGROUP_UDP6_RECVMSG);
 	CGROUP_ATYPE(CGROUP_GETSOCKOPT);
 	CGROUP_ATYPE(CGROUP_SETSOCKOPT);
+	CGROUP_ATYPE(CGROUP_SOCKINIT);
 	CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME);
 	CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME);
 	CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME);
@@ -148,6 +149,9 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 					    int optname, void *optval,
 					    int *optlen, int retval);
 
+int __cgroup_bpf_run_sockinit(int *family, int *type, int *protocol,
+			      enum cgroup_bpf_attach_type atype);
+
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
 {
@@ -407,6 +411,15 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_SOCKINIT(family, type, protocol)		       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled(CGROUP_SOCKINIT))			       \
+		__ret = __cgroup_bpf_run_sockinit(family, type, protocol,      \
+						  CGROUP_SOCKINIT);	       \
+	__ret;								       \
+})
+
 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype, struct bpf_prog *prog);
 int cgroup_bpf_prog_detach(const union bpf_attr *attr,
@@ -505,6 +518,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 					    optlen, retval) ({ retval; })
 #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
 				       kernel_optval) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_SOCKINIT(family, type, protocol) ({ 0; })
 
 #define for_each_cgroup_storage_type(stype) for (; false; )
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 93b9f404a007..fe294e4d618c 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1996,6 +1996,30 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 
 	return ret;
 }
+
+int __cgroup_bpf_run_sockinit(int *family, int *type, int *protocol,
+			      enum cgroup_bpf_attach_type atype)
+{
+	struct bpf_sockinit_ctx ctx = {
+		.family		= *family,
+		.type		= *type,
+		.protocol	= *protocol,
+	};
+	struct cgroup *cgrp;
+	int ret;
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+				    NULL);
+	rcu_read_unlock();
+
+	*family		= ctx.family;
+	*type		= ctx.type;
+	*protocol	= ctx.protocol;
+
+	return ret;
+}
 #endif
 
 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [RFC bpf-next 3/8] net: socket: run sockinit hooks
  2023-07-06  4:08 [RFC bpf-next 0/8] BPF 'force to MPTCP' Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 1/8] bpf: Add new prog type sockinit Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 2/8] bpf: Run a sockinit program Geliang Tang
@ 2023-07-06  4:08 ` Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 4/8] libbpf: Support sockinit hook Geliang Tang
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2023-07-06  4:08 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa
  Cc: Geliang Tang, bpf, mptcp

The arguments of socket() need to be changed sometimes like the MPTCP
case in the next commits.

It's too late to add the BPF hooks in BPF_CGROUP_RUN_PROG_INET_SOCK()
in inet_create(). So this patch invokes BPF_CGROUP_RUN_PROG_SOCKINIT()
in __socket_create() to change the arguments.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 net/socket.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/socket.c b/net/socket.c
index 2b0e54b2405c..27b423e1800f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1469,6 +1469,12 @@ int __sock_create(struct net *net, int family, int type, int protocol,
 	struct socket *sock;
 	const struct net_proto_family *pf;
 
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_SOCKINIT(&family, &type, &protocol);
+		if (err)
+			return err;
+	}
+
 	/*
 	 *      Check protocol is in range
 	 */
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [RFC bpf-next 4/8] libbpf: Support sockinit hook
  2023-07-06  4:08 [RFC bpf-next 0/8] BPF 'force to MPTCP' Geliang Tang
                   ` (2 preceding siblings ...)
  2023-07-06  4:08 ` [RFC bpf-next 3/8] net: socket: run sockinit hooks Geliang Tang
@ 2023-07-06  4:08 ` Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 5/8] selftests/bpf: Add mptcpify program Geliang Tang
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2023-07-06  4:08 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa
  Cc: Geliang Tang, bpf, mptcp

Sync BPF_PROG_TYPE_CGROUP_SOCKINIT related bpf UAPI changes to tools/.
Support BPF_PROG_TYPE_CGROUP_SOCKINIT program in libbpf: identifying
program and attach types by section name, probe.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 tools/include/uapi/linux/bpf.h | 8 ++++++++
 tools/lib/bpf/libbpf.c         | 3 +++
 tools/lib/bpf/libbpf_probes.c  | 1 +
 3 files changed, 12 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 60a9d59beeab..cb882ab8065d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -980,6 +980,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
 	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
 	BPF_PROG_TYPE_CGROUP_SOCKOPT,
+	BPF_PROG_TYPE_CGROUP_SOCKINIT,
 	BPF_PROG_TYPE_TRACING,
 	BPF_PROG_TYPE_STRUCT_OPS,
 	BPF_PROG_TYPE_EXT,
@@ -1013,6 +1014,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UDP6_RECVMSG,
 	BPF_CGROUP_GETSOCKOPT,
 	BPF_CGROUP_SETSOCKOPT,
+	BPF_CGROUP_SOCKINIT,
 	BPF_TRACE_RAW_TP,
 	BPF_TRACE_FENTRY,
 	BPF_TRACE_FEXIT,
@@ -6829,6 +6831,12 @@ struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+struct bpf_sockinit_ctx {
+	__u32 family;
+	__u32 type;
+	__u32 protocol;
+};
+
 /* DIRECT:  Skip the FIB rules and go to FIB table associated with device
  * OUTPUT:  Do lookup from egress perspective; default is ingress
  */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 214f828ece6b..03f62d163030 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -95,6 +95,7 @@ static const char * const attach_type_name[] = {
 	[BPF_CGROUP_UDP6_RECVMSG]	= "cgroup_udp6_recvmsg",
 	[BPF_CGROUP_GETSOCKOPT]		= "cgroup_getsockopt",
 	[BPF_CGROUP_SETSOCKOPT]		= "cgroup_setsockopt",
+	[BPF_CGROUP_SOCKINIT]		= "cgroup_sockinit",
 	[BPF_SK_SKB_STREAM_PARSER]	= "sk_skb_stream_parser",
 	[BPF_SK_SKB_STREAM_VERDICT]	= "sk_skb_stream_verdict",
 	[BPF_SK_SKB_VERDICT]		= "sk_skb_verdict",
@@ -197,6 +198,7 @@ static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_CGROUP_SYSCTL]		= "cgroup_sysctl",
 	[BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE]	= "raw_tracepoint_writable",
 	[BPF_PROG_TYPE_CGROUP_SOCKOPT]		= "cgroup_sockopt",
+	[BPF_PROG_TYPE_CGROUP_SOCKINIT]		= "cgroup_sockinit",
 	[BPF_PROG_TYPE_TRACING]			= "tracing",
 	[BPF_PROG_TYPE_STRUCT_OPS]		= "struct_ops",
 	[BPF_PROG_TYPE_EXT]			= "ext",
@@ -8734,6 +8736,7 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("cgroup/getsockopt",	CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE),
 	SEC_DEF("cgroup/setsockopt",	CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE),
 	SEC_DEF("cgroup/dev",		CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT),
+	SEC_DEF("cgroup/sockinit",	CGROUP_SOCKINIT, BPF_CGROUP_SOCKINIT, SEC_ATTACHABLE),
 	SEC_DEF("struct_ops+",		STRUCT_OPS, 0, SEC_NONE),
 	SEC_DEF("struct_ops.s+",	STRUCT_OPS, 0, SEC_SLEEPABLE),
 	SEC_DEF("sk_lookup",		SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE),
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 9c4db90b92b6..3734fee60d2f 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -180,6 +180,7 @@ static int probe_prog_load(enum bpf_prog_type prog_type,
 	case BPF_PROG_TYPE_SK_REUSEPORT:
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_SOCKINIT:
 		break;
 	case BPF_PROG_TYPE_NETFILTER:
 		opts.expected_attach_type = BPF_NETFILTER;
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [RFC bpf-next 5/8] selftests/bpf: Add mptcpify program
  2023-07-06  4:08 [RFC bpf-next 0/8] BPF 'force to MPTCP' Geliang Tang
                   ` (3 preceding siblings ...)
  2023-07-06  4:08 ` [RFC bpf-next 4/8] libbpf: Support sockinit hook Geliang Tang
@ 2023-07-06  4:08 ` Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 6/8] selftests/bpf: use random netns name for mptcp Geliang Tang
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2023-07-06  4:08 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa
  Cc: Geliang Tang, bpf, mptcp

This patch implements a new test program mptcpify: if the family is
AF_INET or AF_INET6, the type is SOCK_STREAM, and the protocol ID is
0 or IPPROTO_TCP, set it to IPPROTO_MPTCP.

This is defined in a newly added 'sockinit' SEC, so it will be hooked
in BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create().

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 .../bpf/cgroup_getset_retval_hooks.h          |  1 +
 .../selftests/bpf/prog_tests/section_names.c  |  5 ++++
 tools/testing/selftests/bpf/progs/mptcpify.c  | 26 +++++++++++++++++++
 3 files changed, 32 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/mptcpify.c

diff --git a/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h b/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h
index a525d3544fd7..0ba09e135df3 100644
--- a/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h
+++ b/tools/testing/selftests/bpf/cgroup_getset_retval_hooks.h
@@ -14,6 +14,7 @@ BPF_RETVAL_HOOK(post_bind6, "cgroup/post_bind6", bpf_sock_addr, 0)
 BPF_RETVAL_HOOK(sendmsg4, "cgroup/sendmsg4", bpf_sock_addr, 0)
 BPF_RETVAL_HOOK(sendmsg6, "cgroup/sendmsg6", bpf_sock_addr, 0)
 BPF_RETVAL_HOOK(sysctl, "cgroup/sysctl", bpf_sysctl, 0)
+BPF_RETVAL_HOOK(sockinit, "cgroup/sockinit", bpf_sockinit_ctx, 0)
 BPF_RETVAL_HOOK(recvmsg4, "cgroup/recvmsg4", bpf_sock_addr, -EINVAL)
 BPF_RETVAL_HOOK(recvmsg6, "cgroup/recvmsg6", bpf_sock_addr, -EINVAL)
 BPF_RETVAL_HOOK(getsockopt, "cgroup/getsockopt", bpf_sockopt, 0)
diff --git a/tools/testing/selftests/bpf/prog_tests/section_names.c b/tools/testing/selftests/bpf/prog_tests/section_names.c
index 8b571890c57e..52319c45de57 100644
--- a/tools/testing/selftests/bpf/prog_tests/section_names.c
+++ b/tools/testing/selftests/bpf/prog_tests/section_names.c
@@ -158,6 +158,11 @@ static struct sec_name_test tests[] = {
 		{0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT},
 		{0, BPF_CGROUP_SETSOCKOPT},
 	},
+	{
+		"cgroup/sockinit",
+		{0, BPF_PROG_TYPE_CGROUP_SOCKINIT, BPF_CGROUP_SOCKINIT},
+		{0, BPF_CGROUP_SOCKINIT},
+	},
 };
 
 static void test_prog_type_by_name(const struct sec_name_test *test)
diff --git a/tools/testing/selftests/bpf/progs/mptcpify.c b/tools/testing/selftests/bpf/progs/mptcpify.c
new file mode 100644
index 000000000000..f751e6f65eca
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/mptcpify.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023, SUSE. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define	AF_INET		2
+#define	AF_INET6	10
+#define	SOCK_STREAM	1
+#define	IPPROTO_TCP	6
+#define	IPPROTO_MPTCP	262
+
+SEC("cgroup/sockinit")
+int mptcpify(struct bpf_sockinit_ctx *ctx)
+{
+	if ((ctx->family == AF_INET || ctx->family == AF_INET6) &&
+	    ctx->type == SOCK_STREAM &&
+	    (!ctx->protocol || ctx->protocol == IPPROTO_TCP)) {
+		ctx->protocol = IPPROTO_MPTCP;
+	}
+
+	return 1;
+}
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [RFC bpf-next 6/8] selftests/bpf: use random netns name for mptcp
  2023-07-06  4:08 [RFC bpf-next 0/8] BPF 'force to MPTCP' Geliang Tang
                   ` (4 preceding siblings ...)
  2023-07-06  4:08 ` [RFC bpf-next 5/8] selftests/bpf: Add mptcpify program Geliang Tang
@ 2023-07-06  4:08 ` Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 7/8] selftests/bpf: add two mptcp netns helpers Geliang Tang
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2023-07-06  4:08 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa
  Cc: Geliang Tang, bpf, mptcp, Matthieu Baerts

Use rand() to generate a random netns name instead of using the fixed
name "mptcp_ns" for every test.

By doing that, we can re-launch the test even if there was an issue
removing the previous netns or if by accident, a netns with this generic
name already existed on the system.

Note that using a different name each will also help adding more
subtests in future commits.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
---
 tools/testing/selftests/bpf/prog_tests/mptcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index cd0c42fff7c0..4ccca3d39a8f 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -7,7 +7,7 @@
 #include "network_helpers.h"
 #include "mptcp_sock.skel.h"
 
-#define NS_TEST "mptcp_ns"
+char NS_TEST[32];
 
 #ifndef TCP_CA_NAME_MAX
 #define TCP_CA_NAME_MAX	16
@@ -147,6 +147,8 @@ static void test_base(void)
 	if (!ASSERT_GE(cgroup_fd, 0, "test__join_cgroup"))
 		return;
 
+	srand(time(NULL));
+	snprintf(NS_TEST, sizeof(NS_TEST), "mptcp_ns_%d", rand());
 	SYS(fail, "ip netns add %s", NS_TEST);
 	SYS(fail, "ip -net %s link set dev lo up", NS_TEST);
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [RFC bpf-next 7/8] selftests/bpf: add two mptcp netns helpers
  2023-07-06  4:08 [RFC bpf-next 0/8] BPF 'force to MPTCP' Geliang Tang
                   ` (5 preceding siblings ...)
  2023-07-06  4:08 ` [RFC bpf-next 6/8] selftests/bpf: use random netns name for mptcp Geliang Tang
@ 2023-07-06  4:08 ` Geliang Tang
  2023-07-06  4:08 ` [RFC bpf-next 8/8] selftests/bpf: Add mptcpify selftest Geliang Tang
  2023-07-06 17:02 ` [RFC bpf-next 0/8] BPF 'force to MPTCP' Stanislav Fomichev
  8 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2023-07-06  4:08 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa
  Cc: Geliang Tang, bpf, mptcp, Matthieu Baerts

Add two netns helpers for mptcp tests: create_netns() and
cleanup_netns(). Use them in test_base().

These new helpers will be re-used in the following commits introducing
new tests.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
---
 .../testing/selftests/bpf/prog_tests/mptcp.c  | 34 +++++++++++++------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index 4ccca3d39a8f..b2a833a900c2 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -22,6 +22,26 @@ struct mptcp_storage {
 	char ca_name[TCP_CA_NAME_MAX];
 };
 
+static struct nstoken *create_netns(void)
+{
+	srand(time(NULL));
+	snprintf(NS_TEST, sizeof(NS_TEST), "mptcp_ns_%d", rand());
+	SYS(fail, "ip netns add %s", NS_TEST);
+	SYS(fail, "ip -net %s link set dev lo up", NS_TEST);
+
+	return open_netns(NS_TEST);
+fail:
+	return NULL;
+}
+
+static void cleanup_netns(struct nstoken *nstoken)
+{
+	if (nstoken)
+		close_netns(nstoken);
+
+	SYS_NOFAIL("ip netns del %s &> /dev/null", NS_TEST);
+}
+
 static int verify_tsk(int map_fd, int client_fd)
 {
 	int err, cfd = client_fd;
@@ -147,13 +167,8 @@ static void test_base(void)
 	if (!ASSERT_GE(cgroup_fd, 0, "test__join_cgroup"))
 		return;
 
-	srand(time(NULL));
-	snprintf(NS_TEST, sizeof(NS_TEST), "mptcp_ns_%d", rand());
-	SYS(fail, "ip netns add %s", NS_TEST);
-	SYS(fail, "ip -net %s link set dev lo up", NS_TEST);
-
-	nstoken = open_netns(NS_TEST);
-	if (!ASSERT_OK_PTR(nstoken, "open_netns"))
+	nstoken = create_netns();
+	if (!ASSERT_OK_PTR(nstoken, "create_netns"))
 		goto fail;
 
 	/* without MPTCP */
@@ -176,10 +191,7 @@ static void test_base(void)
 	close(server_fd);
 
 fail:
-	if (nstoken)
-		close_netns(nstoken);
-
-	SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null");
+	cleanup_netns(nstoken);
 
 	close(cgroup_fd);
 }
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [RFC bpf-next 8/8] selftests/bpf: Add mptcpify selftest
  2023-07-06  4:08 [RFC bpf-next 0/8] BPF 'force to MPTCP' Geliang Tang
                   ` (6 preceding siblings ...)
  2023-07-06  4:08 ` [RFC bpf-next 7/8] selftests/bpf: add two mptcp netns helpers Geliang Tang
@ 2023-07-06  4:08 ` Geliang Tang
  2023-07-06 17:02 ` [RFC bpf-next 0/8] BPF 'force to MPTCP' Stanislav Fomichev
  8 siblings, 0 replies; 15+ messages in thread
From: Geliang Tang @ 2023-07-06  4:08 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa
  Cc: Geliang Tang, bpf, mptcp

This patch extends the MPTCP test base, add a selftest test_mptcpify()
for the mptcpify case.

Open and load the mptcpify test prog to mptcpify the TCP sockets
dynamically, then use start_server() and connect_to_fd() to create a
TCP socket, but actually what's created is an MPTCP socket, which can
be verified through the output of 'ss' command.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 .../testing/selftests/bpf/prog_tests/mptcp.c  | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index b2a833a900c2..186270e058c7 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -6,6 +6,7 @@
 #include "cgroup_helpers.h"
 #include "network_helpers.h"
 #include "mptcp_sock.skel.h"
+#include "mptcpify.skel.h"
 
 char NS_TEST[32];
 
@@ -196,8 +197,98 @@ static void test_base(void)
 	close(cgroup_fd);
 }
 
+static void send_byte(int fd)
+{
+	char b = 0x55;
+
+	ASSERT_EQ(write(fd, &b, sizeof(b)), 1, "send single byte");
+}
+
+static int verify_mptcpify(void)
+{
+	char cmd[128];
+	int err = 0;
+
+	snprintf(cmd, sizeof(cmd),
+		 "ip netns exec %s ss -tOni | grep -q tcp-ulp-mptcp",
+		 NS_TEST);
+	if (!ASSERT_OK(system(cmd), "No tcp-ulp-mptcp found!"))
+		err++;
+
+	snprintf(cmd, sizeof(cmd),
+		 "ip netns exec %s nstat -asz MPTcpExtMPCapableSYNACKRX | \
+		 awk 'NR==1 {next} {print $2}' | \
+		 grep -q 1", NS_TEST);
+	if (!ASSERT_OK(system(cmd), "No MPTcpExtMPCapableSYNACKRX found!"))
+		err++;
+
+	return err;
+}
+
+static int run_mptcpify(int cgroup_fd)
+{
+	int server_fd, client_fd, err = 0;
+	struct mptcpify *mptcpify_skel;
+
+	mptcpify_skel = mptcpify__open_and_load();
+	if (!ASSERT_OK_PTR(mptcpify_skel, "mptcpify__open_and_load"))
+		return -EIO;
+
+	mptcpify_skel->links.mptcpify =
+		bpf_program__attach_cgroup(mptcpify_skel->progs.mptcpify, cgroup_fd);
+	if (!ASSERT_OK_PTR(mptcpify_skel->links.mptcpify, "bpf_program__attach_cgroup")) {
+		err = -EIO;
+		goto out;
+	}
+
+	/* without MPTCP */
+	server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
+	if (!ASSERT_GE(server_fd, 0, "start_server")) {
+		err = -EIO;
+		goto out;
+	}
+
+	client_fd = connect_to_fd(server_fd, 0);
+	if (!ASSERT_GE(client_fd, 0, "connect to fd")) {
+		err = -EIO;
+		goto close_server;
+	}
+
+	send_byte(client_fd);
+	err += verify_mptcpify();
+
+	close(client_fd);
+close_server:
+	close(server_fd);
+out:
+	mptcpify__destroy(mptcpify_skel);
+	return err;
+}
+
+static void test_mptcpify(void)
+{
+	struct nstoken *nstoken = NULL;
+	int cgroup_fd;
+
+	cgroup_fd = test__join_cgroup("/mptcpify");
+	if (!ASSERT_GE(cgroup_fd, 0, "test__join_cgroup"))
+		return;
+
+	nstoken = create_netns();
+	if (!ASSERT_OK_PTR(nstoken, "create_netns"))
+		goto fail;
+
+	ASSERT_OK(run_mptcpify(cgroup_fd), "run_mptcpify");
+
+fail:
+	cleanup_netns(nstoken);
+	close(cgroup_fd);
+}
+
 void test_mptcp(void)
 {
 	if (test__start_subtest("base"))
 		test_base();
+	if (test__start_subtest("mptcpify"))
+		test_mptcpify();
 }
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [RFC bpf-next 0/8] BPF 'force to MPTCP'
  2023-07-06  4:08 [RFC bpf-next 0/8] BPF 'force to MPTCP' Geliang Tang
                   ` (7 preceding siblings ...)
  2023-07-06  4:08 ` [RFC bpf-next 8/8] selftests/bpf: Add mptcpify selftest Geliang Tang
@ 2023-07-06 17:02 ` Stanislav Fomichev
  2023-07-13  5:47   ` Geliang Tang
  8 siblings, 1 reply; 15+ messages in thread
From: Stanislav Fomichev @ 2023-07-06 17:02 UTC (permalink / raw)
  To: Geliang Tang
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Hao Luo, Jiri Olsa, bpf, mptcp

On 07/06, Geliang Tang wrote:
> As is described in the "How to use MPTCP?" section in MPTCP wiki [1]:
> 
> "Your app can create sockets with IPPROTO_MPTCP as the proto:
> ( socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP); ). Legacy apps can be
> forced to create and use MPTCP sockets instead of TCP ones via the
> mptcpize command bundled with the mptcpd daemon."
> 
> But the mptcpize (LD_PRELOAD technique) command has some limitations
> [2]:
> 
>  - it doesn't work if the application is not using libc (e.g. GoLang
> apps)
>  - in some envs, it might not be easy to set env vars / change the way
> apps are launched, e.g. on Android
>  - mptcpize needs to be launched with all apps that want MPTCP: we could
> have more control from BPF to enable MPTCP only for some apps or all the
> ones of a netns or a cgroup, etc.
>  - it is not in BPF, we cannot talk about it at netdev conf.
> 
> So this patchset attempts to use BPF to implement functions similer to
> mptcpize.
> 
> The main idea is add a hook in sys_socket() to change the protocol id
> from IPPROTO_TCP (or 0) to IPPROTO_MPTCP.
> 
> 1. This first solution [3] is using "cgroup/sock_create":
> 
> Implement a new helper bpf_mptcpify() to change the protocol id:
> 
> +BPF_CALL_1(bpf_mptcpify, struct sock *, sk)
> +{
> +	if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) {
> +		sk->sk_protocol = IPPROTO_MPTCP;
> +		return (unsigned long)sk;
> +	}
> +
> +	return (unsigned long)NULL;
> +}
> +
> +const struct bpf_func_proto bpf_mptcpify_proto = {
> +	.func		= bpf_mptcpify,
> +	.gpl_only	= false,
> +	.ret_type	= RET_PTR_TO_BTF_ID_OR_NULL,
> +	.ret_btf_id	= &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +};
> 
> Add a hook in "cgroup/sock_create" section, invoking bpf_mptcpify()
> helper in this hook:
> 
> +SEC("cgroup/sock_create")
> +int sock(struct bpf_sock *ctx)
> +{
> +	struct sock *sk;
> +
> +	if (ctx->type != SOCK_STREAM)
> +		return 1;
> +
> +	sk = bpf_mptcpify(ctx);
> +	if (!sk)
> +		return 1;
> +
> +	protocol = sk->sk_protocol;
> +	return 1;
> +}
> 
> But this solution doesn't work, because the sock_create section is
> hooked at the end of inet_create(). It's too late to change the protocol
> id.
> 
> 2. The second solution [4] is using "fentry":
> 
> Implement a bpf_mptcpify() helper:
> 
> +BPF_CALL_1(bpf_mptcpify, struct socket_args *, args)
> +{
> +	if (args->family == AF_INET &&
> +	    args->type == SOCK_STREAM &&
> +	    (!args->protocol || args->protocol == IPPROTO_TCP))
> +		args->protocol = IPPROTO_MPTCP;
> +
> +	return 0;
> +}
> +
> +BTF_ID_LIST(bpf_mptcpify_btf_ids)
> +BTF_ID(struct, socket_args)
> +
> +static const struct bpf_func_proto bpf_mptcpify_proto = {
> +	.func		= bpf_mptcpify,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_BTF_ID,
> +	.arg1_btf_id	= &bpf_mptcpify_btf_ids[0],
> +};
> 
> Add a new wrapper socket_create() in sys_socket() path, passing a
> pointer of struct socket_args (int family; int type; int protocol) to
> the wrapper.
> 
> +int socket_create(struct socket_args *args, struct socket **res)
> +{
> +	return sock_create(args->family, args->type, args->protocol, res);
> +}
> +EXPORT_SYMBOL(socket_create);
> +
>  /**
>   *	sock_create_kern - creates a socket (kernel space)
>   *	@net: net namespace
> @@ -1608,6 +1614,7 @@  EXPORT_SYMBOL(sock_create_kern);
>  
>  static struct socket *__sys_socket_create(int family, int type, int protocol)
>  {
> +	struct socket_args args = { 0 };
>  	struct socket *sock;
>  	int retval;
>  
> @@ -1621,7 +1628,10 @@  static struct socket *__sys_socket_create(int family, int type, int protocol)
>  		return ERR_PTR(-EINVAL);
>  	type &= SOCK_TYPE_MASK;
>  
> -	retval = sock_create(family, type, protocol, &sock);
> +	args.family = family;
> +	args.type = type;
> +	args.protocol = protocol;
> +	retval = socket_create(&args, &sock);
>  	if (retval < 0)
>  		return ERR_PTR(retval);
> 
> Add "fentry" hook to the newly added wrapper, invoking bpf_mptcpify()
> in the hook:
> 
> +SEC("fentry/socket_create")
> +int BPF_PROG(trace_socket_create, void *args,
> +		struct socket **res)
> +{
> +	bpf_mptcpify(args);
> +	return 0;
> +}
> 
> This version works. But it's just a work around version. Since the code
> to add a wrapper to sys_socket() is not very elegant indeed, and it
> shouldn't be accepted by upstream.
> 
> 3. The last solution is this patchset itself:
> 
> Introduce new program type BPF_PROG_TYPE_CGROUP_SOCKINIT and attach type
> BPF_CGROUP_SOCKINIT on cgroup basis.
> 
> Define BPF_CGROUP_RUN_PROG_SOCKINIT() helper, and implement
> __cgroup_bpf_run_sockinit() helper to run a sockinit program:
> 
> +#define BPF_CGROUP_RUN_PROG_SOCKINIT(family, type, protocol)		       \
> +({									       \
> +	int __ret = 0;							       \
> +	if (cgroup_bpf_enabled(CGROUP_SOCKINIT))			       \
> +		__ret = __cgroup_bpf_run_sockinit(family, type, protocol,      \
> +						  CGROUP_SOCKINIT);	       \
> +	__ret;								       \
> +})
> +
> 
> Invoke BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create() to change
> the arguments:
> 
> @@ -1469,6 +1469,12 @@  int __sock_create(struct net *net, int family, int type, int protocol,
>  	struct socket *sock;
>  	const struct net_proto_family *pf;
>  
> +	if (!kern) {
> +		err = BPF_CGROUP_RUN_PROG_SOCKINIT(&family, &type, &protocol);
> +		if (err)
> +			return err;
> +	}
> +
>  	/*
>  	 *      Check protocol is in range
>  	 */
> 
> Define BPF program in this newly added 'sockinit' SEC, so it will be
> hooked in BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create().
> 
> @@ -158,6 +158,11 @@  static struct sec_name_test tests[] = {
>  		{0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT},
>  		{0, BPF_CGROUP_SETSOCKOPT},
>  	},
> +	{
> +		"cgroup/sockinit",
> +		{0, BPF_PROG_TYPE_CGROUP_SOCKINIT, BPF_CGROUP_SOCKINIT},
> +		{0, BPF_CGROUP_SOCKINIT},
> +	},
>  };
> 
> +SEC("cgroup/sockinit")
> +int mptcpify(struct bpf_sockinit_ctx *ctx)
> +{
> +	if ((ctx->family == AF_INET || ctx->family == AF_INET6) &&
> +	    ctx->type == SOCK_STREAM &&
> +	    (!ctx->protocol || ctx->protocol == IPPROTO_TCP)) {
> +		ctx->protocol = IPPROTO_MPTCP;
> +	}
> +
> +	return 1;
> +}
> 
> This version is the best solution we have found so far.
> 
> [1]
> https://github.com/multipath-tcp/mptcp_net-next/wiki
> [2]
> https://github.com/multipath-tcp/mptcp_net-next/issues/79
> [3]
> https://patchwork.kernel.org/project/mptcp/cover/cover.1688215769.git.geliang.tang@suse.com/
> [4]
> https://patchwork.kernel.org/project/mptcp/cover/cover.1688366249.git.geliang.tang@suse.com/

cgroup/sock_create being weird and triggering late and only for af_inet4/6
was the reason we added ability to attach to lsm hooks on a per-cgroup
basis:
https://lore.kernel.org/bpf/20220628174314.1216643-1-sdf@google.com/

Unfortunately, using it here won't help either :-( The following
hook triggers early enough but doesn't allow changing the arguments (I was
interested only in filtering based on the arguments, not changing them):

LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern)

So maybe another alternative might be to change its definition to int *family,
int *type, int *protocol and use lsm_cgroup/socket_create to rewrite the
protocol? (some verifier changes might needed to make those writable)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC bpf-next 0/8] BPF 'force to MPTCP'
  2023-07-06 17:02 ` [RFC bpf-next 0/8] BPF 'force to MPTCP' Stanislav Fomichev
@ 2023-07-13  5:47   ` Geliang Tang
  2023-07-13 17:14     ` Stanislav Fomichev
  0 siblings, 1 reply; 15+ messages in thread
From: Geliang Tang @ 2023-07-13  5:47 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Hao Luo, Jiri Olsa, bpf, mptcp

[-- Attachment #1: Type: text/plain, Size: 8205 bytes --]

On Thu, Jul 06, 2023 at 10:02:43AM -0700, Stanislav Fomichev wrote:
> On 07/06, Geliang Tang wrote:
> > As is described in the "How to use MPTCP?" section in MPTCP wiki [1]:
> > 
> > "Your app can create sockets with IPPROTO_MPTCP as the proto:
> > ( socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP); ). Legacy apps can be
> > forced to create and use MPTCP sockets instead of TCP ones via the
> > mptcpize command bundled with the mptcpd daemon."
> > 
> > But the mptcpize (LD_PRELOAD technique) command has some limitations
> > [2]:
> > 
> >  - it doesn't work if the application is not using libc (e.g. GoLang
> > apps)
> >  - in some envs, it might not be easy to set env vars / change the way
> > apps are launched, e.g. on Android
> >  - mptcpize needs to be launched with all apps that want MPTCP: we could
> > have more control from BPF to enable MPTCP only for some apps or all the
> > ones of a netns or a cgroup, etc.
> >  - it is not in BPF, we cannot talk about it at netdev conf.
> > 
> > So this patchset attempts to use BPF to implement functions similer to
> > mptcpize.
> > 
> > The main idea is add a hook in sys_socket() to change the protocol id
> > from IPPROTO_TCP (or 0) to IPPROTO_MPTCP.
> > 
> > 1. This first solution [3] is using "cgroup/sock_create":
> > 
> > Implement a new helper bpf_mptcpify() to change the protocol id:
> > 
> > +BPF_CALL_1(bpf_mptcpify, struct sock *, sk)
> > +{
> > +	if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) {
> > +		sk->sk_protocol = IPPROTO_MPTCP;
> > +		return (unsigned long)sk;
> > +	}
> > +
> > +	return (unsigned long)NULL;
> > +}
> > +
> > +const struct bpf_func_proto bpf_mptcpify_proto = {
> > +	.func		= bpf_mptcpify,
> > +	.gpl_only	= false,
> > +	.ret_type	= RET_PTR_TO_BTF_ID_OR_NULL,
> > +	.ret_btf_id	= &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
> > +	.arg1_type	= ARG_PTR_TO_CTX,
> > +};
> > 
> > Add a hook in "cgroup/sock_create" section, invoking bpf_mptcpify()
> > helper in this hook:
> > 
> > +SEC("cgroup/sock_create")
> > +int sock(struct bpf_sock *ctx)
> > +{
> > +	struct sock *sk;
> > +
> > +	if (ctx->type != SOCK_STREAM)
> > +		return 1;
> > +
> > +	sk = bpf_mptcpify(ctx);
> > +	if (!sk)
> > +		return 1;
> > +
> > +	protocol = sk->sk_protocol;
> > +	return 1;
> > +}
> > 
> > But this solution doesn't work, because the sock_create section is
> > hooked at the end of inet_create(). It's too late to change the protocol
> > id.
> > 
> > 2. The second solution [4] is using "fentry":
> > 
> > Implement a bpf_mptcpify() helper:
> > 
> > +BPF_CALL_1(bpf_mptcpify, struct socket_args *, args)
> > +{
> > +	if (args->family == AF_INET &&
> > +	    args->type == SOCK_STREAM &&
> > +	    (!args->protocol || args->protocol == IPPROTO_TCP))
> > +		args->protocol = IPPROTO_MPTCP;
> > +
> > +	return 0;
> > +}
> > +
> > +BTF_ID_LIST(bpf_mptcpify_btf_ids)
> > +BTF_ID(struct, socket_args)
> > +
> > +static const struct bpf_func_proto bpf_mptcpify_proto = {
> > +	.func		= bpf_mptcpify,
> > +	.gpl_only	= false,
> > +	.ret_type	= RET_INTEGER,
> > +	.arg1_type	= ARG_PTR_TO_BTF_ID,
> > +	.arg1_btf_id	= &bpf_mptcpify_btf_ids[0],
> > +};
> > 
> > Add a new wrapper socket_create() in sys_socket() path, passing a
> > pointer of struct socket_args (int family; int type; int protocol) to
> > the wrapper.
> > 
> > +int socket_create(struct socket_args *args, struct socket **res)
> > +{
> > +	return sock_create(args->family, args->type, args->protocol, res);
> > +}
> > +EXPORT_SYMBOL(socket_create);
> > +
> >  /**
> >   *	sock_create_kern - creates a socket (kernel space)
> >   *	@net: net namespace
> > @@ -1608,6 +1614,7 @@  EXPORT_SYMBOL(sock_create_kern);
> >  
> >  static struct socket *__sys_socket_create(int family, int type, int protocol)
> >  {
> > +	struct socket_args args = { 0 };
> >  	struct socket *sock;
> >  	int retval;
> >  
> > @@ -1621,7 +1628,10 @@  static struct socket *__sys_socket_create(int family, int type, int protocol)
> >  		return ERR_PTR(-EINVAL);
> >  	type &= SOCK_TYPE_MASK;
> >  
> > -	retval = sock_create(family, type, protocol, &sock);
> > +	args.family = family;
> > +	args.type = type;
> > +	args.protocol = protocol;
> > +	retval = socket_create(&args, &sock);
> >  	if (retval < 0)
> >  		return ERR_PTR(retval);
> > 
> > Add "fentry" hook to the newly added wrapper, invoking bpf_mptcpify()
> > in the hook:
> > 
> > +SEC("fentry/socket_create")
> > +int BPF_PROG(trace_socket_create, void *args,
> > +		struct socket **res)
> > +{
> > +	bpf_mptcpify(args);
> > +	return 0;
> > +}
> > 
> > This version works. But it's just a work around version. Since the code
> > to add a wrapper to sys_socket() is not very elegant indeed, and it
> > shouldn't be accepted by upstream.
> > 
> > 3. The last solution is this patchset itself:
> > 
> > Introduce new program type BPF_PROG_TYPE_CGROUP_SOCKINIT and attach type
> > BPF_CGROUP_SOCKINIT on cgroup basis.
> > 
> > Define BPF_CGROUP_RUN_PROG_SOCKINIT() helper, and implement
> > __cgroup_bpf_run_sockinit() helper to run a sockinit program:
> > 
> > +#define BPF_CGROUP_RUN_PROG_SOCKINIT(family, type, protocol)		       \
> > +({									       \
> > +	int __ret = 0;							       \
> > +	if (cgroup_bpf_enabled(CGROUP_SOCKINIT))			       \
> > +		__ret = __cgroup_bpf_run_sockinit(family, type, protocol,      \
> > +						  CGROUP_SOCKINIT);	       \
> > +	__ret;								       \
> > +})
> > +
> > 
> > Invoke BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create() to change
> > the arguments:
> > 
> > @@ -1469,6 +1469,12 @@  int __sock_create(struct net *net, int family, int type, int protocol,
> >  	struct socket *sock;
> >  	const struct net_proto_family *pf;
> >  
> > +	if (!kern) {
> > +		err = BPF_CGROUP_RUN_PROG_SOCKINIT(&family, &type, &protocol);
> > +		if (err)
> > +			return err;
> > +	}
> > +
> >  	/*
> >  	 *      Check protocol is in range
> >  	 */
> > 
> > Define BPF program in this newly added 'sockinit' SEC, so it will be
> > hooked in BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create().
> > 
> > @@ -158,6 +158,11 @@  static struct sec_name_test tests[] = {
> >  		{0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT},
> >  		{0, BPF_CGROUP_SETSOCKOPT},
> >  	},
> > +	{
> > +		"cgroup/sockinit",
> > +		{0, BPF_PROG_TYPE_CGROUP_SOCKINIT, BPF_CGROUP_SOCKINIT},
> > +		{0, BPF_CGROUP_SOCKINIT},
> > +	},
> >  };
> > 
> > +SEC("cgroup/sockinit")
> > +int mptcpify(struct bpf_sockinit_ctx *ctx)
> > +{
> > +	if ((ctx->family == AF_INET || ctx->family == AF_INET6) &&
> > +	    ctx->type == SOCK_STREAM &&
> > +	    (!ctx->protocol || ctx->protocol == IPPROTO_TCP)) {
> > +		ctx->protocol = IPPROTO_MPTCP;
> > +	}
> > +
> > +	return 1;
> > +}
> > 
> > This version is the best solution we have found so far.
> > 
> > [1]
> > https://github.com/multipath-tcp/mptcp_net-next/wiki
> > [2]
> > https://github.com/multipath-tcp/mptcp_net-next/issues/79
> > [3]
> > https://patchwork.kernel.org/project/mptcp/cover/cover.1688215769.git.geliang.tang@suse.com/
> > [4]
> > https://patchwork.kernel.org/project/mptcp/cover/cover.1688366249.git.geliang.tang@suse.com/
> 
> cgroup/sock_create being weird and triggering late and only for af_inet4/6
> was the reason we added ability to attach to lsm hooks on a per-cgroup
> basis:
> https://lore.kernel.org/bpf/20220628174314.1216643-1-sdf@google.com/
> 
> Unfortunately, using it here won't help either :-( The following
> hook triggers early enough but doesn't allow changing the arguments (I was
> interested only in filtering based on the arguments, not changing them):
> 
> LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern)
> 
> So maybe another alternative might be to change its definition to int *family,
> int *type, int *protocol and use lsm_cgroup/socket_create to rewrite the

Thanks Stanislav, this lsm_cgroup/socket_create works. The test patch
is attached.

> protocol? (some verifier changes might needed to make those writable)

But I got some verification errors here:

   invalid mem access 'scalar'.

I tried many times but couldn't solve it, so I simply skipped the
verifier in the test patch (I marked TODO in front of this code).
Could you please give me some suggestions for verification?

Thanks,
-Geliang

[-- Attachment #2: v4-0001-bpf-Force-to-MPTCP.patch --]
[-- Type: text/x-patch, Size: 13164 bytes --]

From dd3c0ee4202e01c52534e8359450b42498cd4e40 Mon Sep 17 00:00:00 2001
Message-Id: <dd3c0ee4202e01c52534e8359450b42498cd4e40.1689226263.git.geliang.tang@suse.com>
From: Geliang Tang <geliang.tang@suse.com>
Date: Sun, 2 Jul 2023 16:48:59 +0800
Subject: [RFC bpf-next v4] bpf: Force to MPTCP

selftests/bpf: Add mptcpify program

This patch implements a new test program mptcpify: if the family is
AF_INET or AF_INET6, the type is SOCK_STREAM, and the protocol ID is
0 or IPPROTO_TCP, set it to IPPROTO_MPTCP.

This is defined in a newly added 'sockinit' SEC, so it will be hooked
in BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create().

Signed-off-by: Geliang Tang <geliang.tang@suse.com>

selftests/bpf: use random netns name for mptcp

Use rand() to generate a random netns name instead of using the fixed
name "mptcp_ns" for every test.

By doing that, we can re-launch the test even if there was an issue
removing the previous netns or if by accident, a netns with this generic
name already existed on the system.

Note that using a different name each will also help adding more
subtests in future commits.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>

selftests/bpf: add two mptcp netns helpers

Add two netns helpers for mptcp tests: create_netns() and
cleanup_netns(). Use them in test_base().

These new helpers will be re-used in the following commits introducing
new tests.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>

selftests/bpf: Add mptcpify selftest

This patch extends the MPTCP test base, add a selftest test_mptcpify()
for the mptcpify case.

Open and load the mptcpify test prog to mptcpify the TCP sockets
dynamically, then use start_server() and connect_to_fd() to create a
TCP socket, but actually what's created is an MPTCP socket, which can
be verified through the outputs of 'ss' and 'nstat' commands.

Signed-off-by: Geliang Tang <geliang.tang@suse.com>
---
 include/linux/lsm_hook_defs.h                 |   2 +-
 include/linux/security.h                      |   4 +-
 kernel/bpf/verifier.c                         |   5 +
 net/socket.c                                  |   4 +-
 security/apparmor/lsm.c                       |   6 +-
 security/security.c                           |   2 +-
 security/selinux/hooks.c                      |   4 +-
 .../testing/selftests/bpf/prog_tests/mptcp.c  | 128 ++++++++++++++++--
 tools/testing/selftests/bpf/progs/mptcpify.c  |  41 ++++++
 9 files changed, 176 insertions(+), 20 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/mptcpify.c

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 7308a1a7599b..c2c178dfb06d 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -288,7 +288,7 @@ LSM_HOOK(int, 0, watch_key, struct key *key)
 LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other,
 	 struct sock *newsk)
 LSM_HOOK(int, 0, unix_may_send, struct socket *sock, struct socket *other)
-LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern)
+LSM_HOOK(int, 0, socket_create, int family, int type, int *protocol, int kern)
 LSM_HOOK(int, 0, socket_post_create, struct socket *sock, int family, int type,
 	 int protocol, int kern)
 LSM_HOOK(int, 0, socket_socketpair, struct socket *socka, struct socket *sockb)
diff --git a/include/linux/security.h b/include/linux/security.h
index 32828502f09e..67fd5bb91b72 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1416,7 +1416,7 @@ static inline int security_watch_key(struct key *key)
 
 int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk);
 int security_unix_may_send(struct socket *sock,  struct socket *other);
-int security_socket_create(int family, int type, int protocol, int kern);
+int security_socket_create(int family, int type, int *protocol, int kern);
 int security_socket_post_create(struct socket *sock, int family,
 				int type, int protocol, int kern);
 int security_socket_socketpair(struct socket *socka, struct socket *sockb);
@@ -1482,7 +1482,7 @@ static inline int security_unix_may_send(struct socket *sock,
 }
 
 static inline int security_socket_create(int family, int type,
-					 int protocol, int kern)
+					 int *protocol, int kern)
 {
 	return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 81a93eeac7a0..d6503ac62f6d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6471,6 +6471,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 		if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
 			mark_reg_unknown(env, regs, value_regno);
+	} else if (base_type(reg->type) == SCALAR_VALUE) {
+		/* TODO
+		 * skip it to let mptcpify test run */
+		pr_info("%s R%d invalid mem access '%s'\n",
+			__func__, regno, reg_type_str(env, reg->type));
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str(env, reg->type));
diff --git a/net/socket.c b/net/socket.c
index 2b0e54b2405c..cb1df106de4a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1328,7 +1328,7 @@ int sock_create_lite(int family, int type, int protocol, struct socket **res)
 	int err;
 	struct socket *sock = NULL;
 
-	err = security_socket_create(family, type, protocol, 1);
+	err = security_socket_create(family, type, &protocol, 1);
 	if (err)
 		goto out;
 
@@ -1488,7 +1488,7 @@ int __sock_create(struct net *net, int family, int type, int protocol,
 		family = PF_PACKET;
 	}
 
-	err = security_socket_create(family, type, protocol, kern);
+	err = security_socket_create(family, type, &protocol, kern);
 	if (err)
 		return err;
 
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index f431251ffb91..ba1ba86771bc 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -868,7 +868,7 @@ static void apparmor_sk_clone_security(const struct sock *sk,
 /**
  * apparmor_socket_create - check perms before creating a new socket
  */
-static int apparmor_socket_create(int family, int type, int protocol, int kern)
+static int apparmor_socket_create(int family, int type, int *protocol, int kern)
 {
 	struct aa_label *label;
 	int error = 0;
@@ -878,9 +878,9 @@ static int apparmor_socket_create(int family, int type, int protocol, int kern)
 	label = begin_current_label_crit_section();
 	if (!(kern || unconfined(label)))
 		error = af_select(family,
-				  create_perm(label, family, type, protocol),
+				  create_perm(label, family, type, *protocol),
 				  aa_af_perm(label, OP_CREATE, AA_MAY_CREATE,
-					     family, type, protocol));
+					     family, type, *protocol));
 	end_current_label_crit_section(label);
 
 	return error;
diff --git a/security/security.c b/security/security.c
index b720424ca37d..4a8ef5d0304a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -4078,7 +4078,7 @@ EXPORT_SYMBOL(security_unix_may_send);
  *
  * Return: Returns 0 if permission is granted.
  */
-int security_socket_create(int family, int type, int protocol, int kern)
+int security_socket_create(int family, int type, int *protocol, int kern)
 {
 	return call_int_hook(socket_create, 0, family, type, protocol, kern);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index d06e350fedee..4a1d65210faa 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4513,7 +4513,7 @@ static int sock_has_perm(struct sock *sk, u32 perms)
 }
 
 static int selinux_socket_create(int family, int type,
-				 int protocol, int kern)
+				 int *protocol, int kern)
 {
 	const struct task_security_struct *tsec = selinux_cred(current_cred());
 	u32 newsid;
@@ -4523,7 +4523,7 @@ static int selinux_socket_create(int family, int type,
 	if (kern)
 		return 0;
 
-	secclass = socket_type_to_security_class(family, type, protocol);
+	secclass = socket_type_to_security_class(family, type, *protocol);
 	rc = socket_sockcreate_sid(tsec, secclass, &newsid);
 	if (rc)
 		return rc;
diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index cd0c42fff7c0..93767e441e17 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -6,8 +6,9 @@
 #include "cgroup_helpers.h"
 #include "network_helpers.h"
 #include "mptcp_sock.skel.h"
+#include "mptcpify.skel.h"
 
-#define NS_TEST "mptcp_ns"
+char NS_TEST[32];
 
 #ifndef TCP_CA_NAME_MAX
 #define TCP_CA_NAME_MAX	16
@@ -22,6 +23,26 @@ struct mptcp_storage {
 	char ca_name[TCP_CA_NAME_MAX];
 };
 
+static struct nstoken *create_netns(void)
+{
+	srand(time(NULL));
+	snprintf(NS_TEST, sizeof(NS_TEST), "mptcp_ns_%d", rand());
+	SYS(fail, "ip netns add %s", NS_TEST);
+	SYS(fail, "ip -net %s link set dev lo up", NS_TEST);
+
+	return open_netns(NS_TEST);
+fail:
+	return NULL;
+}
+
+static void cleanup_netns(struct nstoken *nstoken)
+{
+	if (nstoken)
+		close_netns(nstoken);
+
+	SYS_NOFAIL("ip netns del %s &> /dev/null", NS_TEST);
+}
+
 static int verify_tsk(int map_fd, int client_fd)
 {
 	int err, cfd = client_fd;
@@ -147,11 +168,8 @@ static void test_base(void)
 	if (!ASSERT_GE(cgroup_fd, 0, "test__join_cgroup"))
 		return;
 
-	SYS(fail, "ip netns add %s", NS_TEST);
-	SYS(fail, "ip -net %s link set dev lo up", NS_TEST);
-
-	nstoken = open_netns(NS_TEST);
-	if (!ASSERT_OK_PTR(nstoken, "open_netns"))
+	nstoken = create_netns();
+	if (!ASSERT_OK_PTR(nstoken, "create_netns"))
 		goto fail;
 
 	/* without MPTCP */
@@ -174,11 +192,101 @@ static void test_base(void)
 	close(server_fd);
 
 fail:
-	if (nstoken)
-		close_netns(nstoken);
+	cleanup_netns(nstoken);
+
+	close(cgroup_fd);
+}
+
+static void send_byte(int fd)
+{
+	char b = 0x55;
+
+	ASSERT_EQ(write(fd, &b, sizeof(b)), 1, "send single byte");
+}
+
+static int verify_mptcpify(void)
+{
+	char cmd[256];
+	int err = 0;
+
+	snprintf(cmd, sizeof(cmd),
+		 "ip netns exec %s ss -tOni | grep -q '%s'",
+		 NS_TEST, "tcp-ulp-mptcp");
+	if (!ASSERT_OK(system(cmd), "No tcp-ulp-mptcp found!"))
+		err++;
+
+	snprintf(cmd, sizeof(cmd),
+		 "ip netns exec %s nstat -asz %s | awk '%s' | grep -q '%s'",
+		 NS_TEST, "MPTcpExtMPCapableSYNACKRX",
+		 "NR==1 {next} {print $2}", "1");
+	if (!ASSERT_OK(system(cmd), "No MPTcpExtMPCapableSYNACKRX found!"))
+		err++;
+
+	return err;
+}
+
+static int run_mptcpify(int cgroup_fd)
+{
+	int server_fd, client_fd, prog_fd, err = 0;
+	struct mptcpify *mptcpify_skel;
+
+	mptcpify_skel = mptcpify__open_and_load();
+	if (!ASSERT_OK_PTR(mptcpify_skel, "mptcpify__open_and_load"))
+		return -EIO;
+
+	prog_fd = bpf_program__fd(mptcpify_skel->progs.mptcpify);
+	if (!ASSERT_GE(prog_fd, 0, "bpf_program__fd")) {
+		err = -EIO;
+		goto out;
+	}
 
-	SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null");
+	err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_LSM_CGROUP, 0);
+	if (!ASSERT_OK(err, "attach alloc_prog_fd")) {
+		err = -EIO;
+		goto out;
+	}
 
+	/* without MPTCP */
+	server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
+	if (!ASSERT_GE(server_fd, 0, "start_server")) {
+		err = -EIO;
+		goto out;
+	}
+
+	client_fd = connect_to_fd(server_fd, 0);
+	if (!ASSERT_GE(client_fd, 0, "connect to fd")) {
+		err = -EIO;
+		goto close_server;
+	}
+
+	send_byte(client_fd);
+	err += verify_mptcpify();
+
+	close(client_fd);
+close_server:
+	close(server_fd);
+out:
+	mptcpify__destroy(mptcpify_skel);
+	return err;
+}
+
+static void test_mptcpify(void)
+{
+	struct nstoken *nstoken = NULL;
+	int cgroup_fd;
+
+	cgroup_fd = test__join_cgroup("/mptcpify");
+	if (!ASSERT_GE(cgroup_fd, 0, "test__join_cgroup"))
+		return;
+
+	nstoken = create_netns();
+	if (!ASSERT_OK_PTR(nstoken, "create_netns"))
+		goto fail;
+
+	ASSERT_OK(run_mptcpify(cgroup_fd), "run_mptcpify");
+
+fail:
+	cleanup_netns(nstoken);
 	close(cgroup_fd);
 }
 
@@ -186,4 +294,6 @@ void test_mptcp(void)
 {
 	if (test__start_subtest("base"))
 		test_base();
+	if (test__start_subtest("mptcpify"))
+		test_mptcpify();
 }
diff --git a/tools/testing/selftests/bpf/progs/mptcpify.c b/tools/testing/selftests/bpf/progs/mptcpify.c
new file mode 100644
index 000000000000..94aef62016fa
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/mptcpify.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023, SUSE. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+char fmt[] = "tcp=%u\n";
+
+#define	AF_INET		2
+#define	AF_INET6	10
+#define	SOCK_STREAM	1
+#define	IPPROTO_TCP	6
+#define	IPPROTO_MPTCP	262
+
+static __always_inline bool is_tcp(int family, int type, int protocol)
+{
+	if ((family == AF_INET || family == AF_INET6) &&
+	    type == SOCK_STREAM &&
+	    (!protocol || protocol == IPPROTO_TCP))
+		return true;
+
+	return false;
+}
+
+SEC("lsm_cgroup/socket_create")
+int BPF_PROG(mptcpify, int family, int type, int *protocol, int kern)
+{
+	bool tcp;
+
+	if (kern)
+		goto out;
+
+	tcp = is_tcp(family, type, *protocol);
+	bpf_trace_printk(fmt, sizeof(fmt), tcp);
+	if (tcp)
+		*protocol = IPPROTO_MPTCP;
+out:
+	return 1;
+}
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [RFC bpf-next 0/8] BPF 'force to MPTCP'
  2023-07-13  5:47   ` Geliang Tang
@ 2023-07-13 17:14     ` Stanislav Fomichev
  2023-07-13 22:46       ` Alexei Starovoitov
  0 siblings, 1 reply; 15+ messages in thread
From: Stanislav Fomichev @ 2023-07-13 17:14 UTC (permalink / raw)
  To: Geliang Tang
  Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Hao Luo, Jiri Olsa, bpf, mptcp

On 07/13, Geliang Tang wrote:
> On Thu, Jul 06, 2023 at 10:02:43AM -0700, Stanislav Fomichev wrote:
> > On 07/06, Geliang Tang wrote:
> > > As is described in the "How to use MPTCP?" section in MPTCP wiki [1]:
> > > 
> > > "Your app can create sockets with IPPROTO_MPTCP as the proto:
> > > ( socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP); ). Legacy apps can be
> > > forced to create and use MPTCP sockets instead of TCP ones via the
> > > mptcpize command bundled with the mptcpd daemon."
> > > 
> > > But the mptcpize (LD_PRELOAD technique) command has some limitations
> > > [2]:
> > > 
> > >  - it doesn't work if the application is not using libc (e.g. GoLang
> > > apps)
> > >  - in some envs, it might not be easy to set env vars / change the way
> > > apps are launched, e.g. on Android
> > >  - mptcpize needs to be launched with all apps that want MPTCP: we could
> > > have more control from BPF to enable MPTCP only for some apps or all the
> > > ones of a netns or a cgroup, etc.
> > >  - it is not in BPF, we cannot talk about it at netdev conf.
> > > 
> > > So this patchset attempts to use BPF to implement functions similer to
> > > mptcpize.
> > > 
> > > The main idea is add a hook in sys_socket() to change the protocol id
> > > from IPPROTO_TCP (or 0) to IPPROTO_MPTCP.
> > > 
> > > 1. This first solution [3] is using "cgroup/sock_create":
> > > 
> > > Implement a new helper bpf_mptcpify() to change the protocol id:
> > > 
> > > +BPF_CALL_1(bpf_mptcpify, struct sock *, sk)
> > > +{
> > > +	if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) {
> > > +		sk->sk_protocol = IPPROTO_MPTCP;
> > > +		return (unsigned long)sk;
> > > +	}
> > > +
> > > +	return (unsigned long)NULL;
> > > +}
> > > +
> > > +const struct bpf_func_proto bpf_mptcpify_proto = {
> > > +	.func		= bpf_mptcpify,
> > > +	.gpl_only	= false,
> > > +	.ret_type	= RET_PTR_TO_BTF_ID_OR_NULL,
> > > +	.ret_btf_id	= &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
> > > +	.arg1_type	= ARG_PTR_TO_CTX,
> > > +};
> > > 
> > > Add a hook in "cgroup/sock_create" section, invoking bpf_mptcpify()
> > > helper in this hook:
> > > 
> > > +SEC("cgroup/sock_create")
> > > +int sock(struct bpf_sock *ctx)
> > > +{
> > > +	struct sock *sk;
> > > +
> > > +	if (ctx->type != SOCK_STREAM)
> > > +		return 1;
> > > +
> > > +	sk = bpf_mptcpify(ctx);
> > > +	if (!sk)
> > > +		return 1;
> > > +
> > > +	protocol = sk->sk_protocol;
> > > +	return 1;
> > > +}
> > > 
> > > But this solution doesn't work, because the sock_create section is
> > > hooked at the end of inet_create(). It's too late to change the protocol
> > > id.
> > > 
> > > 2. The second solution [4] is using "fentry":
> > > 
> > > Implement a bpf_mptcpify() helper:
> > > 
> > > +BPF_CALL_1(bpf_mptcpify, struct socket_args *, args)
> > > +{
> > > +	if (args->family == AF_INET &&
> > > +	    args->type == SOCK_STREAM &&
> > > +	    (!args->protocol || args->protocol == IPPROTO_TCP))
> > > +		args->protocol = IPPROTO_MPTCP;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +BTF_ID_LIST(bpf_mptcpify_btf_ids)
> > > +BTF_ID(struct, socket_args)
> > > +
> > > +static const struct bpf_func_proto bpf_mptcpify_proto = {
> > > +	.func		= bpf_mptcpify,
> > > +	.gpl_only	= false,
> > > +	.ret_type	= RET_INTEGER,
> > > +	.arg1_type	= ARG_PTR_TO_BTF_ID,
> > > +	.arg1_btf_id	= &bpf_mptcpify_btf_ids[0],
> > > +};
> > > 
> > > Add a new wrapper socket_create() in sys_socket() path, passing a
> > > pointer of struct socket_args (int family; int type; int protocol) to
> > > the wrapper.
> > > 
> > > +int socket_create(struct socket_args *args, struct socket **res)
> > > +{
> > > +	return sock_create(args->family, args->type, args->protocol, res);
> > > +}
> > > +EXPORT_SYMBOL(socket_create);
> > > +
> > >  /**
> > >   *	sock_create_kern - creates a socket (kernel space)
> > >   *	@net: net namespace
> > > @@ -1608,6 +1614,7 @@  EXPORT_SYMBOL(sock_create_kern);
> > >  
> > >  static struct socket *__sys_socket_create(int family, int type, int protocol)
> > >  {
> > > +	struct socket_args args = { 0 };
> > >  	struct socket *sock;
> > >  	int retval;
> > >  
> > > @@ -1621,7 +1628,10 @@  static struct socket *__sys_socket_create(int family, int type, int protocol)
> > >  		return ERR_PTR(-EINVAL);
> > >  	type &= SOCK_TYPE_MASK;
> > >  
> > > -	retval = sock_create(family, type, protocol, &sock);
> > > +	args.family = family;
> > > +	args.type = type;
> > > +	args.protocol = protocol;
> > > +	retval = socket_create(&args, &sock);
> > >  	if (retval < 0)
> > >  		return ERR_PTR(retval);
> > > 
> > > Add "fentry" hook to the newly added wrapper, invoking bpf_mptcpify()
> > > in the hook:
> > > 
> > > +SEC("fentry/socket_create")
> > > +int BPF_PROG(trace_socket_create, void *args,
> > > +		struct socket **res)
> > > +{
> > > +	bpf_mptcpify(args);
> > > +	return 0;
> > > +}
> > > 
> > > This version works. But it's just a work around version. Since the code
> > > to add a wrapper to sys_socket() is not very elegant indeed, and it
> > > shouldn't be accepted by upstream.
> > > 
> > > 3. The last solution is this patchset itself:
> > > 
> > > Introduce new program type BPF_PROG_TYPE_CGROUP_SOCKINIT and attach type
> > > BPF_CGROUP_SOCKINIT on cgroup basis.
> > > 
> > > Define BPF_CGROUP_RUN_PROG_SOCKINIT() helper, and implement
> > > __cgroup_bpf_run_sockinit() helper to run a sockinit program:
> > > 
> > > +#define BPF_CGROUP_RUN_PROG_SOCKINIT(family, type, protocol)		       \
> > > +({									       \
> > > +	int __ret = 0;							       \
> > > +	if (cgroup_bpf_enabled(CGROUP_SOCKINIT))			       \
> > > +		__ret = __cgroup_bpf_run_sockinit(family, type, protocol,      \
> > > +						  CGROUP_SOCKINIT);	       \
> > > +	__ret;								       \
> > > +})
> > > +
> > > 
> > > Invoke BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create() to change
> > > the arguments:
> > > 
> > > @@ -1469,6 +1469,12 @@  int __sock_create(struct net *net, int family, int type, int protocol,
> > >  	struct socket *sock;
> > >  	const struct net_proto_family *pf;
> > >  
> > > +	if (!kern) {
> > > +		err = BPF_CGROUP_RUN_PROG_SOCKINIT(&family, &type, &protocol);
> > > +		if (err)
> > > +			return err;
> > > +	}
> > > +
> > >  	/*
> > >  	 *      Check protocol is in range
> > >  	 */
> > > 
> > > Define BPF program in this newly added 'sockinit' SEC, so it will be
> > > hooked in BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create().
> > > 
> > > @@ -158,6 +158,11 @@  static struct sec_name_test tests[] = {
> > >  		{0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT},
> > >  		{0, BPF_CGROUP_SETSOCKOPT},
> > >  	},
> > > +	{
> > > +		"cgroup/sockinit",
> > > +		{0, BPF_PROG_TYPE_CGROUP_SOCKINIT, BPF_CGROUP_SOCKINIT},
> > > +		{0, BPF_CGROUP_SOCKINIT},
> > > +	},
> > >  };
> > > 
> > > +SEC("cgroup/sockinit")
> > > +int mptcpify(struct bpf_sockinit_ctx *ctx)
> > > +{
> > > +	if ((ctx->family == AF_INET || ctx->family == AF_INET6) &&
> > > +	    ctx->type == SOCK_STREAM &&
> > > +	    (!ctx->protocol || ctx->protocol == IPPROTO_TCP)) {
> > > +		ctx->protocol = IPPROTO_MPTCP;
> > > +	}
> > > +
> > > +	return 1;
> > > +}
> > > 
> > > This version is the best solution we have found so far.
> > > 
> > > [1]
> > > https://github.com/multipath-tcp/mptcp_net-next/wiki
> > > [2]
> > > https://github.com/multipath-tcp/mptcp_net-next/issues/79
> > > [3]
> > > https://patchwork.kernel.org/project/mptcp/cover/cover.1688215769.git.geliang.tang@suse.com/
> > > [4]
> > > https://patchwork.kernel.org/project/mptcp/cover/cover.1688366249.git.geliang.tang@suse.com/
> > 
> > cgroup/sock_create being weird and triggering late and only for af_inet4/6
> > was the reason we added ability to attach to lsm hooks on a per-cgroup
> > basis:
> > https://lore.kernel.org/bpf/20220628174314.1216643-1-sdf@google.com/
> > 
> > Unfortunately, using it here won't help either :-( The following
> > hook triggers early enough but doesn't allow changing the arguments (I was
> > interested only in filtering based on the arguments, not changing them):
> > 
> > LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern)
> > 
> > So maybe another alternative might be to change its definition to int *family,
> > int *type, int *protocol and use lsm_cgroup/socket_create to rewrite the
> 
> Thanks Stanislav, this lsm_cgroup/socket_create works. The test patch
> is attached.
> 
> > protocol? (some verifier changes might needed to make those writable)
> 
> But I got some verification errors here:
> 
>    invalid mem access 'scalar'.
> 
> I tried many times but couldn't solve it, so I simply skipped the
> verifier in the test patch (I marked TODO in front of this code).
> Could you please give me some suggestions for verification?

Right, that's the core issue here, to add support to the verifier
to let it write into scalar pointer arguments. I don't have a good suggestion,
but we've discussed it multiple times elsewhere that maybe we
should do it :-)

Can we use some new btf_type_tag("allow_write") to annotate places
where we know that it's safe to write to them? Then we can extend
the verifier to check for this condition hopefully. Maybe other BPF
folks have better suggestions here?

We should also CC LSM people to make sure it's not a no-no to
change LSM_HOOK(s) in a way to allow changing the arguments. I'm
not sure how rigid those definitions are :-(

> Thanks,
> -Geliang

> From dd3c0ee4202e01c52534e8359450b42498cd4e40 Mon Sep 17 00:00:00 2001
> Message-Id: <dd3c0ee4202e01c52534e8359450b42498cd4e40.1689226263.git.geliang.tang@suse.com>
> From: Geliang Tang <geliang.tang@suse.com>
> Date: Sun, 2 Jul 2023 16:48:59 +0800
> Subject: [RFC bpf-next v4] bpf: Force to MPTCP
> 
> selftests/bpf: Add mptcpify program
> 
> This patch implements a new test program mptcpify: if the family is
> AF_INET or AF_INET6, the type is SOCK_STREAM, and the protocol ID is
> 0 or IPPROTO_TCP, set it to IPPROTO_MPTCP.
> 
> This is defined in a newly added 'sockinit' SEC, so it will be hooked
> in BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create().
> 
> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> 
> selftests/bpf: use random netns name for mptcp
> 
> Use rand() to generate a random netns name instead of using the fixed
> name "mptcp_ns" for every test.
> 
> By doing that, we can re-launch the test even if there was an issue
> removing the previous netns or if by accident, a netns with this generic
> name already existed on the system.
> 
> Note that using a different name each will also help adding more
> subtests in future commits.
> 
> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> 
> selftests/bpf: add two mptcp netns helpers
> 
> Add two netns helpers for mptcp tests: create_netns() and
> cleanup_netns(). Use them in test_base().
> 
> These new helpers will be re-used in the following commits introducing
> new tests.
> 
> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> 
> selftests/bpf: Add mptcpify selftest
> 
> This patch extends the MPTCP test base, add a selftest test_mptcpify()
> for the mptcpify case.
> 
> Open and load the mptcpify test prog to mptcpify the TCP sockets
> dynamically, then use start_server() and connect_to_fd() to create a
> TCP socket, but actually what's created is an MPTCP socket, which can
> be verified through the outputs of 'ss' and 'nstat' commands.
> 
> Signed-off-by: Geliang Tang <geliang.tang@suse.com>
> ---
>  include/linux/lsm_hook_defs.h                 |   2 +-
>  include/linux/security.h                      |   4 +-
>  kernel/bpf/verifier.c                         |   5 +
>  net/socket.c                                  |   4 +-
>  security/apparmor/lsm.c                       |   6 +-
>  security/security.c                           |   2 +-
>  security/selinux/hooks.c                      |   4 +-
>  .../testing/selftests/bpf/prog_tests/mptcp.c  | 128 ++++++++++++++++--
>  tools/testing/selftests/bpf/progs/mptcpify.c  |  41 ++++++
>  9 files changed, 176 insertions(+), 20 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/progs/mptcpify.c
> 
> diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
> index 7308a1a7599b..c2c178dfb06d 100644
> --- a/include/linux/lsm_hook_defs.h
> +++ b/include/linux/lsm_hook_defs.h
> @@ -288,7 +288,7 @@ LSM_HOOK(int, 0, watch_key, struct key *key)
>  LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other,
>  	 struct sock *newsk)
>  LSM_HOOK(int, 0, unix_may_send, struct socket *sock, struct socket *other)
> -LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern)
> +LSM_HOOK(int, 0, socket_create, int family, int type, int *protocol, int kern)
>  LSM_HOOK(int, 0, socket_post_create, struct socket *sock, int family, int type,
>  	 int protocol, int kern)
>  LSM_HOOK(int, 0, socket_socketpair, struct socket *socka, struct socket *sockb)
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 32828502f09e..67fd5bb91b72 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -1416,7 +1416,7 @@ static inline int security_watch_key(struct key *key)
>  
>  int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk);
>  int security_unix_may_send(struct socket *sock,  struct socket *other);
> -int security_socket_create(int family, int type, int protocol, int kern);
> +int security_socket_create(int family, int type, int *protocol, int kern);
>  int security_socket_post_create(struct socket *sock, int family,
>  				int type, int protocol, int kern);
>  int security_socket_socketpair(struct socket *socka, struct socket *sockb);
> @@ -1482,7 +1482,7 @@ static inline int security_unix_may_send(struct socket *sock,
>  }
>  
>  static inline int security_socket_create(int family, int type,
> -					 int protocol, int kern)
> +					 int *protocol, int kern)
>  {
>  	return 0;
>  }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 81a93eeac7a0..d6503ac62f6d 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -6471,6 +6471,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>  
>  		if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
>  			mark_reg_unknown(env, regs, value_regno);
> +	} else if (base_type(reg->type) == SCALAR_VALUE) {
> +		/* TODO
> +		 * skip it to let mptcpify test run */
> +		pr_info("%s R%d invalid mem access '%s'\n",
> +			__func__, regno, reg_type_str(env, reg->type));
>  	} else {
>  		verbose(env, "R%d invalid mem access '%s'\n", regno,
>  			reg_type_str(env, reg->type));
> diff --git a/net/socket.c b/net/socket.c
> index 2b0e54b2405c..cb1df106de4a 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -1328,7 +1328,7 @@ int sock_create_lite(int family, int type, int protocol, struct socket **res)
>  	int err;
>  	struct socket *sock = NULL;
>  
> -	err = security_socket_create(family, type, protocol, 1);
> +	err = security_socket_create(family, type, &protocol, 1);
>  	if (err)
>  		goto out;
>  
> @@ -1488,7 +1488,7 @@ int __sock_create(struct net *net, int family, int type, int protocol,
>  		family = PF_PACKET;
>  	}
>  
> -	err = security_socket_create(family, type, protocol, kern);
> +	err = security_socket_create(family, type, &protocol, kern);
>  	if (err)
>  		return err;
>  
> diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
> index f431251ffb91..ba1ba86771bc 100644
> --- a/security/apparmor/lsm.c
> +++ b/security/apparmor/lsm.c
> @@ -868,7 +868,7 @@ static void apparmor_sk_clone_security(const struct sock *sk,
>  /**
>   * apparmor_socket_create - check perms before creating a new socket
>   */
> -static int apparmor_socket_create(int family, int type, int protocol, int kern)
> +static int apparmor_socket_create(int family, int type, int *protocol, int kern)
>  {
>  	struct aa_label *label;
>  	int error = 0;
> @@ -878,9 +878,9 @@ static int apparmor_socket_create(int family, int type, int protocol, int kern)
>  	label = begin_current_label_crit_section();
>  	if (!(kern || unconfined(label)))
>  		error = af_select(family,
> -				  create_perm(label, family, type, protocol),
> +				  create_perm(label, family, type, *protocol),
>  				  aa_af_perm(label, OP_CREATE, AA_MAY_CREATE,
> -					     family, type, protocol));
> +					     family, type, *protocol));
>  	end_current_label_crit_section(label);
>  
>  	return error;
> diff --git a/security/security.c b/security/security.c
> index b720424ca37d..4a8ef5d0304a 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -4078,7 +4078,7 @@ EXPORT_SYMBOL(security_unix_may_send);
>   *
>   * Return: Returns 0 if permission is granted.
>   */
> -int security_socket_create(int family, int type, int protocol, int kern)
> +int security_socket_create(int family, int type, int *protocol, int kern)
>  {
>  	return call_int_hook(socket_create, 0, family, type, protocol, kern);
>  }
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index d06e350fedee..4a1d65210faa 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -4513,7 +4513,7 @@ static int sock_has_perm(struct sock *sk, u32 perms)
>  }
>  
>  static int selinux_socket_create(int family, int type,
> -				 int protocol, int kern)
> +				 int *protocol, int kern)
>  {
>  	const struct task_security_struct *tsec = selinux_cred(current_cred());
>  	u32 newsid;
> @@ -4523,7 +4523,7 @@ static int selinux_socket_create(int family, int type,
>  	if (kern)
>  		return 0;
>  
> -	secclass = socket_type_to_security_class(family, type, protocol);
> +	secclass = socket_type_to_security_class(family, type, *protocol);
>  	rc = socket_sockcreate_sid(tsec, secclass, &newsid);
>  	if (rc)
>  		return rc;
> diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
> index cd0c42fff7c0..93767e441e17 100644
> --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
> +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
> @@ -6,8 +6,9 @@
>  #include "cgroup_helpers.h"
>  #include "network_helpers.h"
>  #include "mptcp_sock.skel.h"
> +#include "mptcpify.skel.h"
>  
> -#define NS_TEST "mptcp_ns"
> +char NS_TEST[32];
>  
>  #ifndef TCP_CA_NAME_MAX
>  #define TCP_CA_NAME_MAX	16
> @@ -22,6 +23,26 @@ struct mptcp_storage {
>  	char ca_name[TCP_CA_NAME_MAX];
>  };
>  
> +static struct nstoken *create_netns(void)
> +{
> +	srand(time(NULL));
> +	snprintf(NS_TEST, sizeof(NS_TEST), "mptcp_ns_%d", rand());
> +	SYS(fail, "ip netns add %s", NS_TEST);
> +	SYS(fail, "ip -net %s link set dev lo up", NS_TEST);
> +
> +	return open_netns(NS_TEST);
> +fail:
> +	return NULL;
> +}
> +
> +static void cleanup_netns(struct nstoken *nstoken)
> +{
> +	if (nstoken)
> +		close_netns(nstoken);
> +
> +	SYS_NOFAIL("ip netns del %s &> /dev/null", NS_TEST);
> +}
> +
>  static int verify_tsk(int map_fd, int client_fd)
>  {
>  	int err, cfd = client_fd;
> @@ -147,11 +168,8 @@ static void test_base(void)
>  	if (!ASSERT_GE(cgroup_fd, 0, "test__join_cgroup"))
>  		return;
>  
> -	SYS(fail, "ip netns add %s", NS_TEST);
> -	SYS(fail, "ip -net %s link set dev lo up", NS_TEST);
> -
> -	nstoken = open_netns(NS_TEST);
> -	if (!ASSERT_OK_PTR(nstoken, "open_netns"))
> +	nstoken = create_netns();
> +	if (!ASSERT_OK_PTR(nstoken, "create_netns"))
>  		goto fail;
>  
>  	/* without MPTCP */
> @@ -174,11 +192,101 @@ static void test_base(void)
>  	close(server_fd);
>  
>  fail:
> -	if (nstoken)
> -		close_netns(nstoken);
> +	cleanup_netns(nstoken);
> +
> +	close(cgroup_fd);
> +}
> +
> +static void send_byte(int fd)
> +{
> +	char b = 0x55;
> +
> +	ASSERT_EQ(write(fd, &b, sizeof(b)), 1, "send single byte");
> +}
> +
> +static int verify_mptcpify(void)
> +{
> +	char cmd[256];
> +	int err = 0;
> +
> +	snprintf(cmd, sizeof(cmd),
> +		 "ip netns exec %s ss -tOni | grep -q '%s'",
> +		 NS_TEST, "tcp-ulp-mptcp");
> +	if (!ASSERT_OK(system(cmd), "No tcp-ulp-mptcp found!"))
> +		err++;
> +
> +	snprintf(cmd, sizeof(cmd),
> +		 "ip netns exec %s nstat -asz %s | awk '%s' | grep -q '%s'",
> +		 NS_TEST, "MPTcpExtMPCapableSYNACKRX",
> +		 "NR==1 {next} {print $2}", "1");
> +	if (!ASSERT_OK(system(cmd), "No MPTcpExtMPCapableSYNACKRX found!"))
> +		err++;
> +
> +	return err;
> +}
> +
> +static int run_mptcpify(int cgroup_fd)
> +{
> +	int server_fd, client_fd, prog_fd, err = 0;
> +	struct mptcpify *mptcpify_skel;
> +
> +	mptcpify_skel = mptcpify__open_and_load();
> +	if (!ASSERT_OK_PTR(mptcpify_skel, "mptcpify__open_and_load"))
> +		return -EIO;
> +
> +	prog_fd = bpf_program__fd(mptcpify_skel->progs.mptcpify);
> +	if (!ASSERT_GE(prog_fd, 0, "bpf_program__fd")) {
> +		err = -EIO;
> +		goto out;
> +	}
>  
> -	SYS_NOFAIL("ip netns del " NS_TEST " &> /dev/null");
> +	err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_LSM_CGROUP, 0);
> +	if (!ASSERT_OK(err, "attach alloc_prog_fd")) {
> +		err = -EIO;
> +		goto out;
> +	}
>  
> +	/* without MPTCP */
> +	server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
> +	if (!ASSERT_GE(server_fd, 0, "start_server")) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	client_fd = connect_to_fd(server_fd, 0);
> +	if (!ASSERT_GE(client_fd, 0, "connect to fd")) {
> +		err = -EIO;
> +		goto close_server;
> +	}
> +
> +	send_byte(client_fd);
> +	err += verify_mptcpify();
> +
> +	close(client_fd);
> +close_server:
> +	close(server_fd);
> +out:
> +	mptcpify__destroy(mptcpify_skel);
> +	return err;
> +}
> +
> +static void test_mptcpify(void)
> +{
> +	struct nstoken *nstoken = NULL;
> +	int cgroup_fd;
> +
> +	cgroup_fd = test__join_cgroup("/mptcpify");
> +	if (!ASSERT_GE(cgroup_fd, 0, "test__join_cgroup"))
> +		return;
> +
> +	nstoken = create_netns();
> +	if (!ASSERT_OK_PTR(nstoken, "create_netns"))
> +		goto fail;
> +
> +	ASSERT_OK(run_mptcpify(cgroup_fd), "run_mptcpify");
> +
> +fail:
> +	cleanup_netns(nstoken);
>  	close(cgroup_fd);
>  }
>  
> @@ -186,4 +294,6 @@ void test_mptcp(void)
>  {
>  	if (test__start_subtest("base"))
>  		test_base();
> +	if (test__start_subtest("mptcpify"))
> +		test_mptcpify();
>  }
> diff --git a/tools/testing/selftests/bpf/progs/mptcpify.c b/tools/testing/selftests/bpf/progs/mptcpify.c
> new file mode 100644
> index 000000000000..94aef62016fa
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/mptcpify.c
> @@ -0,0 +1,41 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2023, SUSE. */
> +
> +#include <linux/bpf.h>
> +#include <bpf/bpf_helpers.h>
> +#include "bpf_tcp_helpers.h"
> +
> +char _license[] SEC("license") = "GPL";
> +char fmt[] = "tcp=%u\n";
> +
> +#define	AF_INET		2
> +#define	AF_INET6	10
> +#define	SOCK_STREAM	1
> +#define	IPPROTO_TCP	6
> +#define	IPPROTO_MPTCP	262
> +
> +static __always_inline bool is_tcp(int family, int type, int protocol)
> +{
> +	if ((family == AF_INET || family == AF_INET6) &&
> +	    type == SOCK_STREAM &&
> +	    (!protocol || protocol == IPPROTO_TCP))
> +		return true;
> +
> +	return false;
> +}
> +
> +SEC("lsm_cgroup/socket_create")
> +int BPF_PROG(mptcpify, int family, int type, int *protocol, int kern)
> +{
> +	bool tcp;
> +
> +	if (kern)
> +		goto out;
> +
> +	tcp = is_tcp(family, type, *protocol);
> +	bpf_trace_printk(fmt, sizeof(fmt), tcp);
> +	if (tcp)
> +		*protocol = IPPROTO_MPTCP;
> +out:
> +	return 1;
> +}
> -- 
> 2.35.3
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC bpf-next 0/8] BPF 'force to MPTCP'
  2023-07-13 17:14     ` Stanislav Fomichev
@ 2023-07-13 22:46       ` Alexei Starovoitov
  2023-07-13 23:09         ` Stanislav Fomichev
  0 siblings, 1 reply; 15+ messages in thread
From: Alexei Starovoitov @ 2023-07-13 22:46 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Geliang Tang, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Hao Luo, Jiri Olsa, bpf, MPTCP Upstream

On Thu, Jul 13, 2023 at 10:14 AM Stanislav Fomichev <sdf@google.com> wrote:
>
> On 07/13, Geliang Tang wrote:
> > On Thu, Jul 06, 2023 at 10:02:43AM -0700, Stanislav Fomichev wrote:
> > > On 07/06, Geliang Tang wrote:
> > > > As is described in the "How to use MPTCP?" section in MPTCP wiki [1]:
> > > >
> > > > "Your app can create sockets with IPPROTO_MPTCP as the proto:
> > > > ( socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP); ). Legacy apps can be
> > > > forced to create and use MPTCP sockets instead of TCP ones via the
> > > > mptcpize command bundled with the mptcpd daemon."
> > > >
> > > > But the mptcpize (LD_PRELOAD technique) command has some limitations
> > > > [2]:
> > > >
> > > >  - it doesn't work if the application is not using libc (e.g. GoLang
> > > > apps)
> > > >  - in some envs, it might not be easy to set env vars / change the way
> > > > apps are launched, e.g. on Android
> > > >  - mptcpize needs to be launched with all apps that want MPTCP: we could
> > > > have more control from BPF to enable MPTCP only for some apps or all the
> > > > ones of a netns or a cgroup, etc.
> > > >  - it is not in BPF, we cannot talk about it at netdev conf.
> > > >
> > > > So this patchset attempts to use BPF to implement functions similer to
> > > > mptcpize.
> > > >
> > > > The main idea is add a hook in sys_socket() to change the protocol id
> > > > from IPPROTO_TCP (or 0) to IPPROTO_MPTCP.
> > > >
> > > > 1. This first solution [3] is using "cgroup/sock_create":
> > > >
> > > > Implement a new helper bpf_mptcpify() to change the protocol id:
> > > >
> > > > +BPF_CALL_1(bpf_mptcpify, struct sock *, sk)
> > > > +{
> > > > + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) {
> > > > +         sk->sk_protocol = IPPROTO_MPTCP;
> > > > +         return (unsigned long)sk;
> > > > + }
> > > > +
> > > > + return (unsigned long)NULL;
> > > > +}
> > > > +
> > > > +const struct bpf_func_proto bpf_mptcpify_proto = {
> > > > + .func           = bpf_mptcpify,
> > > > + .gpl_only       = false,
> > > > + .ret_type       = RET_PTR_TO_BTF_ID_OR_NULL,
> > > > + .ret_btf_id     = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
> > > > + .arg1_type      = ARG_PTR_TO_CTX,
> > > > +};
> > > >
> > > > Add a hook in "cgroup/sock_create" section, invoking bpf_mptcpify()
> > > > helper in this hook:
> > > >
> > > > +SEC("cgroup/sock_create")
> > > > +int sock(struct bpf_sock *ctx)
> > > > +{
> > > > + struct sock *sk;
> > > > +
> > > > + if (ctx->type != SOCK_STREAM)
> > > > +         return 1;
> > > > +
> > > > + sk = bpf_mptcpify(ctx);
> > > > + if (!sk)
> > > > +         return 1;
> > > > +
> > > > + protocol = sk->sk_protocol;
> > > > + return 1;
> > > > +}
> > > >
> > > > But this solution doesn't work, because the sock_create section is
> > > > hooked at the end of inet_create(). It's too late to change the protocol
> > > > id.
> > > >
> > > > 2. The second solution [4] is using "fentry":
> > > >
> > > > Implement a bpf_mptcpify() helper:
> > > >
> > > > +BPF_CALL_1(bpf_mptcpify, struct socket_args *, args)
> > > > +{
> > > > + if (args->family == AF_INET &&
> > > > +     args->type == SOCK_STREAM &&
> > > > +     (!args->protocol || args->protocol == IPPROTO_TCP))
> > > > +         args->protocol = IPPROTO_MPTCP;
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +BTF_ID_LIST(bpf_mptcpify_btf_ids)
> > > > +BTF_ID(struct, socket_args)
> > > > +
> > > > +static const struct bpf_func_proto bpf_mptcpify_proto = {
> > > > + .func           = bpf_mptcpify,
> > > > + .gpl_only       = false,
> > > > + .ret_type       = RET_INTEGER,
> > > > + .arg1_type      = ARG_PTR_TO_BTF_ID,
> > > > + .arg1_btf_id    = &bpf_mptcpify_btf_ids[0],
> > > > +};
> > > >
> > > > Add a new wrapper socket_create() in sys_socket() path, passing a
> > > > pointer of struct socket_args (int family; int type; int protocol) to
> > > > the wrapper.
> > > >
> > > > +int socket_create(struct socket_args *args, struct socket **res)
> > > > +{
> > > > + return sock_create(args->family, args->type, args->protocol, res);
> > > > +}
> > > > +EXPORT_SYMBOL(socket_create);
> > > > +
> > > >  /**
> > > >   *       sock_create_kern - creates a socket (kernel space)
> > > >   *       @net: net namespace
> > > > @@ -1608,6 +1614,7 @@  EXPORT_SYMBOL(sock_create_kern);
> > > >
> > > >  static struct socket *__sys_socket_create(int family, int type, int protocol)
> > > >  {
> > > > + struct socket_args args = { 0 };
> > > >   struct socket *sock;
> > > >   int retval;
> > > >
> > > > @@ -1621,7 +1628,10 @@  static struct socket *__sys_socket_create(int family, int type, int protocol)
> > > >           return ERR_PTR(-EINVAL);
> > > >   type &= SOCK_TYPE_MASK;
> > > >
> > > > - retval = sock_create(family, type, protocol, &sock);
> > > > + args.family = family;
> > > > + args.type = type;
> > > > + args.protocol = protocol;
> > > > + retval = socket_create(&args, &sock);
> > > >   if (retval < 0)
> > > >           return ERR_PTR(retval);
> > > >
> > > > Add "fentry" hook to the newly added wrapper, invoking bpf_mptcpify()
> > > > in the hook:
> > > >
> > > > +SEC("fentry/socket_create")
> > > > +int BPF_PROG(trace_socket_create, void *args,
> > > > +         struct socket **res)
> > > > +{
> > > > + bpf_mptcpify(args);
> > > > + return 0;
> > > > +}
> > > >
> > > > This version works. But it's just a work around version. Since the code
> > > > to add a wrapper to sys_socket() is not very elegant indeed, and it
> > > > shouldn't be accepted by upstream.
> > > >
> > > > 3. The last solution is this patchset itself:
> > > >
> > > > Introduce new program type BPF_PROG_TYPE_CGROUP_SOCKINIT and attach type
> > > > BPF_CGROUP_SOCKINIT on cgroup basis.
> > > >
> > > > Define BPF_CGROUP_RUN_PROG_SOCKINIT() helper, and implement
> > > > __cgroup_bpf_run_sockinit() helper to run a sockinit program:
> > > >
> > > > +#define BPF_CGROUP_RUN_PROG_SOCKINIT(family, type, protocol)                    \
> > > > +({                                                                              \
> > > > + int __ret = 0;                                                         \
> > > > + if (cgroup_bpf_enabled(CGROUP_SOCKINIT))                               \
> > > > +         __ret = __cgroup_bpf_run_sockinit(family, type, protocol,      \
> > > > +                                           CGROUP_SOCKINIT);            \
> > > > + __ret;                                                                 \
> > > > +})
> > > > +
> > > >
> > > > Invoke BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create() to change
> > > > the arguments:
> > > >
> > > > @@ -1469,6 +1469,12 @@  int __sock_create(struct net *net, int family, int type, int protocol,
> > > >   struct socket *sock;
> > > >   const struct net_proto_family *pf;
> > > >
> > > > + if (!kern) {
> > > > +         err = BPF_CGROUP_RUN_PROG_SOCKINIT(&family, &type, &protocol);
> > > > +         if (err)
> > > > +                 return err;
> > > > + }
> > > > +
> > > >   /*
> > > >    *      Check protocol is in range
> > > >    */
> > > >
> > > > Define BPF program in this newly added 'sockinit' SEC, so it will be
> > > > hooked in BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create().
> > > >
> > > > @@ -158,6 +158,11 @@  static struct sec_name_test tests[] = {
> > > >           {0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT},
> > > >           {0, BPF_CGROUP_SETSOCKOPT},
> > > >   },
> > > > + {
> > > > +         "cgroup/sockinit",
> > > > +         {0, BPF_PROG_TYPE_CGROUP_SOCKINIT, BPF_CGROUP_SOCKINIT},
> > > > +         {0, BPF_CGROUP_SOCKINIT},
> > > > + },
> > > >  };
> > > >
> > > > +SEC("cgroup/sockinit")
> > > > +int mptcpify(struct bpf_sockinit_ctx *ctx)
> > > > +{
> > > > + if ((ctx->family == AF_INET || ctx->family == AF_INET6) &&
> > > > +     ctx->type == SOCK_STREAM &&
> > > > +     (!ctx->protocol || ctx->protocol == IPPROTO_TCP)) {
> > > > +         ctx->protocol = IPPROTO_MPTCP;
> > > > + }
> > > > +
> > > > + return 1;
> > > > +}
> > > >
> > > > This version is the best solution we have found so far.
> > > >
> > > > [1]
> > > > https://github.com/multipath-tcp/mptcp_net-next/wiki
> > > > [2]
> > > > https://github.com/multipath-tcp/mptcp_net-next/issues/79
> > > > [3]
> > > > https://patchwork.kernel.org/project/mptcp/cover/cover.1688215769.git.geliang.tang@suse.com/
> > > > [4]
> > > > https://patchwork.kernel.org/project/mptcp/cover/cover.1688366249.git.geliang.tang@suse.com/
> > >
> > > cgroup/sock_create being weird and triggering late and only for af_inet4/6
> > > was the reason we added ability to attach to lsm hooks on a per-cgroup
> > > basis:
> > > https://lore.kernel.org/bpf/20220628174314.1216643-1-sdf@google.com/
> > >
> > > Unfortunately, using it here won't help either :-( The following
> > > hook triggers early enough but doesn't allow changing the arguments (I was
> > > interested only in filtering based on the arguments, not changing them):
> > >
> > > LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern)
> > >
> > > So maybe another alternative might be to change its definition to int *family,
> > > int *type, int *protocol and use lsm_cgroup/socket_create to rewrite the
> >
> > Thanks Stanislav, this lsm_cgroup/socket_create works. The test patch
> > is attached.
> >
> > > protocol? (some verifier changes might needed to make those writable)
> >
> > But I got some verification errors here:
> >
> >    invalid mem access 'scalar'.
> >
> > I tried many times but couldn't solve it, so I simply skipped the
> > verifier in the test patch (I marked TODO in front of this code).
> > Could you please give me some suggestions for verification?
>
> Right, that's the core issue here, to add support to the verifier
> to let it write into scalar pointer arguments. I don't have a good suggestion,
> but we've discussed it multiple times elsewhere that maybe we
> should do it :-)
>
> Can we use some new btf_type_tag("allow_write") to annotate places
> where we know that it's safe to write to them? Then we can extend
> the verifier to check for this condition hopefully. Maybe other BPF
> folks have better suggestions here?
>
> We should also CC LSM people to make sure it's not a no-no to
> change LSM_HOOK(s) in a way to allow changing the arguments. I'm
> not sure how rigid those definitions are :-(

imo all 3 options including this 4th one are too hacky.
I understand ld_preload limitations and desire to have it per-cgroup,
but messing this much with user space feels a little bit too much.
What side effects will it cause?
Meaning is this enough to just change the proto?
Nothing in user space later on needs to be aware the protocol is so different?
I feel the consequences are too drastic to support such thing
through an official/stable hook.
We can consider an fmod_ret unstable hook somewhere in the kernel
that bpf prog can attach to and tweak the ret value and/or args,
but the production environment won't be using it.
It will be a temporary gap until user space is properly converted to mptcp.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC bpf-next 0/8] BPF 'force to MPTCP'
  2023-07-13 22:46       ` Alexei Starovoitov
@ 2023-07-13 23:09         ` Stanislav Fomichev
  2023-07-14  7:56           ` Paolo Abeni
  0 siblings, 1 reply; 15+ messages in thread
From: Stanislav Fomichev @ 2023-07-13 23:09 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Geliang Tang, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Hao Luo, Jiri Olsa, bpf, MPTCP Upstream, netdev

On 07/13, Alexei Starovoitov wrote:
> On Thu, Jul 13, 2023 at 10:14 AM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > On 07/13, Geliang Tang wrote:
> > > On Thu, Jul 06, 2023 at 10:02:43AM -0700, Stanislav Fomichev wrote:
> > > > On 07/06, Geliang Tang wrote:
> > > > > As is described in the "How to use MPTCP?" section in MPTCP wiki [1]:
> > > > >
> > > > > "Your app can create sockets with IPPROTO_MPTCP as the proto:
> > > > > ( socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP); ). Legacy apps can be
> > > > > forced to create and use MPTCP sockets instead of TCP ones via the
> > > > > mptcpize command bundled with the mptcpd daemon."
> > > > >
> > > > > But the mptcpize (LD_PRELOAD technique) command has some limitations
> > > > > [2]:
> > > > >
> > > > >  - it doesn't work if the application is not using libc (e.g. GoLang
> > > > > apps)
> > > > >  - in some envs, it might not be easy to set env vars / change the way
> > > > > apps are launched, e.g. on Android
> > > > >  - mptcpize needs to be launched with all apps that want MPTCP: we could
> > > > > have more control from BPF to enable MPTCP only for some apps or all the
> > > > > ones of a netns or a cgroup, etc.
> > > > >  - it is not in BPF, we cannot talk about it at netdev conf.
> > > > >
> > > > > So this patchset attempts to use BPF to implement functions similer to
> > > > > mptcpize.
> > > > >
> > > > > The main idea is add a hook in sys_socket() to change the protocol id
> > > > > from IPPROTO_TCP (or 0) to IPPROTO_MPTCP.
> > > > >
> > > > > 1. This first solution [3] is using "cgroup/sock_create":
> > > > >
> > > > > Implement a new helper bpf_mptcpify() to change the protocol id:
> > > > >
> > > > > +BPF_CALL_1(bpf_mptcpify, struct sock *, sk)
> > > > > +{
> > > > > + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) {
> > > > > +         sk->sk_protocol = IPPROTO_MPTCP;
> > > > > +         return (unsigned long)sk;
> > > > > + }
> > > > > +
> > > > > + return (unsigned long)NULL;
> > > > > +}
> > > > > +
> > > > > +const struct bpf_func_proto bpf_mptcpify_proto = {
> > > > > + .func           = bpf_mptcpify,
> > > > > + .gpl_only       = false,
> > > > > + .ret_type       = RET_PTR_TO_BTF_ID_OR_NULL,
> > > > > + .ret_btf_id     = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
> > > > > + .arg1_type      = ARG_PTR_TO_CTX,
> > > > > +};
> > > > >
> > > > > Add a hook in "cgroup/sock_create" section, invoking bpf_mptcpify()
> > > > > helper in this hook:
> > > > >
> > > > > +SEC("cgroup/sock_create")
> > > > > +int sock(struct bpf_sock *ctx)
> > > > > +{
> > > > > + struct sock *sk;
> > > > > +
> > > > > + if (ctx->type != SOCK_STREAM)
> > > > > +         return 1;
> > > > > +
> > > > > + sk = bpf_mptcpify(ctx);
> > > > > + if (!sk)
> > > > > +         return 1;
> > > > > +
> > > > > + protocol = sk->sk_protocol;
> > > > > + return 1;
> > > > > +}
> > > > >
> > > > > But this solution doesn't work, because the sock_create section is
> > > > > hooked at the end of inet_create(). It's too late to change the protocol
> > > > > id.
> > > > >
> > > > > 2. The second solution [4] is using "fentry":
> > > > >
> > > > > Implement a bpf_mptcpify() helper:
> > > > >
> > > > > +BPF_CALL_1(bpf_mptcpify, struct socket_args *, args)
> > > > > +{
> > > > > + if (args->family == AF_INET &&
> > > > > +     args->type == SOCK_STREAM &&
> > > > > +     (!args->protocol || args->protocol == IPPROTO_TCP))
> > > > > +         args->protocol = IPPROTO_MPTCP;
> > > > > +
> > > > > + return 0;
> > > > > +}
> > > > > +
> > > > > +BTF_ID_LIST(bpf_mptcpify_btf_ids)
> > > > > +BTF_ID(struct, socket_args)
> > > > > +
> > > > > +static const struct bpf_func_proto bpf_mptcpify_proto = {
> > > > > + .func           = bpf_mptcpify,
> > > > > + .gpl_only       = false,
> > > > > + .ret_type       = RET_INTEGER,
> > > > > + .arg1_type      = ARG_PTR_TO_BTF_ID,
> > > > > + .arg1_btf_id    = &bpf_mptcpify_btf_ids[0],
> > > > > +};
> > > > >
> > > > > Add a new wrapper socket_create() in sys_socket() path, passing a
> > > > > pointer of struct socket_args (int family; int type; int protocol) to
> > > > > the wrapper.
> > > > >
> > > > > +int socket_create(struct socket_args *args, struct socket **res)
> > > > > +{
> > > > > + return sock_create(args->family, args->type, args->protocol, res);
> > > > > +}
> > > > > +EXPORT_SYMBOL(socket_create);
> > > > > +
> > > > >  /**
> > > > >   *       sock_create_kern - creates a socket (kernel space)
> > > > >   *       @net: net namespace
> > > > > @@ -1608,6 +1614,7 @@  EXPORT_SYMBOL(sock_create_kern);
> > > > >
> > > > >  static struct socket *__sys_socket_create(int family, int type, int protocol)
> > > > >  {
> > > > > + struct socket_args args = { 0 };
> > > > >   struct socket *sock;
> > > > >   int retval;
> > > > >
> > > > > @@ -1621,7 +1628,10 @@  static struct socket *__sys_socket_create(int family, int type, int protocol)
> > > > >           return ERR_PTR(-EINVAL);
> > > > >   type &= SOCK_TYPE_MASK;
> > > > >
> > > > > - retval = sock_create(family, type, protocol, &sock);
> > > > > + args.family = family;
> > > > > + args.type = type;
> > > > > + args.protocol = protocol;
> > > > > + retval = socket_create(&args, &sock);
> > > > >   if (retval < 0)
> > > > >           return ERR_PTR(retval);
> > > > >
> > > > > Add "fentry" hook to the newly added wrapper, invoking bpf_mptcpify()
> > > > > in the hook:
> > > > >
> > > > > +SEC("fentry/socket_create")
> > > > > +int BPF_PROG(trace_socket_create, void *args,
> > > > > +         struct socket **res)
> > > > > +{
> > > > > + bpf_mptcpify(args);
> > > > > + return 0;
> > > > > +}
> > > > >
> > > > > This version works. But it's just a work around version. Since the code
> > > > > to add a wrapper to sys_socket() is not very elegant indeed, and it
> > > > > shouldn't be accepted by upstream.
> > > > >
> > > > > 3. The last solution is this patchset itself:
> > > > >
> > > > > Introduce new program type BPF_PROG_TYPE_CGROUP_SOCKINIT and attach type
> > > > > BPF_CGROUP_SOCKINIT on cgroup basis.
> > > > >
> > > > > Define BPF_CGROUP_RUN_PROG_SOCKINIT() helper, and implement
> > > > > __cgroup_bpf_run_sockinit() helper to run a sockinit program:
> > > > >
> > > > > +#define BPF_CGROUP_RUN_PROG_SOCKINIT(family, type, protocol)                    \
> > > > > +({                                                                              \
> > > > > + int __ret = 0;                                                         \
> > > > > + if (cgroup_bpf_enabled(CGROUP_SOCKINIT))                               \
> > > > > +         __ret = __cgroup_bpf_run_sockinit(family, type, protocol,      \
> > > > > +                                           CGROUP_SOCKINIT);            \
> > > > > + __ret;                                                                 \
> > > > > +})
> > > > > +
> > > > >
> > > > > Invoke BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create() to change
> > > > > the arguments:
> > > > >
> > > > > @@ -1469,6 +1469,12 @@  int __sock_create(struct net *net, int family, int type, int protocol,
> > > > >   struct socket *sock;
> > > > >   const struct net_proto_family *pf;
> > > > >
> > > > > + if (!kern) {
> > > > > +         err = BPF_CGROUP_RUN_PROG_SOCKINIT(&family, &type, &protocol);
> > > > > +         if (err)
> > > > > +                 return err;
> > > > > + }
> > > > > +
> > > > >   /*
> > > > >    *      Check protocol is in range
> > > > >    */
> > > > >
> > > > > Define BPF program in this newly added 'sockinit' SEC, so it will be
> > > > > hooked in BPF_CGROUP_RUN_PROG_SOCKINIT() in __socket_create().
> > > > >
> > > > > @@ -158,6 +158,11 @@  static struct sec_name_test tests[] = {
> > > > >           {0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT},
> > > > >           {0, BPF_CGROUP_SETSOCKOPT},
> > > > >   },
> > > > > + {
> > > > > +         "cgroup/sockinit",
> > > > > +         {0, BPF_PROG_TYPE_CGROUP_SOCKINIT, BPF_CGROUP_SOCKINIT},
> > > > > +         {0, BPF_CGROUP_SOCKINIT},
> > > > > + },
> > > > >  };
> > > > >
> > > > > +SEC("cgroup/sockinit")
> > > > > +int mptcpify(struct bpf_sockinit_ctx *ctx)
> > > > > +{
> > > > > + if ((ctx->family == AF_INET || ctx->family == AF_INET6) &&
> > > > > +     ctx->type == SOCK_STREAM &&
> > > > > +     (!ctx->protocol || ctx->protocol == IPPROTO_TCP)) {
> > > > > +         ctx->protocol = IPPROTO_MPTCP;
> > > > > + }
> > > > > +
> > > > > + return 1;
> > > > > +}
> > > > >
> > > > > This version is the best solution we have found so far.
> > > > >
> > > > > [1]
> > > > > https://github.com/multipath-tcp/mptcp_net-next/wiki
> > > > > [2]
> > > > > https://github.com/multipath-tcp/mptcp_net-next/issues/79
> > > > > [3]
> > > > > https://patchwork.kernel.org/project/mptcp/cover/cover.1688215769.git.geliang.tang@suse.com/
> > > > > [4]
> > > > > https://patchwork.kernel.org/project/mptcp/cover/cover.1688366249.git.geliang.tang@suse.com/
> > > >
> > > > cgroup/sock_create being weird and triggering late and only for af_inet4/6
> > > > was the reason we added ability to attach to lsm hooks on a per-cgroup
> > > > basis:
> > > > https://lore.kernel.org/bpf/20220628174314.1216643-1-sdf@google.com/
> > > >
> > > > Unfortunately, using it here won't help either :-( The following
> > > > hook triggers early enough but doesn't allow changing the arguments (I was
> > > > interested only in filtering based on the arguments, not changing them):
> > > >
> > > > LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern)
> > > >
> > > > So maybe another alternative might be to change its definition to int *family,
> > > > int *type, int *protocol and use lsm_cgroup/socket_create to rewrite the
> > >
> > > Thanks Stanislav, this lsm_cgroup/socket_create works. The test patch
> > > is attached.
> > >
> > > > protocol? (some verifier changes might needed to make those writable)
> > >
> > > But I got some verification errors here:
> > >
> > >    invalid mem access 'scalar'.
> > >
> > > I tried many times but couldn't solve it, so I simply skipped the
> > > verifier in the test patch (I marked TODO in front of this code).
> > > Could you please give me some suggestions for verification?
> >
> > Right, that's the core issue here, to add support to the verifier
> > to let it write into scalar pointer arguments. I don't have a good suggestion,
> > but we've discussed it multiple times elsewhere that maybe we
> > should do it :-)
> >
> > Can we use some new btf_type_tag("allow_write") to annotate places
> > where we know that it's safe to write to them? Then we can extend
> > the verifier to check for this condition hopefully. Maybe other BPF
> > folks have better suggestions here?
> >
> > We should also CC LSM people to make sure it's not a no-no to
> > change LSM_HOOK(s) in a way to allow changing the arguments. I'm
> > not sure how rigid those definitions are :-(
> 
> imo all 3 options including this 4th one are too hacky.
> I understand ld_preload limitations and desire to have it per-cgroup,
> but messing this much with user space feels a little bit too much.
> What side effects will it cause?

Maybe all that is really needed is some new per-netns sysctl to automatically
upgrade from IPPROTO_TCP to IPPROTO_MPTCP? Or is it too broad of a
brush?

I've also CC'd netdev for visibility...

> Meaning is this enough to just change the proto?
> Nothing in user space later on needs to be aware the protocol is so different?

IIUC, if you use IPPROTO_MPTCP, you just get regular TCP until you start
adding extra routes (via netlink). That's why their current
unconditional IPPROTO_TCP->IPPROTO_MPTCP rewrite via ld_preload also somewhat
works.

> I feel the consequences are too drastic to support such thing
> through an official/stable hook.
> We can consider an fmod_ret unstable hook somewhere in the kernel
> that bpf prog can attach to and tweak the ret value and/or args,
> but the production environment won't be using it.
> It will be a temporary gap until user space is properly converted to mptcp.

Asking every app to do s/IPPROTO_TCP/IPPROTO_MPTCP/ might be annoying
though? (don't have a horse in this race, but have some v4->v6 migration
vibes from this)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC bpf-next 0/8] BPF 'force to MPTCP'
  2023-07-13 23:09         ` Stanislav Fomichev
@ 2023-07-14  7:56           ` Paolo Abeni
  0 siblings, 0 replies; 15+ messages in thread
From: Paolo Abeni @ 2023-07-14  7:56 UTC (permalink / raw)
  To: Stanislav Fomichev, Alexei Starovoitov
  Cc: Geliang Tang, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	KP Singh, Hao Luo, Jiri Olsa, bpf, MPTCP Upstream, netdev

On Thu, 2023-07-13 at 16:09 -0700, Stanislav Fomichev wrote:
> On 07/13, Alexei Starovoitov wrote:
> > imo all 3 options including this 4th one are too hacky.
> > I understand ld_preload limitations and desire to have it per-cgroup,
> > but messing this much with user space feels a little bit too much.
> > What side effects will it cause?
> 
> Maybe all that is really needed is some new per-netns sysctl to automatically
> upgrade from IPPROTO_TCP to IPPROTO_MPTCP? Or is it too broad of a
> brush?

I think it would be actually too broad, see below...

> I've also CC'd netdev for visibility...
> 
> > Meaning is this enough to just change the proto?
> > Nothing in user space later on needs to be aware the protocol is so different?
> 
> IIUC, if you use IPPROTO_MPTCP, you just get regular TCP until you start
> adding extra routes (via netlink). That's why their current
> unconditional IPPROTO_TCP->IPPROTO_MPTCP rewrite via ld_preload also somewhat
> works.

FTR, it the other way around: when using IPPROTO_MPTCP you always get
MPTCP protocol handshake that downgrade gracefully to TCP if the peer
does not support it. Then multiple paths can be added/enabled by
different means, but that is another matter - a quite orthogonal one.

The transition to TCP in currently not completely for free: active
(client) MPTCP sockets fallen-back to TCP will keep some overhead vs
plain TCP ones.

Being able to control the IPPROTO_TCP->IPPROTO_MPTCP change on per
socket basis do offer some advantages e.g. constraining the change to
the sockets that are likely to complete successfully the MPTCP
handshake. 

> > I feel the consequences are too drastic to support such thing
> > through an official/stable hook.
> > We can consider an fmod_ret unstable hook somewhere in the kernel
> > that bpf prog can attach to and tweak the ret value and/or args,
> > but the production environment won't be using it.
> > It will be a temporary gap until user space is properly converted to mptcp.
> 
> Asking every app to do s/IPPROTO_TCP/IPPROTO_MPTCP/ might be annoying
> though? (don't have a horse in this race, but have some v4->v6 migration
> vibes from this)

I can do only wild guesses, but I also expect such "transition" to be
extremely long and/or incomplete.

Cheers,

Paolo


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2023-07-14  7:57 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-07-06  4:08 [RFC bpf-next 0/8] BPF 'force to MPTCP' Geliang Tang
2023-07-06  4:08 ` [RFC bpf-next 1/8] bpf: Add new prog type sockinit Geliang Tang
2023-07-06  4:08 ` [RFC bpf-next 2/8] bpf: Run a sockinit program Geliang Tang
2023-07-06  4:08 ` [RFC bpf-next 3/8] net: socket: run sockinit hooks Geliang Tang
2023-07-06  4:08 ` [RFC bpf-next 4/8] libbpf: Support sockinit hook Geliang Tang
2023-07-06  4:08 ` [RFC bpf-next 5/8] selftests/bpf: Add mptcpify program Geliang Tang
2023-07-06  4:08 ` [RFC bpf-next 6/8] selftests/bpf: use random netns name for mptcp Geliang Tang
2023-07-06  4:08 ` [RFC bpf-next 7/8] selftests/bpf: add two mptcp netns helpers Geliang Tang
2023-07-06  4:08 ` [RFC bpf-next 8/8] selftests/bpf: Add mptcpify selftest Geliang Tang
2023-07-06 17:02 ` [RFC bpf-next 0/8] BPF 'force to MPTCP' Stanislav Fomichev
2023-07-13  5:47   ` Geliang Tang
2023-07-13 17:14     ` Stanislav Fomichev
2023-07-13 22:46       ` Alexei Starovoitov
2023-07-13 23:09         ` Stanislav Fomichev
2023-07-14  7:56           ` Paolo Abeni

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.