From: David Carlier <devnexen@gmail.com>
To: mptcp@lists.linux.dev
Cc: matttbe@kernel.org, martineau@kernel.org, geliang@kernel.org,
pabeni@redhat.com, David Carlier <devnexen@gmail.com>
Subject: [PATCH mptcp-next v10 3/4] mptcp: support MSG_ERRQUEUE on the parent socket
Date: Fri, 29 May 2026 18:45:21 +0100 [thread overview]
Message-ID: <20260529174524.260199-4-devnexen@gmail.com> (raw)
In-Reply-To: <20260529174524.260199-1-devnexen@gmail.com>
Splice pending err skbs from each subflow's error queue onto the parent
msk's error queue at error-report time, so poll() and recvmsg(MSG_ERRQUEUE)
on the parent socket observe TX timestamps and MSG_ZEROCOPY completion
notifications through the standard inet ABI.
The splice filters by SO_EE_ORIGIN: TIMESTAMPING / ZEROCOPY / LOCAL
events forward to the parent because they are tied to user-handed data,
not to a specific path; subflow-level ICMP errors are dropped because
the legacy RECVERR ABI cannot meaningfully convey their per-subflow peer
identity to single-path-aware userspace. Such events will be carried by
a future MPTCP_RECERR channel.
MSG_ZEROCOPY completions carry an unpin/free obligation for userspace, so
they are queued to the parent unconditionally and are never dropped,
mirroring tcp's __msg_zerocopy_callback() which likewise bypasses
sk_rcvbuf. Timestamping and local events instead go through
sock_queue_err_skb() and are dropped under rmem pressure (sk_rmem_alloc +
truesize >= sk_rcvbuf) on a full err queue, matching tcp's sk_rcvbuf-gated
tx-timestamp path and ip_icmp_error() / ipv6_icmp_error(). The
MSG_ERRQUEUE branch of mptcp_recvmsg() forwards to inet_recv_error()
directly, and poll() advertises EPOLLERR purely on the parent's sk_err /
sk_error_queue, matching tcp_poll().
Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David Carlier <devnexen@gmail.com>
---
net/mptcp/protocol.c | 68 ++++++++++++++++++++++++++++++++++++++------
1 file changed, 59 insertions(+), 9 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1d67728d4233..64c0e841b9b7 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -11,6 +11,7 @@
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/atomic.h>
+#include <linux/errqueue.h>
#include <net/aligned_data.h>
#include <net/rps.h>
#include <net/sock.h>
@@ -829,21 +830,66 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
return moved;
}
+static bool mptcp_errqueue_skb_forwardable(const struct sk_buff *skb)
+{
+ u8 origin = SKB_EXT_ERR(skb)->ee.ee_origin;
+
+ return origin == SO_EE_ORIGIN_TIMESTAMPING ||
+ origin == SO_EE_ORIGIN_ZEROCOPY ||
+ origin == SO_EE_ORIGIN_LOCAL;
+}
+
+static bool __mptcp_subflow_splice_errqueue(struct sock *sk, struct sock *ssk)
+{
+ struct sk_buff *skb;
+ bool moved = false;
+
+ while ((skb = skb_dequeue(&ssk->sk_error_queue))) {
+ if (!mptcp_errqueue_skb_forwardable(skb)) {
+ kfree_skb(skb); /* path-specific (ICMP) — belongs in MPTCP_RECERR */
+ continue;
+ }
+ /* MSG_ZEROCOPY completions carry an unpin/free obligation for
+ * userspace and must never be dropped. TCP's
+ * __msg_zerocopy_callback() queues them to sk_error_queue
+ * regardless of sk_rcvbuf; mirror that here rather than letting
+ * sock_queue_err_skb() drop them under receive-buffer pressure.
+ */
+ if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY) {
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb_queue_tail(&sk->sk_error_queue, skb);
+ moved = true;
+ continue;
+ }
+ if (sock_queue_err_skb(sk, skb)) {
+ kfree_skb(skb);
+ continue;
+ }
+ moved = true;
+ }
+
+ return moved;
+}
+
static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
{
+ bool propagated = false;
int ssk_state;
+ bool report;
int err;
+ report = __mptcp_subflow_splice_errqueue(sk, ssk);
+
/* only propagate errors on fallen-back sockets or
* on MPC connect
*/
if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
- return false;
+ goto out;
err = sock_error(ssk);
if (!err)
- return false;
-
+ goto out;
/* We need to propagate only transition to CLOSE state.
* Orphaned socket will see such state change via
* subflow_sched_work_if_closed() and that path will properly
@@ -853,11 +899,15 @@ static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
mptcp_set_state(sk, ssk_state);
WRITE_ONCE(sk->sk_err, -err);
+ report = propagated = true;
- /* This barrier is coupled with smp_rmb() in mptcp_poll() */
- smp_wmb();
- sk_error_report(sk);
- return true;
+out:
+ if (report) {
+ /* This barrier is coupled with smp_rmb() in mptcp_poll() */
+ smp_wmb();
+ sk_error_report(sk);
+ }
+ return propagated;
}
void __mptcp_error_report(struct sock *sk)
@@ -2313,7 +2363,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int target;
long timeo;
- /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len);
@@ -4363,7 +4412,8 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
/* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
smp_rmb();
- if (READ_ONCE(sk->sk_err))
+ if (READ_ONCE(sk->sk_err) ||
+ !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR;
return mask;
--
2.53.0
next prev parent reply other threads:[~2026-05-29 17:45 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-29 17:45 [PATCH mptcp-next v10 0/4] mptcp: MSG_ERRQUEUE support on the parent socket David Carlier
2026-05-29 17:45 ` [PATCH mptcp-next v10 1/4] mptcp: sockopt: factor inet_flags propagation into a mask David Carlier
2026-05-29 17:45 ` [PATCH mptcp-next v10 2/4] mptcp: propagate RECVERR sockopts to subflows David Carlier
2026-05-29 17:45 ` David Carlier [this message]
2026-05-29 17:45 ` [PATCH mptcp-next v10 4/4] selftests: mptcp: cover IP_RECVERR sockopt propagation David Carlier
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260529174524.260199-4-devnexen@gmail.com \
--to=devnexen@gmail.com \
--cc=geliang@kernel.org \
--cc=martineau@kernel.org \
--cc=matttbe@kernel.org \
--cc=mptcp@lists.linux.dev \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox