From: David Carlier <devnexen@gmail.com>
To: mptcp@lists.linux.dev
Cc: matttbe@kernel.org, martineau@kernel.org, geliang@kernel.org,
pabeni@redhat.com, David Carlier <devnexen@gmail.com>
Subject: [PATCH mptcp-next v7 3/4] mptcp: support MSG_ERRQUEUE on the parent socket
Date: Sat, 9 May 2026 22:16:46 +0100 [thread overview]
Message-ID: <20260509211651.104934-4-devnexen@gmail.com> (raw)
In-Reply-To: <20260509211651.104934-1-devnexen@gmail.com>
Splice pending err skbs from each subflow's error queue onto the parent
msk's error queue at error-report time, so poll() and recvmsg(MSG_ERRQUEUE)
on the parent socket observe TX timestamps and MSG_ZEROCOPY completion
notifications through the standard inet ABI.
The splice filters by SO_EE_ORIGIN: TIMESTAMPING / ZEROCOPY / LOCAL
events forward to the parent because they are tied to user-handed data,
not to a specific path; subflow-level ICMP errors are dropped because
the legacy RECVERR ABI cannot meaningfully convey their per-subflow peer
identity to single-path-aware userspace. Such events will be carried by
a future MPTCP_RECERR channel.
mptcp_recv_error() retries the splice on the pull side: if
sock_queue_err_skb() previously failed under rmem pressure, the skb
stays on the subflow queue, and the next recvmsg(MSG_ERRQUEUE) splices
it once the parent's queue has been drained.
Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David Carlier <devnexen@gmail.com>
---
net/mptcp/protocol.c | 92 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 86 insertions(+), 6 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 93e7a42fc65c..53abb8dc2c0f 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -11,6 +11,7 @@
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/atomic.h>
+#include <linux/errqueue.h>
#include <net/aligned_data.h>
#include <net/rps.h>
#include <net/sock.h>
@@ -815,21 +816,52 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
return moved;
}
+static bool mptcp_errqueue_skb_forwardable(const struct sk_buff *skb)
+{
+ u8 origin = SKB_EXT_ERR(skb)->ee.ee_origin;
+
+ return origin == SO_EE_ORIGIN_TIMESTAMPING ||
+ origin == SO_EE_ORIGIN_ZEROCOPY ||
+ origin == SO_EE_ORIGIN_LOCAL;
+}
+
+static bool __mptcp_subflow_splice_errqueue(struct sock *sk, struct sock *ssk)
+{
+ struct sk_buff *skb;
+ bool moved = false;
+
+ while ((skb = skb_dequeue(&ssk->sk_error_queue))) {
+ if (!mptcp_errqueue_skb_forwardable(skb)) {
+ kfree_skb(skb); /* path-specific (ICMP) — belongs in MPTCP_RECERR */
+ continue;
+ }
+ if (sock_queue_err_skb(sk, skb)) {
+ skb_queue_head(&ssk->sk_error_queue, skb);
+ break;
+ }
+ moved = true;
+ }
+
+ return moved;
+}
+
static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
{
int ssk_state;
+ bool report;
int err;
+ report = __mptcp_subflow_splice_errqueue(sk, ssk);
+
/* only propagate errors on fallen-back sockets or
* on MPC connect
*/
if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
- return false;
+ goto out;
err = sock_error(ssk);
if (!err)
- return false;
-
+ goto out;
/* We need to propagate only transition to CLOSE state.
* Orphaned socket will see such state change via
* subflow_sched_work_if_closed() and that path will properly
@@ -839,6 +871,11 @@ static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
mptcp_set_state(sk, ssk_state);
WRITE_ONCE(sk->sk_err, -err);
+ report = true;
+
+out:
+ if (!report)
+ return false;
/* This barrier is coupled with smp_rmb() in mptcp_poll() */
smp_wmb();
@@ -2286,6 +2323,35 @@ static unsigned int mptcp_inq_hint(const struct sock *sk)
return 0;
}
+static int mptcp_recv_error(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
+ bool moved = false;
+ int ret;
+
+ /* Drain the parent first: a previous splice may have failed under
+ * rmem pressure and the skb stayed on a subflow. Freeing space here
+ * lets the splice below succeed; sock_queue_err_skb() then re-asserts
+ * EPOLLERR so userspace knows to drain again on the next poll.
+ */
+ ret = inet_recv_error(sk, msg, len);
+
+ lock_sock(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!skb_queue_empty_lockless(&ssk->sk_error_queue))
+ moved |= __mptcp_subflow_splice_errqueue(sk, ssk);
+ }
+ release_sock(sk);
+
+ if (ret == -EAGAIN && moved)
+ ret = inet_recv_error(sk, msg, len);
+
+ return ret;
+}
+
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int flags)
{
@@ -2295,9 +2361,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int target;
long timeo;
- /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */
if (unlikely(flags & MSG_ERRQUEUE))
- return inet_recv_error(sk, msg, len);
+ return mptcp_recv_error(sk, msg, len);
lock_sock(sk);
if (unlikely(sk->sk_state == TCP_LISTEN)) {
@@ -4298,6 +4363,19 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
return 0;
}
+static bool mptcp_subflow_errqueue_pending(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!skb_queue_empty_lockless(&ssk->sk_error_queue))
+ return true;
+ }
+ return false;
+}
+
static __poll_t mptcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
@@ -4341,7 +4419,9 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
/* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
smp_rmb();
- if (READ_ONCE(sk->sk_err))
+ if (READ_ONCE(sk->sk_err) ||
+ !skb_queue_empty_lockless(&sk->sk_error_queue) ||
+ mptcp_subflow_errqueue_pending(msk))
mask |= EPOLLERR;
return mask;
--
2.53.0
next prev parent reply other threads:[~2026-05-09 21:17 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-09 21:16 [PATCH mptcp-next v7 0/4] mptcp: MSG_ERRQUEUE support on the parent socket David Carlier
2026-05-09 21:16 ` [PATCH mptcp-next v7 1/4] mptcp: sockopt: factor inet_flags propagation into a mask David Carlier
2026-05-09 21:16 ` [PATCH mptcp-next v7 2/4] mptcp: propagate RECVERR sockopts to subflows David Carlier
2026-05-09 21:16 ` David Carlier [this message]
2026-05-09 21:16 ` [PATCH mptcp-next v7 4/4] selftests: mptcp: cover IP_RECVERR sockopt propagation David Carlier
2026-05-09 22:24 ` [PATCH mptcp-next v7 0/4] mptcp: MSG_ERRQUEUE support on the parent socket MPTCP CI
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260509211651.104934-4-devnexen@gmail.com \
--to=devnexen@gmail.com \
--cc=geliang@kernel.org \
--cc=martineau@kernel.org \
--cc=matttbe@kernel.org \
--cc=mptcp@lists.linux.dev \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.