From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: stable@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
patches@lists.linux.dev, Mat Martineau <martineau@kernel.org>,
Paolo Abeni <pabeni@redhat.com>, Jakub Kicinski <kuba@kernel.org>,
Sasha Levin <sashal@kernel.org>
Subject: [PATCH 6.6 28/49] mptcp: refactor sndbuf auto-tuning
Date: Wed, 3 Jan 2024 17:55:48 +0100 [thread overview]
Message-ID: <20240103164839.340039748@linuxfoundation.org> (raw)
In-Reply-To: <20240103164834.970234661@linuxfoundation.org>
6.6-stable review patch. If anyone has any objections, please let me know.
------------------
From: Paolo Abeni <pabeni@redhat.com>
[ Upstream commit 8005184fd1ca6aeb3fea36f4eb9463fc1b90c114 ]
The MPTCP protocol account for the data enqueued on all the subflows
to the main socket send buffer, while the send buffer auto-tuning
algorithm set the main socket send buffer size as the max size among
the subflows.
That causes bad performances when at least one subflow is sndbuf
limited, e.g. due to very high latency, as the MPTCP scheduler can't
even fill such buffer.
Change the send-buffer auto-tuning algorithm to compute the main socket
send buffer size as the sum of all the subflows buffer size.
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <martineau@kernel.org>
Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-2-v1-9-9dc60939d371@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Stable-dep-of: 4fd19a307016 ("mptcp: fix inconsistent state on fastopen race")
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
net/mptcp/protocol.c | 18 +++++++++++++--
net/mptcp/protocol.h | 54 ++++++++++++++++++++++++++++++++++++++++----
net/mptcp/sockopt.c | 5 +++-
net/mptcp/subflow.c | 3 +--
4 files changed, 70 insertions(+), 10 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index c1527f520dce3..44499e49d76e6 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -893,6 +893,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
mptcp_sockopt_sync_locked(msk, ssk);
mptcp_subflow_joined(msk, ssk);
mptcp_stop_tout_timer(sk);
+ __mptcp_propagate_sndbuf(sk, ssk);
return true;
}
@@ -1079,15 +1080,16 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(sk);
bool first = true;
- sk_stream_moderate_sndbuf(sk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (first)
tcp_enter_memory_pressure(ssk);
sk_stream_moderate_sndbuf(ssk);
+
first = false;
}
+ __mptcp_sync_sndbuf(sk);
}
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
@@ -2452,6 +2454,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
WRITE_ONCE(msk->first, NULL);
out:
+ __mptcp_sync_sndbuf(sk);
if (need_push)
__mptcp_push_pending(sk, 0);
@@ -3223,7 +3226,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
* uses the correct data
*/
mptcp_copy_inaddrs(nsk, ssk);
- mptcp_propagate_sndbuf(nsk, ssk);
+ __mptcp_propagate_sndbuf(nsk, ssk);
mptcp_rcv_space_init(msk, ssk);
bh_unlock_sock(nsk);
@@ -3401,6 +3404,8 @@ static void mptcp_release_cb(struct sock *sk)
__mptcp_set_connected(sk);
if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
__mptcp_error_report(sk);
+ if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
+ __mptcp_sync_sndbuf(sk);
}
__mptcp_update_rmem(sk);
@@ -3445,6 +3450,14 @@ void mptcp_subflow_process_delegated(struct sock *ssk, long status)
__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
}
+ if (status & BIT(MPTCP_DELEGATE_SNDBUF)) {
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk))
+ __mptcp_sync_sndbuf(sk);
+ else
+ __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags);
+ mptcp_data_unlock(sk);
+ }
if (status & BIT(MPTCP_DELEGATE_ACK))
schedule_3rdack_retransmission(ssk);
}
@@ -3529,6 +3542,7 @@ bool mptcp_finish_join(struct sock *ssk)
/* active subflow, already present inside the conn_list */
if (!list_empty(&subflow->node)) {
mptcp_subflow_joined(msk, ssk);
+ mptcp_propagate_sndbuf(parent, ssk);
return true;
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 3612545fa62e0..40866acd91ad5 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -123,6 +123,7 @@
#define MPTCP_RETRANSMIT 4
#define MPTCP_FLUSH_JOIN_LIST 5
#define MPTCP_CONNECTED 6
+#define MPTCP_SYNC_SNDBUF 7
struct mptcp_skb_cb {
u64 map_seq;
@@ -447,6 +448,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
#define MPTCP_DELEGATE_SCHEDULED 0
#define MPTCP_DELEGATE_SEND 1
#define MPTCP_DELEGATE_ACK 2
+#define MPTCP_DELEGATE_SNDBUF 3
#define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED))
/* MPTCP subflow context */
@@ -520,6 +522,9 @@ struct mptcp_subflow_context {
u32 setsockopt_seq;
u32 stale_rcv_tstamp;
+ int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf,
+ * protected by the msk socket lock
+ */
struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */
@@ -762,13 +767,52 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
}
-static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+static inline void __mptcp_sync_sndbuf(struct sock *sk)
{
- if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf))
- return false;
+ struct mptcp_subflow_context *subflow;
+ int ssk_sndbuf, new_sndbuf;
+
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ return;
+
+ new_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[0];
+ mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
+ ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf);
+
+ subflow->cached_sndbuf = ssk_sndbuf;
+ new_sndbuf += ssk_sndbuf;
+ }
+
+ /* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */
+ WRITE_ONCE(sk->sk_sndbuf, new_sndbuf);
+}
+
+/* The called held both the msk socket and the subflow socket locks,
+ * possibly under BH
+ */
+static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf)
+ __mptcp_sync_sndbuf(sk);
+}
+
+/* the caller held only the subflow socket lock, either in process or
+ * BH context. Additionally this can be called under the msk data lock,
+ * so we can't acquire such lock here: let the delegate action acquires
+ * the needed locks in suitable order.
+ */
+static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf))
+ return;
- WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
- return true;
+ local_bh_disable();
+ mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF);
+ local_bh_enable();
}
static inline void mptcp_write_space(struct sock *sk)
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 7539b9c8c2fb4..116e3008231bd 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -95,6 +95,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in
case SO_SNDBUFFORCE:
ssk->sk_userlocks |= SOCK_SNDBUF_LOCK;
WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
+ mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
break;
case SO_RCVBUF:
case SO_RCVBUFFORCE:
@@ -1418,8 +1419,10 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
if (sk->sk_userlocks & tx_rx_locks) {
ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
- if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) {
WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
+ mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
+ }
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
}
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 9c1f8d1d63d24..d8827427ffc84 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -421,6 +421,7 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc
void __mptcp_set_connected(struct sock *sk)
{
+ __mptcp_propagate_sndbuf(sk, mptcp_sk(sk)->first);
if (sk->sk_state == TCP_SYN_SENT) {
inet_sk_state_store(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
@@ -472,7 +473,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
return;
msk = mptcp_sk(parent);
- mptcp_propagate_sndbuf(parent, sk);
subflow->rel_write_seq = 1;
subflow->conn_finished = 1;
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
@@ -1728,7 +1728,6 @@ static void subflow_state_change(struct sock *sk)
msk = mptcp_sk(parent);
if (subflow_simultaneous_connect(sk)) {
- mptcp_propagate_sndbuf(parent, sk);
mptcp_do_fallback(sk);
mptcp_rcv_space_init(msk, sk);
pr_fallback(msk);
--
2.43.0
next prev parent reply other threads:[~2024-01-03 17:14 UTC|newest]
Thread overview: 72+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-01-03 16:55 [PATCH 6.6 00/49] 6.6.10-rc1 review Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 01/49] ksmbd: Remove unused field in ksmbd_user struct Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 02/49] ksmbd: reorganize ksmbd_iov_pin_rsp() Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 03/49] ksmbd: fix kernel-doc comment of ksmbd_vfs_setxattr() Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 04/49] ksmbd: fix missing RDMA-capable flag for IPoIB device in ksmbd_rdma_capable_netdev() Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 05/49] ksmbd: add support for surrogate pair conversion Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 06/49] ksmbd: no need to wait for binded connection termination at logoff Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 07/49] ksmbd: fix kernel-doc comment of ksmbd_vfs_kern_path_locked() Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 08/49] ksmbd: prevent memory leak on error return Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 09/49] ksmbd: separately allocate ci per dentry Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 10/49] ksmbd: move oplock handling after unlock parent dir Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 11/49] ksmbd: release interim response after sending status pending response Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 12/49] ksmbd: move setting SMB2_FLAGS_ASYNC_COMMAND and AsyncId Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 13/49] ksmbd: dont update ->op_state as OPLOCK_STATE_NONE on error Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 14/49] ksmbd: set epoch in create context v2 lease Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 15/49] ksmbd: set v2 lease capability Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 16/49] ksmbd: downgrade RWH lease caching state to RH for directory Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 17/49] ksmbd: send v2 lease break notification " Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 18/49] ksmbd: lazy v2 lease break on smb2_write() Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 19/49] ksmbd: avoid duplicate opinfo_put() call on error of smb21_lease_break_ack() Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 20/49] fs: new accessor methods for atime and mtime Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 21/49] client: convert to new timestamp accessors Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 22/49] fs: cifs: Fix atime update check Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 23/49] virtio_ring: fix syncs DMA memory with different direction Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 24/49] kexec: fix KEXEC_FILE dependencies Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 25/49] kexec: select CRYPTO from KEXEC_FILE instead of depending on it Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 26/49] linux/export: Fix alignment for 64-bit ksymtab entries Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 27/49] linux/export: Ensure natural alignment of kcrctab array Greg Kroah-Hartman
2024-01-03 16:55 ` Greg Kroah-Hartman [this message]
2024-01-03 16:55 ` [PATCH 6.6 29/49] mptcp: fix possible NULL pointer dereference on close Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 30/49] mptcp: fix inconsistent state on fastopen race Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 31/49] block: renumber QUEUE_FLAG_HW_WC Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 32/49] platform/x86/intel/pmc: Add suspend callback Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 33/49] platform/x86/intel/pmc: Allow reenabling LTRs Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 34/49] platform/x86/intel/pmc: Move GBE LTR ignore to suspend callback Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 35/49] ksmbd: fix slab-out-of-bounds in smb_strndup_from_utf16() Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 36/49] platform/x86: p2sb: Allow p2sb_bar() calls during PCI device probe Greg Kroah-Hartman
2024-01-04 9:01 ` Shinichiro Kawasaki
2024-01-03 16:55 ` [PATCH 6.6 37/49] maple_tree: do not preallocate nodes for slot stores Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 38/49] selftests: secretmem: floor the memory size to the multiple of page_size Greg Kroah-Hartman
2024-01-03 16:55 ` [PATCH 6.6 39/49] mm/filemap: avoid buffered read/write race to read inconsistent data Greg Kroah-Hartman
2024-01-03 16:56 ` [PATCH 6.6 40/49] mm: migrate high-order folios in swap cache correctly Greg Kroah-Hartman
2024-01-03 16:56 ` [PATCH 6.6 41/49] mm/memory-failure: cast index to loff_t before shifting it Greg Kroah-Hartman
2024-01-03 16:56 ` [PATCH 6.6 42/49] mm/memory-failure: check the mapcount of the precise page Greg Kroah-Hartman
2024-01-03 16:56 ` [PATCH 6.6 43/49] Revert "nvme-fc: fix race between error recovery and creating association" Greg Kroah-Hartman
2024-01-03 16:56 ` [PATCH 6.6 44/49] ring-buffer: Fix wake ups when buffer_percent is set to 100 Greg Kroah-Hartman
2024-01-03 16:56 ` [PATCH 6.6 45/49] ftrace: Fix modification of direct_function hash while in use Greg Kroah-Hartman
2024-01-03 16:56 ` [PATCH 6.6 46/49] tracing: Fix blocked reader of snapshot buffer Greg Kroah-Hartman
2024-01-03 16:56 ` [PATCH 6.6 47/49] wifi: cfg80211: fix CQM for non-range use Greg Kroah-Hartman
2024-01-03 16:56 ` [PATCH 6.6 48/49] wifi: nl80211: fix deadlock in nl80211_set_cqm_rssi (6.6.x) Greg Kroah-Hartman
2024-01-03 16:56 ` [PATCH 6.6 49/49] netfilter: nf_tables: skip set commit for deleted/destroyed sets Greg Kroah-Hartman
2024-01-03 17:44 ` [PATCH 6.6 00/49] 6.6.10-rc1 review Nam Cao
2024-01-03 18:57 ` SeongJae Park
2024-01-03 22:04 ` Florian Fainelli
2024-01-03 23:35 ` Kelsey Steele
2024-01-04 0:18 ` Shuah Khan
2024-01-04 2:24 ` Takeshi Ogasawara
2024-01-04 4:10 ` Daniel Díaz
2024-01-04 7:15 ` Daniel Díaz
2024-01-04 7:58 ` Greg Kroah-Hartman
2024-01-04 8:21 ` Johannes Berg
2024-01-04 12:39 ` Naresh Kamboju
2024-01-04 12:58 ` Greg Kroah-Hartman
2024-01-04 5:20 ` Bagas Sanjaya
2024-01-04 7:55 ` Luna Jernberg
2024-01-04 7:57 ` Greg Kroah-Hartman
2024-01-04 10:26 ` Ron Economos
2024-01-04 11:53 ` Harshit Mogalapalli
2024-01-04 16:52 ` Jon Hunter
2024-01-04 17:12 ` Allen
2024-01-05 1:04 ` Guenter Roeck
2024-01-05 2:43 ` Namjae Jeon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240103164839.340039748@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=kuba@kernel.org \
--cc=martineau@kernel.org \
--cc=pabeni@redhat.com \
--cc=patches@lists.linux.dev \
--cc=sashal@kernel.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox