* [PATCH v2] net/tls: support maximum record size limit
@ 2025-09-02 3:38 Wilfred Mallawa
2025-09-02 11:40 ` Simon Horman
` (2 more replies)
0 siblings, 3 replies; 7+ messages in thread
From: Wilfred Mallawa @ 2025-09-02 3:38 UTC (permalink / raw)
To: David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Jonathan Corbet, John Fastabend
Cc: Simon Horman, netdev, linux-doc, linux-kernel, Alistair Francis,
Damien Le'Moal, Wilfred Mallawa
From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
During a handshake, an endpoint may specify a maximum record size limit.
Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
maximum record size. Meaning that, the outgoing records from the kernel
can exceed a lower size negotiated during the handshake. In such a case,
the TLS endpoint must send a fatal "record_overflow" alert [1], and
thus the record is discarded.
Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
support. For these devices, supporting TLS record size negotiation is
necessary because the maximum TLS record size supported by the controller
is less than the default 16KB currently used by the kernel.
This patch adds support for retrieving the negotiated record size limit
during a handshake, and enforcing it at the TLS layer such that outgoing
records are no larger than the size negotiated. This patch depends on
the respective userspace support in tlshd and GnuTLS [2].
[1] https://www.rfc-editor.org/rfc/rfc8449
[2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005
Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
---
Documentation/networking/tls.rst | 7 ++++++
include/net/tls.h | 1 +
include/uapi/linux/tls.h | 2 ++
net/tls/tls_main.c | 39 ++++++++++++++++++++++++++++++--
net/tls/tls_sw.c | 4 ++++
5 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 36cc7afc2527..0232df902320 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -280,6 +280,13 @@ If the record decrypted turns out to had been padded or is not a data
record it will be decrypted again into a kernel buffer without zero copy.
Such events are counted in the ``TlsDecryptRetry`` statistic.
+TLS_TX_RECORD_SIZE_LIM
+~~~~~~~~~~~~~~~~~~~~~~
+
+During a TLS handshake, an endpoint may use the record size limit extension
+to specify a maximum record size. This allows enforcing the specified record
+size limit, such that outgoing records do not exceed the limit specified.
+
Statistics
==========
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..c9a3759f27ca 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -226,6 +226,7 @@ struct tls_context {
u8 rx_conf:3;
u8 zerocopy_sendfile:1;
u8 rx_no_pad:1;
+ u16 record_size_limit;
int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index b66a800389cc..3add266d5916 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -41,6 +41,7 @@
#define TLS_RX 2 /* Set receive parameters */
#define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */
#define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */
+#define TLS_TX_RECORD_SIZE_LIM 5 /* Maximum record size */
/* Supported versions */
#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -194,6 +195,7 @@ enum {
TLS_INFO_RXCONF,
TLS_INFO_ZC_RO_TX,
TLS_INFO_RX_NO_PAD,
+ TLS_INFO_TX_RECORD_SIZE_LIM,
__TLS_INFO_MAX,
};
#define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index a3ccb3135e51..1098c01f2749 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -812,6 +812,31 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
return rc;
}
+static int do_tls_setsockopt_record_size(struct sock *sk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ u16 value;
+
+ if (sockptr_is_null(optval) || optlen != sizeof(value))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&value, optval, sizeof(value)))
+ return -EFAULT;
+
+ if (ctx->prot_info.version == TLS_1_2_VERSION &&
+ value > TLS_MAX_PAYLOAD_SIZE)
+ return -EINVAL;
+
+ if (ctx->prot_info.version == TLS_1_3_VERSION &&
+ value > TLS_MAX_PAYLOAD_SIZE + 1)
+ return -EINVAL;
+
+ ctx->record_size_limit = value;
+
+ return 0;
+}
+
static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
@@ -833,6 +858,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_RECORD_SIZE_LIM:
+ rc = do_tls_setsockopt_record_size(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -1065,7 +1093,7 @@ static u16 tls_user_config(struct tls_context *ctx, bool tx)
static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
{
- u16 version, cipher_type;
+ u16 version, cipher_type, record_size_limit;
struct tls_context *ctx;
struct nlattr *start;
int err;
@@ -1110,7 +1138,13 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
if (err)
goto nla_failure;
}
-
+ record_size_limit = ctx->record_size_limit;
+ if (record_size_limit) {
+ err = nla_put_u16(skb, TLS_INFO_TX_RECORD_SIZE_LIM,
+ record_size_limit);
+ if (err)
+ goto nla_failure;
+ }
rcu_read_unlock();
nla_nest_end(skb, start);
return 0;
@@ -1132,6 +1166,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin)
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */
nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_RECORD_SIZE_LIM */
0;
return size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index bac65d0d4e3e..9f9359f591d3 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
unsigned char record_type = TLS_RECORD_TYPE_DATA;
bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
bool eor = !(msg->msg_flags & MSG_MORE);
+ u16 record_size_limit;
size_t try_to_copy;
ssize_t copied = 0;
struct sk_msg *msg_pl, *msg_en;
@@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
}
}
+ record_size_limit = tls_ctx->record_size_limit ?
+ tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE;
+
while (msg_data_left(msg)) {
if (sk->sk_err) {
ret = -sk->sk_err;
--
2.51.0
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH v2] net/tls: support maximum record size limit
2025-09-02 3:38 [PATCH v2] net/tls: support maximum record size limit Wilfred Mallawa
@ 2025-09-02 11:40 ` Simon Horman
2025-09-02 22:05 ` Wilfred Mallawa
2025-09-02 16:07 ` Sabrina Dubroca
2025-09-02 21:24 ` kernel test robot
2 siblings, 1 reply; 7+ messages in thread
From: Simon Horman @ 2025-09-02 11:40 UTC (permalink / raw)
To: Wilfred Mallawa
Cc: David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Jonathan Corbet, John Fastabend, netdev, linux-doc, linux-kernel,
Alistair Francis, Damien Le'Moal, Wilfred Mallawa
On Tue, Sep 02, 2025 at 01:38:10PM +1000, Wilfred Mallawa wrote:
> From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
>
> During a handshake, an endpoint may specify a maximum record size limit.
> Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
> maximum record size. Meaning that, the outgoing records from the kernel
> can exceed a lower size negotiated during the handshake. In such a case,
> the TLS endpoint must send a fatal "record_overflow" alert [1], and
> thus the record is discarded.
>
> Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
> support. For these devices, supporting TLS record size negotiation is
> necessary because the maximum TLS record size supported by the controller
> is less than the default 16KB currently used by the kernel.
>
> This patch adds support for retrieving the negotiated record size limit
> during a handshake, and enforcing it at the TLS layer such that outgoing
> records are no larger than the size negotiated. This patch depends on
> the respective userspace support in tlshd and GnuTLS [2].
>
> [1] https://www.rfc-editor.org/rfc/rfc8449
> [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005
>
> Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
Hi Wilfred,
I'll leave review of this approach to others.
But in the meantime I wanted to pass on a minor problem I noticed in the code
> diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
> index bac65d0d4e3e..9f9359f591d3 100644
> --- a/net/tls/tls_sw.c
> +++ b/net/tls/tls_sw.c
> @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
> unsigned char record_type = TLS_RECORD_TYPE_DATA;
> bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
> bool eor = !(msg->msg_flags & MSG_MORE);
> + u16 record_size_limit;
> size_t try_to_copy;
> ssize_t copied = 0;
> struct sk_msg *msg_pl, *msg_en;
> @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
> }
> }
>
> + record_size_limit = tls_ctx->record_size_limit ?
> + tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE;
> +
> while (msg_data_left(msg)) {
> if (sk->sk_err) {
> ret = -sk->sk_err;
record_size_limit is set but otherwise unused.
Did you forget to add something here?
If not, please remove record_size_limit from this function.
--
pw-bot: changes-requested
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] net/tls: support maximum record size limit
2025-09-02 3:38 [PATCH v2] net/tls: support maximum record size limit Wilfred Mallawa
2025-09-02 11:40 ` Simon Horman
@ 2025-09-02 16:07 ` Sabrina Dubroca
2025-09-02 22:50 ` Wilfred Mallawa
2025-09-02 21:24 ` kernel test robot
2 siblings, 1 reply; 7+ messages in thread
From: Sabrina Dubroca @ 2025-09-02 16:07 UTC (permalink / raw)
To: Wilfred Mallawa
Cc: David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Jonathan Corbet, John Fastabend, Simon Horman, netdev, linux-doc,
linux-kernel, Alistair Francis, Damien Le'Moal,
Wilfred Mallawa
2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote:
> From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
>
> During a handshake, an endpoint may specify a maximum record size limit.
> Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
> maximum record size. Meaning that, the outgoing records from the kernel
> can exceed a lower size negotiated during the handshake. In such a case,
> the TLS endpoint must send a fatal "record_overflow" alert [1], and
> thus the record is discarded.
>
> Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
> support. For these devices, supporting TLS record size negotiation is
> necessary because the maximum TLS record size supported by the controller
> is less than the default 16KB currently used by the kernel.
>
> This patch adds support for retrieving the negotiated record size limit
> during a handshake, and enforcing it at the TLS layer such that outgoing
> records are no larger than the size negotiated. This patch depends on
> the respective userspace support in tlshd and GnuTLS [2].
>
> [1] https://www.rfc-editor.org/rfc/rfc8449
> [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005
>
> Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> ---
> Documentation/networking/tls.rst | 7 ++++++
> include/net/tls.h | 1 +
> include/uapi/linux/tls.h | 2 ++
> net/tls/tls_main.c | 39 ++++++++++++++++++++++++++++++--
> net/tls/tls_sw.c | 4 ++++
> 5 files changed, 51 insertions(+), 2 deletions(-)
A selftest would be nice (tools/testing/selftests/net/tls.c), but I'm
not sure what we could do on the "RX" side to check that we are
respecting the size restriction. Use a basic TCP socket and try to
parse (and then discard without decrypting) records manually out of
the stream and see if we got the length we wanted?
> diff --git a/include/net/tls.h b/include/net/tls.h
> index 857340338b69..c9a3759f27ca 100644
> --- a/include/net/tls.h
> +++ b/include/net/tls.h
> @@ -226,6 +226,7 @@ struct tls_context {
> u8 rx_conf:3;
> u8 zerocopy_sendfile:1;
> u8 rx_no_pad:1;
> + u16 record_size_limit;
Maybe "tx_record_size_limit", since it's not intended for RX?
I don't know if the kernel will ever have a need to enforce the RX
record size, but it would maybe avoid future head-scratching "why is
this not used on the RX path?"
> diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
> index a3ccb3135e51..1098c01f2749 100644
> --- a/net/tls/tls_main.c
> +++ b/net/tls/tls_main.c
> @@ -812,6 +812,31 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
> return rc;
> }
>
> +static int do_tls_setsockopt_record_size(struct sock *sk, sockptr_t optval,
> + unsigned int optlen)
> +{
> + struct tls_context *ctx = tls_get_ctx(sk);
> + u16 value;
> +
> + if (sockptr_is_null(optval) || optlen != sizeof(value))
> + return -EINVAL;
> +
> + if (copy_from_sockptr(&value, optval, sizeof(value)))
> + return -EFAULT;
> +
> + if (ctx->prot_info.version == TLS_1_2_VERSION &&
> + value > TLS_MAX_PAYLOAD_SIZE)
> + return -EINVAL;
> +
> + if (ctx->prot_info.version == TLS_1_3_VERSION &&
> + value > TLS_MAX_PAYLOAD_SIZE + 1)
> + return -EINVAL;
> +
> + ctx->record_size_limit = value;
> +
> + return 0;
> +}
> +
> static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
> unsigned int optlen)
> {
> @@ -833,6 +858,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
> case TLS_RX_EXPECT_NO_PAD:
> rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
> break;
> + case TLS_TX_RECORD_SIZE_LIM:
> + rc = do_tls_setsockopt_record_size(sk, optval, optlen);
> + break;
Adding the corresponding changes to do_tls_getsockopt would also be good.
> diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
> index bac65d0d4e3e..9f9359f591d3 100644
> --- a/net/tls/tls_sw.c
> +++ b/net/tls/tls_sw.c
> @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
> unsigned char record_type = TLS_RECORD_TYPE_DATA;
> bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
> bool eor = !(msg->msg_flags & MSG_MORE);
> + u16 record_size_limit;
> size_t try_to_copy;
> ssize_t copied = 0;
> struct sk_msg *msg_pl, *msg_en;
> @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
> }
> }
>
> + record_size_limit = tls_ctx->record_size_limit ?
> + tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE;
As Simon said (good catch Simon :)), this isn't used anywhere. Are you
sure this patch works? The previous version had a hunk in
tls_sw_sendmsg_locked that looks like what I would expect.
And the the offloaded TX path (in net/tls/tls_device.c) would also
need similar changes.
I'm wondering if it's better to add this conditional, or just
initialize record_size_limit to TLS_MAX_PAYLOAD_SIZE as we set up the
tls_context. Then we only have to replace TLS_MAX_PAYLOAD_SIZE with
tls_ctx->record_size_limit in a few places?
--
Sabrina
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] net/tls: support maximum record size limit
2025-09-02 3:38 [PATCH v2] net/tls: support maximum record size limit Wilfred Mallawa
2025-09-02 11:40 ` Simon Horman
2025-09-02 16:07 ` Sabrina Dubroca
@ 2025-09-02 21:24 ` kernel test robot
2 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2025-09-02 21:24 UTC (permalink / raw)
To: Wilfred Mallawa, David S . Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Jonathan Corbet, John Fastabend
Cc: oe-kbuild-all, Simon Horman, netdev, linux-doc, linux-kernel,
Alistair Francis, Damien Le'Moal, Wilfred Mallawa
Hi Wilfred,
kernel test robot noticed the following build warnings:
[auto build test WARNING on net-next/main]
[also build test WARNING on net/main linus/master horms-ipvs/master v6.17-rc4 next-20250902]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Wilfred-Mallawa/net-tls-support-maximum-record-size-limit/20250902-114005
base: net-next/main
patch link: https://lore.kernel.org/r/20250902033809.177182-2-wilfred.opensource%40gmail.com
patch subject: [PATCH v2] net/tls: support maximum record size limit
config: i386-buildonly-randconfig-001-20250903 (https://download.01.org/0day-ci/archive/20250903/202509030542.ZW9r1S1c-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250903/202509030542.ZW9r1S1c-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202509030542.ZW9r1S1c-lkp@intel.com/
All warnings (new ones prefixed by >>):
net/tls/tls_sw.c: In function 'tls_sw_sendmsg_locked':
>> net/tls/tls_sw.c:1036:13: warning: variable 'record_size_limit' set but not used [-Wunused-but-set-variable]
1036 | u16 record_size_limit;
| ^~~~~~~~~~~~~~~~~
vim +/record_size_limit +1036 net/tls/tls_sw.c
1024
1025 static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
1026 size_t size)
1027 {
1028 long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1029 struct tls_context *tls_ctx = tls_get_ctx(sk);
1030 struct tls_prot_info *prot = &tls_ctx->prot_info;
1031 struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
1032 bool async_capable = ctx->async_capable;
1033 unsigned char record_type = TLS_RECORD_TYPE_DATA;
1034 bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
1035 bool eor = !(msg->msg_flags & MSG_MORE);
> 1036 u16 record_size_limit;
1037 size_t try_to_copy;
1038 ssize_t copied = 0;
1039 struct sk_msg *msg_pl, *msg_en;
1040 struct tls_rec *rec;
1041 int required_size;
1042 int num_async = 0;
1043 bool full_record;
1044 int record_room;
1045 int num_zc = 0;
1046 int orig_size;
1047 int ret = 0;
1048
1049 if (!eor && (msg->msg_flags & MSG_EOR))
1050 return -EINVAL;
1051
1052 if (unlikely(msg->msg_controllen)) {
1053 ret = tls_process_cmsg(sk, msg, &record_type);
1054 if (ret) {
1055 if (ret == -EINPROGRESS)
1056 num_async++;
1057 else if (ret != -EAGAIN)
1058 goto send_end;
1059 }
1060 }
1061
1062 record_size_limit = tls_ctx->record_size_limit ?
1063 tls_ctx->record_size_limit : TLS_MAX_PAYLOAD_SIZE;
1064
1065 while (msg_data_left(msg)) {
1066 if (sk->sk_err) {
1067 ret = -sk->sk_err;
1068 goto send_end;
1069 }
1070
1071 if (ctx->open_rec)
1072 rec = ctx->open_rec;
1073 else
1074 rec = ctx->open_rec = tls_get_rec(sk);
1075 if (!rec) {
1076 ret = -ENOMEM;
1077 goto send_end;
1078 }
1079
1080 msg_pl = &rec->msg_plaintext;
1081 msg_en = &rec->msg_encrypted;
1082
1083 orig_size = msg_pl->sg.size;
1084 full_record = false;
1085 try_to_copy = msg_data_left(msg);
1086 record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
1087 if (try_to_copy >= record_room) {
1088 try_to_copy = record_room;
1089 full_record = true;
1090 }
1091
1092 required_size = msg_pl->sg.size + try_to_copy +
1093 prot->overhead_size;
1094
1095 if (!sk_stream_memory_free(sk))
1096 goto wait_for_sndbuf;
1097
1098 alloc_encrypted:
1099 ret = tls_alloc_encrypted_msg(sk, required_size);
1100 if (ret) {
1101 if (ret != -ENOSPC)
1102 goto wait_for_memory;
1103
1104 /* Adjust try_to_copy according to the amount that was
1105 * actually allocated. The difference is due
1106 * to max sg elements limit
1107 */
1108 try_to_copy -= required_size - msg_en->sg.size;
1109 full_record = true;
1110 }
1111
1112 if (try_to_copy && (msg->msg_flags & MSG_SPLICE_PAGES)) {
1113 ret = tls_sw_sendmsg_splice(sk, msg, msg_pl,
1114 try_to_copy, &copied);
1115 if (ret < 0)
1116 goto send_end;
1117 tls_ctx->pending_open_record_frags = true;
1118
1119 if (sk_msg_full(msg_pl))
1120 full_record = true;
1121
1122 if (full_record || eor)
1123 goto copied;
1124 continue;
1125 }
1126
1127 if (!is_kvec && (full_record || eor) && !async_capable) {
1128 u32 first = msg_pl->sg.end;
1129
1130 ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter,
1131 msg_pl, try_to_copy);
1132 if (ret)
1133 goto fallback_to_reg_send;
1134
1135 num_zc++;
1136 copied += try_to_copy;
1137
1138 sk_msg_sg_copy_set(msg_pl, first);
1139 ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
1140 record_type, &copied,
1141 msg->msg_flags);
1142 if (ret) {
1143 if (ret == -EINPROGRESS)
1144 num_async++;
1145 else if (ret == -ENOMEM)
1146 goto wait_for_memory;
1147 else if (ctx->open_rec && ret == -ENOSPC) {
1148 if (msg_pl->cork_bytes) {
1149 ret = 0;
1150 goto send_end;
1151 }
1152 goto rollback_iter;
1153 } else if (ret != -EAGAIN)
1154 goto send_end;
1155 }
1156 continue;
1157 rollback_iter:
1158 copied -= try_to_copy;
1159 sk_msg_sg_copy_clear(msg_pl, first);
1160 iov_iter_revert(&msg->msg_iter,
1161 msg_pl->sg.size - orig_size);
1162 fallback_to_reg_send:
1163 sk_msg_trim(sk, msg_pl, orig_size);
1164 }
1165
1166 required_size = msg_pl->sg.size + try_to_copy;
1167
1168 ret = tls_clone_plaintext_msg(sk, required_size);
1169 if (ret) {
1170 if (ret != -ENOSPC)
1171 goto send_end;
1172
1173 /* Adjust try_to_copy according to the amount that was
1174 * actually allocated. The difference is due
1175 * to max sg elements limit
1176 */
1177 try_to_copy -= required_size - msg_pl->sg.size;
1178 full_record = true;
1179 sk_msg_trim(sk, msg_en,
1180 msg_pl->sg.size + prot->overhead_size);
1181 }
1182
1183 if (try_to_copy) {
1184 ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter,
1185 msg_pl, try_to_copy);
1186 if (ret < 0)
1187 goto trim_sgl;
1188 }
1189
1190 /* Open records defined only if successfully copied, otherwise
1191 * we would trim the sg but not reset the open record frags.
1192 */
1193 tls_ctx->pending_open_record_frags = true;
1194 copied += try_to_copy;
1195 copied:
1196 if (full_record || eor) {
1197 ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
1198 record_type, &copied,
1199 msg->msg_flags);
1200 if (ret) {
1201 if (ret == -EINPROGRESS)
1202 num_async++;
1203 else if (ret == -ENOMEM)
1204 goto wait_for_memory;
1205 else if (ret != -EAGAIN) {
1206 if (ret == -ENOSPC)
1207 ret = 0;
1208 goto send_end;
1209 }
1210 }
1211 }
1212
1213 continue;
1214
1215 wait_for_sndbuf:
1216 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1217 wait_for_memory:
1218 ret = sk_stream_wait_memory(sk, &timeo);
1219 if (ret) {
1220 trim_sgl:
1221 if (ctx->open_rec)
1222 tls_trim_both_msgs(sk, orig_size);
1223 goto send_end;
1224 }
1225
1226 if (ctx->open_rec && msg_en->sg.size < required_size)
1227 goto alloc_encrypted;
1228 }
1229
1230 if (!num_async) {
1231 goto send_end;
1232 } else if (num_zc || eor) {
1233 int err;
1234
1235 /* Wait for pending encryptions to get completed */
1236 err = tls_encrypt_async_wait(ctx);
1237 if (err) {
1238 ret = err;
1239 copied = 0;
1240 }
1241 }
1242
1243 /* Transmit if any encryptions have completed */
1244 if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) {
1245 cancel_delayed_work(&ctx->tx_work.work);
1246 tls_tx_records(sk, msg->msg_flags);
1247 }
1248
1249 send_end:
1250 ret = sk_stream_error(sk, msg->msg_flags, ret);
1251 return copied > 0 ? copied : ret;
1252 }
1253
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] net/tls: support maximum record size limit
2025-09-02 11:40 ` Simon Horman
@ 2025-09-02 22:05 ` Wilfred Mallawa
0 siblings, 0 replies; 7+ messages in thread
From: Wilfred Mallawa @ 2025-09-02 22:05 UTC (permalink / raw)
To: horms@kernel.org
Cc: corbet@lwn.net, dlemoal@kernel.org, davem@davemloft.net,
Alistair Francis, john.fastabend@gmail.com,
linux-kernel@vger.kernel.org, kuba@kernel.org,
linux-doc@vger.kernel.org, edumazet@google.com, pabeni@redhat.com,
netdev@vger.kernel.org
On Tue, 2025-09-02 at 12:40 +0100, Simon Horman wrote:
>
[snip]
> Hi Wilfred,
>
> I'll leave review of this approach to others.
> But in the meantime I wanted to pass on a minor problem I noticed in
> the code
>
> > diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
> > index bac65d0d4e3e..9f9359f591d3 100644
> > --- a/net/tls/tls_sw.c
> > +++ b/net/tls/tls_sw.c
> > @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock
> > *sk, struct msghdr *msg,
> > unsigned char record_type = TLS_RECORD_TYPE_DATA;
> > bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
> > bool eor = !(msg->msg_flags & MSG_MORE);
> > + u16 record_size_limit;
> > size_t try_to_copy;
> > ssize_t copied = 0;
> > struct sk_msg *msg_pl, *msg_en;
> > @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock
> > *sk, struct msghdr *msg,
> > }
> > }
> >
> > + record_size_limit = tls_ctx->record_size_limit ?
> > + tls_ctx->record_size_limit :
> > TLS_MAX_PAYLOAD_SIZE;
> > +
> > while (msg_data_left(msg)) {
> > if (sk->sk_err) {
> > ret = -sk->sk_err;
>
> record_size_limit is set but otherwise unused.
> Did you forget to add something here?
>
> If not, please remove record_size_limit from this function.
Hey Simon,
Yes, this is an error. I will fixup. Thank you.
Regards,
Wilfred
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] net/tls: support maximum record size limit
2025-09-02 16:07 ` Sabrina Dubroca
@ 2025-09-02 22:50 ` Wilfred Mallawa
2025-09-03 8:21 ` Sabrina Dubroca
0 siblings, 1 reply; 7+ messages in thread
From: Wilfred Mallawa @ 2025-09-02 22:50 UTC (permalink / raw)
To: sd@queasysnail.net
Cc: corbet@lwn.net, dlemoal@kernel.org, davem@davemloft.net,
linux-doc@vger.kernel.org, john.fastabend@gmail.com,
linux-kernel@vger.kernel.org, Alistair Francis, kuba@kernel.org,
horms@kernel.org, edumazet@google.com, pabeni@redhat.com,
netdev@vger.kernel.org
On Tue, 2025-09-02 at 18:07 +0200, Sabrina Dubroca wrote:
> 2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote:
> > From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> >
> > During a handshake, an endpoint may specify a maximum record size
> > limit.
> > Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for
> > the
> > maximum record size. Meaning that, the outgoing records from the
> > kernel
> > can exceed a lower size negotiated during the handshake. In such a
> > case,
> > the TLS endpoint must send a fatal "record_overflow" alert [1], and
> > thus the record is discarded.
> >
> > Upcoming Western Digital NVMe-TCP hardware controllers implement
> > TLS
> > support. For these devices, supporting TLS record size negotiation
> > is
> > necessary because the maximum TLS record size supported by the
> > controller
> > is less than the default 16KB currently used by the kernel.
> >
> > This patch adds support for retrieving the negotiated record size
> > limit
> > during a handshake, and enforcing it at the TLS layer such that
> > outgoing
> > records are no larger than the size negotiated. This patch depends
> > on
> > the respective userspace support in tlshd and GnuTLS [2].
> >
> > [1] https://www.rfc-editor.org/rfc/rfc8449
> > [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005
> >
> > Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> > ---
> > Documentation/networking/tls.rst | 7 ++++++
> > include/net/tls.h | 1 +
> > include/uapi/linux/tls.h | 2 ++
> > net/tls/tls_main.c | 39
> > ++++++++++++++++++++++++++++++--
> > net/tls/tls_sw.c | 4 ++++
> > 5 files changed, 51 insertions(+), 2 deletions(-)
>
Hey Sabrina,
> A selftest would be nice (tools/testing/selftests/net/tls.c), but I'm
> not sure what we could do on the "RX" side to check that we are
> respecting the size restriction. Use a basic TCP socket and try to
> parse (and then discard without decrypting) records manually out of
> the stream and see if we got the length we wanted?
>
So far I have just been using an NVMe TCP Target with TLS enabled and
checking that the targets RX record sizes are <= negotiated size in
tls_rx_one_record(). I didn't check for this patch and the bug below
got through...my bad!
Is it possible to get the exact record length into the testing layer?
Wouldn't the socket just return N bytes received which doesn't
necessarily correlate to a record size?
>
> > diff --git a/include/net/tls.h b/include/net/tls.h
> > index 857340338b69..c9a3759f27ca 100644
> > --- a/include/net/tls.h
> > +++ b/include/net/tls.h
> > @@ -226,6 +226,7 @@ struct tls_context {
> > u8 rx_conf:3;
> > u8 zerocopy_sendfile:1;
> > u8 rx_no_pad:1;
> > + u16 record_size_limit;
>
> Maybe "tx_record_size_limit", since it's not intended for RX?
>
> I don't know if the kernel will ever have a need to enforce the RX
> record size, but it would maybe avoid future head-scratching "why is
> this not used on the RX path?"
Ah good point, I think this makes sense.
>
>
>
> > diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
> > index a3ccb3135e51..1098c01f2749 100644
> > --- a/net/tls/tls_main.c
> > +++ b/net/tls/tls_main.c
> > @@ -812,6 +812,31 @@ static int do_tls_setsockopt_no_pad(struct
> > sock *sk, sockptr_t optval,
> > return rc;
> > }
> >
> > +static int do_tls_setsockopt_record_size(struct sock *sk,
> > sockptr_t optval,
> > + unsigned int optlen)
> > +{
> > + struct tls_context *ctx = tls_get_ctx(sk);
> > + u16 value;
> > +
> > + if (sockptr_is_null(optval) || optlen != sizeof(value))
> > + return -EINVAL;
> > +
> > + if (copy_from_sockptr(&value, optval, sizeof(value)))
> > + return -EFAULT;
> > +
> > + if (ctx->prot_info.version == TLS_1_2_VERSION &&
> > + value > TLS_MAX_PAYLOAD_SIZE)
> > + return -EINVAL;
> > +
> > + if (ctx->prot_info.version == TLS_1_3_VERSION &&
> > + value > TLS_MAX_PAYLOAD_SIZE + 1)
> > + return -EINVAL;
> > +
> > + ctx->record_size_limit = value;
> > +
> > + return 0;
> > +}
> > +
> > static int do_tls_setsockopt(struct sock *sk, int optname,
> > sockptr_t optval,
> > unsigned int optlen)
> > {
> > @@ -833,6 +858,9 @@ static int do_tls_setsockopt(struct sock *sk,
> > int optname, sockptr_t optval,
> > case TLS_RX_EXPECT_NO_PAD:
> > rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
> > break;
> > + case TLS_TX_RECORD_SIZE_LIM:
> > + rc = do_tls_setsockopt_record_size(sk, optval,
> > optlen);
> > + break;
>
> Adding the corresponding changes to do_tls_getsockopt would also be
> good.
>
okay, I will add that.
>
> > diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
> > index bac65d0d4e3e..9f9359f591d3 100644
> > --- a/net/tls/tls_sw.c
> > +++ b/net/tls/tls_sw.c
> > @@ -1033,6 +1033,7 @@ static int tls_sw_sendmsg_locked(struct sock
> > *sk, struct msghdr *msg,
> > unsigned char record_type = TLS_RECORD_TYPE_DATA;
> > bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
> > bool eor = !(msg->msg_flags & MSG_MORE);
> > + u16 record_size_limit;
> > size_t try_to_copy;
> > ssize_t copied = 0;
> > struct sk_msg *msg_pl, *msg_en;
> > @@ -1058,6 +1059,9 @@ static int tls_sw_sendmsg_locked(struct sock
> > *sk, struct msghdr *msg,
> > }
> > }
> >
> > + record_size_limit = tls_ctx->record_size_limit ?
> > + tls_ctx->record_size_limit :
> > TLS_MAX_PAYLOAD_SIZE;
>
> As Simon said (good catch Simon :)), this isn't used anywhere. Are
> you
> sure this patch works? The previous version had a hunk in
> tls_sw_sendmsg_locked that looks like what I would expect.
>
This is a bug! I missed adding that hunk.
> And the the offloaded TX path (in net/tls/tls_device.c) would also
> need similar changes.
>
Okay, will add in V3.
>
> I'm wondering if it's better to add this conditional, or just
> initialize record_size_limit to TLS_MAX_PAYLOAD_SIZE as we set up the
> tls_context. Then we only have to replace TLS_MAX_PAYLOAD_SIZE with
> tls_ctx->record_size_limit in a few places?
Yeah I think sounds better, will add for V3.
Thanks for the feedback!
Regards,
Wilfred
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] net/tls: support maximum record size limit
2025-09-02 22:50 ` Wilfred Mallawa
@ 2025-09-03 8:21 ` Sabrina Dubroca
0 siblings, 0 replies; 7+ messages in thread
From: Sabrina Dubroca @ 2025-09-03 8:21 UTC (permalink / raw)
To: Wilfred Mallawa
Cc: corbet@lwn.net, dlemoal@kernel.org, davem@davemloft.net,
linux-doc@vger.kernel.org, john.fastabend@gmail.com,
linux-kernel@vger.kernel.org, Alistair Francis, kuba@kernel.org,
horms@kernel.org, edumazet@google.com, pabeni@redhat.com,
netdev@vger.kernel.org
2025-09-02, 22:50:53 +0000, Wilfred Mallawa wrote:
> On Tue, 2025-09-02 at 18:07 +0200, Sabrina Dubroca wrote:
> > 2025-09-02, 13:38:10 +1000, Wilfred Mallawa wrote:
> > > From: Wilfred Mallawa <wilfred.mallawa@wdc.com>
> Hey Sabrina,
> > A selftest would be nice (tools/testing/selftests/net/tls.c), but I'm
> > not sure what we could do on the "RX" side to check that we are
> > respecting the size restriction. Use a basic TCP socket and try to
> > parse (and then discard without decrypting) records manually out of
> > the stream and see if we got the length we wanted?
> >
> So far I have just been using an NVMe TCP Target with TLS enabled and
> checking that the targets RX record sizes are <= negotiated size in
> tls_rx_one_record(). I didn't check for this patch and the bug below
> got through...my bad!
>
> Is it possible to get the exact record length into the testing layer?
Not really, unless we come up with some mechanism using probes. I
wouldn't go that route unless we don't have any other choice.
> Wouldn't the socket just return N bytes received which doesn't
> necessarily correlate to a record size?
Yes. That's why I suggested only using ktls on one side of the test,
and parsing the records out of the raw stream of bytes on the RX side.
Actually, control records don't get aggregated on read, so sending a
large non-data buffer should result in separate limit-sized reads. But
this makes me wonder if this limit is supposed to apply to control
records, and how the userspace library/application is supposed to deal
with the possible splitting of those records?
Here's a rough example of what I had in mind. The hardcoded cipher
overhead is a bit ugly but I don't see a way around it. Sanity check
at the end is probably not needed. I didn't write the loop because I
haven't had enough coffee yet to get that right :)
TEST(tx_record_size)
{
struct tls_crypto_info_keys tls12;
int cfd, ret, fd, len, overhead;
char buf[1000], buf2[2000];
__u16 limit = 100;
bool notls;
tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128,
&tls12, 0);
ulp_sock_pair(_metadata, &fd, &cfd, ¬ls);
if (notls)
exit(KSFT_SKIP);
/* Don't install keys on fd, we'll parse raw records */
ret = setsockopt(cfd, SOL_TLS, TLS_TX, &tls12, tls12.len);
ASSERT_EQ(ret, 0);
ret = setsockopt(cfd, SOL_TLS, TLS_TX_RECORD_SIZE_LIM, &limit, sizeof(limit));
ASSERT_EQ(ret, 0);
EXPECT_EQ(send(cfd, buf, sizeof(buf), 0), sizeof(buf));
close(cfd);
ret = recv(fd, buf2, sizeof(buf2), 0);
memcpy(&len, buf2 + 3, 2);
len = htons(len);
/* 16B tag + 8B IV -- record header (5B) is not counted but we'll need it to walk the record stream */
overhead = 16 + 8;
// TODO should be <= limit since we may not have filled every
// record (especially the last one), and loop over all the
// records we got
// next record starts at buf2 + (limit + overhead + 5)
ASSERT_EQ(len, limit + overhead);
/* sanity check that it's a TLS header for application data */
ASSERT_EQ(buf2[0], 23);
ASSERT_EQ(buf2[1], 0x3);
ASSERT_EQ(buf2[2], 0x3);
close(fd);
}
--
Sabrina
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2025-09-03 8:21 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-02 3:38 [PATCH v2] net/tls: support maximum record size limit Wilfred Mallawa
2025-09-02 11:40 ` Simon Horman
2025-09-02 22:05 ` Wilfred Mallawa
2025-09-02 16:07 ` Sabrina Dubroca
2025-09-02 22:50 ` Wilfred Mallawa
2025-09-03 8:21 ` Sabrina Dubroca
2025-09-02 21:24 ` kernel test robot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).