From: Boris Pismenny <borispismenny@gmail.com>
To: Eric Dumazet <edumazet@google.com>, Boris Pismenny <borisp@mellanox.com>
Cc: Jakub Kicinski <kuba@kernel.org>,
David Miller <davem@davemloft.net>,
Saeed Mahameed <saeedm@nvidia.com>,
Christoph Hellwig <hch@lst.de>,
sagi@grimberg.me, axboe@fb.com, kbusch@kernel.org,
Al Viro <viro@zeniv.linux.org.uk>,
David Ahern <dsahern@gmail.com>,
smalin@marvell.com, boris.pismenny@gmail.com,
linux-nvme@lists.infradead.org, netdev <netdev@vger.kernel.org>,
benishay@nvidia.com, ogerlitz@nvidia.com, yorayz@nvidia.com,
Ben Ben-Ishay <benishay@mellanox.com>,
Or Gerlitz <ogerlitz@mellanox.com>,
Yoray Zack <yorayz@mellanox.com>
Subject: Re: [PATCH v2 net-next 02/21] net: Introduce direct data placement tcp offload
Date: Thu, 14 Jan 2021 22:19:25 +0200 [thread overview]
Message-ID: <62d4606a-0a41-2b12-cf16-3523d0b73573@gmail.com> (raw)
In-Reply-To: <CANn89iJaFRFxVe-eV7hcwC_5Zp+HtWHxTQt+BNYcKOwZUriSDg@mail.gmail.com>
On 14/01/2021 17:57, Eric Dumazet wrote:
> On Thu, Jan 14, 2021 at 4:10 PM Boris Pismenny <borisp@mellanox.com> wrote:
>>
>> This commit introduces direct data placement offload for TCP.
>> This capability is accompanied by new net_device operations that
>> configure hardware contexts. There is a context per socket, and a context per DDP
>> opreation. Additionally, a resynchronization routine is used to assist
>> hardware handle TCP OOO, and continue the offload.
>> Furthermore, we let the offloading driver advertise what is the max hw
>> sectors/segments.
>>
>> Using this interface, the NIC hardware will scatter TCP payload directly
>> to the BIO pages according to the command_id.
>> To maintain the correctness of the network stack, the driver is expected
>> to construct SKBs that point to the BIO pages.
>>
>> This, the SKB represents the data on the wire, while it is pointing
>> to data that is already placed in the destination buffer.
>> As a result, data from page frags should not be copied out to
>> the linear part.
>>
>> As SKBs that use DDP are already very memory efficient, we modify
>> skb_condence to avoid copying data from fragments to the linear
>> part of SKBs that belong to a socket that uses DDP offload.
>>
>> A follow-up patch will use this interface for DDP in NVMe-TCP.
>>
>> Signed-off-by: Boris Pismenny <borisp@mellanox.com>
>> Signed-off-by: Ben Ben-Ishay <benishay@mellanox.com>
>> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
>> Signed-off-by: Yoray Zack <yorayz@mellanox.com>
>> ---
>> include/linux/netdev_features.h | 2 +
>> include/linux/netdevice.h | 5 ++
>> include/net/inet_connection_sock.h | 4 +
>> include/net/tcp_ddp.h | 136 +++++++++++++++++++++++++++++
>> net/Kconfig | 9 ++
>> net/core/skbuff.c | 9 +-
>> net/ethtool/common.c | 1 +
>> 7 files changed, 165 insertions(+), 1 deletion(-)
>> create mode 100644 include/net/tcp_ddp.h
>>
>> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
>> index 934de56644e7..fb35dcac03d2 100644
>> --- a/include/linux/netdev_features.h
>> +++ b/include/linux/netdev_features.h
>> @@ -84,6 +84,7 @@ enum {
>> NETIF_F_GRO_FRAGLIST_BIT, /* Fraglist GRO */
>>
>> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
>> + NETIF_F_HW_TCP_DDP_BIT, /* TCP direct data placement offload */
>>
>> /*
>> * Add your fresh new feature above and remember to update
>> @@ -157,6 +158,7 @@ enum {
>> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
>> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
>> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
>> +#define NETIF_F_HW_TCP_DDP __NETIF_F(HW_TCP_DDP)
>>
>> /* Finds the next feature with the highest number of the range of start till 0.
>> */
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 259be67644e3..3dd3cdf5dec3 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -941,6 +941,7 @@ struct dev_ifalias {
>>
>> struct devlink;
>> struct tlsdev_ops;
>> +struct tcp_ddp_dev_ops;
>>
>> struct netdev_name_node {
>> struct hlist_node hlist;
>> @@ -1937,6 +1938,10 @@ struct net_device {
>> const struct tlsdev_ops *tlsdev_ops;
>> #endif
>>
>> +#ifdef CONFIG_TCP_DDP
>> + const struct tcp_ddp_dev_ops *tcp_ddp_ops;
>> +#endif
>> +
>> const struct header_ops *header_ops;
>>
>> unsigned int flags;
>> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
>> index 7338b3865a2a..a08b85b53aa8 100644
>> --- a/include/net/inet_connection_sock.h
>> +++ b/include/net/inet_connection_sock.h
>> @@ -66,6 +66,8 @@ struct inet_connection_sock_af_ops {
>> * @icsk_ulp_ops Pluggable ULP control hook
>> * @icsk_ulp_data ULP private data
>> * @icsk_clean_acked Clean acked data hook
>> + * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook
>> + * @icsk_ulp_ddp_data ULP direct data placement private data
>> * @icsk_listen_portaddr_node hash to the portaddr listener hashtable
>> * @icsk_ca_state: Congestion control state
>> * @icsk_retransmits: Number of unrecovered [RTO] timeouts
>> @@ -94,6 +96,8 @@ struct inet_connection_sock {
>> const struct tcp_ulp_ops *icsk_ulp_ops;
>> void __rcu *icsk_ulp_data;
>> void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
>
> #ifdef CONFIG_TCP_DDP ?
>
>> + const struct tcp_ddp_ulp_ops *icsk_ulp_ddp_ops;
>> + void __rcu *icsk_ulp_ddp_data;
>> struct hlist_node icsk_listen_portaddr_node;
>> unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
>> __u8 icsk_ca_state:5,
>> diff --git a/include/net/tcp_ddp.h b/include/net/tcp_ddp.h
>> new file mode 100644
>> index 000000000000..31e5b1a16d0f
>> --- /dev/null
>> +++ b/include/net/tcp_ddp.h
>> @@ -0,0 +1,136 @@
>> +/* SPDX-License-Identifier: GPL-2.0
>> + *
>> + * tcp_ddp.h
>> + * Author: Boris Pismenny <borisp@mellanox.com>
>> + * Copyright (C) 2021 Mellanox Technologies.
>> + */
>> +#ifndef _TCP_DDP_H
>> +#define _TCP_DDP_H
>> +
>> +#include <linux/netdevice.h>
>> +#include <net/inet_connection_sock.h>
>> +#include <net/sock.h>
>> +
>> +/* limits returned by the offload driver, zero means don't care */
>> +struct tcp_ddp_limits {
>> + int max_ddp_sgl_len;
>> +};
>> +
>> +enum tcp_ddp_type {
>> + TCP_DDP_NVME = 1,
>> +};
>> +
>> +/**
>> + * struct tcp_ddp_config - Generic tcp ddp configuration: tcp ddp IO queue
>> + * config implementations must use this as the first member.
>> + * Add new instances of tcp_ddp_config below (nvme-tcp, etc.).
>> + */
>> +struct tcp_ddp_config {
>> + enum tcp_ddp_type type;
>> + unsigned char buf[];
>> +};
>> +
>> +/**
>> + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue
>> + *
>> + * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0)
>> + * @cpda: controller pdu data alignmend (dwords, 0's based)
>> + * @dgst: digest types enabled.
>> + * The netdev will offload crc if ddp_crc is supported.
>> + * @queue_size: number of nvme-tcp IO queue elements
>> + * @queue_id: queue identifier
>> + * @cpu_io: cpu core running the IO thread for this queue
>> + */
>> +struct nvme_tcp_ddp_config {
>> + struct tcp_ddp_config cfg;
>> +
>> + u16 pfv;
>> + u8 cpda;
>> + u8 dgst;
>> + int queue_size;
>> + int queue_id;
>> + int io_cpu;
>> +};
>> +
>> +/**
>> + * struct tcp_ddp_io - tcp ddp configuration for an IO request.
>> + *
>> + * @command_id: identifier on the wire associated with these buffers
>> + * @nents: number of entries in the sg_table
>> + * @sg_table: describing the buffers for this IO request
>> + * @first_sgl: first SGL in sg_table
>> + */
>> +struct tcp_ddp_io {
>> + u32 command_id;
>> + int nents;
>> + struct sg_table sg_table;
>> + struct scatterlist first_sgl[SG_CHUNK_SIZE];
>> +};
>> +
>> +/* struct tcp_ddp_dev_ops - operations used by an upper layer protocol to configure ddp offload
>> + *
>> + * @tcp_ddp_limits: limit the number of scatter gather entries per IO.
>> + * the device driver can use this to limit the resources allocated per queue.
>> + * @tcp_ddp_sk_add: add offload for the queue represennted by the socket+config pair.
>> + * this function is used to configure either copy, crc or both offloads.
>> + * @tcp_ddp_sk_del: remove offload from the socket, and release any device related resources.
>> + * @tcp_ddp_setup: request copy offload for buffers associated with a command_id in tcp_ddp_io.
>> + * @tcp_ddp_teardown: release offload resources association between buffers and command_id in
>> + * tcp_ddp_io.
>> + * @tcp_ddp_resync: respond to the driver's resync_request. Called only if resync is successful.
>> + */
>> +struct tcp_ddp_dev_ops {
>> + int (*tcp_ddp_limits)(struct net_device *netdev,
>> + struct tcp_ddp_limits *limits);
>> + int (*tcp_ddp_sk_add)(struct net_device *netdev,
>> + struct sock *sk,
>> + struct tcp_ddp_config *config);
>> + void (*tcp_ddp_sk_del)(struct net_device *netdev,
>> + struct sock *sk);
>> + int (*tcp_ddp_setup)(struct net_device *netdev,
>> + struct sock *sk,
>> + struct tcp_ddp_io *io);
>> + int (*tcp_ddp_teardown)(struct net_device *netdev,
>> + struct sock *sk,
>> + struct tcp_ddp_io *io,
>> + void *ddp_ctx);
>> + void (*tcp_ddp_resync)(struct net_device *netdev,
>> + struct sock *sk, u32 seq);
>> +};
>> +
>> +#define TCP_DDP_RESYNC_REQ BIT(0)
>> +
>> +/**
>> + * struct tcp_ddp_ulp_ops - Interface to register uppper layer Direct Data Placement (DDP) TCP offload
>> + */
>> +struct tcp_ddp_ulp_ops {
>> + /* NIC requests ulp to indicate if @seq is the start of a message */
>> + bool (*resync_request)(struct sock *sk, u32 seq, u32 flags);
>> + /* NIC driver informs the ulp that ddp teardown is done - used for async completions*/
>> + void (*ddp_teardown_done)(void *ddp_ctx);
>> +};
>> +
>> +/**
>> + * struct tcp_ddp_ctx - Generic tcp ddp context: device driver per queue contexts must
>> + * use this as the first member.
>> + */
>> +struct tcp_ddp_ctx {
>> + enum tcp_ddp_type type;
>> + unsigned char buf[];
>> +};
>> +
>> +static inline struct tcp_ddp_ctx *tcp_ddp_get_ctx(const struct sock *sk)
>> +{
>> + struct inet_connection_sock *icsk = inet_csk(sk);
>> +
>> + return (__force struct tcp_ddp_ctx *)icsk->icsk_ulp_ddp_data;
>> +}
>> +
>> +static inline void tcp_ddp_set_ctx(struct sock *sk, void *ctx)
>> +{
>> + struct inet_connection_sock *icsk = inet_csk(sk);
>> +
>> + rcu_assign_pointer(icsk->icsk_ulp_ddp_data, ctx);
>> +}
>> +
>> +#endif //_TCP_DDP_H
>> diff --git a/net/Kconfig b/net/Kconfig
>> index f4c32d982af6..3876861cdc90 100644
>> --- a/net/Kconfig
>> +++ b/net/Kconfig
>> @@ -457,6 +457,15 @@ config ETHTOOL_NETLINK
>> netlink. It provides better extensibility and some new features,
>> e.g. notification messages.
>>
>> +config TCP_DDP
>> + bool "TCP direct data placement offload"
>> + default n
>> + help
>> + Direct Data Placement (DDP) offload for TCP enables ULP, such as
>> + NVMe-TCP/iSCSI, to request the NIC to place TCP payload data
>> + of a command response directly into kernel pages.
>> +
>> +
>> endif # if NET
>>
>> # Used by archs to tell that they support BPF JIT compiler plus which flavour.
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index f62cae3f75d8..791c1b6bc067 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -69,6 +69,7 @@
>> #include <net/xfrm.h>
>> #include <net/mpls.h>
>> #include <net/mptcp.h>
>> +#include <net/tcp_ddp.h>
>>
>> #include <linux/uaccess.h>
>> #include <trace/events/skb.h>
>> @@ -6140,9 +6141,15 @@ EXPORT_SYMBOL(pskb_extract);
>> */
>> void skb_condense(struct sk_buff *skb)
>> {
>> + bool is_ddp = false;
>> +
>> +#ifdef CONFIG_TCP_DDP
>
> This looks strange to me : TCP should call this helper while skb->sk is NULL
>
> Are you sure this is not dead code ?
>
Will verify again on Sunday. AFAICT, early demux sets skb->sk before this code
is called. Just to clarify, the purpose of this code is to avoid skb condensing
data that is already placed into destination buffers.
>> + is_ddp = skb->sk && inet_csk(skb->sk) &&
>> + inet_csk(skb->sk)->icsk_ulp_ddp_data;
>> +#endif
>> if (skb->data_len) {
>> if (skb->data_len > skb->end - skb->tail ||
>> - skb_cloned(skb))
>> + skb_cloned(skb) || is_ddp)
>> return;
>>
>> /* Nice, we can free page frag(s) right now */
>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
>> index 24036e3055a1..a2ff7a4a6bbf 100644
>> --- a/net/ethtool/common.c
>> +++ b/net/ethtool/common.c
>> @@ -68,6 +68,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
>> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
>> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
>> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
>> + [NETIF_F_HW_TCP_DDP_BIT] = "tcp-ddp-offload",
>> };
>>
>> const char
>> --
>> 2.24.1
>>
next prev parent reply other threads:[~2021-01-14 20:20 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-01-14 15:10 [PATCH v2 net-next 00/21] nvme-tcp receive offloads Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 01/21] iov_iter: Introduce new procedures for copy to iter/pages Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 02/21] net: Introduce direct data placement tcp offload Boris Pismenny
2021-01-14 15:57 ` Eric Dumazet
2021-01-14 20:19 ` Boris Pismenny [this message]
2021-01-14 20:43 ` Eric Dumazet
2021-01-31 10:40 ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 03/21] net: Introduce crc offload for tcp ddp ulp Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 04/21] net: SKB copy(+hash) iterators for DDP offloads Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 05/21] net/tls: expose get_netdev_for_sock Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 06/21] nvme-tcp: Add DDP offload control path Boris Pismenny
2021-01-19 3:47 ` David Ahern
2021-01-31 7:51 ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 07/21] nvme-tcp: Add DDP data-path Boris Pismenny
2021-01-19 4:18 ` David Ahern
2021-01-31 8:44 ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 08/21] nvme-tcp : Recalculate crc in the end of the capsule Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 09/21] nvme-tcp: Deal with netdevice DOWN events Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 10/21] net/mlx5: Header file changes for nvme-tcp offload Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 11/21] net/mlx5: Add 128B CQE for NVMEoTCP offload Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 12/21] net/mlx5e: TCP flow steering for nvme-tcp Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 13/21] net/mlx5e: NVMEoTCP offload initialization Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 14/21] net/mlx5e: KLM UMR helper macros Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 15/21] net/mlx5e: NVMEoTCP use KLM UMRs Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 16/21] net/mlx5e: NVMEoTCP queue init/teardown Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 17/21] net/mlx5e: NVMEoTCP async ddp invalidation Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 18/21] net/mlx5e: NVMEoTCP ddp setup and resync Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 19/21] net/mlx5e: NVMEoTCP, data-path for DDP offload Boris Pismenny
2021-01-16 4:57 ` David Ahern
2021-01-17 8:42 ` Boris Pismenny
2021-01-19 4:36 ` David Ahern
2021-01-31 9:27 ` Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 20/21] net/mlx5e: NVMEoTCP statistics Boris Pismenny
2021-01-14 15:10 ` [PATCH v2 net-next 21/21] Documentation: add TCP DDP offload documentation Boris Pismenny
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=62d4606a-0a41-2b12-cf16-3523d0b73573@gmail.com \
--to=borispismenny@gmail.com \
--cc=axboe@fb.com \
--cc=benishay@mellanox.com \
--cc=benishay@nvidia.com \
--cc=boris.pismenny@gmail.com \
--cc=borisp@mellanox.com \
--cc=davem@davemloft.net \
--cc=dsahern@gmail.com \
--cc=edumazet@google.com \
--cc=hch@lst.de \
--cc=kbusch@kernel.org \
--cc=kuba@kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=netdev@vger.kernel.org \
--cc=ogerlitz@mellanox.com \
--cc=ogerlitz@nvidia.com \
--cc=saeedm@nvidia.com \
--cc=sagi@grimberg.me \
--cc=smalin@marvell.com \
--cc=viro@zeniv.linux.org.uk \
--cc=yorayz@mellanox.com \
--cc=yorayz@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).