From: Arjun Roy <arjunroy.kdev@gmail.com>
To: davem@davemloft.net, netdev@vger.kernel.org
Cc: arjunroy@google.com, edumazet@google.com, soheil@google.com
Subject: [net-next v2 1/8] net-zerocopy: Copy straggler unaligned data for TCP Rx. zerocopy.
Date: Wed, 2 Dec 2020 14:09:38 -0800 [thread overview]
Message-ID: <20201202220945.911116-2-arjunroy.kdev@gmail.com> (raw)
In-Reply-To: <20201202220945.911116-1-arjunroy.kdev@gmail.com>
From: Arjun Roy <arjunroy@google.com>
When TCP receive zerocopy does not successfully map the entire
requested space, it outputs a 'hint' that the caller should recvmsg().
Augment zerocopy to accept a user buffer that it tries to copy this
hint into - if it is possible to copy the entire hint, it will do so.
This elides a recvmsg() call for received traffic that isn't exactly
page-aligned in size.
This was tested with RPC-style traffic of arbitrary sizes. Normally,
each received message required at least one getsockopt() call, and one
recvmsg() call for the remaining unaligned data.
With this change, almost all of the recvmsg() calls are eliminated,
leading to a savings of about 25%-50% in number of system calls
for RPC-style workloads.
---
include/uapi/linux/tcp.h | 2 +
net/ipv4/tcp.c | 84 ++++++++++++++++++++++++++++++++--------
2 files changed, 70 insertions(+), 16 deletions(-)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index cfcb10b75483..62db78b9c1a0 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -349,5 +349,7 @@ struct tcp_zerocopy_receive {
__u32 recv_skip_hint; /* out: amount of bytes to skip */
__u32 inq; /* out: amount of bytes in read queue */
__s32 err; /* out: socket error */
+ __u64 copybuf_address; /* in: copybuf address (small reads) */
+ __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
};
#endif /* _UAPI_LINUX_TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b2bc3d7fe9e8..887c6e986927 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1743,6 +1743,52 @@ int tcp_mmap(struct file *file, struct socket *sock,
}
EXPORT_SYMBOL(tcp_mmap);
+static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
+ struct sk_buff *skb, u32 copylen,
+ u32 *offset, u32 *seq)
+{
+ unsigned long copy_address = (unsigned long)zc->copybuf_address;
+ struct msghdr msg = {};
+ struct iovec iov;
+ int err;
+
+ if (copy_address != zc->copybuf_address)
+ return -EINVAL;
+
+ err = import_single_range(READ, (void __user *)copy_address,
+ copylen, &iov, &msg.msg_iter);
+ if (err)
+ return err;
+ err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
+ if (err)
+ return err;
+ zc->recv_skip_hint -= copylen;
+ *offset += copylen;
+ *seq += copylen;
+ return (__s32)copylen;
+}
+
+static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
+ struct sock *sk,
+ struct sk_buff *skb,
+ u32 *seq,
+ s32 copybuf_len)
+{
+ u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
+
+ if (!copylen)
+ return 0;
+ /* skb is null if inq < PAGE_SIZE. */
+ if (skb)
+ offset = *seq - TCP_SKB_CB(skb)->seq;
+ else
+ skb = tcp_recv_skb(sk, *seq, &offset);
+
+ zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
+ seq);
+ return zc->copybuf_len < 0 ? 0 : copylen;
+}
+
static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
struct page **pages,
unsigned long pages_to_map,
@@ -1776,8 +1822,10 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc)
{
+ u32 length = 0, offset, vma_len, avail_len, aligned_len, copylen = 0;
unsigned long address = (unsigned long)zc->address;
- u32 length = 0, seq, offset, zap_len;
+ s32 copybuf_len = zc->copybuf_len;
+ struct tcp_sock *tp = tcp_sk(sk);
#define PAGE_BATCH_SIZE 8
struct page *pages[PAGE_BATCH_SIZE];
const skb_frag_t *frags = NULL;
@@ -1785,10 +1833,12 @@ static int tcp_zerocopy_receive(struct sock *sk,
struct sk_buff *skb = NULL;
unsigned long pg_idx = 0;
unsigned long curr_addr;
- struct tcp_sock *tp;
- int inq;
+ u32 seq = tp->copied_seq;
+ int inq = tcp_inq(sk);
int ret;
+ zc->copybuf_len = 0;
+
if (address & (PAGE_SIZE - 1) || address != zc->address)
return -EINVAL;
@@ -1797,8 +1847,6 @@ static int tcp_zerocopy_receive(struct sock *sk,
sock_rps_record_flow(sk);
- tp = tcp_sk(sk);
-
mmap_read_lock(current->mm);
vma = find_vma(current->mm, address);
@@ -1806,17 +1854,16 @@ static int tcp_zerocopy_receive(struct sock *sk,
mmap_read_unlock(current->mm);
return -EINVAL;
}
- zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
-
- seq = tp->copied_seq;
- inq = tcp_inq(sk);
- zc->length = min_t(u32, zc->length, inq);
- zap_len = zc->length & ~(PAGE_SIZE - 1);
- if (zap_len) {
- zap_page_range(vma, address, zap_len);
+ vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
+ avail_len = min_t(u32, vma_len, inq);
+ aligned_len = avail_len & ~(PAGE_SIZE - 1);
+ if (aligned_len) {
+ zap_page_range(vma, address, aligned_len);
+ zc->length = aligned_len;
zc->recv_skip_hint = 0;
} else {
- zc->recv_skip_hint = zc->length;
+ zc->length = avail_len;
+ zc->recv_skip_hint = avail_len;
}
ret = 0;
curr_addr = address;
@@ -1885,13 +1932,18 @@ static int tcp_zerocopy_receive(struct sock *sk,
}
out:
mmap_read_unlock(current->mm);
- if (length) {
+ /* Try to copy straggler data. */
+ if (!ret)
+ copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
+ copybuf_len);
+
+ if (length + copylen) {
WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset);
- tcp_cleanup_rbuf(sk, length);
+ tcp_cleanup_rbuf(sk, length + copylen);
ret = 0;
if (length == zc->length)
zc->recv_skip_hint = 0;
--
2.29.2.576.ga3fc446d84-goog
next prev parent reply other threads:[~2020-12-02 22:10 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-12-02 22:09 [net-next v2 0/8] Perf. optimizations for TCP Recv. Zerocopy Arjun Roy
2020-12-02 22:09 ` Arjun Roy [this message]
2020-12-03 0:15 ` [net-next v2 1/8] net-zerocopy: Copy straggler unaligned data for TCP Rx. zerocopy Stephen Hemminger
2020-12-03 0:24 ` Arjun Roy
2020-12-03 23:01 ` David Laight
2020-12-03 23:14 ` Eric Dumazet
2020-12-04 9:02 ` David Laight
2020-12-03 23:19 ` Arjun Roy
2020-12-03 23:24 ` Arjun Roy
2020-12-04 9:03 ` David Laight
2020-12-04 22:37 ` Arjun Roy
2020-12-02 22:09 ` [net-next v2 2/8] net-tcp: Introduce tcp_recvmsg_locked() Arjun Roy
2020-12-02 22:09 ` [net-next v2 3/8] net-zerocopy: Refactor skb frag fast-forward op Arjun Roy
2020-12-02 22:09 ` [net-next v2 4/8] net-zerocopy: Refactor frag-is-remappable test Arjun Roy
2020-12-02 22:09 ` [net-next v2 5/8] net-zerocopy: Fast return if inq < PAGE_SIZE Arjun Roy
2020-12-02 22:09 ` [net-next v2 6/8] net-zerocopy: Introduce short-circuit small reads Arjun Roy
2020-12-02 22:09 ` [net-next v2 7/8] net-zerocopy: Set zerocopy hint when data is copied Arjun Roy
2020-12-02 22:09 ` [net-next v2 8/8] net-zerocopy: Defer vm zap unless actually needed Arjun Roy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201202220945.911116-2-arjunroy.kdev@gmail.com \
--to=arjunroy.kdev@gmail.com \
--cc=arjunroy@google.com \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=netdev@vger.kernel.org \
--cc=soheil@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).