netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Willem de Bruijn <willemb@google.com>
To: netdev@vger.kernel.org
Cc: mst@redhat.com, jasowang@redhat.com,
	Willem de Bruijn <willemb@google.com>
Subject: [PATCH net-next RFC 04/10] sock: sendmsg zerocopy notification coalescing
Date: Thu, 20 Aug 2015 10:36:43 -0400	[thread overview]
Message-ID: <1440081408-12302-5-git-send-email-willemb@google.com> (raw)
In-Reply-To: <1440081408-12302-1-git-send-email-willemb@google.com>

From: Willem de Bruijn <willemb@google.com>

Support coalescing of zerocopy notifications.

In the simple case, each sendmsg() call generates data and eventually
a zerocopy ready notification N, where N indicates the Nth successful
invocation of sendmsg() with the MSG_ZEROCOPY flag on this socket.

TCP and corked sockets can cause sendmsg() calls to append to a single
sk_buff and ubuf_info. Modify the notification path to return an
inclusive range of notifications [N..N+m].

Add skb_zerocopy_realloc() to reuse ubuf_info across sendmsg() calls.

Additionally, revise sock_zerocopy_callback() to coalesce consecutive
notifications: if an skb_uarg [1, 1] is freed while [0, 0] is on the
notification queue, modified the head of the queue to read [0, 1] and
drop the second separate notification.

For the case of reliable ordered transmission (TCP), only the upper
value of the range to be read, as the lower value is guaranteed to
be 1 above the last read notification.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/skbuff.h | 11 ++++++-
 net/core/skbuff.c      | 83 ++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 87 insertions(+), 7 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3372f1c..99de112 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -323,13 +323,21 @@ enum {
 struct ubuf_info {
 	void (*callback)(struct ubuf_info *, bool zerocopy_success);
 	void *ctx;
-	unsigned long desc;
+	union {
+		unsigned long desc;
+		struct {
+			u16 id;
+			u16 len;
+		};
+	};
 	atomic_t refcnt;
 };
 
 #define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
 
 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+					struct ubuf_info *uarg);
 
 static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 {
@@ -337,6 +345,7 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 }
 
 void sock_zerocopy_put(struct ubuf_info *uarg);
+void sock_zerocopy_put_abort(struct ubuf_info *uarg);
 
 void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6ee7282..4ae60ee 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -854,7 +854,8 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg = (void *)skb->cb;
 
 	uarg->callback = sock_zerocopy_callback;
-	uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1;
+	uarg->id = ((u16)atomic_inc_return(&sk->sk_zckey)) - 1;
+	uarg->len = 1;
 	atomic_set(&uarg->refcnt, 0);
 
 	return uarg;
@@ -863,20 +864,79 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
 
 #define skb_from_uarg(skb)	container_of((void *)uarg, struct sk_buff, cb)
 
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+					struct ubuf_info *uarg)
+{
+	if (uarg) {
+		u16 next;
+
+		/* realloc only when socket is locked (TCP, UDP cork),
+		 * so uarg->len and sk_zckey access is serialized
+		 */
+		BUG_ON(!sock_owned_by_user(sk));
+
+		if (unlikely(uarg->len == USHRT_MAX - 1))
+			return NULL;
+
+		next = atomic_read(&sk->sk_zckey);
+		if ((u16)(uarg->id + uarg->len) == next) {
+			uarg->len++;
+			atomic_set(&sk->sk_zckey, ++next);
+			return uarg;
+		}
+	}
+
+	return sock_zerocopy_alloc(sk, size);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+
+static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u16 lo, u16 len)
+{
+	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+	long sum_len;
+	u16 old_lo, old_hi;
+
+	old_lo = serr->ee.ee_data & 0xFFFF;
+	old_hi = serr->ee.ee_data >> 16;
+	sum_len = old_hi - old_lo + 1 + len;
+	if (old_hi < old_lo)
+		sum_len += (1 << 16);
+
+	if (lo != old_hi + 1 || sum_len >= (1 << 16))
+		return false;
+
+	old_hi += len;
+	serr->ee.ee_data = (old_hi << 16) | old_lo;
+	return true;
+}
+
 void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
 {
 	struct sock_exterr_skb *serr;
-	struct sk_buff *skb = skb_from_uarg(skb);
+	struct sk_buff *head, *skb = skb_from_uarg(skb);
 	struct sock *sk = skb->sk;
-	u16 id = uarg->desc;
+	struct sk_buff_head *q = &sk->sk_error_queue;
+	unsigned long flags;
+	u16 len, lo, hi;
+
+	len = uarg->len;
+	lo = uarg->id;
+	hi = uarg->id + len - 1;
 
 	serr = SKB_EXT_ERR(skb);
 	memset(serr, 0, sizeof(*serr));
 	serr->ee.ee_errno = 0;
 	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
-	serr->ee.ee_data = id;
+	serr->ee.ee_data = (hi << 16) | lo;
 
-	skb_queue_tail(&sk->sk_error_queue, skb);
+	spin_lock_irqsave(&q->lock, flags);
+	head = skb_peek(q);
+	if (!head || !skb_zerocopy_notify_extend(head, lo, len)) {
+		__skb_queue_tail(q, skb);
+		skb = NULL;
+	}
+	spin_unlock_irqrestore(&q->lock, flags);
+	consume_skb(skb);
 
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_error_report(sk);
@@ -886,7 +946,8 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
 void sock_zerocopy_put(struct ubuf_info *uarg)
 {
 	if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
-		if (uarg->callback)
+		/* if !len, there was only 1 call, and it was aborted */
+		if (uarg->callback && uarg->len)
 			uarg->callback(uarg, true);
 		else
 			consume_skb(skb_from_uarg(uarg));
@@ -894,6 +955,16 @@ void sock_zerocopy_put(struct ubuf_info *uarg)
 }
 EXPORT_SYMBOL_GPL(sock_zerocopy_put);
 
+/* only called when sendmsg returns with error; no notification for this call */
+void sock_zerocopy_put_abort(struct ubuf_info *uarg)
+{
+	if (uarg) {
+		uarg->len--;
+		sock_zerocopy_put(uarg);
+	}
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
+
 bool skb_zerocopy_alloc(struct sk_buff *skb, size_t size)
 {
 	struct ubuf_info *uarg;
-- 
2.5.0.276.gf5e568e

  parent reply	other threads:[~2015-08-20 14:36 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-08-20 14:36 [PATCH net-next RFC 00/10] socket sendmsg MSG_ZEROCOPY Willem de Bruijn
2015-08-20 14:36 ` [PATCH net-next RFC 01/10] sock: skb_copy_ubufs support for compound pages Willem de Bruijn
2015-08-20 14:36 ` [PATCH net-next RFC 02/10] sock: add sendmsg zerocopy Willem de Bruijn
2015-08-20 14:36 ` [PATCH net-next RFC 03/10] sock: enable " Willem de Bruijn
2015-08-20 14:36 ` Willem de Bruijn [this message]
2015-08-20 14:36 ` [PATCH net-next RFC 05/10] tcp: " Willem de Bruijn
2015-08-20 14:36 ` [PATCH net-next RFC 06/10] udp: " Willem de Bruijn
2015-08-20 14:36 ` [PATCH net-next RFC 07/10] raw: enable sendmsg zerocopy with hdrincl Willem de Bruijn
2015-08-20 14:36 ` [PATCH net-next RFC 08/10] packet: enable sendmsg zerocopy Willem de Bruijn
2015-08-20 14:36 ` [PATCH net-next RFC 09/10] sock: sendmsg zerocopy ulimit Willem de Bruijn
2015-08-20 22:56 ` [PATCH net-next RFC 00/10] socket sendmsg MSG_ZEROCOPY David Miller
2015-08-21  2:49   ` Willem de Bruijn
2015-08-21  5:17     ` David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1440081408-12302-5-git-send-email-willemb@google.com \
    --to=willemb@google.com \
    --cc=jasowang@redhat.com \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).