From: Johannes Lundberg <jlundberg@llnw.com>
To: linux-kernel@vger.kernel.org
Cc: Johannes Lundberg <jlundberg@llnw.com>,
"David S. Miller" <davem@davemloft.net>,
Jakub Kicinski <kuba@kernel.org>,
Eric Dumazet <edumazet@google.com>,
Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>,
David Ahern <dsahern@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Florian Westphal <fw@strlen.de>,
Alexander Aring <aahringo@redhat.com>,
Tonghao Zhang <xiangxia.m.yue@gmail.com>,
Yangbo Lu <yangbo.lu@nxp.com>,
Thomas Gleixner <tglx@linutronix.de>,
netdev@vger.kernel.org
Subject: [PATCH] fs: eventpoll: add empty event
Date: Mon, 27 Sep 2021 13:29:17 -0700 [thread overview]
Message-ID: <20210927202923.7360-1-jlundberg@llnw.com> (raw)
The EPOLLEMPTY event will trigger when the TCP write buffer becomes
empty, i.e., when all outgoing data have been ACKed.
The need for this functionality comes from a business requirement
of measuring with higher precision how much time is spent
transmitting data to a client. For reference, similar functionality
was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.
Signed-off-by: Johannes Lundberg <jlundberg@llnw.com>
---
include/net/sock.h | 11 +++++++++++
include/uapi/linux/eventpoll.h | 1 +
net/core/sock.c | 5 +++++
net/core/stream.c | 14 ++++++++++++++
net/ipv4/tcp.c | 5 +++++
5 files changed, 36 insertions(+)
diff --git a/include/net/sock.h b/include/net/sock.h
index c005c3c750e8..9047a9e225a9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -516,6 +516,7 @@ struct sock {
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk);
void (*sk_write_space)(struct sock *sk);
+ void (*sk_empty)(struct sock *sk);
void (*sk_error_report)(struct sock *sk);
int (*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
@@ -965,6 +966,7 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val)
WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}
+void sk_stream_empty(struct sock *sk);
void sk_stream_write_space(struct sock *sk);
/* OOB backlog add */
@@ -1288,6 +1290,11 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));
+static inline bool sk_stream_is_empty(const struct sock *sk)
+{
+ return (sk->sk_wmem_queued == 0);
+}
+
static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
@@ -1559,6 +1566,10 @@ DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
sk_wmem_queued_add(sk, -skb->truesize);
+
+ if (sk_stream_is_empty(sk))
+ sk->sk_empty(sk);
+
sk_mem_uncharge(sk, skb->truesize);
if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
!sk->sk_tx_skb_cache && !skb_cloned(skb)) {
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..aab9f1f624d0 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -39,6 +39,7 @@
#define EPOLLWRNORM (__force __poll_t)0x00000100
#define EPOLLWRBAND (__force __poll_t)0x00000200
#define EPOLLMSG (__force __poll_t)0x00000400
+#define EPOLLEMPTY (__force __poll_t)0x00000800
#define EPOLLRDHUP (__force __poll_t)0x00002000
/* Set exclusive wakeup mode for the target file descriptor */
diff --git a/net/core/sock.c b/net/core/sock.c
index 512e629f9780..f917791d8149 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3062,6 +3062,10 @@ static void sock_def_write_space(struct sock *sk)
rcu_read_unlock();
}
+static void sock_def_empty(struct sock *sk)
+{
+}
+
static void sock_def_destruct(struct sock *sk)
{
}
@@ -3136,6 +3140,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
+ sk->sk_empty = sock_def_empty;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
diff --git a/net/core/stream.c b/net/core/stream.c
index 4f1d4aa5fb38..c7e4135542a2 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -21,6 +21,20 @@
#include <linux/wait.h>
#include <net/sock.h>
+void sk_stream_empty(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ struct socket_wq *wq;
+
+ if (sk_stream_is_empty(sk) && sock) {
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, EPOLLEMPTY);
+ rcu_read_unlock();
+ }
+}
+
/**
* sk_stream_write_space - stream socket write_space callback.
* @sk: socket
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e8b48df73c85..550bae79af06 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -453,6 +453,8 @@ void tcp_init_sock(struct sock *sk)
tp->tsoffset = 0;
tp->rack.reo_wnd_steps = 1;
+ sk->sk_empty = sk_stream_empty;
+
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
@@ -561,6 +563,9 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
tp->urg_data)
target++;
+ if (sk_stream_is_empty(sk))
+ mask |= EPOLLEMPTY;
+
if (tcp_stream_is_readable(sk, target))
mask |= EPOLLIN | EPOLLRDNORM;
--
2.17.1
next reply other threads:[~2021-09-27 20:30 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-09-27 20:29 Johannes Lundberg [this message]
2021-09-27 20:47 ` [PATCH] fs: eventpoll: add empty event Eric Dumazet
2021-09-27 21:17 ` Johannes Lundberg
2021-09-27 21:33 ` Eric Dumazet
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210927202923.7360-1-jlundberg@llnw.com \
--to=jlundberg@llnw.com \
--cc=aahringo@redhat.com \
--cc=davem@davemloft.net \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=fw@strlen.de \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=tglx@linutronix.de \
--cc=xiangxia.m.yue@gmail.com \
--cc=yangbo.lu@nxp.com \
--cc=yoshfuji@linux-ipv6.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).