netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] fs: eventpoll: add empty event
@ 2021-09-27 20:29 Johannes Lundberg
  2021-09-27 20:47 ` Eric Dumazet
  0 siblings, 1 reply; 4+ messages in thread
From: Johannes Lundberg @ 2021-09-27 20:29 UTC (permalink / raw)
  To: linux-kernel
  Cc: Johannes Lundberg, David S. Miller, Jakub Kicinski, Eric Dumazet,
	Hideaki YOSHIFUJI, David Ahern, Paolo Abeni, Florian Westphal,
	Alexander Aring, Tonghao Zhang, Yangbo Lu, Thomas Gleixner,
	netdev

The EPOLLEMPTY event will trigger when the TCP write buffer becomes
empty, i.e., when all outgoing data have been ACKed.

The need for this functionality comes from a business requirement
of measuring with higher precision how much time is spent
transmitting data to a client. For reference, similar functionality
was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.

Signed-off-by: Johannes Lundberg <jlundberg@llnw.com>
---
 include/net/sock.h             | 11 +++++++++++
 include/uapi/linux/eventpoll.h |  1 +
 net/core/sock.c                |  5 +++++
 net/core/stream.c              | 14 ++++++++++++++
 net/ipv4/tcp.c                 |  5 +++++
 5 files changed, 36 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index c005c3c750e8..9047a9e225a9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -516,6 +516,7 @@ struct sock {
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk);
 	void			(*sk_write_space)(struct sock *sk);
+	void			(*sk_empty)(struct sock *sk);
 	void			(*sk_error_report)(struct sock *sk);
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
@@ -965,6 +966,7 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val)
 	WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
 }
 
+void sk_stream_empty(struct sock *sk);
 void sk_stream_write_space(struct sock *sk);
 
 /* OOB backlog add */
@@ -1288,6 +1290,11 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
 
 INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));
 
+static inline bool sk_stream_is_empty(const struct sock *sk)
+{
+	return (sk->sk_wmem_queued == 0);
+}
+
 static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
 {
 	if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
@@ -1559,6 +1566,10 @@ DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
 	sk_wmem_queued_add(sk, -skb->truesize);
+
+	if (sk_stream_is_empty(sk))
+		sk->sk_empty(sk);
+
 	sk_mem_uncharge(sk, skb->truesize);
 	if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
 	    !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..aab9f1f624d0 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -39,6 +39,7 @@
 #define EPOLLWRNORM	(__force __poll_t)0x00000100
 #define EPOLLWRBAND	(__force __poll_t)0x00000200
 #define EPOLLMSG	(__force __poll_t)0x00000400
+#define EPOLLEMPTY	(__force __poll_t)0x00000800
 #define EPOLLRDHUP	(__force __poll_t)0x00002000
 
 /* Set exclusive wakeup mode for the target file descriptor */
diff --git a/net/core/sock.c b/net/core/sock.c
index 512e629f9780..f917791d8149 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3062,6 +3062,10 @@ static void sock_def_write_space(struct sock *sk)
 	rcu_read_unlock();
 }
 
+static void sock_def_empty(struct sock *sk)
+{
+}
+
 static void sock_def_destruct(struct sock *sk)
 {
 }
@@ -3136,6 +3140,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_state_change	=	sock_def_wakeup;
 	sk->sk_data_ready	=	sock_def_readable;
 	sk->sk_write_space	=	sock_def_write_space;
+	sk->sk_empty		=	sock_def_empty;
 	sk->sk_error_report	=	sock_def_error_report;
 	sk->sk_destruct		=	sock_def_destruct;
 
diff --git a/net/core/stream.c b/net/core/stream.c
index 4f1d4aa5fb38..c7e4135542a2 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -21,6 +21,20 @@
 #include <linux/wait.h>
 #include <net/sock.h>
 
+void sk_stream_empty(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+	struct socket_wq *wq;
+
+	if (sk_stream_is_empty(sk) && sock) {
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (skwq_has_sleeper(wq))
+			wake_up_interruptible_poll(&wq->wait, EPOLLEMPTY);
+		rcu_read_unlock();
+	}
+}
+
 /**
  * sk_stream_write_space - stream socket write_space callback.
  * @sk: socket
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e8b48df73c85..550bae79af06 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -453,6 +453,8 @@ void tcp_init_sock(struct sock *sk)
 	tp->tsoffset = 0;
 	tp->rack.reo_wnd_steps = 1;
 
+	sk->sk_empty = sk_stream_empty;
+
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
@@ -561,6 +563,9 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 		    tp->urg_data)
 			target++;
 
+		if (sk_stream_is_empty(sk))
+			mask |= EPOLLEMPTY;
+
 		if (tcp_stream_is_readable(sk, target))
 			mask |= EPOLLIN | EPOLLRDNORM;
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-09-27 21:34 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-09-27 20:29 [PATCH] fs: eventpoll: add empty event Johannes Lundberg
2021-09-27 20:47 ` Eric Dumazet
2021-09-27 21:17   ` Johannes Lundberg
2021-09-27 21:33     ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).