[PATCH net-next 0/3] udp: refactor memory accounting

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next 0/3] udp: refactor memory accounting
@ 2016-09-21 17:23 Paolo Abeni
  2016-09-21 17:23 ` [PATCH net-next 1/3] net/socket: factor out helpers for memory and queue manipulation Paolo Abeni
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Paolo Abeni @ 2016-09-21 17:23 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: David S. Miller, James Morris, Trond Myklebust, Alexander Duyck,
	Daniel Borkmann, Eric Dumazet, Tom Herbert, Hannes Frederic Sowa,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

This patch series refactor the udp memory accounting, replacing the
generic implementation with a custom one, in order to remove the needs for
locking the socket on the enqueue and dequeue operations. The socket backlog
usage is dropped, as well.

The first patch factor out core pieces of some queue and memory management
socket helpers, so that they can later be used by the udp memory accounting
functions.
The second patch adds the memory account helpers, without using them.
The third patch replacse the old rx memory accounting path for udp over ipv4 and
udp over ipv6. In kernel UDP users are updated, as well.

The memory accounting schema is described in detail in the individual patch
commit message.

The performance gain depends on the specific scenario; with few flows (and
little contention in the original code) the differences are in the noise range,
while with several flows contending the same socket, the measured speed-up
is relevant (e.g. even over 100% in case of extreme contention)

Paolo Abeni (3):
  net/socket: factor out helpers for memory and queue manipulation
  udp: implement memory accounting helpers
  udp: use it's own memory accounting schema

 include/linux/skbuff.h |   2 +-
 include/linux/udp.h    |   2 +
 include/net/sock.h     |   5 ++
 include/net/udp.h      |   5 ++
 net/core/datagram.c    |  36 ++++++----
 net/core/skbuff.c      |   3 +-
 net/core/sock.c        |  96 ++++++++++++++++---------
 net/ipv4/udp.c         | 190 +++++++++++++++++++++++++++++++++++++++++--------
 net/ipv6/udp.c         |  28 +++-----
 net/sunrpc/svcsock.c   |  22 ++++--
 net/sunrpc/xprtsock.c  |   2 +-
 11 files changed, 290 insertions(+), 101 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH net-next 1/3] net/socket: factor out helpers for memory and queue manipulation
  2016-09-21 17:23 [PATCH net-next 0/3] udp: refactor memory accounting Paolo Abeni
@ 2016-09-21 17:23 ` Paolo Abeni
       [not found] ` <cover.1474477902.git.pabeni-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2016-09-21 17:23 ` [PATCH net-next 3/3] udp: use it's own memory accounting schema Paolo Abeni
  2 siblings, 0 replies; 13+ messages in thread
From: Paolo Abeni @ 2016-09-21 17:23 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, James Morris, Trond Myklebust, Alexander Duyck,
	Daniel Borkmann, Eric Dumazet, Tom Herbert, Hannes Frederic Sowa,
	linux-nfs

Basic sock operations that udp code can use with its own
memory accounting schema. No functional change is introduced
in the existing APIs.

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h |  2 +-
 include/net/sock.h     |  5 +++
 net/core/datagram.c    | 36 +++++++++++--------
 net/core/skbuff.c      |  3 +-
 net/core/sock.c        | 96 +++++++++++++++++++++++++++++++++-----------------
 5 files changed, 94 insertions(+), 48 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cfb7219..49c489d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3016,7 +3016,7 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
 #define skb_walk_frags(skb, iter)	\
 	for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
 
-
+void sock_rmem_free(struct sk_buff *skb);
 int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
 				const struct sk_buff *skb);
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
diff --git a/include/net/sock.h b/include/net/sock.h
index c797c57..a37362c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1274,7 +1274,9 @@ static inline struct inode *SOCK_INODE(struct socket *socket)
 /*
  * Functions for memory accounting
  */
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
 int __sk_mem_schedule(struct sock *sk, int size, int kind);
+void __sk_mem_reduce_allocated(struct sock *sk, int amount);
 void __sk_mem_reclaim(struct sock *sk, int amount);
 
 #define SK_MEM_QUANTUM ((int)PAGE_SIZE)
@@ -1940,6 +1942,9 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer,
 
 void sk_stop_timer(struct sock *sk, struct timer_list *timer);
 
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+			unsigned int flags);
+void __sock_enqueue_skb(struct sock *sk, struct sk_buff *skb);
 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b7de71f..bfb973a 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -323,6 +323,27 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
 }
 EXPORT_SYMBOL(__skb_free_datagram_locked);
 
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+			unsigned int flags)
+{
+	int err = 0;
+
+	if (flags & MSG_PEEK) {
+		err = -ENOENT;
+		spin_lock_bh(&sk->sk_receive_queue.lock);
+		if (skb == skb_peek(&sk->sk_receive_queue)) {
+			__skb_unlink(skb, &sk->sk_receive_queue);
+			atomic_dec(&skb->users);
+			err = 0;
+		}
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+	}
+
+	atomic_inc(&sk->sk_drops);
+	return err;
+}
+EXPORT_SYMBOL(__sk_queue_drop_skb);
+
 /**
  *	skb_kill_datagram - Free a datagram skbuff forcibly
  *	@sk: socket
@@ -346,23 +367,10 @@ EXPORT_SYMBOL(__skb_free_datagram_locked);
 
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
 {
-	int err = 0;
-
-	if (flags & MSG_PEEK) {
-		err = -ENOENT;
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (skb == skb_peek(&sk->sk_receive_queue)) {
-			__skb_unlink(skb, &sk->sk_receive_queue);
-			atomic_dec(&skb->users);
-			err = 0;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-	}
+	int err = __sk_queue_drop_skb(sk, skb, flags);
 
 	kfree_skb(skb);
-	atomic_inc(&sk->sk_drops);
 	sk_mem_reclaim_partial(sk);
-
 	return err;
 }
 EXPORT_SYMBOL(skb_kill_datagram);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b6..4dce605 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3657,12 +3657,13 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
 }
 EXPORT_SYMBOL_GPL(skb_cow_data);
 
-static void sock_rmem_free(struct sk_buff *skb)
+void sock_rmem_free(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
 
 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
 }
+EXPORT_SYMBOL_GPL(sock_rmem_free);
 
 /*
  * Note: We dont mem charge error packets (no sk_forward_alloc changes)
diff --git a/net/core/sock.c b/net/core/sock.c
index 51a7304..752308d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -405,24 +405,12 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 }
 
 
-int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+void __sock_enqueue_skb(struct sock *sk, struct sk_buff *skb)
 {
 	unsigned long flags;
 	struct sk_buff_head *list = &sk->sk_receive_queue;
 
-	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
-		atomic_inc(&sk->sk_drops);
-		trace_sock_rcvqueue_full(sk, skb);
-		return -ENOMEM;
-	}
-
-	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
-		atomic_inc(&sk->sk_drops);
-		return -ENOBUFS;
-	}
-
 	skb->dev = NULL;
-	skb_set_owner_r(skb, sk);
 
 	/* we escape from rcu protected region, make sure we dont leak
 	 * a norefcounted dst
@@ -436,6 +424,24 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_data_ready(sk);
+}
+EXPORT_SYMBOL(__sock_enqueue_skb);
+
+int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
+		atomic_inc(&sk->sk_drops);
+		trace_sock_rcvqueue_full(sk, skb);
+		return -ENOMEM;
+	}
+
+	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
+		atomic_inc(&sk->sk_drops);
+		return -ENOBUFS;
+	}
+
+	skb_set_owner_r(skb, sk);
+	__sock_enqueue_skb(sk, skb);
 	return 0;
 }
 EXPORT_SYMBOL(__sock_queue_rcv_skb);
@@ -2088,24 +2094,18 @@ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
 EXPORT_SYMBOL(sk_wait_data);
 
 /**
- *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ *	__sk_mem_raise_allocated - increase memory_allocated
  *	@sk: socket
  *	@size: memory size to allocate
+ *	@amt: pages to allocate
  *	@kind: allocation type
  *
- *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
- *	rmem allocation. This function assumes that protocols which have
- *	memory_pressure use sk_wmem_queued as write buffer accounting.
+ *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
  */
-int __sk_mem_schedule(struct sock *sk, int size, int kind)
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 {
 	struct proto *prot = sk->sk_prot;
-	int amt = sk_mem_pages(size);
-	long allocated;
-
-	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
-
-	allocated = sk_memory_allocated_add(sk, amt);
+	long allocated = sk_memory_allocated_add(sk, amt);
 
 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
@@ -2166,9 +2166,6 @@ suppress_allocation:
 
 	trace_sock_exceed_buf_limit(sk, prot, allocated);
 
-	/* Alas. Undo changes. */
-	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
-
 	sk_memory_allocated_sub(sk, amt);
 
 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
@@ -2176,18 +2173,40 @@ suppress_allocation:
 
 	return 0;
 }
+EXPORT_SYMBOL(__sk_mem_raise_allocated);
+
+/**
+ *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ *	@sk: socket
+ *	@size: memory size to allocate
+ *	@kind: allocation type
+ *
+ *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ *	rmem allocation. This function assumes that protocols which have
+ *	memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+	int ret, amt = sk_mem_pages(size);
+
+	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
+	if (!ret)
+		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+	return ret;
+}
 EXPORT_SYMBOL(__sk_mem_schedule);
 
 /**
- *	__sk_mem_reclaim - reclaim memory_allocated
+ *	__sk_mem_reduce_allocated - reclaim memory_allocated
  *	@sk: socket
- *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ *	@amount: number of quanta
+ *
+ *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
  */
-void __sk_mem_reclaim(struct sock *sk, int amount)
+void __sk_mem_reduce_allocated(struct sock *sk, int amount)
 {
-	amount >>= SK_MEM_QUANTUM_SHIFT;
 	sk_memory_allocated_sub(sk, amount);
-	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
 
 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
@@ -2196,6 +2215,19 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
 		sk_leave_memory_pressure(sk);
 }
+EXPORT_SYMBOL(__sk_mem_reduce_allocated);
+
+/**
+ *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
+ *	@sk: socket
+ *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ */
+void __sk_mem_reclaim(struct sock *sk, int amount)
+{
+	amount >>= SK_MEM_QUANTUM_SHIFT;
+	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+	__sk_mem_reduce_allocated(sk, amount);
+}
 EXPORT_SYMBOL(__sk_mem_reclaim);
 
 int sk_set_peek_off(struct sock *sk, int val)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

[parent not found: <cover.1474477902.git.pabeni-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]

* [PATCH net-next 2/3] udp: implement memory accounting helpers
       [not found] ` <cover.1474477902.git.pabeni-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2016-09-21 17:23   ` Paolo Abeni
  2016-09-21 23:31     ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Paolo Abeni @ 2016-09-21 17:23 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: David S. Miller, James Morris, Trond Myklebust, Alexander Duyck,
	Daniel Borkmann, Eric Dumazet, Tom Herbert, Hannes Frederic Sowa,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

Avoid usage of common memory accounting functions, since
the logic is pretty much different.

To account for forward allocation, a couple of new atomic_t
members are added to udp_sock: 'mem_alloced' and 'mem_freed'.
The current forward allocation is estimated as 'mem_alloced'
 minus 'mem_freed' minus 'sk_rmem_alloc'.

When the forward allocation can't cope with the packet to be
enqueued, 'mem_alloced' is incremented by the packet size
rounded-up to the next SK_MEM_QUANTUM.
After a dequeue, we try to partially reclaim of the forward
allocated memory rounded down to an SK_MEM_QUANTUM and
'mem_freed' is increased by that amount.
sk->sk_forward_alloc is set after each allocated/freed memory
update, to the currently estimated forward allocation, without
any lock or protection.
This value is updated/maintained only to expose some
semi-reasonable value to the eventual reader, and is guaranteed
to be 0 at socket destruction time.

The above needs custom memory reclaiming on shutdown, provided
by the udp_destruct_sock() helper, which completely reclaim
the allocated forward memory.

Helpers are provided for skb free, consume and purge, respecting
the above constraints.

The socket lock is still used to protect the updates to sk_peek_off,
but is acquired only if peeking with offset is enabled.

As a consequence of the above schema, enqueue to sk_error_queue
will cause larger forward allocation on following normal data
(due to sk_rmem_alloc grow), but this allows amortizing the cost
of the atomic operation on SK_MEM_QUANTUM/skb->truesize packets.
The use of separate atomics for 'mem_alloced' and 'mem_freed'
allows the use of a single atomic operation to protect against
concurrent dequeue.

Acked-by: Hannes Frederic Sowa <hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r@public.gmane.org>
Signed-off-by: Paolo Abeni <pabeni-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 include/linux/udp.h |   2 +
 include/net/udp.h   |   5 ++
 net/ipv4/udp.c      | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 158 insertions(+)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index d1fd8cd..cd72645 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -42,6 +42,8 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 struct udp_sock {
 	/* inet_sock has to be the first member */
 	struct inet_sock inet;
+	atomic_t	 mem_allocated;
+	atomic_t	 mem_freed;
 #define udp_port_hash		inet.sk.__sk_common.skc_u16hashes[0]
 #define udp_portaddr_hash	inet.sk.__sk_common.skc_u16hashes[1]
 #define udp_portaddr_node	inet.sk.__sk_common.skc_portaddr_node
diff --git a/include/net/udp.h b/include/net/udp.h
index ea53a87..86307a4 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -246,6 +246,10 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 }
 
 /* net/ipv4/udp.c */
+void skb_free_udp(struct sock *sk, struct sk_buff *skb);
+void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
+int udp_rmem_schedule(struct sock *sk, struct sk_buff *skb);
+
 void udp_v4_early_demux(struct sk_buff *skb);
 int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
@@ -258,6 +262,7 @@ void udp_flush_pending_frames(struct sock *sk);
 void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
 int udp_rcv(struct sk_buff *skb);
 int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+int udp_init_sock(struct sock *sk);
 int udp_disconnect(struct sock *sk, int flags);
 unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait);
 struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 058c312..98480af 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1178,6 +1178,157 @@ out:
 	return ret;
 }
 
+static inline int __udp_forward(struct udp_sock *up, int freed, int rmem)
+{
+	return atomic_read(&up->mem_allocated) - freed - rmem;
+}
+
+static int skb_unref(struct sk_buff *skb)
+{
+	if (likely(atomic_read(&skb->users) == 1))
+		smp_rmb();
+	else if (likely(!atomic_dec_and_test(&skb->users)))
+		return 0;
+
+	return skb->truesize;
+}
+
+static inline int udp_try_release(struct sock *sk, int *fwd, int partial)
+{
+	struct udp_sock *up = udp_sk(sk);
+	int freed_old, freed_new, amt;
+
+	freed_old = atomic_read(&up->mem_freed);
+	*fwd = __udp_forward(up, freed_old, atomic_read(&sk->sk_rmem_alloc));
+	if (*fwd < SK_MEM_QUANTUM + partial)
+		return 0;
+
+	/* we can have concurrent release; if we catch any conflict
+	 * via atomic_cmpxchg, let only one of them relase the memory
+	 */
+	amt = sk_mem_pages(*fwd - partial) << SK_MEM_QUANTUM_SHIFT;
+	freed_new = atomic_cmpxchg(&up->mem_freed, freed_old, freed_old + amt);
+	return (freed_new == freed_old) ? amt : 0;
+}
+
+/* reclaim the allocated forward memory, except 'partial' quanta */
+static void skb_release_mem_udp(struct sock *sk, int partial)
+{
+	int fwd, delta = udp_try_release(sk, &fwd, partial);
+
+	if (delta)
+		__sk_mem_reduce_allocated(sk, delta >> SK_MEM_QUANTUM_SHIFT);
+	sk->sk_forward_alloc = fwd - delta;
+}
+
+void skb_free_udp(struct sock *sk, struct sk_buff *skb)
+{
+	int size = skb_unref(skb);
+
+	if (!size)
+		return;
+
+	trace_kfree_skb(skb, __builtin_return_address(0));
+	__kfree_skb(skb);
+	skb_release_mem_udp(sk, 1);
+}
+EXPORT_SYMBOL_GPL(skb_free_udp);
+
+void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
+{
+	int size = skb_unref(skb);
+
+	if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
+		bool slow = lock_sock_fast(sk);
+
+		sk_peek_offset_bwd(sk, len);
+		unlock_sock_fast(sk, slow);
+	}
+	if (!size)
+		return;
+
+	__kfree_skb(skb);
+	skb_release_mem_udp(sk, 1);
+}
+EXPORT_SYMBOL_GPL(skb_consume_udp);
+
+static void udp_queue_purge(struct sock *sk, struct sk_buff_head *list,
+			    int partial)
+{
+	struct sk_buff *skb;
+	int size;
+
+	while ((skb = __skb_dequeue(list)) != NULL) {
+		size = skb_unref(skb);
+		if (size) {
+			trace_kfree_skb(skb, udp_queue_purge);
+			__kfree_skb(skb);
+		}
+	}
+	skb_release_mem_udp(sk, partial);
+}
+
+int udp_rmem_schedule(struct sock *sk, struct sk_buff *skb)
+{
+	int alloc, freed, fwd, amt, delta, rmem, err = -ENOMEM;
+	struct udp_sock *up = udp_sk(sk);
+
+	rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
+	if (rmem > sk->sk_rcvbuf)
+		goto drop;
+
+	freed = atomic_read(&up->mem_freed);
+	fwd = __udp_forward(up, freed, rmem);
+	if (fwd > 0)
+		goto no_alloc;
+
+	amt = sk_mem_pages(skb->truesize);
+	delta = amt << SK_MEM_QUANTUM_SHIFT;
+	if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
+		err = -ENOBUFS;
+		goto drop;
+	}
+
+	/* if we have some skbs in the error queue, the forward allocation could
+	 * be understimated, even below 0; avoid exporting such values
+	 */
+	alloc = atomic_add_return(delta, &up->mem_allocated);
+	fwd = alloc - freed - rmem;
+	if (fwd < 0)
+		fwd = SK_MEM_QUANTUM;
+
+no_alloc:
+	sk->sk_forward_alloc = fwd;
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = sock_rmem_free;
+	return 0;
+
+drop:
+	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+	atomic_inc(&sk->sk_drops);
+	return err;
+}
+EXPORT_SYMBOL_GPL(udp_rmem_schedule);
+
+static void udp_destruct_sock(struct sock *sk)
+{
+	/* reclaim completely the forward allocated memory */
+	udp_queue_purge(sk, &sk->sk_receive_queue, 0);
+	inet_sock_destruct(sk);
+}
+
+int udp_init_sock(struct sock *sk)
+{
+	struct udp_sock *up = udp_sk(sk);
+
+	atomic_set(&up->mem_allocated, 0);
+	atomic_set(&up->mem_freed, 0);
+	sk->sk_destruct = udp_destruct_sock;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(udp_init_sock);
+
 /**
  *	first_packet_length	- return length of first packet in receive queue
  *	@sk: socket
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
  2016-09-21 17:23   ` [PATCH net-next 2/3] udp: implement memory accounting helpers Paolo Abeni
@ 2016-09-21 23:31     ` Eric Dumazet
  2016-09-22 10:33       ` Paolo Abeni
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2016-09-21 23:31 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, David S. Miller, James Morris, Trond Myklebust,
	Alexander Duyck, Daniel Borkmann, Eric Dumazet, Tom Herbert,
	Hannes Frederic Sowa, linux-nfs

On Wed, 2016-09-21 at 19:23 +0200, Paolo Abeni wrote:
> Avoid usage of common memory accounting functions, since
> the logic is pretty much different.
> 
> To account for forward allocation, a couple of new atomic_t
> members are added to udp_sock: 'mem_alloced' and 'mem_freed'.
> The current forward allocation is estimated as 'mem_alloced'
>  minus 'mem_freed' minus 'sk_rmem_alloc'.
> 
> When the forward allocation can't cope with the packet to be
> enqueued, 'mem_alloced' is incremented by the packet size
> rounded-up to the next SK_MEM_QUANTUM.
> After a dequeue, we try to partially reclaim of the forward
> allocated memory rounded down to an SK_MEM_QUANTUM and
> 'mem_freed' is increased by that amount.
> sk->sk_forward_alloc is set after each allocated/freed memory
> update, to the currently estimated forward allocation, without
> any lock or protection.
> This value is updated/maintained only to expose some
> semi-reasonable value to the eventual reader, and is guaranteed
> to be 0 at socket destruction time.
> 
> The above needs custom memory reclaiming on shutdown, provided
> by the udp_destruct_sock() helper, which completely reclaim
> the allocated forward memory.
> 
> Helpers are provided for skb free, consume and purge, respecting
> the above constraints.
> 
> The socket lock is still used to protect the updates to sk_peek_off,
> but is acquired only if peeking with offset is enabled.
> 
> As a consequence of the above schema, enqueue to sk_error_queue
> will cause larger forward allocation on following normal data
> (due to sk_rmem_alloc grow), but this allows amortizing the cost
> of the atomic operation on SK_MEM_QUANTUM/skb->truesize packets.
> The use of separate atomics for 'mem_alloced' and 'mem_freed'
> allows the use of a single atomic operation to protect against
> concurrent dequeue.
> 
> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  include/linux/udp.h |   2 +
>  include/net/udp.h   |   5 ++
>  net/ipv4/udp.c      | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 158 insertions(+)
> 
> diff --git a/include/linux/udp.h b/include/linux/udp.h
> index d1fd8cd..cd72645 100644
> --- a/include/linux/udp.h
> +++ b/include/linux/udp.h
> @@ -42,6 +42,8 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
>  struct udp_sock {
>  	/* inet_sock has to be the first member */
>  	struct inet_sock inet;
> +	atomic_t	 mem_allocated;
> +	atomic_t	 mem_freed;

Hi Paolo, thanks for working on this.

All this code looks quite invasive to me ?

Also does inet_diag properly give the forward_alloc to user ?

$ ss -mua
State      Recv-Q Send-Q Local Address:Port                 Peer Addres
s:Port
UNCONN     51584  0          *:52460      *:*                    
	 skmem:(r51584,rb327680,t0,tb327680,f1664,w0,o0,bl0,d575)


Couldn't we instead use an union of an atomic_t and int for
sk->sk_forward_alloc ?

All udp queues/dequeues would manipulate the atomic_t using regular
atomic ops and use a special skb destructor (instead of sock_rfree())

Also I would not bother 'reclaiming' forward_alloc at dequeue, unless
udp is under memory pressure.

Please share your performance numbers, thanks !

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
  2016-09-21 23:31     ` Eric Dumazet
@ 2016-09-22 10:33       ` Paolo Abeni
  2016-09-22 15:21         ` Edward Cree
  0 siblings, 1 reply; 13+ messages in thread
From: Paolo Abeni @ 2016-09-22 10:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, David S. Miller, James Morris, Trond Myklebust,
	Alexander Duyck, Daniel Borkmann, Eric Dumazet, Tom Herbert,
	Hannes Frederic Sowa, linux-nfs

Hi Eric,

On Wed, 2016-09-21 at 16:31 -0700, Eric Dumazet wrote:
> Also does inet_diag properly give the forward_alloc to user ?
> 
> $ ss -mua
> State      Recv-Q Send-Q Local Address:Port                 Peer Addres
> s:Port
> UNCONN     51584  0          *:52460      *:*                    
> 	 skmem:(r51584,rb327680,t0,tb327680,f1664,w0,o0,bl0,d575)

Thank you very much for reviewing this! 

My bad, there is still a race which leads to temporary negative values
of fwd. I feel the fix is trivial but it needs some investigation.

> Couldn't we instead use an union of an atomic_t and int for
> sk->sk_forward_alloc ?

That was our first attempt, but we had some issue on mem scheduling; if
we use:

   if (atomic_sub_return(size, &sk->sk_forward_alloc_atomic) < 0) {
        // fwd alloc
   }

that leads to inescapable, temporary, negative value for
sk->sk_forward_alloc.

Another option would be:

again:
        fwd = atomic_read(&sk->sk_forward_alloc_atomic);
        if (fwd > size) {
		if (atomic_cmpxchg(&sk->sk_forward_alloc_atomic, fwd, fwd - size) != fwd)
			goto again;
        } else 
            // fwd alloc

which would be bad under high contention.

I think that using a single atomic to track both scheduling and reclaim
requires a similar loop to resolve reclaim contention. They should be
very rare, especially if we can avoid reclaiming on dequeue, but still
will complicate the code.

Finally, pahole shows that the number of cacheline used by an udp socket
on x86_64 does not change adding the two new atomic fields.

> All udp queues/dequeues would manipulate the atomic_t using regular
> atomic ops and use a special skb destructor (instead of sock_rfree())

I tried the special skb destructor and discarded it, but thinking again,
now I feel it can simplify the code, regardless of the used schema.

We will still need to replace the current in-kernel
skb_free_datagram_locked() for udp with something else, so a bit of
noise in the sunrpc subsystem will remain.

> Also I would not bother 'reclaiming' forward_alloc at dequeue, unless
> udp is under memory pressure.

We discarded that option because it would change significantly the
system behavior, but if there is agreement on it, I think that it would
make a really relevant (positive) difference!

> Please share your performance numbers, thanks !

Tested using pktgen with random src port, 64 bytes packet, wire-speed on
an 10G link as sender and udp_sink by Jesper
(https://github.com/netoptimizer/network-testing/blob/master/src/udp_sink.c) as receiver, using l4 tuple rxhash to stress the contention, and one or more udp_sink instance with reuseport. 

nr readers	Kpps (vanilla)	Kpps (patched)
1		241		600
3		1800		2200
6		3450		3950
9		5100		5400
12		6400		6650

Thank you for the very nice suggestions!

Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
  2016-09-22 10:33       ` Paolo Abeni
@ 2016-09-22 15:21         ` Edward Cree
       [not found]           ` <589839b3-5930-2527-b0a3-315be254a175-s/n/eUQHGBpZroRs9YW3xA@public.gmane.org>
  0 siblings, 1 reply; 13+ messages in thread
From: Edward Cree @ 2016-09-22 15:21 UTC (permalink / raw)
  To: Paolo Abeni, Eric Dumazet
  Cc: netdev, David S. Miller, James Morris, Trond Myklebust,
	Alexander Duyck, Daniel Borkmann, Eric Dumazet, Tom Herbert,
	Hannes Frederic Sowa, linux-nfs

On 22/09/16 11:33, Paolo Abeni wrote:
> Hi Eric,
>
> On Wed, 2016-09-21 at 16:31 -0700, Eric Dumazet wrote:
>> Also does inet_diag properly give the forward_alloc to user ?
>>
>> $ ss -mua
>> State      Recv-Q Send-Q Local Address:Port                 Peer Addres
>> s:Port
>> UNCONN     51584  0          *:52460      *:*                    
>> 	 skmem:(r51584,rb327680,t0,tb327680,f1664,w0,o0,bl0,d575)
> Thank you very much for reviewing this! 
>
> My bad, there is still a race which leads to temporary negative values
> of fwd. I feel the fix is trivial but it needs some investigation.
>
>> Couldn't we instead use an union of an atomic_t and int for
>> sk->sk_forward_alloc ?
> That was our first attempt, but we had some issue on mem scheduling; if
> we use:
>
>    if (atomic_sub_return(size, &sk->sk_forward_alloc_atomic) < 0) {
>         // fwd alloc
>    }
>
> that leads to inescapable, temporary, negative value for
> sk->sk_forward_alloc.
>
> Another option would be:
>
> again:
>         fwd = atomic_read(&sk->sk_forward_alloc_atomic);
>         if (fwd > size) {
> 		if (atomic_cmpxchg(&sk->sk_forward_alloc_atomic, fwd, fwd - size) != fwd)
> 			goto again;
>         } else 
>             // fwd alloc
>
> which would be bad under high contention.
Apologies if I'm misunderstanding the problem, but couldn't you have two
atomic_t fields, 'internal' and 'external' forward_alloc.  Then
    if (atomic_sub_return(size, &sk->sk_forward_alloc_internal) < 0) {
        atomic_sub(size, &sk->sk_forward_alloc);
        // fwd alloc
    } else {
        atomic_add(size, &sk->sk_forward_alloc_internal);
    }
or something like that.  Then sk->sk_forward_alloc never sees a negative
value, and is always >= sk->sk_forward_alloc_internal.  Of course places
that go the other way would have to add to sk->sk_forward_alloc first,
before adding to sk->sk_forward_alloc_internal, to maintain that invariant.

Would that help matters at all?

-Ed

^ permalink raw reply	[flat|nested] 13+ messages in thread

[parent not found: <589839b3-5930-2527-b0a3-315be254a175-s/n/eUQHGBpZroRs9YW3xA@public.gmane.org>]

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
       [not found]           ` <589839b3-5930-2527-b0a3-315be254a175-s/n/eUQHGBpZroRs9YW3xA@public.gmane.org>
@ 2016-09-22 16:14             ` Paolo Abeni
  2016-09-22 16:30               ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Paolo Abeni @ 2016-09-22 16:14 UTC (permalink / raw)
  To: Edward Cree
  Cc: Eric Dumazet, netdev-u79uwXL29TY76Z2rM5mHXA, David S. Miller,
	James Morris, Trond Myklebust, Alexander Duyck, Daniel Borkmann,
	Eric Dumazet, Tom Herbert, Hannes Frederic Sowa,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

On Thu, 2016-09-22 at 16:21 +0100, Edward Cree wrote:
> On 22/09/16 11:33, Paolo Abeni wrote:
> > Hi Eric,
> >
> > On Wed, 2016-09-21 at 16:31 -0700, Eric Dumazet wrote:
> >> Also does inet_diag properly give the forward_alloc to user ?
> >>
> >> $ ss -mua
> >> State      Recv-Q Send-Q Local Address:Port                 Peer Addres
> >> s:Port
> >> UNCONN     51584  0          *:52460      *:*                    
> >> 	 skmem:(r51584,rb327680,t0,tb327680,f1664,w0,o0,bl0,d575)
> > Thank you very much for reviewing this! 
> >
> > My bad, there is still a race which leads to temporary negative values
> > of fwd. I feel the fix is trivial but it needs some investigation.
> >
> >> Couldn't we instead use an union of an atomic_t and int for
> >> sk->sk_forward_alloc ?
> > That was our first attempt, but we had some issue on mem scheduling; if
> > we use:
> >
> >    if (atomic_sub_return(size, &sk->sk_forward_alloc_atomic) < 0) {
> >         // fwd alloc
> >    }
> >
> > that leads to inescapable, temporary, negative value for
> > sk->sk_forward_alloc.
> >
> > Another option would be:
> >
> > again:
> >         fwd = atomic_read(&sk->sk_forward_alloc_atomic);
> >         if (fwd > size) {
> > 		if (atomic_cmpxchg(&sk->sk_forward_alloc_atomic, fwd, fwd - size) != fwd)
> > 			goto again;
> >         } else 
> >             // fwd alloc
> >
> > which would be bad under high contention.
> Apologies if I'm misunderstanding the problem, but couldn't you have two
> atomic_t fields, 'internal' and 'external' forward_alloc.  Then
>     if (atomic_sub_return(size, &sk->sk_forward_alloc_internal) < 0) {
>         atomic_sub(size, &sk->sk_forward_alloc);
>         // fwd alloc
>     } else {
>         atomic_add(size, &sk->sk_forward_alloc_internal);
>     }
> or something like that.  Then sk->sk_forward_alloc never sees a negative
> value, and is always >= sk->sk_forward_alloc_internal.  Of course places
> that go the other way would have to add to sk->sk_forward_alloc first,
> before adding to sk->sk_forward_alloc_internal, to maintain that invariant.

I think that the idea behind using atomic ops directly on
sk_forward_alloc is to avoid adding other fields to the udp_socket. 

If we can add some fields to the udp_sock structure, the schema proposed
in this patch should fit better (modulo bugs ;-), always requiring a
single atomic operation at memory reclaiming time and at memory
allocation time.

Paolo

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
  2016-09-22 16:14             ` Paolo Abeni
@ 2016-09-22 16:30               ` Eric Dumazet
  2016-09-22 20:27                 ` Paolo Abeni
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2016-09-22 16:30 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Edward Cree, netdev, David S. Miller, James Morris,
	Trond Myklebust, Alexander Duyck, Daniel Borkmann, Eric Dumazet,
	Tom Herbert, Hannes Frederic Sowa, linux-nfs

On Thu, 2016-09-22 at 18:14 +0200, Paolo Abeni wrote:

> I think that the idea behind using atomic ops directly on
> sk_forward_alloc is to avoid adding other fields to the udp_socket. 
> 
> If we can add some fields to the udp_sock structure, the schema proposed
> in this patch should fit better (modulo bugs ;-), always requiring a
> single atomic operation at memory reclaiming time and at memory
> allocation time.

But do we want any additional atomic to begin with ?

Given typical number of UDP sockets on a host, we could reserve/forward
alloc at socket creation time, and when SO_RCVBUF is changed.

I totally understand that this schem does not work with TCP, because TCP
BDP mandates xx MB of buffers per socket, and we really handle millions
of TCP sockets per host,

But for UDP, most applications are working well with a default limit,
and we hardly have more than 1000 UDP sockets.

cat /proc/sys/net/core/rmem_default
212992

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
  2016-09-22 16:30               ` Eric Dumazet
@ 2016-09-22 20:27                 ` Paolo Abeni
  2016-09-22 20:34                   ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Paolo Abeni @ 2016-09-22 20:27 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Edward Cree, netdev, David S. Miller, James Morris,
	Trond Myklebust, Alexander Duyck, Daniel Borkmann, Eric Dumazet,
	Tom Herbert, Hannes Frederic Sowa, linux-nfs

On Thu, 2016-09-22 at 09:30 -0700, Eric Dumazet wrote:
> On Thu, 2016-09-22 at 18:14 +0200, Paolo Abeni wrote:
> 
> > I think that the idea behind using atomic ops directly on
> > sk_forward_alloc is to avoid adding other fields to the udp_socket. 
> > 
> > If we can add some fields to the udp_sock structure, the schema proposed
> > in this patch should fit better (modulo bugs ;-), always requiring a
> > single atomic operation at memory reclaiming time and at memory
> > allocation time.
> 
> But do we want any additional atomic to begin with ?
> 
> Given typical number of UDP sockets on a host, we could reserve/forward
> alloc at socket creation time, and when SO_RCVBUF is changed.

That would be very efficient and would probably work on most scenario,
but if/when the system will reach udp memory pressure things will be
very bad: forward allocation on open() will fail and nobody will be able
to create any new udp socket, right ?

We are working on a v2 incorporating the feedback of your previous email
- still keeping the new udp_sock fields.
It looks quite simpler than v1, will work reasonably well in memory
pressure scenario, and performance are measurably better than v1, most
probably comparable with the above solution, since usually no additional
atomic operations  (beyond sk_rmem_alloc updating) are performed on
enqueue/dequeue.

Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
  2016-09-22 20:27                 ` Paolo Abeni
@ 2016-09-22 20:34                   ` Eric Dumazet
  2016-09-22 20:37                     ` Eric Dumazet
  2016-09-23  9:56                     ` Paolo Abeni
  0 siblings, 2 replies; 13+ messages in thread
From: Eric Dumazet @ 2016-09-22 20:34 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Edward Cree, netdev, David S. Miller, James Morris,
	Trond Myklebust, Alexander Duyck, Daniel Borkmann, Eric Dumazet,
	Tom Herbert, Hannes Frederic Sowa, linux-nfs

On Thu, 2016-09-22 at 22:27 +0200, Paolo Abeni wrote:
> On Thu, 2016-09-22 at 09:30 -0700, Eric Dumazet wrote:
> > On Thu, 2016-09-22 at 18:14 +0200, Paolo Abeni wrote:
> > 
> > > I think that the idea behind using atomic ops directly on
> > > sk_forward_alloc is to avoid adding other fields to the udp_socket. 
> > > 
> > > If we can add some fields to the udp_sock structure, the schema proposed
> > > in this patch should fit better (modulo bugs ;-), always requiring a
> > > single atomic operation at memory reclaiming time and at memory
> > > allocation time.
> > 
> > But do we want any additional atomic to begin with ?
> > 
> > Given typical number of UDP sockets on a host, we could reserve/forward
> > alloc at socket creation time, and when SO_RCVBUF is changed.
> 
> That would be very efficient and would probably work on most scenario,
> but if/when the system will reach udp memory pressure things will be
> very bad: forward allocation on open() will fail and nobody will be able
> to create any new udp socket, right ?
> 

No, we could allow one page per socket (udp_mem[0]) and applications
would still work.

TCP has the notion of memory pressure, and behaves roughly the same in
this case (one skb is allowed to be received)

The other (fat) sockets could notice udp_memory_pressure is set and
start reclaiming their forward allocations for other sockets.

We have a counter of UDP sockets, so probably doable to compute
udp_mem[2]/number 

Anyway, just an idea.

> We are working on a v2 incorporating the feedback of your previous email
> - still keeping the new udp_sock fields.
> It looks quite simpler than v1, will work reasonably well in memory
> pressure scenario, and performance are measurably better than v1, most
> probably comparable with the above solution, since usually no additional
> atomic operations  (beyond sk_rmem_alloc updating) are performed on
> enqueue/dequeue.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
  2016-09-22 20:34                   ` Eric Dumazet
@ 2016-09-22 20:37                     ` Eric Dumazet
  2016-09-23  9:56                     ` Paolo Abeni
  1 sibling, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2016-09-22 20:37 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Edward Cree, netdev, David S. Miller, James Morris,
	Trond Myklebust, Alexander Duyck, Daniel Borkmann, Eric Dumazet,
	Tom Herbert, Hannes Frederic Sowa, linux-nfs

On Thu, 2016-09-22 at 13:34 -0700, Eric Dumazet wrote:
> On Thu, 2016-09-22 at 22:27 +0200, Paolo Abeni wrote:
> > On Thu, 2016-09-22 at 09:30 -0700, Eric Dumazet wrote:
> > > On Thu, 2016-09-22 at 18:14 +0200, Paolo Abeni wrote:
> > > 
> > > > I think that the idea behind using atomic ops directly on
> > > > sk_forward_alloc is to avoid adding other fields to the udp_socket. 
> > > > 
> > > > If we can add some fields to the udp_sock structure, the schema proposed
> > > > in this patch should fit better (modulo bugs ;-), always requiring a
> > > > single atomic operation at memory reclaiming time and at memory
> > > > allocation time.
> > > 
> > > But do we want any additional atomic to begin with ?
> > > 
> > > Given typical number of UDP sockets on a host, we could reserve/forward
> > > alloc at socket creation time, and when SO_RCVBUF is changed.
> > 
> > That would be very efficient and would probably work on most scenario,
> > but if/when the system will reach udp memory pressure things will be
> > very bad: forward allocation on open() will fail and nobody will be able
> > to create any new udp socket, right ?
> > 
> 
> No, we could allow one page per socket (udp_mem[0]) and applications
> would still work.

I meant udp_rmem_min, not udp_mem[0]

udp_rmem_min - INTEGER
        Minimal size of receive buffer used by UDP sockets in moderation.
        Each UDP socket is able to use the size for receiving data, even if
        total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
        Default: 1 page

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
  2016-09-22 20:34                   ` Eric Dumazet
  2016-09-22 20:37                     ` Eric Dumazet
@ 2016-09-23  9:56                     ` Paolo Abeni
  1 sibling, 0 replies; 13+ messages in thread
From: Paolo Abeni @ 2016-09-23  9:56 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Edward Cree, netdev, David S. Miller, James Morris,
	Trond Myklebust, Alexander Duyck, Daniel Borkmann, Eric Dumazet,
	Tom Herbert, Hannes Frederic Sowa, linux-nfs

On Thu, 2016-09-22 at 13:34 -0700, Eric Dumazet wrote:
> On Thu, 2016-09-22 at 22:27 +0200, Paolo Abeni wrote:
> > On Thu, 2016-09-22 at 09:30 -0700, Eric Dumazet wrote:
> > > On Thu, 2016-09-22 at 18:14 +0200, Paolo Abeni wrote:
> > > 
> > > > I think that the idea behind using atomic ops directly on
> > > > sk_forward_alloc is to avoid adding other fields to the udp_socket. 
> > > > 
> > > > If we can add some fields to the udp_sock structure, the schema proposed
> > > > in this patch should fit better (modulo bugs ;-), always requiring a
> > > > single atomic operation at memory reclaiming time and at memory
> > > > allocation time.
> > > 
> > > But do we want any additional atomic to begin with ?
> > > 
> > > Given typical number of UDP sockets on a host, we could reserve/forward
> > > alloc at socket creation time, and when SO_RCVBUF is changed.
> > 
> > That would be very efficient and would probably work on most scenario,
> > but if/when the system will reach udp memory pressure things will be
> > very bad: forward allocation on open() will fail and nobody will be able
> > to create any new udp socket, right ?
> > 
> 
> No, we could allow one page per socket (udp_mem[0]) and applications
> would still work.
> 
> TCP has the notion of memory pressure, and behaves roughly the same in
> this case (one skb is allowed to be received)
> 
> The other (fat) sockets could notice udp_memory_pressure is set and
> start reclaiming their forward allocations for other sockets.

I agree, that would work. Anyway I think it will more invasive than the
proposed code, after the currently ongoing rework. With the above, on
memory pressure, reclaim will be needed on dequeue, and scheduling will
still needed even on enqueue.

The overall behavior of the two approach will be similar: lazy
reclaiming under memory pressure or on sk closing, and rare memory
scheduling. 

> Anyway, just an idea.

Any comments are always very appreciated!

Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH net-next 3/3] udp: use it's own memory accounting schema
  2016-09-21 17:23 [PATCH net-next 0/3] udp: refactor memory accounting Paolo Abeni
  2016-09-21 17:23 ` [PATCH net-next 1/3] net/socket: factor out helpers for memory and queue manipulation Paolo Abeni
       [not found] ` <cover.1474477902.git.pabeni-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2016-09-21 17:23 ` Paolo Abeni
  2 siblings, 0 replies; 13+ messages in thread
From: Paolo Abeni @ 2016-09-21 17:23 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, James Morris, Trond Myklebust, Alexander Duyck,
	Daniel Borkmann, Eric Dumazet, Tom Herbert, Hannes Frederic Sowa,
	linux-nfs

Completely avoid default sock memory accounting and replace it
with udp-specific accounting.

Since the new memory accounting model does not require socket
locking, remove the lock on enqueue and free and avoid using the
backlog on enqueue.

Be sure to clean-up rx queue memory on socket destruction, using
udp its own sk_destruct.

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/ipv4/udp.c        | 39 ++++++++++-----------------------------
 net/ipv6/udp.c        | 28 +++++++++-------------------
 net/sunrpc/svcsock.c  | 22 ++++++++++++++++++----
 net/sunrpc/xprtsock.c |  2 +-
 4 files changed, 38 insertions(+), 53 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 98480af..cb617ee 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1358,13 +1358,8 @@ static int first_packet_length(struct sock *sk)
 	res = skb ? skb->len : -1;
 	spin_unlock_bh(&rcvq->lock);
 
-	if (!skb_queue_empty(&list_kill)) {
-		bool slow = lock_sock_fast(sk);
-
-		__skb_queue_purge(&list_kill);
-		sk_mem_reclaim_partial(sk);
-		unlock_sock_fast(sk, slow);
-	}
+	if (!skb_queue_empty(&list_kill))
+		udp_queue_purge(sk, &list_kill, 1);
 	return res;
 }
 
@@ -1413,7 +1408,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 	int err;
 	int is_udplite = IS_UDPLITE(sk);
 	bool checksum_valid = false;
-	bool slow;
 
 	if (flags & MSG_ERRQUEUE)
 		return ip_recv_error(sk, msg, len, addr_len);
@@ -1454,13 +1448,12 @@ try_again:
 	}
 
 	if (unlikely(err)) {
-		trace_kfree_skb(skb, udp_recvmsg);
 		if (!peeked) {
 			atomic_inc(&sk->sk_drops);
 			UDP_INC_STATS(sock_net(sk),
 				      UDP_MIB_INERRORS, is_udplite);
 		}
-		skb_free_datagram_locked(sk, skb);
+		skb_free_udp(sk, skb);
 		return err;
 	}
 
@@ -1485,16 +1478,15 @@ try_again:
 	if (flags & MSG_TRUNC)
 		err = ulen;
 
-	__skb_free_datagram_locked(sk, skb, peeking ? -err : err);
+	skb_consume_udp(sk, skb, peeking ? -err : err);
 	return err;
 
 csum_copy_err:
-	slow = lock_sock_fast(sk);
-	if (!skb_kill_datagram(sk, skb, flags)) {
+	if (!__sk_queue_drop_skb(sk, skb, flags)) {
 		UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
 		UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
 	}
-	unlock_sock_fast(sk, slow);
+	skb_free_udp(sk, skb);
 
 	/* starting over for a new packet, but check if we need to yield */
 	cond_resched();
@@ -1613,7 +1605,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		sk_incoming_cpu_update(sk);
 	}
 
-	rc = __sock_queue_rcv_skb(sk, skb);
+	rc = udp_rmem_schedule(sk, skb);
 	if (rc < 0) {
 		int is_udplite = IS_UDPLITE(sk);
 
@@ -1627,8 +1619,8 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		return -1;
 	}
 
+	__sock_enqueue_skb(sk, skb);
 	return 0;
-
 }
 
 static struct static_key udp_encap_needed __read_mostly;
@@ -1650,7 +1642,6 @@ EXPORT_SYMBOL(udp_encap_enable);
 int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
-	int rc;
 	int is_udplite = IS_UDPLITE(sk);
 
 	/*
@@ -1743,19 +1734,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		goto drop;
 	}
 
-	rc = 0;
-
 	ipv4_pktinfo_prepare(sk, skb);
-	bh_lock_sock(sk);
-	if (!sock_owned_by_user(sk))
-		rc = __udp_queue_rcv_skb(sk, skb);
-	else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
-		bh_unlock_sock(sk);
-		goto drop;
-	}
-	bh_unlock_sock(sk);
-
-	return rc;
+	return __udp_queue_rcv_skb(sk, skb);
 
 csum_error:
 	__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
@@ -2365,6 +2345,7 @@ struct proto udp_prot = {
 	.connect	   = ip4_datagram_connect,
 	.disconnect	   = udp_disconnect,
 	.ioctl		   = udp_ioctl,
+	.init		   = udp_init_sock,
 	.destroy	   = udp_destroy_sock,
 	.setsockopt	   = udp_setsockopt,
 	.getsockopt	   = udp_getsockopt,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9aa7c1c..6f8e160 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -334,7 +334,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	int is_udplite = IS_UDPLITE(sk);
 	bool checksum_valid = false;
 	int is_udp4;
-	bool slow;
 
 	if (flags & MSG_ERRQUEUE)
 		return ipv6_recv_error(sk, msg, len, addr_len);
@@ -388,7 +387,7 @@ try_again:
 				UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
 					       is_udplite);
 		}
-		skb_free_datagram_locked(sk, skb);
+		skb_free_udp(sk, skb);
 		return err;
 	}
 	if (!peeked) {
@@ -437,12 +436,11 @@ try_again:
 	if (flags & MSG_TRUNC)
 		err = ulen;
 
-	__skb_free_datagram_locked(sk, skb, peeking ? -err : err);
+	skb_consume_udp(sk, skb, peeking ? -err : err);
 	return err;
 
 csum_copy_err:
-	slow = lock_sock_fast(sk);
-	if (!skb_kill_datagram(sk, skb, flags)) {
+	if (!__sk_queue_drop_skb(sk, skb, flags)) {
 		if (is_udp4) {
 			UDP_INC_STATS(sock_net(sk),
 				      UDP_MIB_CSUMERRORS, is_udplite);
@@ -455,7 +453,7 @@ csum_copy_err:
 				       UDP_MIB_INERRORS, is_udplite);
 		}
 	}
-	unlock_sock_fast(sk, slow);
+	skb_free_udp(sk, skb);
 
 	/* starting over for a new packet, but check if we need to yield */
 	cond_resched();
@@ -523,7 +521,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		sk_incoming_cpu_update(sk);
 	}
 
-	rc = __sock_queue_rcv_skb(sk, skb);
+	rc = udp_rmem_schedule(sk, skb);
 	if (rc < 0) {
 		int is_udplite = IS_UDPLITE(sk);
 
@@ -535,6 +533,8 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		kfree_skb(skb);
 		return -1;
 	}
+
+	__sock_enqueue_skb(sk, skb);
 	return 0;
 }
 
@@ -556,7 +556,6 @@ EXPORT_SYMBOL(udpv6_encap_enable);
 int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
-	int rc;
 	int is_udplite = IS_UDPLITE(sk);
 
 	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
@@ -630,17 +629,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 	skb_dst_drop(skb);
 
-	bh_lock_sock(sk);
-	rc = 0;
-	if (!sock_owned_by_user(sk))
-		rc = __udpv6_queue_rcv_skb(sk, skb);
-	else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
-		bh_unlock_sock(sk);
-		goto drop;
-	}
-	bh_unlock_sock(sk);
-
-	return rc;
+	return __udpv6_queue_rcv_skb(sk, skb);
 
 csum_error:
 	__UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
@@ -1433,6 +1422,7 @@ struct proto udpv6_prot = {
 	.connect	   = ip6_datagram_connect,
 	.disconnect	   = udp_disconnect,
 	.ioctl		   = udp_ioctl,
+	.init		   = udp_init_sock,
 	.destroy	   = udpv6_destroy_sock,
 	.setsockopt	   = udpv6_setsockopt,
 	.getsockopt	   = udpv6_getsockopt,
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 57625f6..b5739c7 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -39,6 +39,7 @@
 #include <net/checksum.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/udp.h>
 #include <net/tcp.h>
 #include <net/tcp_states.h>
 #include <asm/uaccess.h>
@@ -129,6 +130,20 @@ static void svc_release_skb(struct svc_rqst *rqstp)
 	}
 }
 
+static void svc_release_udp_skb(struct svc_rqst *rqstp)
+{
+	struct sk_buff *skb = rqstp->rq_xprt_ctxt;
+
+	if (skb) {
+		struct svc_sock *svsk =
+			container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+		rqstp->rq_xprt_ctxt = NULL;
+
+		dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
+		skb_consume_udp(svsk->sk_sk, skb, 0);
+	}
+}
+
 union svc_pktinfo_u {
 	struct in_pktinfo pkti;
 	struct in6_pktinfo pkti6;
@@ -575,7 +590,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 			goto out_free;
 		}
 		local_bh_enable();
-		skb_free_datagram_locked(svsk->sk_sk, skb);
+		skb_consume_udp(svsk->sk_sk, skb, 0);
 	} else {
 		/* we can use it in-place */
 		rqstp->rq_arg.head[0].iov_base = skb->data;
@@ -602,8 +617,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 
 	return len;
 out_free:
-	trace_kfree_skb(skb, svc_udp_recvfrom);
-	skb_free_datagram_locked(svsk->sk_sk, skb);
+	skb_free_udp(svsk->sk_sk, skb);
 	return 0;
 }
 
@@ -660,7 +674,7 @@ static struct svc_xprt_ops svc_udp_ops = {
 	.xpo_create = svc_udp_create,
 	.xpo_recvfrom = svc_udp_recvfrom,
 	.xpo_sendto = svc_udp_sendto,
-	.xpo_release_rqst = svc_release_skb,
+	.xpo_release_rqst = svc_release_udp_skb,
 	.xpo_detach = svc_sock_detach,
 	.xpo_free = svc_sock_free,
 	.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8ede3bc..b75c2c3 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1074,7 +1074,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
 		skb = skb_recv_datagram(sk, 0, 1, &err);
 		if (skb != NULL) {
 			xs_udp_data_read_skb(&transport->xprt, sk, skb);
-			skb_free_datagram(sk, skb);
+			skb_consume_udp(sk, skb, 0);
 			continue;
 		}
 		if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2016-09-23  9:56 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-09-21 17:23 [PATCH net-next 0/3] udp: refactor memory accounting Paolo Abeni
2016-09-21 17:23 ` [PATCH net-next 1/3] net/socket: factor out helpers for memory and queue manipulation Paolo Abeni
     [not found] ` <cover.1474477902.git.pabeni-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2016-09-21 17:23   ` [PATCH net-next 2/3] udp: implement memory accounting helpers Paolo Abeni
2016-09-21 23:31     ` Eric Dumazet
2016-09-22 10:33       ` Paolo Abeni
2016-09-22 15:21         ` Edward Cree
     [not found]           ` <589839b3-5930-2527-b0a3-315be254a175-s/n/eUQHGBpZroRs9YW3xA@public.gmane.org>
2016-09-22 16:14             ` Paolo Abeni
2016-09-22 16:30               ` Eric Dumazet
2016-09-22 20:27                 ` Paolo Abeni
2016-09-22 20:34                   ` Eric Dumazet
2016-09-22 20:37                     ` Eric Dumazet
2016-09-23  9:56                     ` Paolo Abeni
2016-09-21 17:23 ` [PATCH net-next 3/3] udp: use it's own memory accounting schema Paolo Abeni

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).