[PATCH] [NET] Size listen hash tables using backlog hint

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Eric Dumazet <dada1@cosmosbay.com>
To: David Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Subject: [PATCH] [NET] Size listen hash tables using backlog hint
Date: Thu, 19 Oct 2006 11:27:50 +0200	[thread overview]
Message-ID: <200610191127.50831.dada1@cosmosbay.com> (raw)
In-Reply-To: <200610171458.37636.dada1@cosmosbay.com>

[-- Attachment #1: Type: text/plain, Size: 1562 bytes --]

Hi David

Here is the second try for this patch. Many thanks for your feedback.

[PATCH] [NET] Size listen hash tables using backlog hint

We currently allocate  a fixed size 512 (TCP_SYNQ_HSIZE) slots hash table for 
each LISTEN socket, regardless of various parameters (listen backlog for 
example)

On x86_64, this means order-1 allocations (might fail), even for 'small' 
sockets, expecting few connections. On the contrary, a huge server wanting a 
backlog of 50000 is slowed down a bit because of this fixed limit.

This patch makes the sizing of listen hash table a dynamic parameter, 
depending of :
- net.core.somaxconn tunable (default is 128)
- net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128)
- backlog value given by user application  (2nd parameter of listen())

For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of 
kmalloc().

We still limit memory allocation with the two existing tunables (somaxconn & 
tcp_max_syn_backlog).

 include/net/request_sock.h      |    8 ++++----
 include/net/tcp.h               |    1 -
 net/core/request_sock.c         |   38 +++++++++++++++++++++++++++++---------
 net/dccp/ipv4.c                 |    2 +-
 net/dccp/proto.c                |    6 +++---
 net/ipv4/af_inet.c              |    2 +-
 net/ipv4/inet_connection_sock.c |    2 +-
 net/ipv4/tcp_ipv4.c             |    6 +++---
 net/ipv6/tcp_ipv6.c             |    2 +-
 9 files changed, 43 insertions(+), 24 deletions(-)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

[-- Attachment #2: size_listen_hash_table.patch --]
[-- Type: text/plain, Size: 7933 bytes --]

--- linux-2.6.19-rc2/net/core/request_sock.c	2006-10-13 18:25:04.000000000 +0200
+++ linux-2.6.19-rc2-ed/net/core/request_sock.c	2006-10-19 11:05:56.000000000 +0200
@@ -15,6 +15,7 @@
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/vmalloc.h>
 
 #include <net/request_sock.h>
 
@@ -29,22 +30,31 @@
  * it is absolutely not enough even at 100conn/sec. 256 cures most
  * of problems. This value is adjusted to 128 for very small machines
  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
- * Further increasing requires to change hash table size.
+ * Note : Dont forget somaxconn that may limit backlog too.
  */
 int sysctl_max_syn_backlog = 256;
 
 int reqsk_queue_alloc(struct request_sock_queue *queue,
-		      const int nr_table_entries)
+		      unsigned int nr_table_entries)
 {
-	const int lopt_size = sizeof(struct listen_sock) +
-			      nr_table_entries * sizeof(struct request_sock *);
-	struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL);
+	size_t lopt_size = sizeof(struct listen_sock);
+	struct listen_sock *lopt;
 
+	nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
+	nr_table_entries = max_t(u32, nr_table_entries, 8);
+	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
+	lopt_size += nr_table_entries * sizeof(struct request_sock *);
+	if (lopt_size > PAGE_SIZE)
+		lopt = __vmalloc(lopt_size,
+			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+			PAGE_KERNEL);
+	else
+		lopt = kzalloc(lopt_size, GFP_KERNEL);
 	if (lopt == NULL)
 		return -ENOMEM;
 
-	for (lopt->max_qlen_log = 6;
-	     (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
+	for (lopt->max_qlen_log = 3;
+	     (1 << lopt->max_qlen_log) < nr_table_entries;
 	     lopt->max_qlen_log++);
 
 	get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
@@ -52,6 +62,11 @@
 	queue->rskq_accept_head = NULL;
 	lopt->nr_table_entries = nr_table_entries;
 
+	/*
+	 * This write_lock_bh()/write_unlock_bh() pair forces this CPU to commit
+	 * its memory changes and let readers (which acquire syn_wait_lock in
+	 * reader mode) operate without seeing random content.
+	 */
 	write_lock_bh(&queue->syn_wait_lock);
 	queue->listen_opt = lopt;
 	write_unlock_bh(&queue->syn_wait_lock);
@@ -65,9 +80,11 @@
 {
 	/* make all the listen_opt local to us */
 	struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
+	size_t lopt_size = sizeof(struct listen_sock) +
+		lopt->nr_table_entries * sizeof(struct request_sock *);
 
 	if (lopt->qlen != 0) {
-		int i;
+		unsigned int i;
 
 		for (i = 0; i < lopt->nr_table_entries; i++) {
 			struct request_sock *req;
@@ -81,7 +98,10 @@
 	}
 
 	BUG_TRAP(lopt->qlen == 0);
-	kfree(lopt);
+	if (lopt_size > PAGE_SIZE)
+		vfree(lopt);
+	else
+		kfree(lopt);
 }
 
 EXPORT_SYMBOL(reqsk_queue_destroy);
--- linux-2.6.19-rc2/net/ipv4/af_inet.c	2006-10-13 18:25:04.000000000 +0200
+++ linux-2.6.19-rc2-ed/net/ipv4/af_inet.c	2006-10-17 10:32:22.000000000 +0200
@@ -204,7 +204,7 @@
 	 * we can only allow the backlog to be adjusted.
 	 */
 	if (old_state != TCP_LISTEN) {
-		err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+		err = inet_csk_listen_start(sk, backlog);
 		if (err)
 			goto out;
 	}
--- linux-2.6.19-rc2/net/ipv4/tcp_ipv4.c	2006-10-13 18:25:04.000000000 +0200
+++ linux-2.6.19-rc2-ed/net/ipv4/tcp_ipv4.c	2006-10-17 12:19:38.000000000 +0200
@@ -715,7 +715,7 @@
 	return dopt;
 }
 
-struct request_sock_ops tcp_request_sock_ops = {
+struct request_sock_ops tcp_request_sock_ops __read_mostly = {
 	.family		=	PF_INET,
 	.obj_size	=	sizeof(struct tcp_request_sock),
 	.rtx_syn_ack	=	tcp_v4_send_synack,
@@ -1385,7 +1385,7 @@
 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
 		struct request_sock *req = cur;
 
-	       	icsk = inet_csk(st->syn_wait_sk);
+		icsk = inet_csk(st->syn_wait_sk);
 		req = req->dl_next;
 		while (1) {
 			while (req) {
@@ -1395,7 +1395,7 @@
 				}
 				req = req->dl_next;
 			}
-			if (++st->sbucket >= TCP_SYNQ_HSIZE)
+			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
 				break;
 get_req:
 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
--- linux-2.6.19-rc2/net/dccp/proto.c	2006-10-13 18:25:04.000000000 +0200
+++ linux-2.6.19-rc2-ed/net/dccp/proto.c	2006-10-17 10:32:22.000000000 +0200
@@ -262,12 +262,12 @@
 
 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
 
-static inline int dccp_listen_start(struct sock *sk)
+static inline int dccp_listen_start(struct sock *sk, int backlog)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
 	dp->dccps_role = DCCP_ROLE_LISTEN;
-	return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+	return inet_csk_listen_start(sk, backlog);
 }
 
 int dccp_disconnect(struct sock *sk, int flags)
@@ -788,7 +788,7 @@
 		 * FIXME: here it probably should be sk->sk_prot->listen_start
 		 * see tcp_listen_start
 		 */
-		err = dccp_listen_start(sk);
+		err = dccp_listen_start(sk, backlog);
 		if (err)
 			goto out;
 	}
--- linux-2.6.19-rc2/net/dccp/ipv4.c	2006-10-13 18:25:04.000000000 +0200
+++ linux-2.6.19-rc2-ed/net/dccp/ipv4.c	2006-10-17 10:44:21.000000000 +0200
@@ -1020,7 +1020,7 @@
 	kfree(inet_rsk(req)->opt);
 }
 
-static struct request_sock_ops dccp_request_sock_ops = {
+static struct request_sock_ops dccp_request_sock_ops _read_mostly = {
 	.family		= PF_INET,
 	.obj_size	= sizeof(struct dccp_request_sock),
 	.rtx_syn_ack	= dccp_v4_send_response,
--- linux-2.6.19-rc2/net/ipv6/tcp_ipv6.c	2006-10-13 18:25:04.000000000 +0200
+++ linux-2.6.19-rc2-ed/net/ipv6/tcp_ipv6.c	2006-10-17 10:44:21.000000000 +0200
@@ -526,7 +526,7 @@
 		kfree_skb(inet6_rsk(req)->pktopts);
 }
 
-static struct request_sock_ops tcp6_request_sock_ops = {
+static struct request_sock_ops tcp6_request_sock_ops _read_mostly = {
 	.family		=	AF_INET6,
 	.obj_size	=	sizeof(struct tcp6_request_sock),
 	.rtx_syn_ack	=	tcp_v6_send_synack,
--- linux-2.6.19-rc2/net/ipv4/inet_connection_sock.c	2006-10-13 18:25:04.000000000 +0200
+++ linux-2.6.19-rc2-ed/net/ipv4/inet_connection_sock.c	2006-10-19 10:51:26.000000000 +0200
@@ -343,7 +343,7 @@
 EXPORT_SYMBOL_GPL(inet_csk_route_req);
 
 static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
-				 const u32 rnd, const u16 synq_hsize)
+				 const u32 rnd, const u32 synq_hsize)
 {
 	return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
 }
--- linux-2.6.19-rc2/include/net/tcp.h	2006-10-13 18:25:04.000000000 +0200
+++ linux-2.6.19-rc2-ed/include/net/tcp.h	2006-10-17 10:51:51.000000000 +0200
@@ -138,7 +138,6 @@
 #define MAX_TCP_SYNCNT		127
 
 #define TCP_SYNQ_INTERVAL	(HZ/5)	/* Period of SYNACK timer */
-#define TCP_SYNQ_HSIZE		512	/* Size of SYNACK hash table */
 
 #define TCP_PAWS_24DAYS	(60 * 60 * 24 * 24)
 #define TCP_PAWS_MSL	60		/* Per-host timestamps are invalidated
--- linux-2.6.19-rc2/include/net/request_sock.h	2006-10-13 18:25:04.000000000 +0200
+++ linux-2.6.19-rc2-ed/include/net/request_sock.h	2006-10-17 12:33:18.000000000 +0200
@@ -28,8 +28,8 @@
 
 struct request_sock_ops {
 	int		family;
-	kmem_cache_t	*slab;
 	int		obj_size;
+	kmem_cache_t	*slab;
 	int		(*rtx_syn_ack)(struct sock *sk,
 				       struct request_sock *req,
 				       struct dst_entry *dst);
@@ -51,12 +51,12 @@
 	u32				rcv_wnd;	  /* rcv_wnd offered first time */
 	u32				ts_recent;
 	unsigned long			expires;
-	struct request_sock_ops		*rsk_ops;
+	const struct request_sock_ops		*rsk_ops;
 	struct sock			*sk;
 	u32				secid;
 };
 
-static inline struct request_sock *reqsk_alloc(struct request_sock_ops *ops)
+static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
 {
 	struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC);
 
@@ -120,7 +120,7 @@
 };
 
 extern int reqsk_queue_alloc(struct request_sock_queue *queue,
-			     const int nr_table_entries);
+			     unsigned int nr_table_entries);
 
 static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue)
 {

next prev parent reply	other threads:[~2006-10-19  9:27 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-10-17  0:53 [PATCH] Bound TSO defer time (resend) John Heffner
2006-10-17  3:20 ` Stephen Hemminger
2006-10-17  4:18   ` John Heffner
2006-10-17  5:35     ` David Miller
2006-10-17 12:22       ` John Heffner
2006-10-19  3:39         ` David Miller
2006-10-17 12:58       ` [PATCH] [NET] Size listen hash tables using backlog hint Eric Dumazet Hi
2006-10-18  7:38         ` [PATCH] [NET] inet_peer : group together avl_left, avl_right, v4daddr to speedup lookups on some CPUS Eric Dumazet
2006-10-18 16:35           ` [PATCH] [NET] reduce per cpu ram used for loopback stats Eric Dumazet
2006-10-18 17:00             ` [PATCH, resent] " Eric Dumazet
2006-10-19  3:53               ` David Miller
2006-10-19  3:53             ` [PATCH] " David Miller
2006-10-19  3:44           ` [PATCH] [NET] inet_peer : group together avl_left, avl_right, v4daddr to speedup lookups on some CPUS David Miller
2006-10-19 10:57           ` Eric Dumazet
2006-10-19 15:45             ` [PATCH] [NET] One NET_INC_STATS() could be NET_INC_STATS_BH in tcp_v4_err() Eric Dumazet
2006-10-20  7:22               ` David Miller
2006-10-20 14:21                 ` Arnaldo Carvalho de Melo
2006-10-20  7:28             ` [PATCH] [NET] inet_peer : group together avl_left, avl_right, v4daddr to speedup lookups on some CPUS David Miller
2006-10-19  3:31         ` [PATCH] [NET] Size listen hash tables using backlog hint David Miller
2006-10-19  4:54           ` Stephen Hemminger
2006-10-19  5:08             ` David Miller
2006-10-19  5:12           ` Eric Dumazet
2006-10-19  6:12             ` David Miller
2006-10-19  6:34               ` Eric Dumazet
2006-10-19  6:57                 ` David Miller
2006-10-19  8:29                   ` Eric Dumazet
2006-10-19  8:41                     ` David Miller
2006-10-19  9:11                       ` Eric Dumazet
2006-10-19  9:27         ` Eric Dumazet [this message]
2006-10-20  7:27           ` David Miller
2006-10-18 15:37     ` [PATCH] Bound TSO defer time (resend) Andi Kleen
2006-10-18 16:40       ` Stephen Hemminger

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200610191127.50831.dada1@cosmosbay.com \
    --to=dada1@cosmosbay.com \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).