* [PATCH] IPVS: Allow boot time change of hash size. @ 2008-11-26 13:36 Catalin(ux) M. BOIE 2008-11-26 14:40 ` Joseph Mack NA3T 0 siblings, 1 reply; 23+ messages in thread From: Catalin(ux) M. BOIE @ 2008-11-26 13:36 UTC (permalink / raw) To: netdev; +Cc: lvs-devel I was very frustrated about the fact that I have to recompile the kernel to change the hash size. So, I created this patch. If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel command line, or, if you built IPVS as modules, you can add options ip_vs conn_tab_bits=??. To keep everything backward compatible, you still can select the size at compile time, and that will be used as default. Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> --- include/net/ip_vs.h | 16 ++++---------- net/netfilter/ipvs/Kconfig | 4 +++ net/netfilter/ipvs/ip_vs_conn.c | 41 ++++++++++++++++++++++++++++---------- net/netfilter/ipvs/ip_vs_ctl.c | 8 +++--- 4 files changed, 43 insertions(+), 26 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index fe9fcf7..5a788a4 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -26,6 +26,11 @@ #include <linux/ipv6.h> /* for struct ipv6hdr */ #include <net/ipv6.h> /* for ipv6_addr_copy */ + +/* Connections' size value needed by ip_vs_ctl.c */ +extern int ip_vs_conn_tab_size; + + struct ip_vs_iphdr { int len; __u8 protocol; @@ -599,17 +604,6 @@ extern void ip_vs_init_hash_table(struct list_head *table, int rows); * (from ip_vs_conn.c) */ -/* - * IPVS connection entry hash table - */ -#ifndef CONFIG_IP_VS_TAB_BITS -#define CONFIG_IP_VS_TAB_BITS 12 -#endif - -#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS -#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS) -#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1) - enum { IP_VS_DIR_INPUT = 0, IP_VS_DIR_OUTPUT, diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 79a6980..c71e543 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -68,6 +68,10 @@ config IP_VS_TAB_BITS each hash entry uses 8 bytes, so you can estimate how much memory is needed for your box. + You can overwrite this number setting conn_tab_bits module parameter + or by appending ip_vs.conn_tab_bits=? to the kernel command line + if IP VS was compiled built-in. + comment "IPVS transport protocol load balancing support" config IP_VS_PROTO_TCP diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 9a24332..b1462f1 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -37,6 +37,21 @@ #include <net/ip_vs.h> +#ifndef CONFIG_IP_VS_TAB_BITS +#define CONFIG_IP_VS_TAB_BITS 12 +#endif + +/* + * Connection hash size. Default is what was selected at compile time. +*/ +int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); + +/* size and mask values */ +int ip_vs_conn_tab_size; +int ip_vs_conn_tab_mask; + /* * Connection hash table: for input and output packets lookups of IPVS */ @@ -122,11 +137,11 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, if (af == AF_INET6) return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), (__force u32)port, proto, ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; + & ip_vs_conn_tab_mask; #endif return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; + & ip_vs_conn_tab_mask; } @@ -752,7 +767,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) int idx; struct ip_vs_conn *cp; - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + for(idx = 0; idx < ip_vs_conn_tab_size; idx++) { ct_read_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { if (pos-- == 0) { @@ -789,7 +804,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) idx = l - ip_vs_conn_tab; ct_read_unlock_bh(idx); - while (++idx < IP_VS_CONN_TAB_SIZE) { + while (++idx < ip_vs_conn_tab_size) { ct_read_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { seq->private = &ip_vs_conn_tab[idx]; @@ -972,8 +987,8 @@ void ip_vs_random_dropentry(void) /* * Randomly scan 1/32 of the whole table every second */ - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { + unsigned hash = net_random() & ip_vs_conn_tab_mask; /* * Lock is actually needed in this loop. @@ -1025,7 +1040,7 @@ static void ip_vs_conn_flush(void) struct ip_vs_conn *cp; flush_again: - for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { + for (idx=0; idx<ip_vs_conn_tab_size; idx++) { /* * Lock is actually needed in this loop. */ @@ -1056,10 +1071,14 @@ int __init ip_vs_conn_init(void) { int idx; + /* Compute size and mask */ + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; + /* * Allocate the connection hash table and initialize its list heads */ - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(struct list_head)); if (!ip_vs_conn_tab) return -ENOMEM; @@ -1074,12 +1093,12 @@ int __init ip_vs_conn_init(void) IP_VS_INFO("Connection hash table configured " "(size=%d, memory=%ldKbytes)\n", - IP_VS_CONN_TAB_SIZE, - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + ip_vs_conn_tab_size, + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", sizeof(struct ip_vs_conn)); - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 0302cf3..6dcadc2 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1854,7 +1854,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_printf(seq, "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); seq_puts(seq, "Prot LocalAddress:Port Scheduler Flags\n"); seq_puts(seq, @@ -2385,7 +2385,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) char buf[64]; sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); if (copy_to_user(user, buf, strlen(buf)+1) != 0) { ret = -EFAULT; goto out; @@ -2398,7 +2398,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; - info.size = IP_VS_CONN_TAB_SIZE; + info.size = ip_vs_conn_tab_size; info.num_services = ip_vs_num_services; if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; @@ -3238,7 +3238,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) case IPVS_CMD_GET_INFO: NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - IP_VS_CONN_TAB_SIZE); + ip_vs_conn_tab_size); break; } -- 1.5.6.5 ^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-11-26 13:36 [PATCH] IPVS: Allow boot time change of hash size Catalin(ux) M. BOIE @ 2008-11-26 14:40 ` Joseph Mack NA3T 2008-11-26 23:27 ` David Miller 2008-11-27 6:58 ` Catalin(ux) M. BOIE 0 siblings, 2 replies; 23+ messages in thread From: Joseph Mack NA3T @ 2008-11-26 14:40 UTC (permalink / raw) To: Catalin(ux) M. BOIE; +Cc: netdev, lvs-devel On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote: > I was very frustrated about the fact that I have to recompile the kernel > to change the hash size. So, I created this patch. thanks for sending us the code. Why do you need to change the hash size? We really don't recommend anyone do this under normal circumstances Thanks Joe > If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel > command line, or, if you built IPVS as modules, you can add > options ip_vs conn_tab_bits=??. > To keep everything backward compatible, you still can select the size at > compile time, and that will be used as default. > > Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> > --- > include/net/ip_vs.h | 16 ++++---------- > net/netfilter/ipvs/Kconfig | 4 +++ > net/netfilter/ipvs/ip_vs_conn.c | 41 ++++++++++++++++++++++++++++---------- > net/netfilter/ipvs/ip_vs_ctl.c | 8 +++--- > 4 files changed, 43 insertions(+), 26 deletions(-) > > diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h > index fe9fcf7..5a788a4 100644 > --- a/include/net/ip_vs.h > +++ b/include/net/ip_vs.h > @@ -26,6 +26,11 @@ > #include <linux/ipv6.h> /* for struct ipv6hdr */ > #include <net/ipv6.h> /* for ipv6_addr_copy */ > > + > +/* Connections' size value needed by ip_vs_ctl.c */ > +extern int ip_vs_conn_tab_size; > + > + > struct ip_vs_iphdr { > int len; > __u8 protocol; > @@ -599,17 +604,6 @@ extern void ip_vs_init_hash_table(struct list_head *table, int rows); > * (from ip_vs_conn.c) > */ > > -/* > - * IPVS connection entry hash table > - */ > -#ifndef CONFIG_IP_VS_TAB_BITS > -#define CONFIG_IP_VS_TAB_BITS 12 > -#endif > - > -#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS > -#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS) > -#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1) > - > enum { > IP_VS_DIR_INPUT = 0, > IP_VS_DIR_OUTPUT, > diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig > index 79a6980..c71e543 100644 > --- a/net/netfilter/ipvs/Kconfig > +++ b/net/netfilter/ipvs/Kconfig > @@ -68,6 +68,10 @@ config IP_VS_TAB_BITS > each hash entry uses 8 bytes, so you can estimate how much memory is > needed for your box. > > + You can overwrite this number setting conn_tab_bits module parameter > + or by appending ip_vs.conn_tab_bits=? to the kernel command line > + if IP VS was compiled built-in. > + > comment "IPVS transport protocol load balancing support" > > config IP_VS_PROTO_TCP > diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c > index 9a24332..b1462f1 100644 > --- a/net/netfilter/ipvs/ip_vs_conn.c > +++ b/net/netfilter/ipvs/ip_vs_conn.c > @@ -37,6 +37,21 @@ > #include <net/ip_vs.h> > > > +#ifndef CONFIG_IP_VS_TAB_BITS > +#define CONFIG_IP_VS_TAB_BITS 12 > +#endif > + > +/* > + * Connection hash size. Default is what was selected at compile time. > +*/ > +int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; > +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); > +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); > + > +/* size and mask values */ > +int ip_vs_conn_tab_size; > +int ip_vs_conn_tab_mask; > + > /* > * Connection hash table: for input and output packets lookups of IPVS > */ > @@ -122,11 +137,11 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, > if (af == AF_INET6) > return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), > (__force u32)port, proto, ip_vs_conn_rnd) > - & IP_VS_CONN_TAB_MASK; > + & ip_vs_conn_tab_mask; > #endif > return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, > ip_vs_conn_rnd) > - & IP_VS_CONN_TAB_MASK; > + & ip_vs_conn_tab_mask; > } > > > @@ -752,7 +767,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) > int idx; > struct ip_vs_conn *cp; > > - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { > + for(idx = 0; idx < ip_vs_conn_tab_size; idx++) { > ct_read_lock_bh(idx); > list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { > if (pos-- == 0) { > @@ -789,7 +804,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) > idx = l - ip_vs_conn_tab; > ct_read_unlock_bh(idx); > > - while (++idx < IP_VS_CONN_TAB_SIZE) { > + while (++idx < ip_vs_conn_tab_size) { > ct_read_lock_bh(idx); > list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { > seq->private = &ip_vs_conn_tab[idx]; > @@ -972,8 +987,8 @@ void ip_vs_random_dropentry(void) > /* > * Randomly scan 1/32 of the whole table every second > */ > - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { > - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; > + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { > + unsigned hash = net_random() & ip_vs_conn_tab_mask; > > /* > * Lock is actually needed in this loop. > @@ -1025,7 +1040,7 @@ static void ip_vs_conn_flush(void) > struct ip_vs_conn *cp; > > flush_again: > - for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { > + for (idx=0; idx<ip_vs_conn_tab_size; idx++) { > /* > * Lock is actually needed in this loop. > */ > @@ -1056,10 +1071,14 @@ int __init ip_vs_conn_init(void) > { > int idx; > > + /* Compute size and mask */ > + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; > + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; > + > /* > * Allocate the connection hash table and initialize its list heads > */ > - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); > + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(struct list_head)); > if (!ip_vs_conn_tab) > return -ENOMEM; > > @@ -1074,12 +1093,12 @@ int __init ip_vs_conn_init(void) > > IP_VS_INFO("Connection hash table configured " > "(size=%d, memory=%ldKbytes)\n", > - IP_VS_CONN_TAB_SIZE, > - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); > + ip_vs_conn_tab_size, > + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); > IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", > sizeof(struct ip_vs_conn)); > > - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { > + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { > INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); > } > > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c > index 0302cf3..6dcadc2 100644 > --- a/net/netfilter/ipvs/ip_vs_ctl.c > +++ b/net/netfilter/ipvs/ip_vs_ctl.c > @@ -1854,7 +1854,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) > if (v == SEQ_START_TOKEN) { > seq_printf(seq, > "IP Virtual Server version %d.%d.%d (size=%d)\n", > - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); > + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); > seq_puts(seq, > "Prot LocalAddress:Port Scheduler Flags\n"); > seq_puts(seq, > @@ -2385,7 +2385,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) > char buf[64]; > > sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", > - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); > + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); > if (copy_to_user(user, buf, strlen(buf)+1) != 0) { > ret = -EFAULT; > goto out; > @@ -2398,7 +2398,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) > { > struct ip_vs_getinfo info; > info.version = IP_VS_VERSION_CODE; > - info.size = IP_VS_CONN_TAB_SIZE; > + info.size = ip_vs_conn_tab_size; > info.num_services = ip_vs_num_services; > if (copy_to_user(user, &info, sizeof(info)) != 0) > ret = -EFAULT; > @@ -3238,7 +3238,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) > case IPVS_CMD_GET_INFO: > NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); > NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, > - IP_VS_CONN_TAB_SIZE); > + ip_vs_conn_tab_size); > break; > } > > -- Joseph Mack NA3T EME(B,D), FM05lw North Carolina jmack (at) wm7d (dot) net - azimuthal equidistant map generator at http://www.wm7d.net/azproj.shtml Homepage http://www.austintek.com/ It's GNU/Linux! ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-11-26 14:40 ` Joseph Mack NA3T @ 2008-11-26 23:27 ` David Miller 2008-11-27 7:05 ` Catalin(ux) M. BOIE 2008-11-27 6:58 ` Catalin(ux) M. BOIE 1 sibling, 1 reply; 23+ messages in thread From: David Miller @ 2008-11-26 23:27 UTC (permalink / raw) To: jmack; +Cc: catab, netdev, lvs-devel From: Joseph Mack NA3T <jmack@wm7d.net> Date: Wed, 26 Nov 2008 06:40:02 -0800 (PST) > On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote: > > > I was very frustrated about the fact that I have to recompile the kernel > > to change the hash size. So, I created this patch. > > thanks for sending us the code. > > Why do you need to change the hash size? We really don't recommend anyone do this under normal circumstances In any event, this should be dynamically sized at run time. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-11-26 23:27 ` David Miller @ 2008-11-27 7:05 ` Catalin(ux) M. BOIE 2008-11-27 7:37 ` David Miller 0 siblings, 1 reply; 23+ messages in thread From: Catalin(ux) M. BOIE @ 2008-11-27 7:05 UTC (permalink / raw) To: David Miller; +Cc: jmack, netdev, lvs-devel Hello! > From: Joseph Mack NA3T <jmack@wm7d.net> > Date: Wed, 26 Nov 2008 06:40:02 -0800 (PST) > >> On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote: >> >> > I was very frustrated about the fact that I have to recompile the >> kernel >> > to change the hash size. So, I created this patch. >> >> thanks for sending us the code. >> >> Why do you need to change the hash size? We really don't recommend >> anyone do this under normal circumstances > > In any event, this should be dynamically sized at run time. David, thank you very much for feedback. Do you mean that it should be computed based on memory size or you mean that it should be changeable on-the-fly, even when we have some connections in the tables already? Because I really need this now, can we, pretty please, apply it as it is because is an improvement over what we have now? And, I promise that I will make it changeable at runtime. Deal? :) Or you do not want to have another legacy around? Do you have some hints (where to look for a similar thing)? The thing that worries me a little is the locking around the move and maybe the latency involved. Yes, of course you will not change it several times per minute, but... Thank you very much! -- Catalin(ux) M. BOIE http://kernel.embedromix.ro/ ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-11-27 7:05 ` Catalin(ux) M. BOIE @ 2008-11-27 7:37 ` David Miller 0 siblings, 0 replies; 23+ messages in thread From: David Miller @ 2008-11-27 7:37 UTC (permalink / raw) To: catab; +Cc: jmack, netdev, lvs-devel From: "Catalin(ux) M. BOIE" <catab@embedromix.ro> Date: Thu, 27 Nov 2008 00:05:50 -0700 (MST) > Do you mean that it should be computed based on memory size or you mean > that it should be changeable on-the-fly, even when we have some > connections in the tables already? I mean the latter. We do this in other subsystems, such as IPSEC, for various hashes. > Do you have some hints (where to look for a similar thing)? > The thing that worries me a little is the locking around the move and > maybe the latency involved. Yes, of course you will not change it several > times per minute, but... You only increase, never decrease. A lock is held during every access to these hash tables, so the locking should be very similar. Have a look at xfrm_hash_resize() in net/xfrm/xfrm_state.c to get a good idea on how this stuff can be done. And hey, that code resizes 3 tables at once, you only need to handle one :) ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-11-26 14:40 ` Joseph Mack NA3T 2008-11-26 23:27 ` David Miller @ 2008-11-27 6:58 ` Catalin(ux) M. BOIE 2008-11-27 15:58 ` Joseph Mack NA3T 1 sibling, 1 reply; 23+ messages in thread From: Catalin(ux) M. BOIE @ 2008-11-27 6:58 UTC (permalink / raw) To: Joseph Mack NA3T; +Cc: netdev, lvs-devel Hello! > On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote: > >> I was very frustrated about the fact that I have to recompile the kernel >> to change the hash size. So, I created this patch. > > thanks for sending us the code. > > Why do you need to change the hash size? We really don't > recommend anyone do this under normal circumstances As it is written in the help, to lower the collisions in the case of a lot of concurrent connections. Or am I missing something? > Thanks Joe > >> If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel >> command line, or, if you built IPVS as modules, you can add >> options ip_vs conn_tab_bits=??. >> To keep everything backward compatible, you still can select the size at >> compile time, and that will be used as default. >> >> Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> >> --- >> include/net/ip_vs.h | 16 ++++---------- >> net/netfilter/ipvs/Kconfig | 4 +++ >> net/netfilter/ipvs/ip_vs_conn.c | 41 >> ++++++++++++++++++++++++++++---------- >> net/netfilter/ipvs/ip_vs_ctl.c | 8 +++--- >> 4 files changed, 43 insertions(+), 26 deletions(-) >> >> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h >> index fe9fcf7..5a788a4 100644 >> --- a/include/net/ip_vs.h >> +++ b/include/net/ip_vs.h >> @@ -26,6 +26,11 @@ >> #include <linux/ipv6.h> /* for struct ipv6hdr */ >> #include <net/ipv6.h> /* for ipv6_addr_copy */ >> >> + >> +/* Connections' size value needed by ip_vs_ctl.c */ >> +extern int ip_vs_conn_tab_size; >> + >> + >> struct ip_vs_iphdr { >> int len; >> __u8 protocol; >> @@ -599,17 +604,6 @@ extern void ip_vs_init_hash_table(struct list_head >> *table, int rows); >> * (from ip_vs_conn.c) >> */ >> >> -/* >> - * IPVS connection entry hash table >> - */ >> -#ifndef CONFIG_IP_VS_TAB_BITS >> -#define CONFIG_IP_VS_TAB_BITS 12 >> -#endif >> - >> -#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS >> -#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS) >> -#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1) >> - >> enum { >> IP_VS_DIR_INPUT = 0, >> IP_VS_DIR_OUTPUT, >> diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig >> index 79a6980..c71e543 100644 >> --- a/net/netfilter/ipvs/Kconfig >> +++ b/net/netfilter/ipvs/Kconfig >> @@ -68,6 +68,10 @@ config IP_VS_TAB_BITS >> each hash entry uses 8 bytes, so you can estimate how much memory is >> needed for your box. >> >> + You can overwrite this number setting conn_tab_bits module parameter >> + or by appending ip_vs.conn_tab_bits=? to the kernel command line >> + if IP VS was compiled built-in. >> + >> comment "IPVS transport protocol load balancing support" >> >> config IP_VS_PROTO_TCP >> diff --git a/net/netfilter/ipvs/ip_vs_conn.c >> b/net/netfilter/ipvs/ip_vs_conn.c >> index 9a24332..b1462f1 100644 >> --- a/net/netfilter/ipvs/ip_vs_conn.c >> +++ b/net/netfilter/ipvs/ip_vs_conn.c >> @@ -37,6 +37,21 @@ >> #include <net/ip_vs.h> >> >> >> +#ifndef CONFIG_IP_VS_TAB_BITS >> +#define CONFIG_IP_VS_TAB_BITS 12 >> +#endif >> + >> +/* >> + * Connection hash size. Default is what was selected at compile time. >> +*/ >> +int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; >> +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); >> +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); >> + >> +/* size and mask values */ >> +int ip_vs_conn_tab_size; >> +int ip_vs_conn_tab_mask; >> + >> /* >> * Connection hash table: for input and output packets lookups of IPVS >> */ >> @@ -122,11 +137,11 @@ static unsigned int ip_vs_conn_hashkey(int af, >> unsigned proto, >> if (af == AF_INET6) >> return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), >> (__force u32)port, proto, ip_vs_conn_rnd) >> - & IP_VS_CONN_TAB_MASK; >> + & ip_vs_conn_tab_mask; >> #endif >> return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, >> ip_vs_conn_rnd) >> - & IP_VS_CONN_TAB_MASK; >> + & ip_vs_conn_tab_mask; >> } >> >> >> @@ -752,7 +767,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, >> loff_t pos) >> int idx; >> struct ip_vs_conn *cp; >> >> - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { >> + for(idx = 0; idx < ip_vs_conn_tab_size; idx++) { >> ct_read_lock_bh(idx); >> list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { >> if (pos-- == 0) { >> @@ -789,7 +804,7 @@ static void *ip_vs_conn_seq_next(struct seq_file >> *seq, void *v, loff_t *pos) >> idx = l - ip_vs_conn_tab; >> ct_read_unlock_bh(idx); >> >> - while (++idx < IP_VS_CONN_TAB_SIZE) { >> + while (++idx < ip_vs_conn_tab_size) { >> ct_read_lock_bh(idx); >> list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { >> seq->private = &ip_vs_conn_tab[idx]; >> @@ -972,8 +987,8 @@ void ip_vs_random_dropentry(void) >> /* >> * Randomly scan 1/32 of the whole table every second >> */ >> - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { >> - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; >> + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { >> + unsigned hash = net_random() & ip_vs_conn_tab_mask; >> >> /* >> * Lock is actually needed in this loop. >> @@ -1025,7 +1040,7 @@ static void ip_vs_conn_flush(void) >> struct ip_vs_conn *cp; >> >> flush_again: >> - for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { >> + for (idx=0; idx<ip_vs_conn_tab_size; idx++) { >> /* >> * Lock is actually needed in this loop. >> */ >> @@ -1056,10 +1071,14 @@ int __init ip_vs_conn_init(void) >> { >> int idx; >> >> + /* Compute size and mask */ >> + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; >> + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; >> + >> /* >> * Allocate the connection hash table and initialize its list heads >> */ >> - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct >> list_head)); >> + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(struct >> list_head)); >> if (!ip_vs_conn_tab) >> return -ENOMEM; >> >> @@ -1074,12 +1093,12 @@ int __init ip_vs_conn_init(void) >> >> IP_VS_INFO("Connection hash table configured " >> "(size=%d, memory=%ldKbytes)\n", >> - IP_VS_CONN_TAB_SIZE, >> - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); >> + ip_vs_conn_tab_size, >> + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); >> IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", >> sizeof(struct ip_vs_conn)); >> >> - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { >> + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { >> INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); >> } >> >> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c >> b/net/netfilter/ipvs/ip_vs_ctl.c >> index 0302cf3..6dcadc2 100644 >> --- a/net/netfilter/ipvs/ip_vs_ctl.c >> +++ b/net/netfilter/ipvs/ip_vs_ctl.c >> @@ -1854,7 +1854,7 @@ static int ip_vs_info_seq_show(struct seq_file >> *seq, void *v) >> if (v == SEQ_START_TOKEN) { >> seq_printf(seq, >> "IP Virtual Server version %d.%d.%d (size=%d)\n", >> - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); >> + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); >> seq_puts(seq, >> "Prot LocalAddress:Port Scheduler Flags\n"); >> seq_puts(seq, >> @@ -2385,7 +2385,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void >> __user *user, int *len) >> char buf[64]; >> >> sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", >> - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); >> + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); >> if (copy_to_user(user, buf, strlen(buf)+1) != 0) { >> ret = -EFAULT; >> goto out; >> @@ -2398,7 +2398,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void >> __user *user, int *len) >> { >> struct ip_vs_getinfo info; >> info.version = IP_VS_VERSION_CODE; >> - info.size = IP_VS_CONN_TAB_SIZE; >> + info.size = ip_vs_conn_tab_size; >> info.num_services = ip_vs_num_services; >> if (copy_to_user(user, &info, sizeof(info)) != 0) >> ret = -EFAULT; >> @@ -3238,7 +3238,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, >> struct genl_info *info) >> case IPVS_CMD_GET_INFO: >> NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); >> NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, >> - IP_VS_CONN_TAB_SIZE); >> + ip_vs_conn_tab_size); >> break; >> } >> >> > > -- > Joseph Mack NA3T EME(B,D), FM05lw North Carolina > jmack (at) wm7d (dot) net - azimuthal equidistant map > generator at http://www.wm7d.net/azproj.shtml > Homepage http://www.austintek.com/ It's GNU/Linux! > -- Catalin(ux) M. BOIE http://kernel.embedromix.ro/ ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-11-27 6:58 ` Catalin(ux) M. BOIE @ 2008-11-27 15:58 ` Joseph Mack NA3T 2008-11-28 8:49 ` Catalin(ux) M. BOIE 0 siblings, 1 reply; 23+ messages in thread From: Joseph Mack NA3T @ 2008-11-27 15:58 UTC (permalink / raw) To: Catalin(ux) M. BOIE; +Cc: netdev, lvs-devel On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote: >> Why do you need to change the hash size? We really don't >> recommend anyone do this under normal circumstances > > As it is written in the help, to lower the collisions in > the case of a lot of concurrent connections. Or am I > missing something? not sure how big the collision problem is (have you measured an effect?) but Ratz put out the results of a back of the envelope calculation that the current value for the hash table size would work on the currect fastest machines with 3G of memory Joe -- Joseph Mack NA3T EME(B,D), FM05lw North Carolina jmack (at) wm7d (dot) net - azimuthal equidistant map generator at http://www.wm7d.net/azproj.shtml Homepage http://www.austintek.com/ It's GNU/Linux! ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-11-27 15:58 ` Joseph Mack NA3T @ 2008-11-28 8:49 ` Catalin(ux) M. BOIE 2008-11-28 14:55 ` Joseph Mack NA3T 0 siblings, 1 reply; 23+ messages in thread From: Catalin(ux) M. BOIE @ 2008-11-28 8:49 UTC (permalink / raw) To: Joseph Mack NA3T; +Cc: Catalin M. BOIE, netdev, lvs-devel > On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote: > >>> Why do you need to change the hash size? We really don't >>> recommend anyone do this under normal circumstances >> >> As it is written in the help, to lower the collisions in >> the case of a lot of concurrent connections. Or am I >> missing something? > > not sure how big the collision problem is (have you measured > an effect?) but Ratz put out the results of a back of the > envelope calculation that the current value for the hash > table size would work on the currect fastest machines with > 3G of memory Hello, Joe! No, I did not measure it, but, I read the help text: Note the table size must be power of 2. The table size will be the value of 2 to the your input number power. The number to choose is from 8 to 20, the default number is 12, which means the table size is 4096. Don't input the number too small, otherwise you will lose performance on it. You can adapt the table size yourself, according to your virtual server application. It is good to set the table size not far less than the number of connections per second multiplying average lasting time of connection in the table. For example, your virtual server gets 200 connections per second, the connection lasts for 200 seconds in average in the connection table, the table size should be not far less than 200x200, it is good to set the table size 32768 (2**15). The help is wrong or I am missing something? > Joe > > -- > Joseph Mack NA3T EME(B,D), FM05lw North Carolina > jmack (at) wm7d (dot) net - azimuthal equidistant map > generator at http://www.wm7d.net/azproj.shtml > Homepage http://www.austintek.com/ It's GNU/Linux! > -- Catalin(ux) M. BOIE http://kernel.embedromix.ro/ ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-11-28 8:49 ` Catalin(ux) M. BOIE @ 2008-11-28 14:55 ` Joseph Mack NA3T 2008-12-02 15:34 ` Catalin(ux) M. BOIE 0 siblings, 1 reply; 23+ messages in thread From: Joseph Mack NA3T @ 2008-11-28 14:55 UTC (permalink / raw) To: Catalin(ux) M. BOIE; +Cc: netdev, lvs-devel On Fri, 28 Nov 2008, Catalin(ux) M. BOIE wrote: > Hello, Joe! > > No, I did not measure it, but, I read the help text: . . > The help is wrong or I am missing something? there's a bit somewhere else in the same section saying not to change the hash size unless you know more than we do :-) Joe -- Joseph Mack NA3T EME(B,D), FM05lw North Carolina jmack (at) wm7d (dot) net - azimuthal equidistant map generator at http://www.wm7d.net/azproj.shtml Homepage http://www.austintek.com/ It's GNU/Linux! ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-11-28 14:55 ` Joseph Mack NA3T @ 2008-12-02 15:34 ` Catalin(ux) M. BOIE 2008-12-02 22:51 ` David Miller 0 siblings, 1 reply; 23+ messages in thread From: Catalin(ux) M. BOIE @ 2008-12-02 15:34 UTC (permalink / raw) To: Joseph Mack NA3T; +Cc: netdev, lvs-devel Hello, Joe! > On Fri, 28 Nov 2008, Catalin(ux) M. BOIE wrote: > >> Hello, Joe! >> >> No, I did not measure it, but, I read the help text: > . > . >> The help is wrong or I am missing something? > > there's a bit somewhere else in the same section saying not > to change the hash size unless you know more than we do :-) Where, exactly? So, should I not change it at all even if I have a great number of simultaneously connections? > Joe > > -- > Joseph Mack NA3T EME(B,D), FM05lw North Carolina > jmack (at) wm7d (dot) net - azimuthal equidistant map > generator at http://www.wm7d.net/azproj.shtml > Homepage http://www.austintek.com/ It's GNU/Linux! > -- Catalin(ux) M. BOIE http://kernel.embedromix.ro/ ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-12-02 15:34 ` Catalin(ux) M. BOIE @ 2008-12-02 22:51 ` David Miller 2008-12-02 23:16 ` Catalin(ux) M. BOIE 0 siblings, 1 reply; 23+ messages in thread From: David Miller @ 2008-12-02 22:51 UTC (permalink / raw) To: catab; +Cc: jmack, netdev, lvs-devel From: "Catalin(ux) M. BOIE" <catab@embedromix.ro> Date: Tue, 2 Dec 2008 08:34:22 -0700 (MST) > Hello, Joe! > > > On Fri, 28 Nov 2008, Catalin(ux) M. BOIE wrote: > > > >> Hello, Joe! > >> > >> No, I did not measure it, but, I read the help text: > > . > > . > >> The help is wrong or I am missing something? > > > > there's a bit somewhere else in the same section saying not > > to change the hash size unless you know more than we do :-) > > Where, exactly? > > So, should I not change it at all even if I have a great number of > simultaneously connections? You should only ever change something like this if you actually observe a specific performance problem. This is the part that is driving everybody crazy about your report. You seem to be changing this without having observed a measurable performance problem first, and then tracked it down specifically to this hash table's size. You seem to want to change it because it seems to you like that is what should be done. You don't really know if it even matters or not for your workload. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-12-02 22:51 ` David Miller @ 2008-12-02 23:16 ` Catalin(ux) M. BOIE 2008-12-03 0:37 ` David Miller 2008-12-03 21:11 ` Graeme Fowler 0 siblings, 2 replies; 23+ messages in thread From: Catalin(ux) M. BOIE @ 2008-12-02 23:16 UTC (permalink / raw) To: David Miller; +Cc: jmack, netdev, lvs-devel > From: "Catalin(ux) M. BOIE" <catab@embedromix.ro> > Date: Tue, 2 Dec 2008 08:34:22 -0700 (MST) > >> Hello, Joe! >> >> > On Fri, 28 Nov 2008, Catalin(ux) M. BOIE wrote: >> > >> >> Hello, Joe! >> >> >> >> No, I did not measure it, but, I read the help text: >> > . >> > . >> >> The help is wrong or I am missing something? >> > >> > there's a bit somewhere else in the same section saying not >> > to change the hash size unless you know more than we do :-) >> >> Where, exactly? >> >> So, should I not change it at all even if I have a great number of >> simultaneously connections? Hello, David! > You should only ever change something like this if you actually > observe a specific performance problem. I have a need, I didn't wake up some day and I dreamed to change this. I have a gateway with LVS and I have 4 web servers behind. I saw the help text at that option and I saw that I could raise the limit. > This is the part that is driving everybody crazy about your > report. You seem to be changing this without having observed > a measurable performance problem first, and then tracked it down > specifically to this hash table's size. I was looking for anything that could get me past of 88.000 request per seconds. The help text told me to raise that value if I have big number of connections. I just needed an easy way to test. > You seem to want to change it because it seems to you like that is > what should be done. You don't really know if it even matters or not > for your workload. Without easy testing (boot time change not compile time change) is hard to tell if it helps me or not. Anyway, if the help text is wrong, let's correct it. If it is correct, let's allow changing that value at runtime, so people can easy juggle with it. Do you agree, David? Thank you. -- Catalin(ux) M. BOIE http://kernel.embedromix.ro/ ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-12-02 23:16 ` Catalin(ux) M. BOIE @ 2008-12-03 0:37 ` David Miller 2009-12-28 18:49 ` Mark Bergsma 2008-12-03 21:11 ` Graeme Fowler 1 sibling, 1 reply; 23+ messages in thread From: David Miller @ 2008-12-03 0:37 UTC (permalink / raw) To: catab; +Cc: jmack, netdev, lvs-devel From: "Catalin(ux) M. BOIE" <catab@embedromix.ro> Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST) > > From: "Catalin(ux) M. BOIE" <catab@embedromix.ro> > > Date: Tue, 2 Dec 2008 08:34:22 -0700 (MST) > > > > This is the part that is driving everybody crazy about your > > report. You seem to be changing this without having observed > > a measurable performance problem first, and then tracked it down > > specifically to this hash table's size. > > I was looking for anything that could get me past of 88.000 request per > seconds. > The help text told me to raise that value if I have big number of > connections. I just needed an easy way to test. You're just repeating what I said, you "think" it should be changed and as a result you are wasting everyones time. You don't actually "know", you're just guessing using random snippets from documentation rather than good hard evidence of a need. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-12-03 0:37 ` David Miller @ 2009-12-28 18:49 ` Mark Bergsma 2009-12-29 1:34 ` Simon Horman 2010-01-05 0:20 ` Simon Horman 0 siblings, 2 replies; 23+ messages in thread From: Mark Bergsma @ 2009-12-28 18:49 UTC (permalink / raw) To: netdev On 03-12-08 01:37, David Miller wrote: > From: "Catalin(ux) M. BOIE" <catab@embedromix.ro> > Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST) >> I was looking for anything that could get me past of 88.000 request per >> seconds. >> The help text told me to raise that value if I have big number of >> connections. I just needed an easy way to test. > > You're just repeating what I said, you "think" it should be > changed and as a result you are wasting everyones time. > > You don't actually "know", you're just guessing using random > snippets from documentation rather than good hard evidence of > a need. Hello, I just found this year-old thread about a patch allowing the IPVS connection hash table size to be set at load time by a module parameter. Apparently the conclusion reached was that allowing this configuration setting to be changed would be useless, and that the poster's performance problems would likely lie elsewhere, since he had no evidence it was caused by the hash table size. We do however run into the same problem with the default setting (2^12 = 4096 entries), as most of our LVS balancers handle around a million connections/SLAB entries at any point in time (around 100-150 kpps load). With only 4096 hash table entries this implies that each entry consists of a linked list of 256 connections *on average*. To provide some statistics, I did an oprofile run on an 2.6.31 kernel, with both the default 4096 table size, and the same kernel recompiled with IP_VS_CONN_TAB_BITS set to 18 (2^18 = 262144 entries). I built a quick test setup with a part of Wikimedia/Wikipedia's live traffic mirrored by the switch to the test host. With the default setting, at ~ 120 kpps packet load we saw a typical %si CPU usage of around 30-35%, and oprofile reported a hot spot in ip_vs_conn_in_get: samples % image name app name symbol name 1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 302577 7.4554 bnx2 bnx2 /bnx2 181984 4.4840 vmlinux vmlinux __ticket_spin_lock 128636 3.1695 vmlinux vmlinux ip_route_input 74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get 68482 1.6874 vmlinux vmlinux mwait_idle After loading the recompiled kernel with 2^18 entries, %si CPU usage dropped in half to around 12-18%, and oprofile looks much healthier, with only 7% spent in ip_vs_conn_in_get: samples % image name app name symbol name 265641 14.4616 bnx2 bnx2 /bnx2 143251 7.7986 vmlinux vmlinux __ticket_spin_lock 140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 94364 5.1372 vmlinux vmlinux mwait_idle 86267 4.6964 vmlinux vmlinux ip_route_input So yes, having the table size as an ip_vs module parameter would be *very* welcome. Perhaps not as convenient as a dynamically resizing table, but it would be a lot less work and much more maintainable in production than compiling a kernel with every security update... -- Mark Bergsma <mark@wikimedia.org> Operations Engineer, Wikimedia Foundation ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2009-12-28 18:49 ` Mark Bergsma @ 2009-12-29 1:34 ` Simon Horman 2010-01-04 13:57 ` Patrick McHardy 2010-01-05 0:20 ` Simon Horman 1 sibling, 1 reply; 23+ messages in thread From: Simon Horman @ 2009-12-29 1:34 UTC (permalink / raw) To: Mark Bergsma Cc: netdev, lvs-devel, Catalin(ux) M. BOIE, Joseph Mack NA3T, Graeme Fowler, David Miller, Patrick McHardy On Mon, Dec 28, 2009 at 07:49:38PM +0100, Mark Bergsma wrote: > On 03-12-08 01:37, David Miller wrote: > > From: "Catalin(ux) M. BOIE" <catab@embedromix.ro> > > Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST) > >> I was looking for anything that could get me past of 88.000 request per > >> seconds. > >> The help text told me to raise that value if I have big number of > >> connections. I just needed an easy way to test. > > > > You're just repeating what I said, you "think" it should be > > changed and as a result you are wasting everyones time. > > > > You don't actually "know", you're just guessing using random > > snippets from documentation rather than good hard evidence of > > a need. > > Hello, > > I just found this year-old thread about a patch allowing the IPVS > connection hash table size to be set at load time by a module parameter. > Apparently the conclusion reached was that allowing this configuration > setting to be changed would be useless, and that the poster's > performance problems would likely lie elsewhere, since he had no > evidence it was caused by the hash table size. Hi Mark, thanks for your test results. I have added them to the patch. Feel free to edit the text. From: Catalin(ux) M. BOIE <catab@embedromix.ro> IPVS: Allow boot time change of hash size. I was very frustrated about the fact that I have to recompile the kernel to change the hash size. So, I created this patch. If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel command line, or, if you built IPVS as modules, you can add options ip_vs conn_tab_bits=??. To keep everything backward compatible, you still can select the size at compile time, and that will be used as default. [ horms@verge.net.au: trivial up-port and minor style fixes ] Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> Cc: Mark Bergsma <mark@wikimedia.org> Signed-off-by: Simon Horman <horms@verge.net.au> --- It has been about a year since this patch was originally posted and subsequently dropped on the basis of insufficient test data. Mark Bergsma has provided the following test results which seem to strongly support the need for larger hash table sizes: We do however run into the same problem with the default setting (2^12 = 4096 entries), as most of our LVS balancers handle around a million connections/SLAB entries at any point in time (around 100-150 kpps load). With only 4096 hash table entries this implies that each entry consists of a linked list of 256 connections *on average*. To provide some statistics, I did an oprofile run on an 2.6.31 kernel, with both the default 4096 table size, and the same kernel recompiled with IP_VS_CONN_TAB_BITS set to 18 (2^18 = 262144 entries). I built a quick test setup with a part of Wikimedia/Wikipedia's live traffic mirrored by the switch to the test host. With the default setting, at ~ 120 kpps packet load we saw a typical %si CPU usage of around 30-35%, and oprofile reported a hot spot in ip_vs_conn_in_get: samples % image name app name symbol name 1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 302577 7.4554 bnx2 bnx2 /bnx2 181984 4.4840 vmlinux vmlinux __ticket_spin_lock 128636 3.1695 vmlinux vmlinux ip_route_input 74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get 68482 1.6874 vmlinux vmlinux mwait_idle After loading the recompiled kernel with 2^18 entries, %si CPU usage dropped in half to around 12-18%, and oprofile looks much healthier, with only 7% spent in ip_vs_conn_in_get: samples % image name app name symbol name 265641 14.4616 bnx2 bnx2 /bnx2 143251 7.7986 vmlinux vmlinux __ticket_spin_lock 140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 94364 5.1372 vmlinux vmlinux mwait_idle 86267 4.6964 vmlinux vmlinux ip_route_input Index: net-next-2.6/include/net/ip_vs.h =================================================================== --- net-next-2.6.orig/include/net/ip_vs.h 2009-12-29 10:18:56.000000000 +0900 +++ net-next-2.6/include/net/ip_vs.h 2009-12-29 10:19:13.000000000 +0900 @@ -26,6 +26,11 @@ #include <linux/ipv6.h> /* for struct ipv6hdr */ #include <net/ipv6.h> /* for ipv6_addr_copy */ + +/* Connections' size value needed by ip_vs_ctl.c */ +extern int ip_vs_conn_tab_size; + + struct ip_vs_iphdr { int len; __u8 protocol; @@ -592,17 +597,6 @@ extern void ip_vs_init_hash_table(struct * (from ip_vs_conn.c) */ -/* - * IPVS connection entry hash table - */ -#ifndef CONFIG_IP_VS_TAB_BITS -#define CONFIG_IP_VS_TAB_BITS 12 -#endif - -#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS -#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS) -#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1) - enum { IP_VS_DIR_INPUT = 0, IP_VS_DIR_OUTPUT, Index: net-next-2.6/net/netfilter/ipvs/Kconfig =================================================================== --- net-next-2.6.orig/net/netfilter/ipvs/Kconfig 2009-12-29 10:18:56.000000000 +0900 +++ net-next-2.6/net/netfilter/ipvs/Kconfig 2009-12-29 10:19:13.000000000 +0900 @@ -68,6 +68,10 @@ config IP_VS_TAB_BITS each hash entry uses 8 bytes, so you can estimate how much memory is needed for your box. + You can overwrite this number setting conn_tab_bits module parameter + or by appending ip_vs.conn_tab_bits=? to the kernel command line + if IP VS was compiled built-in. + comment "IPVS transport protocol load balancing support" config IP_VS_PROTO_TCP Index: net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c =================================================================== --- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c 2009-12-29 10:18:56.000000000 +0900 +++ net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c 2009-12-29 10:19:13.000000000 +0900 @@ -40,6 +40,21 @@ #include <net/ip_vs.h> +#ifndef CONFIG_IP_VS_TAB_BITS +#define CONFIG_IP_VS_TAB_BITS 12 +#endif + +/* + * Connection hash size. Default is what was selected at compile time. +*/ +int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); + +/* size and mask values */ +int ip_vs_conn_tab_size; +int ip_vs_conn_tab_mask; + /* * Connection hash table: for input and output packets lookups of IPVS */ @@ -125,11 +140,11 @@ static unsigned int ip_vs_conn_hashkey(i if (af == AF_INET6) return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), (__force u32)port, proto, ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; + & ip_vs_conn_tab_mask; #endif return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; + & ip_vs_conn_tab_mask; } @@ -760,7 +775,7 @@ static void *ip_vs_conn_array(struct seq int idx; struct ip_vs_conn *cp; - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { ct_read_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { if (pos-- == 0) { @@ -797,7 +812,7 @@ static void *ip_vs_conn_seq_next(struct idx = l - ip_vs_conn_tab; ct_read_unlock_bh(idx); - while (++idx < IP_VS_CONN_TAB_SIZE) { + while (++idx < ip_vs_conn_tab_size) { ct_read_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { seq->private = &ip_vs_conn_tab[idx]; @@ -976,8 +991,8 @@ void ip_vs_random_dropentry(void) /* * Randomly scan 1/32 of the whole table every second */ - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { + unsigned hash = net_random() & ip_vs_conn_tab_mask; /* * Lock is actually needed in this loop. @@ -1029,7 +1044,7 @@ static void ip_vs_conn_flush(void) struct ip_vs_conn *cp; flush_again: - for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { /* * Lock is actually needed in this loop. */ @@ -1060,10 +1075,15 @@ int __init ip_vs_conn_init(void) { int idx; + /* Compute size and mask */ + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; + /* * Allocate the connection hash table and initialize its list heads */ - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * + sizeof(struct list_head)); if (!ip_vs_conn_tab) return -ENOMEM; @@ -1078,12 +1098,12 @@ int __init ip_vs_conn_init(void) pr_info("Connection hash table configured " "(size=%d, memory=%ldKbytes)\n", - IP_VS_CONN_TAB_SIZE, - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + ip_vs_conn_tab_size, + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", sizeof(struct ip_vs_conn)); - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); } Index: net-next-2.6/net/netfilter/ipvs/ip_vs_ctl.c =================================================================== --- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_ctl.c 2009-12-29 10:18:56.000000000 +0900 +++ net-next-2.6/net/netfilter/ipvs/ip_vs_ctl.c 2009-12-29 10:19:13.000000000 +0900 @@ -1843,7 +1843,7 @@ static int ip_vs_info_seq_show(struct se if (v == SEQ_START_TOKEN) { seq_printf(seq, "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); seq_puts(seq, "Prot LocalAddress:Port Scheduler Flags\n"); seq_puts(seq, @@ -2374,7 +2374,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cm char buf[64]; sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); if (copy_to_user(user, buf, strlen(buf)+1) != 0) { ret = -EFAULT; goto out; @@ -2387,7 +2387,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cm { struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; - info.size = IP_VS_CONN_TAB_SIZE; + info.size = ip_vs_conn_tab_size; info.num_services = ip_vs_num_services; if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; @@ -3231,7 +3231,7 @@ static int ip_vs_genl_get_cmd(struct sk_ case IPVS_CMD_GET_INFO: NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - IP_VS_CONN_TAB_SIZE); + ip_vs_conn_tab_size); break; } ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2009-12-29 1:34 ` Simon Horman @ 2010-01-04 13:57 ` Patrick McHardy 2010-01-04 23:24 ` Simon Horman 0 siblings, 1 reply; 23+ messages in thread From: Patrick McHardy @ 2010-01-04 13:57 UTC (permalink / raw) To: Simon Horman Cc: Mark Bergsma, netdev, lvs-devel, Catalin(ux) M. BOIE, Joseph Mack NA3T, Graeme Fowler, David Miller Simon Horman wrote: > On Mon, Dec 28, 2009 at 07:49:38PM +0100, Mark Bergsma wrote: >> On 03-12-08 01:37, David Miller wrote: >>> From: "Catalin(ux) M. BOIE" <catab@embedromix.ro> >>> Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST) >>>> I was looking for anything that could get me past of 88.000 request per >>>> seconds. >>>> The help text told me to raise that value if I have big number of >>>> connections. I just needed an easy way to test. >>> You're just repeating what I said, you "think" it should be >>> changed and as a result you are wasting everyones time. >>> >>> You don't actually "know", you're just guessing using random >>> snippets from documentation rather than good hard evidence of >>> a need. >> Hello, >> >> I just found this year-old thread about a patch allowing the IPVS >> connection hash table size to be set at load time by a module parameter. >> Apparently the conclusion reached was that allowing this configuration >> setting to be changed would be useless, and that the poster's >> performance problems would likely lie elsewhere, since he had no >> evidence it was caused by the hash table size. > > Hi Mark, > > thanks for your test results. I have added them to the patch. > Feel free to edit the text. Just wondering because of this comment - do you want me to apply this patch? ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2010-01-04 13:57 ` Patrick McHardy @ 2010-01-04 23:24 ` Simon Horman 2010-01-05 11:02 ` Mark Bergsma 0 siblings, 1 reply; 23+ messages in thread From: Simon Horman @ 2010-01-04 23:24 UTC (permalink / raw) To: Patrick McHardy Cc: Mark Bergsma, netdev, lvs-devel, Catalin(ux) M. BOIE, Joseph Mack NA3T, Graeme Fowler, David Miller On Mon, Jan 04, 2010 at 02:57:22PM +0100, Patrick McHardy wrote: > Simon Horman wrote: > > On Mon, Dec 28, 2009 at 07:49:38PM +0100, Mark Bergsma wrote: > >> On 03-12-08 01:37, David Miller wrote: > >>> From: "Catalin(ux) M. BOIE" <catab@embedromix.ro> > >>> Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST) > >>>> I was looking for anything that could get me past of 88.000 request per > >>>> seconds. > >>>> The help text told me to raise that value if I have big number of > >>>> connections. I just needed an easy way to test. > >>> You're just repeating what I said, you "think" it should be > >>> changed and as a result you are wasting everyones time. > >>> > >>> You don't actually "know", you're just guessing using random > >>> snippets from documentation rather than good hard evidence of > >>> a need. > >> Hello, > >> > >> I just found this year-old thread about a patch allowing the IPVS > >> connection hash table size to be set at load time by a module parameter. > >> Apparently the conclusion reached was that allowing this configuration > >> setting to be changed would be useless, and that the poster's > >> performance problems would likely lie elsewhere, since he had no > >> evidence it was caused by the hash table size. > > > > Hi Mark, > > > > thanks for your test results. I have added them to the patch. > > Feel free to edit the text. > > Just wondering because of this comment - do you want me to apply this > patch? I was fishing for an response from Mark. I'll resubmit it properly as that doesn't seem to be forthcoming. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2010-01-04 23:24 ` Simon Horman @ 2010-01-05 11:02 ` Mark Bergsma 2010-01-06 15:25 ` Catalin(ux) M. BOIE 0 siblings, 1 reply; 23+ messages in thread From: Mark Bergsma @ 2010-01-05 11:02 UTC (permalink / raw) To: Simon Horman Cc: Patrick McHardy, netdev, lvs-devel, Catalin(ux) M. BOIE, Joseph Mack NA3T, Graeme Fowler, David Miller On 05-01-10 00:24, Simon Horman wrote: > On Mon, Jan 04, 2010 at 02:57:22PM +0100, Patrick McHardy wrote: >> Simon Horman wrote: >>> On Mon, Dec 28, 2009 at 07:49:38PM +0100, Mark Bergsma wrote: >>>> On 03-12-08 01:37, David Miller wrote: >>>>> From: "Catalin(ux) M. BOIE" <catab@embedromix.ro> >>>>> Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST) >>>>>> I was looking for anything that could get me past of 88.000 request per >>>>>> seconds. >>>>>> The help text told me to raise that value if I have big number of >>>>>> connections. I just needed an easy way to test. >>>>> You're just repeating what I said, you "think" it should be >>>>> changed and as a result you are wasting everyones time. >>>>> >>>>> You don't actually "know", you're just guessing using random >>>>> snippets from documentation rather than good hard evidence of >>>>> a need. >>>> Hello, >>>> >>>> I just found this year-old thread about a patch allowing the IPVS >>>> connection hash table size to be set at load time by a module parameter. >>>> Apparently the conclusion reached was that allowing this configuration >>>> setting to be changed would be useless, and that the poster's >>>> performance problems would likely lie elsewhere, since he had no >>>> evidence it was caused by the hash table size. >>> >>> Hi Mark, >>> >>> thanks for your test results. I have added them to the patch. >>> Feel free to edit the text. >> >> Just wondering because of this comment - do you want me to apply this >> patch? > > I was fishing for an response from Mark. > I'll resubmit it properly as that doesn't seem to be forthcoming. Hi Simon/Patrick, Sorry for the late reply, and yes, this is fine. Thanks for applying the patch! -- Mark Bergsma <mark@wikimedia.org> Operations Engineer, Wikimedia Foundation ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2010-01-05 11:02 ` Mark Bergsma @ 2010-01-06 15:25 ` Catalin(ux) M. BOIE 0 siblings, 0 replies; 23+ messages in thread From: Catalin(ux) M. BOIE @ 2010-01-06 15:25 UTC (permalink / raw) To: Mark Bergsma Cc: Simon Horman, Patrick McHardy, netdev, lvs-devel, Joseph Mack NA3T, Graeme Fowler, David Miller > Sorry for the late reply, and yes, this is fine. Thanks for applying the > patch! > > -- > Mark Bergsma <mark@wikimedia.org> > Operations Engineer, Wikimedia Foundation I also want to thank Mark for providing the data, allowing this patch to be accepted. Thanks, Mark! -- Catalin(ux) M. BOIE http://kernel.embedromix.ro/ ^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH] IPVS: Allow boot time change of hash size. 2009-12-28 18:49 ` Mark Bergsma 2009-12-29 1:34 ` Simon Horman @ 2010-01-05 0:20 ` Simon Horman 2010-01-05 4:56 ` Patrick McHardy 1 sibling, 1 reply; 23+ messages in thread From: Simon Horman @ 2010-01-05 0:20 UTC (permalink / raw) To: netdev, lvs-devel Cc: Catalin(ux) M. BOIE, Mark Bergsma, Joseph Mack NA3T, Graeme Fowler, David Miller, Patrick McHardy From: Catalin(ux) M. BOIE <catab@embedromix.ro> IPVS: Allow boot time change of hash size. I was very frustrated about the fact that I have to recompile the kernel to change the hash size. So, I created this patch. If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel command line, or, if you built IPVS as modules, you can add options ip_vs conn_tab_bits=??. To keep everything backward compatible, you still can select the size at compile time, and that will be used as default. [ horms@verge.net.au: trivial up-port and minor style fixes ] Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> Cc: Mark Bergsma <mark@wikimedia.org> Signed-off-by: Simon Horman <horms@verge.net.au> --- Patrick, please consider applying this. I'd like to do something dynamic. But this change is an obvious win in the mean time. It has been about a year since this patch was originally posted and subsequently dropped on the basis of insufficient test data. Mark Bergsma has provided the following test results which seem to strongly support the need for larger hash table sizes: We do however run into the same problem with the default setting (2^12 = 4096 entries), as most of our LVS balancers handle around a million connections/SLAB entries at any point in time (around 100-150 kpps load). With only 4096 hash table entries this implies that each entry consists of a linked list of 256 connections *on average*. To provide some statistics, I did an oprofile run on an 2.6.31 kernel, with both the default 4096 table size, and the same kernel recompiled with IP_VS_CONN_TAB_BITS set to 18 (2^18 = 262144 entries). I built a quick test setup with a part of Wikimedia/Wikipedia's live traffic mirrored by the switch to the test host. With the default setting, at ~ 120 kpps packet load we saw a typical %si CPU usage of around 30-35%, and oprofile reported a hot spot in ip_vs_conn_in_get: samples % image name app name symbol name 1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 302577 7.4554 bnx2 bnx2 /bnx2 181984 4.4840 vmlinux vmlinux __ticket_spin_lock 128636 3.1695 vmlinux vmlinux ip_route_input 74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get 68482 1.6874 vmlinux vmlinux mwait_idle After loading the recompiled kernel with 2^18 entries, %si CPU usage dropped in half to around 12-18%, and oprofile looks much healthier, with only 7% spent in ip_vs_conn_in_get: samples % image name app name symbol name 265641 14.4616 bnx2 bnx2 /bnx2 143251 7.7986 vmlinux vmlinux __ticket_spin_lock 140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 94364 5.1372 vmlinux vmlinux mwait_idle 86267 4.6964 vmlinux vmlinux ip_route_input Index: net-next-2.6/include/net/ip_vs.h =================================================================== --- net-next-2.6.orig/include/net/ip_vs.h 2009-12-29 10:18:56.000000000 +0900 +++ net-next-2.6/include/net/ip_vs.h 2009-12-29 10:19:13.000000000 +0900 @@ -26,6 +26,11 @@ #include <linux/ipv6.h> /* for struct ipv6hdr */ #include <net/ipv6.h> /* for ipv6_addr_copy */ + +/* Connections' size value needed by ip_vs_ctl.c */ +extern int ip_vs_conn_tab_size; + + struct ip_vs_iphdr { int len; __u8 protocol; @@ -592,17 +597,6 @@ extern void ip_vs_init_hash_table(struct * (from ip_vs_conn.c) */ -/* - * IPVS connection entry hash table - */ -#ifndef CONFIG_IP_VS_TAB_BITS -#define CONFIG_IP_VS_TAB_BITS 12 -#endif - -#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS -#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS) -#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1) - enum { IP_VS_DIR_INPUT = 0, IP_VS_DIR_OUTPUT, Index: net-next-2.6/net/netfilter/ipvs/Kconfig =================================================================== --- net-next-2.6.orig/net/netfilter/ipvs/Kconfig 2009-12-29 10:18:56.000000000 +0900 +++ net-next-2.6/net/netfilter/ipvs/Kconfig 2009-12-29 10:19:13.000000000 +0900 @@ -68,6 +68,10 @@ config IP_VS_TAB_BITS each hash entry uses 8 bytes, so you can estimate how much memory is needed for your box. + You can overwrite this number setting conn_tab_bits module parameter + or by appending ip_vs.conn_tab_bits=? to the kernel command line + if IP VS was compiled built-in. + comment "IPVS transport protocol load balancing support" config IP_VS_PROTO_TCP Index: net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c =================================================================== --- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c 2009-12-29 10:18:56.000000000 +0900 +++ net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c 2009-12-29 10:19:13.000000000 +0900 @@ -40,6 +40,21 @@ #include <net/ip_vs.h> +#ifndef CONFIG_IP_VS_TAB_BITS +#define CONFIG_IP_VS_TAB_BITS 12 +#endif + +/* + * Connection hash size. Default is what was selected at compile time. +*/ +int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); + +/* size and mask values */ +int ip_vs_conn_tab_size; +int ip_vs_conn_tab_mask; + /* * Connection hash table: for input and output packets lookups of IPVS */ @@ -125,11 +140,11 @@ static unsigned int ip_vs_conn_hashkey(i if (af == AF_INET6) return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), (__force u32)port, proto, ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; + & ip_vs_conn_tab_mask; #endif return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; + & ip_vs_conn_tab_mask; } @@ -760,7 +775,7 @@ static void *ip_vs_conn_array(struct seq int idx; struct ip_vs_conn *cp; - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { ct_read_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { if (pos-- == 0) { @@ -797,7 +812,7 @@ static void *ip_vs_conn_seq_next(struct idx = l - ip_vs_conn_tab; ct_read_unlock_bh(idx); - while (++idx < IP_VS_CONN_TAB_SIZE) { + while (++idx < ip_vs_conn_tab_size) { ct_read_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { seq->private = &ip_vs_conn_tab[idx]; @@ -976,8 +991,8 @@ void ip_vs_random_dropentry(void) /* * Randomly scan 1/32 of the whole table every second */ - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { + unsigned hash = net_random() & ip_vs_conn_tab_mask; /* * Lock is actually needed in this loop. @@ -1029,7 +1044,7 @@ static void ip_vs_conn_flush(void) struct ip_vs_conn *cp; flush_again: - for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { /* * Lock is actually needed in this loop. */ @@ -1060,10 +1075,15 @@ int __init ip_vs_conn_init(void) { int idx; + /* Compute size and mask */ + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; + /* * Allocate the connection hash table and initialize its list heads */ - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * + sizeof(struct list_head)); if (!ip_vs_conn_tab) return -ENOMEM; @@ -1078,12 +1098,12 @@ int __init ip_vs_conn_init(void) pr_info("Connection hash table configured " "(size=%d, memory=%ldKbytes)\n", - IP_VS_CONN_TAB_SIZE, - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + ip_vs_conn_tab_size, + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", sizeof(struct ip_vs_conn)); - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); } Index: net-next-2.6/net/netfilter/ipvs/ip_vs_ctl.c =================================================================== --- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_ctl.c 2009-12-29 10:18:56.000000000 +0900 +++ net-next-2.6/net/netfilter/ipvs/ip_vs_ctl.c 2009-12-29 10:19:13.000000000 +0900 @@ -1843,7 +1843,7 @@ static int ip_vs_info_seq_show(struct se if (v == SEQ_START_TOKEN) { seq_printf(seq, "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); seq_puts(seq, "Prot LocalAddress:Port Scheduler Flags\n"); seq_puts(seq, @@ -2374,7 +2374,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cm char buf[64]; sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); if (copy_to_user(user, buf, strlen(buf)+1) != 0) { ret = -EFAULT; goto out; @@ -2387,7 +2387,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cm { struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; - info.size = IP_VS_CONN_TAB_SIZE; + info.size = ip_vs_conn_tab_size; info.num_services = ip_vs_num_services; if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; @@ -3231,7 +3231,7 @@ static int ip_vs_genl_get_cmd(struct sk_ case IPVS_CMD_GET_INFO: NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - IP_VS_CONN_TAB_SIZE); + ip_vs_conn_tab_size); break; } -- To unsubscribe from this list: send the line "unsubscribe lvs-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2010-01-05 0:20 ` Simon Horman @ 2010-01-05 4:56 ` Patrick McHardy 0 siblings, 0 replies; 23+ messages in thread From: Patrick McHardy @ 2010-01-05 4:56 UTC (permalink / raw) To: Simon Horman Cc: netdev, lvs-devel, Catalin(ux) M. BOIE, Mark Bergsma, Joseph Mack NA3T, Graeme Fowler, David Miller Simon Horman wrote: > From: Catalin(ux) M. BOIE <catab@embedromix.ro> > > IPVS: Allow boot time change of hash size. > > I was very frustrated about the fact that I have to recompile the kernel > to change the hash size. So, I created this patch. > > If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel > command line, or, if you built IPVS as modules, you can add > options ip_vs conn_tab_bits=??. > > To keep everything backward compatible, you still can select the size at > compile time, and that will be used as default. > > [ horms@verge.net.au: trivial up-port and minor style fixes ] > Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> > Cc: Mark Bergsma <mark@wikimedia.org> > Signed-off-by: Simon Horman <horms@verge.net.au> > > --- > > Patrick, please consider applying this. I'd like to do something dynamic. > But this change is an obvious win in the mean time. Applied to nf-next-2.6.git, thanks. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-12-02 23:16 ` Catalin(ux) M. BOIE 2008-12-03 0:37 ` David Miller @ 2008-12-03 21:11 ` Graeme Fowler 2008-12-04 7:47 ` Catalin(ux) M. BOIE 1 sibling, 1 reply; 23+ messages in thread From: Graeme Fowler @ 2008-12-03 21:11 UTC (permalink / raw) To: Catalin(ux) M. BOIE; +Cc: David Miller, jmack, netdev, lvs-devel On Tue, 2008-12-02 at 16:16 -0700, Catalin(ux) M. BOIE wrote: > I have a need, I didn't wake up some day and I dreamed to change this. > I have a gateway with LVS and I have 4 web servers behind. > I saw the help text at that option and I saw that I could raise the limit. OK, let's ask the same question a different way, after going a little further into your posting: > I was looking for anything that could get me past of 88.000 request per > seconds. > The help text told me to raise that value if I have big number of > connections. I just needed an easy way to test. OK, so from here: http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/LVS-HOWTO.operation.html#hash_table and onward... have you read all of that? It explains how the hash table size has been developed over the years, and everything that has changed with and relating to it. To pull out an example, a hash table size of 21 bits does not give a connection limit of 2^21 entries, since each part of the hash is a linked list which contains multiple entries, up to the RAM limit in the server. In the HOWTO, the example given shows that for a traffic pattern with 4 seconds session coherence and 1/8th of the traffic hitting the director being a SYN for the available config *appears* to require 2^21 bits. However, because each bit of the hash is a linked list, using 17 bits gets 16 entries in each list - this is not a problem, as it takes the CPU no time at all to search 16 entries. What you need to do for us, please, is to demonstrate that your problem (not exceeding 88k RPS) is categorically something to do with lookups in the hash table. I suspect (although I've been wrong before) that your problem is probably to do with the number of interrupts your hardware can process. There have been many posts of this type on lvs-users recently - search the archives to see what the solutions were. We're trying to help, but the collective wisdom here is that the hash table size is not the cause of your problem *which you haven't yet described fully*. Graeme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size. 2008-12-03 21:11 ` Graeme Fowler @ 2008-12-04 7:47 ` Catalin(ux) M. BOIE 0 siblings, 0 replies; 23+ messages in thread From: Catalin(ux) M. BOIE @ 2008-12-04 7:47 UTC (permalink / raw) To: Graeme Fowler; +Cc: David Miller, jmack, netdev, lvs-devel > On Tue, 2008-12-02 at 16:16 -0700, Catalin(ux) M. BOIE wrote: >> I have a need, I didn't wake up some day and I dreamed to change this. >> I have a gateway with LVS and I have 4 web servers behind. >> I saw the help text at that option and I saw that I could raise the >> limit. > > OK, let's ask the same question a different way, after going a little > further into your posting: > >> I was looking for anything that could get me past of 88.000 request per >> seconds. >> The help text told me to raise that value if I have big number of >> connections. I just needed an easy way to test. > > OK, so from here: > > http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/LVS-HOWTO.operation.html#hash_table > > and onward... have you read all of that? It explains how the hash table > size has been developed over the years, and everything that has changed > with and relating to it. > > To pull out an example, a hash table size of 21 bits does not give a > connection limit of 2^21 entries, since each part of the hash is a > linked list which contains multiple entries, up to the RAM limit in the > server. > > In the HOWTO, the example given shows that for a traffic pattern with 4 > seconds session coherence and 1/8th of the traffic hitting the director > being a SYN for the available config *appears* to require 2^21 bits. > However, because each bit of the hash is a linked list, using 17 bits > gets 16 entries in each list - this is not a problem, as it takes the > CPU no time at all to search 16 entries. > > What you need to do for us, please, is to demonstrate that your problem > (not exceeding 88k RPS) is categorically something to do with lookups in > the hash table. I suspect (although I've been wrong before) that your > problem is probably to do with the number of interrupts your hardware > can process. There have been many posts of this type on lvs-users > recently - search the archives to see what the solutions were. > > We're trying to help, but the collective wisdom here is that the hash > table size is not the cause of your problem *which you haven't yet > described fully*. > > Graeme > Hello! After some replys I understood that I explained very bad what was my intention. I was not complaining that hash size is too low or too high. I just provided a patch that I found useful: allow easy changing of a variable at boot time and not to recompile the kernel to change it. So, I didn't hit the stage when I do some tests to prove that the hash size is too low/high. After all you guys explained to me, it is pretty obvious that I will have bigger problems than the walking of a linked list too deep. So, I agree, my patch is useless. Again, sorry for eating your precious time with this! Thank you very much! -- Catalin(ux) M. BOIE http://kernel.embedromix.ro/ ^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2010-01-06 15:25 UTC | newest] Thread overview: 23+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-11-26 13:36 [PATCH] IPVS: Allow boot time change of hash size Catalin(ux) M. BOIE 2008-11-26 14:40 ` Joseph Mack NA3T 2008-11-26 23:27 ` David Miller 2008-11-27 7:05 ` Catalin(ux) M. BOIE 2008-11-27 7:37 ` David Miller 2008-11-27 6:58 ` Catalin(ux) M. BOIE 2008-11-27 15:58 ` Joseph Mack NA3T 2008-11-28 8:49 ` Catalin(ux) M. BOIE 2008-11-28 14:55 ` Joseph Mack NA3T 2008-12-02 15:34 ` Catalin(ux) M. BOIE 2008-12-02 22:51 ` David Miller 2008-12-02 23:16 ` Catalin(ux) M. BOIE 2008-12-03 0:37 ` David Miller 2009-12-28 18:49 ` Mark Bergsma 2009-12-29 1:34 ` Simon Horman 2010-01-04 13:57 ` Patrick McHardy 2010-01-04 23:24 ` Simon Horman 2010-01-05 11:02 ` Mark Bergsma 2010-01-06 15:25 ` Catalin(ux) M. BOIE 2010-01-05 0:20 ` Simon Horman 2010-01-05 4:56 ` Patrick McHardy 2008-12-03 21:11 ` Graeme Fowler 2008-12-04 7:47 ` Catalin(ux) M. BOIE
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).