* [PATCH] IPVS: Allow boot time change of hash size.
@ 2008-11-26 13:36 Catalin(ux) M. BOIE
2008-11-26 14:40 ` Joseph Mack NA3T
0 siblings, 1 reply; 23+ messages in thread
From: Catalin(ux) M. BOIE @ 2008-11-26 13:36 UTC (permalink / raw)
To: netdev; +Cc: lvs-devel
I was very frustrated about the fact that I have to recompile the kernel
to change the hash size. So, I created this patch.
If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel
command line, or, if you built IPVS as modules, you can add
options ip_vs conn_tab_bits=??.
To keep everything backward compatible, you still can select the size at
compile time, and that will be used as default.
Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro>
---
include/net/ip_vs.h | 16 ++++----------
net/netfilter/ipvs/Kconfig | 4 +++
net/netfilter/ipvs/ip_vs_conn.c | 41 ++++++++++++++++++++++++++++----------
net/netfilter/ipvs/ip_vs_ctl.c | 8 +++---
4 files changed, 43 insertions(+), 26 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index fe9fcf7..5a788a4 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -26,6 +26,11 @@
#include <linux/ipv6.h> /* for struct ipv6hdr */
#include <net/ipv6.h> /* for ipv6_addr_copy */
+
+/* Connections' size value needed by ip_vs_ctl.c */
+extern int ip_vs_conn_tab_size;
+
+
struct ip_vs_iphdr {
int len;
__u8 protocol;
@@ -599,17 +604,6 @@ extern void ip_vs_init_hash_table(struct list_head *table, int rows);
* (from ip_vs_conn.c)
*/
-/*
- * IPVS connection entry hash table
- */
-#ifndef CONFIG_IP_VS_TAB_BITS
-#define CONFIG_IP_VS_TAB_BITS 12
-#endif
-
-#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS
-#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS)
-#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1)
-
enum {
IP_VS_DIR_INPUT = 0,
IP_VS_DIR_OUTPUT,
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 79a6980..c71e543 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -68,6 +68,10 @@ config IP_VS_TAB_BITS
each hash entry uses 8 bytes, so you can estimate how much memory is
needed for your box.
+ You can overwrite this number setting conn_tab_bits module parameter
+ or by appending ip_vs.conn_tab_bits=? to the kernel command line
+ if IP VS was compiled built-in.
+
comment "IPVS transport protocol load balancing support"
config IP_VS_PROTO_TCP
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 9a24332..b1462f1 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -37,6 +37,21 @@
#include <net/ip_vs.h>
+#ifndef CONFIG_IP_VS_TAB_BITS
+#define CONFIG_IP_VS_TAB_BITS 12
+#endif
+
+/*
+ * Connection hash size. Default is what was selected at compile time.
+*/
+int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
+MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
+
+/* size and mask values */
+int ip_vs_conn_tab_size;
+int ip_vs_conn_tab_mask;
+
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
@@ -122,11 +137,11 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
if (af == AF_INET6)
return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
(__force u32)port, proto, ip_vs_conn_rnd)
- & IP_VS_CONN_TAB_MASK;
+ & ip_vs_conn_tab_mask;
#endif
return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
ip_vs_conn_rnd)
- & IP_VS_CONN_TAB_MASK;
+ & ip_vs_conn_tab_mask;
}
@@ -752,7 +767,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
int idx;
struct ip_vs_conn *cp;
- for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+ for(idx = 0; idx < ip_vs_conn_tab_size; idx++) {
ct_read_lock_bh(idx);
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
if (pos-- == 0) {
@@ -789,7 +804,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
idx = l - ip_vs_conn_tab;
ct_read_unlock_bh(idx);
- while (++idx < IP_VS_CONN_TAB_SIZE) {
+ while (++idx < ip_vs_conn_tab_size) {
ct_read_lock_bh(idx);
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
seq->private = &ip_vs_conn_tab[idx];
@@ -972,8 +987,8 @@ void ip_vs_random_dropentry(void)
/*
* Randomly scan 1/32 of the whole table every second
*/
- for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
- unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
+ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
+ unsigned hash = net_random() & ip_vs_conn_tab_mask;
/*
* Lock is actually needed in this loop.
@@ -1025,7 +1040,7 @@ static void ip_vs_conn_flush(void)
struct ip_vs_conn *cp;
flush_again:
- for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+ for (idx=0; idx<ip_vs_conn_tab_size; idx++) {
/*
* Lock is actually needed in this loop.
*/
@@ -1056,10 +1071,14 @@ int __init ip_vs_conn_init(void)
{
int idx;
+ /* Compute size and mask */
+ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
+ ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
+
/*
* Allocate the connection hash table and initialize its list heads
*/
- ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+ ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(struct list_head));
if (!ip_vs_conn_tab)
return -ENOMEM;
@@ -1074,12 +1093,12 @@ int __init ip_vs_conn_init(void)
IP_VS_INFO("Connection hash table configured "
"(size=%d, memory=%ldKbytes)\n",
- IP_VS_CONN_TAB_SIZE,
- (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+ ip_vs_conn_tab_size,
+ (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
sizeof(struct ip_vs_conn));
- for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 0302cf3..6dcadc2 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1854,7 +1854,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN) {
seq_printf(seq,
"IP Virtual Server version %d.%d.%d (size=%d)\n",
- NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
seq_puts(seq,
"Prot LocalAddress:Port Scheduler Flags\n");
seq_puts(seq,
@@ -2385,7 +2385,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
char buf[64];
sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
- NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
ret = -EFAULT;
goto out;
@@ -2398,7 +2398,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
struct ip_vs_getinfo info;
info.version = IP_VS_VERSION_CODE;
- info.size = IP_VS_CONN_TAB_SIZE;
+ info.size = ip_vs_conn_tab_size;
info.num_services = ip_vs_num_services;
if (copy_to_user(user, &info, sizeof(info)) != 0)
ret = -EFAULT;
@@ -3238,7 +3238,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
case IPVS_CMD_GET_INFO:
NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
- IP_VS_CONN_TAB_SIZE);
+ ip_vs_conn_tab_size);
break;
}
--
1.5.6.5
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-11-26 13:36 [PATCH] IPVS: Allow boot time change of hash size Catalin(ux) M. BOIE
@ 2008-11-26 14:40 ` Joseph Mack NA3T
2008-11-26 23:27 ` David Miller
2008-11-27 6:58 ` Catalin(ux) M. BOIE
0 siblings, 2 replies; 23+ messages in thread
From: Joseph Mack NA3T @ 2008-11-26 14:40 UTC (permalink / raw)
To: Catalin(ux) M. BOIE; +Cc: netdev, lvs-devel
On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote:
> I was very frustrated about the fact that I have to recompile the kernel
> to change the hash size. So, I created this patch.
thanks for sending us the code.
Why do you need to change the hash size? We really don't
recommend anyone do this under normal circumstances
Thanks Joe
> If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel
> command line, or, if you built IPVS as modules, you can add
> options ip_vs conn_tab_bits=??.
> To keep everything backward compatible, you still can select the size at
> compile time, and that will be used as default.
>
> Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro>
> ---
> include/net/ip_vs.h | 16 ++++----------
> net/netfilter/ipvs/Kconfig | 4 +++
> net/netfilter/ipvs/ip_vs_conn.c | 41 ++++++++++++++++++++++++++++----------
> net/netfilter/ipvs/ip_vs_ctl.c | 8 +++---
> 4 files changed, 43 insertions(+), 26 deletions(-)
>
> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> index fe9fcf7..5a788a4 100644
> --- a/include/net/ip_vs.h
> +++ b/include/net/ip_vs.h
> @@ -26,6 +26,11 @@
> #include <linux/ipv6.h> /* for struct ipv6hdr */
> #include <net/ipv6.h> /* for ipv6_addr_copy */
>
> +
> +/* Connections' size value needed by ip_vs_ctl.c */
> +extern int ip_vs_conn_tab_size;
> +
> +
> struct ip_vs_iphdr {
> int len;
> __u8 protocol;
> @@ -599,17 +604,6 @@ extern void ip_vs_init_hash_table(struct list_head *table, int rows);
> * (from ip_vs_conn.c)
> */
>
> -/*
> - * IPVS connection entry hash table
> - */
> -#ifndef CONFIG_IP_VS_TAB_BITS
> -#define CONFIG_IP_VS_TAB_BITS 12
> -#endif
> -
> -#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS
> -#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS)
> -#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1)
> -
> enum {
> IP_VS_DIR_INPUT = 0,
> IP_VS_DIR_OUTPUT,
> diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
> index 79a6980..c71e543 100644
> --- a/net/netfilter/ipvs/Kconfig
> +++ b/net/netfilter/ipvs/Kconfig
> @@ -68,6 +68,10 @@ config IP_VS_TAB_BITS
> each hash entry uses 8 bytes, so you can estimate how much memory is
> needed for your box.
>
> + You can overwrite this number setting conn_tab_bits module parameter
> + or by appending ip_vs.conn_tab_bits=? to the kernel command line
> + if IP VS was compiled built-in.
> +
> comment "IPVS transport protocol load balancing support"
>
> config IP_VS_PROTO_TCP
> diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
> index 9a24332..b1462f1 100644
> --- a/net/netfilter/ipvs/ip_vs_conn.c
> +++ b/net/netfilter/ipvs/ip_vs_conn.c
> @@ -37,6 +37,21 @@
> #include <net/ip_vs.h>
>
>
> +#ifndef CONFIG_IP_VS_TAB_BITS
> +#define CONFIG_IP_VS_TAB_BITS 12
> +#endif
> +
> +/*
> + * Connection hash size. Default is what was selected at compile time.
> +*/
> +int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
> +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
> +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
> +
> +/* size and mask values */
> +int ip_vs_conn_tab_size;
> +int ip_vs_conn_tab_mask;
> +
> /*
> * Connection hash table: for input and output packets lookups of IPVS
> */
> @@ -122,11 +137,11 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
> if (af == AF_INET6)
> return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
> (__force u32)port, proto, ip_vs_conn_rnd)
> - & IP_VS_CONN_TAB_MASK;
> + & ip_vs_conn_tab_mask;
> #endif
> return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
> ip_vs_conn_rnd)
> - & IP_VS_CONN_TAB_MASK;
> + & ip_vs_conn_tab_mask;
> }
>
>
> @@ -752,7 +767,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
> int idx;
> struct ip_vs_conn *cp;
>
> - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
> + for(idx = 0; idx < ip_vs_conn_tab_size; idx++) {
> ct_read_lock_bh(idx);
> list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> if (pos-- == 0) {
> @@ -789,7 +804,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> idx = l - ip_vs_conn_tab;
> ct_read_unlock_bh(idx);
>
> - while (++idx < IP_VS_CONN_TAB_SIZE) {
> + while (++idx < ip_vs_conn_tab_size) {
> ct_read_lock_bh(idx);
> list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> seq->private = &ip_vs_conn_tab[idx];
> @@ -972,8 +987,8 @@ void ip_vs_random_dropentry(void)
> /*
> * Randomly scan 1/32 of the whole table every second
> */
> - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
> - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
> + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
> + unsigned hash = net_random() & ip_vs_conn_tab_mask;
>
> /*
> * Lock is actually needed in this loop.
> @@ -1025,7 +1040,7 @@ static void ip_vs_conn_flush(void)
> struct ip_vs_conn *cp;
>
> flush_again:
> - for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
> + for (idx=0; idx<ip_vs_conn_tab_size; idx++) {
> /*
> * Lock is actually needed in this loop.
> */
> @@ -1056,10 +1071,14 @@ int __init ip_vs_conn_init(void)
> {
> int idx;
>
> + /* Compute size and mask */
> + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
> + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
> +
> /*
> * Allocate the connection hash table and initialize its list heads
> */
> - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
> + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(struct list_head));
> if (!ip_vs_conn_tab)
> return -ENOMEM;
>
> @@ -1074,12 +1093,12 @@ int __init ip_vs_conn_init(void)
>
> IP_VS_INFO("Connection hash table configured "
> "(size=%d, memory=%ldKbytes)\n",
> - IP_VS_CONN_TAB_SIZE,
> - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
> + ip_vs_conn_tab_size,
> + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
> IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
> sizeof(struct ip_vs_conn));
>
> - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
> + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
> INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
> }
>
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index 0302cf3..6dcadc2 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -1854,7 +1854,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
> if (v == SEQ_START_TOKEN) {
> seq_printf(seq,
> "IP Virtual Server version %d.%d.%d (size=%d)\n",
> - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
> + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
> seq_puts(seq,
> "Prot LocalAddress:Port Scheduler Flags\n");
> seq_puts(seq,
> @@ -2385,7 +2385,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
> char buf[64];
>
> sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
> - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
> + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
> if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
> ret = -EFAULT;
> goto out;
> @@ -2398,7 +2398,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
> {
> struct ip_vs_getinfo info;
> info.version = IP_VS_VERSION_CODE;
> - info.size = IP_VS_CONN_TAB_SIZE;
> + info.size = ip_vs_conn_tab_size;
> info.num_services = ip_vs_num_services;
> if (copy_to_user(user, &info, sizeof(info)) != 0)
> ret = -EFAULT;
> @@ -3238,7 +3238,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
> case IPVS_CMD_GET_INFO:
> NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
> NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
> - IP_VS_CONN_TAB_SIZE);
> + ip_vs_conn_tab_size);
> break;
> }
>
>
--
Joseph Mack NA3T EME(B,D), FM05lw North Carolina
jmack (at) wm7d (dot) net - azimuthal equidistant map
generator at http://www.wm7d.net/azproj.shtml
Homepage http://www.austintek.com/ It's GNU/Linux!
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-11-26 14:40 ` Joseph Mack NA3T
@ 2008-11-26 23:27 ` David Miller
2008-11-27 7:05 ` Catalin(ux) M. BOIE
2008-11-27 6:58 ` Catalin(ux) M. BOIE
1 sibling, 1 reply; 23+ messages in thread
From: David Miller @ 2008-11-26 23:27 UTC (permalink / raw)
To: jmack; +Cc: catab, netdev, lvs-devel
From: Joseph Mack NA3T <jmack@wm7d.net>
Date: Wed, 26 Nov 2008 06:40:02 -0800 (PST)
> On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote:
>
> > I was very frustrated about the fact that I have to recompile the kernel
> > to change the hash size. So, I created this patch.
>
> thanks for sending us the code.
>
> Why do you need to change the hash size? We really don't recommend anyone do this under normal circumstances
In any event, this should be dynamically sized at run time.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-11-26 14:40 ` Joseph Mack NA3T
2008-11-26 23:27 ` David Miller
@ 2008-11-27 6:58 ` Catalin(ux) M. BOIE
2008-11-27 15:58 ` Joseph Mack NA3T
1 sibling, 1 reply; 23+ messages in thread
From: Catalin(ux) M. BOIE @ 2008-11-27 6:58 UTC (permalink / raw)
To: Joseph Mack NA3T; +Cc: netdev, lvs-devel
Hello!
> On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote:
>
>> I was very frustrated about the fact that I have to recompile the kernel
>> to change the hash size. So, I created this patch.
>
> thanks for sending us the code.
>
> Why do you need to change the hash size? We really don't
> recommend anyone do this under normal circumstances
As it is written in the help, to lower the collisions in the case of a lot
of concurrent connections.
Or am I missing something?
> Thanks Joe
>
>> If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel
>> command line, or, if you built IPVS as modules, you can add
>> options ip_vs conn_tab_bits=??.
>> To keep everything backward compatible, you still can select the size at
>> compile time, and that will be used as default.
>>
>> Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro>
>> ---
>> include/net/ip_vs.h | 16 ++++----------
>> net/netfilter/ipvs/Kconfig | 4 +++
>> net/netfilter/ipvs/ip_vs_conn.c | 41
>> ++++++++++++++++++++++++++++----------
>> net/netfilter/ipvs/ip_vs_ctl.c | 8 +++---
>> 4 files changed, 43 insertions(+), 26 deletions(-)
>>
>> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
>> index fe9fcf7..5a788a4 100644
>> --- a/include/net/ip_vs.h
>> +++ b/include/net/ip_vs.h
>> @@ -26,6 +26,11 @@
>> #include <linux/ipv6.h> /* for struct ipv6hdr */
>> #include <net/ipv6.h> /* for ipv6_addr_copy */
>>
>> +
>> +/* Connections' size value needed by ip_vs_ctl.c */
>> +extern int ip_vs_conn_tab_size;
>> +
>> +
>> struct ip_vs_iphdr {
>> int len;
>> __u8 protocol;
>> @@ -599,17 +604,6 @@ extern void ip_vs_init_hash_table(struct list_head
>> *table, int rows);
>> * (from ip_vs_conn.c)
>> */
>>
>> -/*
>> - * IPVS connection entry hash table
>> - */
>> -#ifndef CONFIG_IP_VS_TAB_BITS
>> -#define CONFIG_IP_VS_TAB_BITS 12
>> -#endif
>> -
>> -#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS
>> -#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS)
>> -#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1)
>> -
>> enum {
>> IP_VS_DIR_INPUT = 0,
>> IP_VS_DIR_OUTPUT,
>> diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
>> index 79a6980..c71e543 100644
>> --- a/net/netfilter/ipvs/Kconfig
>> +++ b/net/netfilter/ipvs/Kconfig
>> @@ -68,6 +68,10 @@ config IP_VS_TAB_BITS
>> each hash entry uses 8 bytes, so you can estimate how much memory is
>> needed for your box.
>>
>> + You can overwrite this number setting conn_tab_bits module parameter
>> + or by appending ip_vs.conn_tab_bits=? to the kernel command line
>> + if IP VS was compiled built-in.
>> +
>> comment "IPVS transport protocol load balancing support"
>>
>> config IP_VS_PROTO_TCP
>> diff --git a/net/netfilter/ipvs/ip_vs_conn.c
>> b/net/netfilter/ipvs/ip_vs_conn.c
>> index 9a24332..b1462f1 100644
>> --- a/net/netfilter/ipvs/ip_vs_conn.c
>> +++ b/net/netfilter/ipvs/ip_vs_conn.c
>> @@ -37,6 +37,21 @@
>> #include <net/ip_vs.h>
>>
>>
>> +#ifndef CONFIG_IP_VS_TAB_BITS
>> +#define CONFIG_IP_VS_TAB_BITS 12
>> +#endif
>> +
>> +/*
>> + * Connection hash size. Default is what was selected at compile time.
>> +*/
>> +int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
>> +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
>> +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
>> +
>> +/* size and mask values */
>> +int ip_vs_conn_tab_size;
>> +int ip_vs_conn_tab_mask;
>> +
>> /*
>> * Connection hash table: for input and output packets lookups of IPVS
>> */
>> @@ -122,11 +137,11 @@ static unsigned int ip_vs_conn_hashkey(int af,
>> unsigned proto,
>> if (af == AF_INET6)
>> return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
>> (__force u32)port, proto, ip_vs_conn_rnd)
>> - & IP_VS_CONN_TAB_MASK;
>> + & ip_vs_conn_tab_mask;
>> #endif
>> return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
>> ip_vs_conn_rnd)
>> - & IP_VS_CONN_TAB_MASK;
>> + & ip_vs_conn_tab_mask;
>> }
>>
>>
>> @@ -752,7 +767,7 @@ static void *ip_vs_conn_array(struct seq_file *seq,
>> loff_t pos)
>> int idx;
>> struct ip_vs_conn *cp;
>>
>> - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
>> + for(idx = 0; idx < ip_vs_conn_tab_size; idx++) {
>> ct_read_lock_bh(idx);
>> list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
>> if (pos-- == 0) {
>> @@ -789,7 +804,7 @@ static void *ip_vs_conn_seq_next(struct seq_file
>> *seq, void *v, loff_t *pos)
>> idx = l - ip_vs_conn_tab;
>> ct_read_unlock_bh(idx);
>>
>> - while (++idx < IP_VS_CONN_TAB_SIZE) {
>> + while (++idx < ip_vs_conn_tab_size) {
>> ct_read_lock_bh(idx);
>> list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
>> seq->private = &ip_vs_conn_tab[idx];
>> @@ -972,8 +987,8 @@ void ip_vs_random_dropentry(void)
>> /*
>> * Randomly scan 1/32 of the whole table every second
>> */
>> - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
>> - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
>> + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
>> + unsigned hash = net_random() & ip_vs_conn_tab_mask;
>>
>> /*
>> * Lock is actually needed in this loop.
>> @@ -1025,7 +1040,7 @@ static void ip_vs_conn_flush(void)
>> struct ip_vs_conn *cp;
>>
>> flush_again:
>> - for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
>> + for (idx=0; idx<ip_vs_conn_tab_size; idx++) {
>> /*
>> * Lock is actually needed in this loop.
>> */
>> @@ -1056,10 +1071,14 @@ int __init ip_vs_conn_init(void)
>> {
>> int idx;
>>
>> + /* Compute size and mask */
>> + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
>> + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
>> +
>> /*
>> * Allocate the connection hash table and initialize its list heads
>> */
>> - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct
>> list_head));
>> + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(struct
>> list_head));
>> if (!ip_vs_conn_tab)
>> return -ENOMEM;
>>
>> @@ -1074,12 +1093,12 @@ int __init ip_vs_conn_init(void)
>>
>> IP_VS_INFO("Connection hash table configured "
>> "(size=%d, memory=%ldKbytes)\n",
>> - IP_VS_CONN_TAB_SIZE,
>> - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
>> + ip_vs_conn_tab_size,
>> + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
>> IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
>> sizeof(struct ip_vs_conn));
>>
>> - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
>> + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
>> INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
>> }
>>
>> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c
>> b/net/netfilter/ipvs/ip_vs_ctl.c
>> index 0302cf3..6dcadc2 100644
>> --- a/net/netfilter/ipvs/ip_vs_ctl.c
>> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
>> @@ -1854,7 +1854,7 @@ static int ip_vs_info_seq_show(struct seq_file
>> *seq, void *v)
>> if (v == SEQ_START_TOKEN) {
>> seq_printf(seq,
>> "IP Virtual Server version %d.%d.%d (size=%d)\n",
>> - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
>> + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
>> seq_puts(seq,
>> "Prot LocalAddress:Port Scheduler Flags\n");
>> seq_puts(seq,
>> @@ -2385,7 +2385,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void
>> __user *user, int *len)
>> char buf[64];
>>
>> sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
>> - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
>> + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
>> if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
>> ret = -EFAULT;
>> goto out;
>> @@ -2398,7 +2398,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void
>> __user *user, int *len)
>> {
>> struct ip_vs_getinfo info;
>> info.version = IP_VS_VERSION_CODE;
>> - info.size = IP_VS_CONN_TAB_SIZE;
>> + info.size = ip_vs_conn_tab_size;
>> info.num_services = ip_vs_num_services;
>> if (copy_to_user(user, &info, sizeof(info)) != 0)
>> ret = -EFAULT;
>> @@ -3238,7 +3238,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb,
>> struct genl_info *info)
>> case IPVS_CMD_GET_INFO:
>> NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
>> NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
>> - IP_VS_CONN_TAB_SIZE);
>> + ip_vs_conn_tab_size);
>> break;
>> }
>>
>>
>
> --
> Joseph Mack NA3T EME(B,D), FM05lw North Carolina
> jmack (at) wm7d (dot) net - azimuthal equidistant map
> generator at http://www.wm7d.net/azproj.shtml
> Homepage http://www.austintek.com/ It's GNU/Linux!
>
--
Catalin(ux) M. BOIE
http://kernel.embedromix.ro/
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-11-26 23:27 ` David Miller
@ 2008-11-27 7:05 ` Catalin(ux) M. BOIE
2008-11-27 7:37 ` David Miller
0 siblings, 1 reply; 23+ messages in thread
From: Catalin(ux) M. BOIE @ 2008-11-27 7:05 UTC (permalink / raw)
To: David Miller; +Cc: jmack, netdev, lvs-devel
Hello!
> From: Joseph Mack NA3T <jmack@wm7d.net>
> Date: Wed, 26 Nov 2008 06:40:02 -0800 (PST)
>
>> On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote:
>>
>> > I was very frustrated about the fact that I have to recompile the
>> kernel
>> > to change the hash size. So, I created this patch.
>>
>> thanks for sending us the code.
>>
>> Why do you need to change the hash size? We really don't recommend
>> anyone do this under normal circumstances
>
> In any event, this should be dynamically sized at run time.
David, thank you very much for feedback.
Do you mean that it should be computed based on memory size or you mean
that it should be changeable on-the-fly, even when we have some
connections in the tables already?
Because I really need this now, can we, pretty please, apply it as it is
because is an improvement over what we have now? And, I promise that I
will make it changeable at runtime.
Deal? :)
Or you do not want to have another legacy around?
Do you have some hints (where to look for a similar thing)?
The thing that worries me a little is the locking around the move and
maybe the latency involved. Yes, of course you will not change it several
times per minute, but...
Thank you very much!
--
Catalin(ux) M. BOIE
http://kernel.embedromix.ro/
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-11-27 7:05 ` Catalin(ux) M. BOIE
@ 2008-11-27 7:37 ` David Miller
0 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2008-11-27 7:37 UTC (permalink / raw)
To: catab; +Cc: jmack, netdev, lvs-devel
From: "Catalin(ux) M. BOIE" <catab@embedromix.ro>
Date: Thu, 27 Nov 2008 00:05:50 -0700 (MST)
> Do you mean that it should be computed based on memory size or you mean
> that it should be changeable on-the-fly, even when we have some
> connections in the tables already?
I mean the latter. We do this in other subsystems, such as
IPSEC, for various hashes.
> Do you have some hints (where to look for a similar thing)?
> The thing that worries me a little is the locking around the move and
> maybe the latency involved. Yes, of course you will not change it several
> times per minute, but...
You only increase, never decrease.
A lock is held during every access to these hash tables, so
the locking should be very similar.
Have a look at xfrm_hash_resize() in net/xfrm/xfrm_state.c to get
a good idea on how this stuff can be done. And hey, that code
resizes 3 tables at once, you only need to handle one :)
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-11-27 6:58 ` Catalin(ux) M. BOIE
@ 2008-11-27 15:58 ` Joseph Mack NA3T
2008-11-28 8:49 ` Catalin(ux) M. BOIE
0 siblings, 1 reply; 23+ messages in thread
From: Joseph Mack NA3T @ 2008-11-27 15:58 UTC (permalink / raw)
To: Catalin(ux) M. BOIE; +Cc: netdev, lvs-devel
On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote:
>> Why do you need to change the hash size? We really don't
>> recommend anyone do this under normal circumstances
>
> As it is written in the help, to lower the collisions in
> the case of a lot of concurrent connections. Or am I
> missing something?
not sure how big the collision problem is (have you measured
an effect?) but Ratz put out the results of a back of the
envelope calculation that the current value for the hash
table size would work on the currect fastest machines with
3G of memory
Joe
--
Joseph Mack NA3T EME(B,D), FM05lw North Carolina
jmack (at) wm7d (dot) net - azimuthal equidistant map
generator at http://www.wm7d.net/azproj.shtml
Homepage http://www.austintek.com/ It's GNU/Linux!
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-11-27 15:58 ` Joseph Mack NA3T
@ 2008-11-28 8:49 ` Catalin(ux) M. BOIE
2008-11-28 14:55 ` Joseph Mack NA3T
0 siblings, 1 reply; 23+ messages in thread
From: Catalin(ux) M. BOIE @ 2008-11-28 8:49 UTC (permalink / raw)
To: Joseph Mack NA3T; +Cc: Catalin M. BOIE, netdev, lvs-devel
> On Wed, 26 Nov 2008, Catalin(ux) M. BOIE wrote:
>
>>> Why do you need to change the hash size? We really don't
>>> recommend anyone do this under normal circumstances
>>
>> As it is written in the help, to lower the collisions in
>> the case of a lot of concurrent connections. Or am I
>> missing something?
>
> not sure how big the collision problem is (have you measured
> an effect?) but Ratz put out the results of a back of the
> envelope calculation that the current value for the hash
> table size would work on the currect fastest machines with
> 3G of memory
Hello, Joe!
No, I did not measure it, but, I read the help text:
Note the table size must be power of 2. The table size will be the
value of 2 to the your input number power. The number to choose is
from 8 to 20, the default number is 12, which means the table size
is 4096. Don't input the number too small, otherwise you will lose
performance on it. You can adapt the table size yourself, according
to your virtual server application. It is good to set the table size
not far less than the number of connections per second multiplying
average lasting time of connection in the table. For example, your
virtual server gets 200 connections per second, the connection lasts
for 200 seconds in average in the connection table, the table size
should be not far less than 200x200, it is good to set the table
size 32768 (2**15).
The help is wrong or I am missing something?
> Joe
>
> --
> Joseph Mack NA3T EME(B,D), FM05lw North Carolina
> jmack (at) wm7d (dot) net - azimuthal equidistant map
> generator at http://www.wm7d.net/azproj.shtml
> Homepage http://www.austintek.com/ It's GNU/Linux!
>
--
Catalin(ux) M. BOIE
http://kernel.embedromix.ro/
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-11-28 8:49 ` Catalin(ux) M. BOIE
@ 2008-11-28 14:55 ` Joseph Mack NA3T
2008-12-02 15:34 ` Catalin(ux) M. BOIE
0 siblings, 1 reply; 23+ messages in thread
From: Joseph Mack NA3T @ 2008-11-28 14:55 UTC (permalink / raw)
To: Catalin(ux) M. BOIE; +Cc: netdev, lvs-devel
On Fri, 28 Nov 2008, Catalin(ux) M. BOIE wrote:
> Hello, Joe!
>
> No, I did not measure it, but, I read the help text:
.
.
> The help is wrong or I am missing something?
there's a bit somewhere else in the same section saying not
to change the hash size unless you know more than we do :-)
Joe
--
Joseph Mack NA3T EME(B,D), FM05lw North Carolina
jmack (at) wm7d (dot) net - azimuthal equidistant map
generator at http://www.wm7d.net/azproj.shtml
Homepage http://www.austintek.com/ It's GNU/Linux!
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-11-28 14:55 ` Joseph Mack NA3T
@ 2008-12-02 15:34 ` Catalin(ux) M. BOIE
2008-12-02 22:51 ` David Miller
0 siblings, 1 reply; 23+ messages in thread
From: Catalin(ux) M. BOIE @ 2008-12-02 15:34 UTC (permalink / raw)
To: Joseph Mack NA3T; +Cc: netdev, lvs-devel
Hello, Joe!
> On Fri, 28 Nov 2008, Catalin(ux) M. BOIE wrote:
>
>> Hello, Joe!
>>
>> No, I did not measure it, but, I read the help text:
> .
> .
>> The help is wrong or I am missing something?
>
> there's a bit somewhere else in the same section saying not
> to change the hash size unless you know more than we do :-)
Where, exactly?
So, should I not change it at all even if I have a great number of
simultaneously connections?
> Joe
>
> --
> Joseph Mack NA3T EME(B,D), FM05lw North Carolina
> jmack (at) wm7d (dot) net - azimuthal equidistant map
> generator at http://www.wm7d.net/azproj.shtml
> Homepage http://www.austintek.com/ It's GNU/Linux!
>
--
Catalin(ux) M. BOIE
http://kernel.embedromix.ro/
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-12-02 15:34 ` Catalin(ux) M. BOIE
@ 2008-12-02 22:51 ` David Miller
2008-12-02 23:16 ` Catalin(ux) M. BOIE
0 siblings, 1 reply; 23+ messages in thread
From: David Miller @ 2008-12-02 22:51 UTC (permalink / raw)
To: catab; +Cc: jmack, netdev, lvs-devel
From: "Catalin(ux) M. BOIE" <catab@embedromix.ro>
Date: Tue, 2 Dec 2008 08:34:22 -0700 (MST)
> Hello, Joe!
>
> > On Fri, 28 Nov 2008, Catalin(ux) M. BOIE wrote:
> >
> >> Hello, Joe!
> >>
> >> No, I did not measure it, but, I read the help text:
> > .
> > .
> >> The help is wrong or I am missing something?
> >
> > there's a bit somewhere else in the same section saying not
> > to change the hash size unless you know more than we do :-)
>
> Where, exactly?
>
> So, should I not change it at all even if I have a great number of
> simultaneously connections?
You should only ever change something like this if you actually
observe a specific performance problem.
This is the part that is driving everybody crazy about your
report. You seem to be changing this without having observed
a measurable performance problem first, and then tracked it down
specifically to this hash table's size.
You seem to want to change it because it seems to you like that is
what should be done. You don't really know if it even matters or not
for your workload.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-12-02 22:51 ` David Miller
@ 2008-12-02 23:16 ` Catalin(ux) M. BOIE
2008-12-03 0:37 ` David Miller
2008-12-03 21:11 ` Graeme Fowler
0 siblings, 2 replies; 23+ messages in thread
From: Catalin(ux) M. BOIE @ 2008-12-02 23:16 UTC (permalink / raw)
To: David Miller; +Cc: jmack, netdev, lvs-devel
> From: "Catalin(ux) M. BOIE" <catab@embedromix.ro>
> Date: Tue, 2 Dec 2008 08:34:22 -0700 (MST)
>
>> Hello, Joe!
>>
>> > On Fri, 28 Nov 2008, Catalin(ux) M. BOIE wrote:
>> >
>> >> Hello, Joe!
>> >>
>> >> No, I did not measure it, but, I read the help text:
>> > .
>> > .
>> >> The help is wrong or I am missing something?
>> >
>> > there's a bit somewhere else in the same section saying not
>> > to change the hash size unless you know more than we do :-)
>>
>> Where, exactly?
>>
>> So, should I not change it at all even if I have a great number of
>> simultaneously connections?
Hello, David!
> You should only ever change something like this if you actually
> observe a specific performance problem.
I have a need, I didn't wake up some day and I dreamed to change this.
I have a gateway with LVS and I have 4 web servers behind.
I saw the help text at that option and I saw that I could raise the limit.
> This is the part that is driving everybody crazy about your
> report. You seem to be changing this without having observed
> a measurable performance problem first, and then tracked it down
> specifically to this hash table's size.
I was looking for anything that could get me past of 88.000 request per
seconds.
The help text told me to raise that value if I have big number of
connections. I just needed an easy way to test.
> You seem to want to change it because it seems to you like that is
> what should be done. You don't really know if it even matters or not
> for your workload.
Without easy testing (boot time change not compile time change) is hard to
tell if it helps me or not.
Anyway, if the help text is wrong, let's correct it.
If it is correct, let's allow changing that value at runtime, so people
can easy juggle with it.
Do you agree, David?
Thank you.
--
Catalin(ux) M. BOIE
http://kernel.embedromix.ro/
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-12-02 23:16 ` Catalin(ux) M. BOIE
@ 2008-12-03 0:37 ` David Miller
2009-12-28 18:49 ` Mark Bergsma
2008-12-03 21:11 ` Graeme Fowler
1 sibling, 1 reply; 23+ messages in thread
From: David Miller @ 2008-12-03 0:37 UTC (permalink / raw)
To: catab; +Cc: jmack, netdev, lvs-devel
From: "Catalin(ux) M. BOIE" <catab@embedromix.ro>
Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST)
> > From: "Catalin(ux) M. BOIE" <catab@embedromix.ro>
> > Date: Tue, 2 Dec 2008 08:34:22 -0700 (MST)
> >
> > This is the part that is driving everybody crazy about your
> > report. You seem to be changing this without having observed
> > a measurable performance problem first, and then tracked it down
> > specifically to this hash table's size.
>
> I was looking for anything that could get me past of 88.000 request per
> seconds.
> The help text told me to raise that value if I have big number of
> connections. I just needed an easy way to test.
You're just repeating what I said, you "think" it should be
changed and as a result you are wasting everyones time.
You don't actually "know", you're just guessing using random
snippets from documentation rather than good hard evidence of
a need.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-12-02 23:16 ` Catalin(ux) M. BOIE
2008-12-03 0:37 ` David Miller
@ 2008-12-03 21:11 ` Graeme Fowler
2008-12-04 7:47 ` Catalin(ux) M. BOIE
1 sibling, 1 reply; 23+ messages in thread
From: Graeme Fowler @ 2008-12-03 21:11 UTC (permalink / raw)
To: Catalin(ux) M. BOIE; +Cc: David Miller, jmack, netdev, lvs-devel
On Tue, 2008-12-02 at 16:16 -0700, Catalin(ux) M. BOIE wrote:
> I have a need, I didn't wake up some day and I dreamed to change this.
> I have a gateway with LVS and I have 4 web servers behind.
> I saw the help text at that option and I saw that I could raise the limit.
OK, let's ask the same question a different way, after going a little
further into your posting:
> I was looking for anything that could get me past of 88.000 request per
> seconds.
> The help text told me to raise that value if I have big number of
> connections. I just needed an easy way to test.
OK, so from here:
http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/LVS-HOWTO.operation.html#hash_table
and onward... have you read all of that? It explains how the hash table
size has been developed over the years, and everything that has changed
with and relating to it.
To pull out an example, a hash table size of 21 bits does not give a
connection limit of 2^21 entries, since each part of the hash is a
linked list which contains multiple entries, up to the RAM limit in the
server.
In the HOWTO, the example given shows that for a traffic pattern with 4
seconds session coherence and 1/8th of the traffic hitting the director
being a SYN for the available config *appears* to require 2^21 bits.
However, because each bit of the hash is a linked list, using 17 bits
gets 16 entries in each list - this is not a problem, as it takes the
CPU no time at all to search 16 entries.
What you need to do for us, please, is to demonstrate that your problem
(not exceeding 88k RPS) is categorically something to do with lookups in
the hash table. I suspect (although I've been wrong before) that your
problem is probably to do with the number of interrupts your hardware
can process. There have been many posts of this type on lvs-users
recently - search the archives to see what the solutions were.
We're trying to help, but the collective wisdom here is that the hash
table size is not the cause of your problem *which you haven't yet
described fully*.
Graeme
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-12-03 21:11 ` Graeme Fowler
@ 2008-12-04 7:47 ` Catalin(ux) M. BOIE
0 siblings, 0 replies; 23+ messages in thread
From: Catalin(ux) M. BOIE @ 2008-12-04 7:47 UTC (permalink / raw)
To: Graeme Fowler; +Cc: David Miller, jmack, netdev, lvs-devel
> On Tue, 2008-12-02 at 16:16 -0700, Catalin(ux) M. BOIE wrote:
>> I have a need, I didn't wake up some day and I dreamed to change this.
>> I have a gateway with LVS and I have 4 web servers behind.
>> I saw the help text at that option and I saw that I could raise the
>> limit.
>
> OK, let's ask the same question a different way, after going a little
> further into your posting:
>
>> I was looking for anything that could get me past of 88.000 request per
>> seconds.
>> The help text told me to raise that value if I have big number of
>> connections. I just needed an easy way to test.
>
> OK, so from here:
>
> http://www.austintek.com/LVS/LVS-HOWTO/HOWTO/LVS-HOWTO.operation.html#hash_table
>
> and onward... have you read all of that? It explains how the hash table
> size has been developed over the years, and everything that has changed
> with and relating to it.
>
> To pull out an example, a hash table size of 21 bits does not give a
> connection limit of 2^21 entries, since each part of the hash is a
> linked list which contains multiple entries, up to the RAM limit in the
> server.
>
> In the HOWTO, the example given shows that for a traffic pattern with 4
> seconds session coherence and 1/8th of the traffic hitting the director
> being a SYN for the available config *appears* to require 2^21 bits.
> However, because each bit of the hash is a linked list, using 17 bits
> gets 16 entries in each list - this is not a problem, as it takes the
> CPU no time at all to search 16 entries.
>
> What you need to do for us, please, is to demonstrate that your problem
> (not exceeding 88k RPS) is categorically something to do with lookups in
> the hash table. I suspect (although I've been wrong before) that your
> problem is probably to do with the number of interrupts your hardware
> can process. There have been many posts of this type on lvs-users
> recently - search the archives to see what the solutions were.
>
> We're trying to help, but the collective wisdom here is that the hash
> table size is not the cause of your problem *which you haven't yet
> described fully*.
>
> Graeme
>
Hello!
After some replys I understood that I explained very bad what was my
intention.
I was not complaining that hash size is too low or too high.
I just provided a patch that I found useful: allow easy changing of a
variable at boot time and not to recompile the kernel to change it.
So, I didn't hit the stage when I do some tests to prove that the hash
size is too low/high.
After all you guys explained to me, it is pretty obvious that I will have
bigger problems than the walking of a linked list too deep.
So, I agree, my patch is useless.
Again, sorry for eating your precious time with this!
Thank you very much!
--
Catalin(ux) M. BOIE
http://kernel.embedromix.ro/
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2008-12-03 0:37 ` David Miller
@ 2009-12-28 18:49 ` Mark Bergsma
2009-12-29 1:34 ` Simon Horman
2010-01-05 0:20 ` Simon Horman
0 siblings, 2 replies; 23+ messages in thread
From: Mark Bergsma @ 2009-12-28 18:49 UTC (permalink / raw)
To: netdev
On 03-12-08 01:37, David Miller wrote:
> From: "Catalin(ux) M. BOIE" <catab@embedromix.ro>
> Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST)
>> I was looking for anything that could get me past of 88.000 request per
>> seconds.
>> The help text told me to raise that value if I have big number of
>> connections. I just needed an easy way to test.
>
> You're just repeating what I said, you "think" it should be
> changed and as a result you are wasting everyones time.
>
> You don't actually "know", you're just guessing using random
> snippets from documentation rather than good hard evidence of
> a need.
Hello,
I just found this year-old thread about a patch allowing the IPVS
connection hash table size to be set at load time by a module parameter.
Apparently the conclusion reached was that allowing this configuration
setting to be changed would be useless, and that the poster's
performance problems would likely lie elsewhere, since he had no
evidence it was caused by the hash table size.
We do however run into the same problem with the default setting (2^12 =
4096 entries), as most of our LVS balancers handle around a million
connections/SLAB entries at any point in time (around 100-150 kpps
load). With only 4096 hash table entries this implies that each entry
consists of a linked list of 256 connections *on average*.
To provide some statistics, I did an oprofile run on an 2.6.31 kernel,
with both the default 4096 table size, and the same kernel recompiled
with IP_VS_CONN_TAB_BITS set to 18 (2^18 = 262144 entries). I built a
quick test setup with a part of Wikimedia/Wikipedia's live traffic
mirrored by the switch to the test host.
With the default setting, at ~ 120 kpps packet load we saw a typical %si
CPU usage of around 30-35%, and oprofile reported a hot spot in
ip_vs_conn_in_get:
samples % image name app name
symbol name
1719761 42.3741 ip_vs.ko ip_vs.ko
ip_vs_conn_in_get
302577 7.4554 bnx2 bnx2 /bnx2
181984 4.4840 vmlinux vmlinux
__ticket_spin_lock
128636 3.1695 vmlinux vmlinux
ip_route_input
74345 1.8318 ip_vs.ko ip_vs.ko
ip_vs_conn_out_get
68482 1.6874 vmlinux vmlinux
mwait_idle
After loading the recompiled kernel with 2^18 entries, %si CPU usage
dropped in half to around 12-18%, and oprofile looks much healthier,
with only 7% spent in ip_vs_conn_in_get:
samples % image name app name
symbol name
265641 14.4616 bnx2 bnx2 /bnx2
143251 7.7986 vmlinux vmlinux
__ticket_spin_lock
140661 7.6576 ip_vs.ko ip_vs.ko
ip_vs_conn_in_get
94364 5.1372 vmlinux vmlinux
mwait_idle
86267 4.6964 vmlinux vmlinux
ip_route_input
So yes, having the table size as an ip_vs module parameter would be
*very* welcome. Perhaps not as convenient as a dynamically resizing
table, but it would be a lot less work and much more maintainable in
production than compiling a kernel with every security update...
--
Mark Bergsma <mark@wikimedia.org>
Operations Engineer, Wikimedia Foundation
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2009-12-28 18:49 ` Mark Bergsma
@ 2009-12-29 1:34 ` Simon Horman
2010-01-04 13:57 ` Patrick McHardy
2010-01-05 0:20 ` Simon Horman
1 sibling, 1 reply; 23+ messages in thread
From: Simon Horman @ 2009-12-29 1:34 UTC (permalink / raw)
To: Mark Bergsma
Cc: netdev, lvs-devel, Catalin(ux) M. BOIE, Joseph Mack NA3T,
Graeme Fowler, David Miller, Patrick McHardy
On Mon, Dec 28, 2009 at 07:49:38PM +0100, Mark Bergsma wrote:
> On 03-12-08 01:37, David Miller wrote:
> > From: "Catalin(ux) M. BOIE" <catab@embedromix.ro>
> > Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST)
> >> I was looking for anything that could get me past of 88.000 request per
> >> seconds.
> >> The help text told me to raise that value if I have big number of
> >> connections. I just needed an easy way to test.
> >
> > You're just repeating what I said, you "think" it should be
> > changed and as a result you are wasting everyones time.
> >
> > You don't actually "know", you're just guessing using random
> > snippets from documentation rather than good hard evidence of
> > a need.
>
> Hello,
>
> I just found this year-old thread about a patch allowing the IPVS
> connection hash table size to be set at load time by a module parameter.
> Apparently the conclusion reached was that allowing this configuration
> setting to be changed would be useless, and that the poster's
> performance problems would likely lie elsewhere, since he had no
> evidence it was caused by the hash table size.
Hi Mark,
thanks for your test results. I have added them to the patch.
Feel free to edit the text.
From: Catalin(ux) M. BOIE <catab@embedromix.ro>
IPVS: Allow boot time change of hash size.
I was very frustrated about the fact that I have to recompile the kernel
to change the hash size. So, I created this patch.
If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel
command line, or, if you built IPVS as modules, you can add
options ip_vs conn_tab_bits=??.
To keep everything backward compatible, you still can select the size at
compile time, and that will be used as default.
[ horms@verge.net.au: trivial up-port and minor style fixes ]
Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro>
Cc: Mark Bergsma <mark@wikimedia.org>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
It has been about a year since this patch was originally posted
and subsequently dropped on the basis of insufficient test data.
Mark Bergsma has provided the following test results which seem
to strongly support the need for larger hash table sizes:
We do however run into the same problem with the default setting (2^12 =
4096 entries), as most of our LVS balancers handle around a million
connections/SLAB entries at any point in time (around 100-150 kpps
load). With only 4096 hash table entries this implies that each entry
consists of a linked list of 256 connections *on average*.
To provide some statistics, I did an oprofile run on an 2.6.31 kernel,
with both the default 4096 table size, and the same kernel recompiled
with IP_VS_CONN_TAB_BITS set to 18 (2^18 = 262144 entries). I built a
quick test setup with a part of Wikimedia/Wikipedia's live traffic
mirrored by the switch to the test host.
With the default setting, at ~ 120 kpps packet load we saw a typical %si
CPU usage of around 30-35%, and oprofile reported a hot spot in
ip_vs_conn_in_get:
samples % image name app name
symbol name
1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get
302577 7.4554 bnx2 bnx2 /bnx2
181984 4.4840 vmlinux vmlinux __ticket_spin_lock
128636 3.1695 vmlinux vmlinux ip_route_input
74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get
68482 1.6874 vmlinux vmlinux mwait_idle
After loading the recompiled kernel with 2^18 entries, %si CPU usage
dropped in half to around 12-18%, and oprofile looks much healthier,
with only 7% spent in ip_vs_conn_in_get:
samples % image name app name
symbol name
265641 14.4616 bnx2 bnx2 /bnx2
143251 7.7986 vmlinux vmlinux __ticket_spin_lock
140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get
94364 5.1372 vmlinux vmlinux mwait_idle
86267 4.6964 vmlinux vmlinux ip_route_input
Index: net-next-2.6/include/net/ip_vs.h
===================================================================
--- net-next-2.6.orig/include/net/ip_vs.h 2009-12-29 10:18:56.000000000 +0900
+++ net-next-2.6/include/net/ip_vs.h 2009-12-29 10:19:13.000000000 +0900
@@ -26,6 +26,11 @@
#include <linux/ipv6.h> /* for struct ipv6hdr */
#include <net/ipv6.h> /* for ipv6_addr_copy */
+
+/* Connections' size value needed by ip_vs_ctl.c */
+extern int ip_vs_conn_tab_size;
+
+
struct ip_vs_iphdr {
int len;
__u8 protocol;
@@ -592,17 +597,6 @@ extern void ip_vs_init_hash_table(struct
* (from ip_vs_conn.c)
*/
-/*
- * IPVS connection entry hash table
- */
-#ifndef CONFIG_IP_VS_TAB_BITS
-#define CONFIG_IP_VS_TAB_BITS 12
-#endif
-
-#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS
-#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS)
-#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1)
-
enum {
IP_VS_DIR_INPUT = 0,
IP_VS_DIR_OUTPUT,
Index: net-next-2.6/net/netfilter/ipvs/Kconfig
===================================================================
--- net-next-2.6.orig/net/netfilter/ipvs/Kconfig 2009-12-29 10:18:56.000000000 +0900
+++ net-next-2.6/net/netfilter/ipvs/Kconfig 2009-12-29 10:19:13.000000000 +0900
@@ -68,6 +68,10 @@ config IP_VS_TAB_BITS
each hash entry uses 8 bytes, so you can estimate how much memory is
needed for your box.
+ You can overwrite this number setting conn_tab_bits module parameter
+ or by appending ip_vs.conn_tab_bits=? to the kernel command line
+ if IP VS was compiled built-in.
+
comment "IPVS transport protocol load balancing support"
config IP_VS_PROTO_TCP
Index: net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c
===================================================================
--- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c 2009-12-29 10:18:56.000000000 +0900
+++ net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c 2009-12-29 10:19:13.000000000 +0900
@@ -40,6 +40,21 @@
#include <net/ip_vs.h>
+#ifndef CONFIG_IP_VS_TAB_BITS
+#define CONFIG_IP_VS_TAB_BITS 12
+#endif
+
+/*
+ * Connection hash size. Default is what was selected at compile time.
+*/
+int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
+MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
+
+/* size and mask values */
+int ip_vs_conn_tab_size;
+int ip_vs_conn_tab_mask;
+
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
@@ -125,11 +140,11 @@ static unsigned int ip_vs_conn_hashkey(i
if (af == AF_INET6)
return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
(__force u32)port, proto, ip_vs_conn_rnd)
- & IP_VS_CONN_TAB_MASK;
+ & ip_vs_conn_tab_mask;
#endif
return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
ip_vs_conn_rnd)
- & IP_VS_CONN_TAB_MASK;
+ & ip_vs_conn_tab_mask;
}
@@ -760,7 +775,7 @@ static void *ip_vs_conn_array(struct seq
int idx;
struct ip_vs_conn *cp;
- for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
ct_read_lock_bh(idx);
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
if (pos-- == 0) {
@@ -797,7 +812,7 @@ static void *ip_vs_conn_seq_next(struct
idx = l - ip_vs_conn_tab;
ct_read_unlock_bh(idx);
- while (++idx < IP_VS_CONN_TAB_SIZE) {
+ while (++idx < ip_vs_conn_tab_size) {
ct_read_lock_bh(idx);
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
seq->private = &ip_vs_conn_tab[idx];
@@ -976,8 +991,8 @@ void ip_vs_random_dropentry(void)
/*
* Randomly scan 1/32 of the whole table every second
*/
- for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
- unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
+ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
+ unsigned hash = net_random() & ip_vs_conn_tab_mask;
/*
* Lock is actually needed in this loop.
@@ -1029,7 +1044,7 @@ static void ip_vs_conn_flush(void)
struct ip_vs_conn *cp;
flush_again:
- for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
/*
* Lock is actually needed in this loop.
*/
@@ -1060,10 +1075,15 @@ int __init ip_vs_conn_init(void)
{
int idx;
+ /* Compute size and mask */
+ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
+ ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
+
/*
* Allocate the connection hash table and initialize its list heads
*/
- ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+ ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size *
+ sizeof(struct list_head));
if (!ip_vs_conn_tab)
return -ENOMEM;
@@ -1078,12 +1098,12 @@ int __init ip_vs_conn_init(void)
pr_info("Connection hash table configured "
"(size=%d, memory=%ldKbytes)\n",
- IP_VS_CONN_TAB_SIZE,
- (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+ ip_vs_conn_tab_size,
+ (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
sizeof(struct ip_vs_conn));
- for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
}
Index: net-next-2.6/net/netfilter/ipvs/ip_vs_ctl.c
===================================================================
--- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_ctl.c 2009-12-29 10:18:56.000000000 +0900
+++ net-next-2.6/net/netfilter/ipvs/ip_vs_ctl.c 2009-12-29 10:19:13.000000000 +0900
@@ -1843,7 +1843,7 @@ static int ip_vs_info_seq_show(struct se
if (v == SEQ_START_TOKEN) {
seq_printf(seq,
"IP Virtual Server version %d.%d.%d (size=%d)\n",
- NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
seq_puts(seq,
"Prot LocalAddress:Port Scheduler Flags\n");
seq_puts(seq,
@@ -2374,7 +2374,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cm
char buf[64];
sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
- NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
ret = -EFAULT;
goto out;
@@ -2387,7 +2387,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cm
{
struct ip_vs_getinfo info;
info.version = IP_VS_VERSION_CODE;
- info.size = IP_VS_CONN_TAB_SIZE;
+ info.size = ip_vs_conn_tab_size;
info.num_services = ip_vs_num_services;
if (copy_to_user(user, &info, sizeof(info)) != 0)
ret = -EFAULT;
@@ -3231,7 +3231,7 @@ static int ip_vs_genl_get_cmd(struct sk_
case IPVS_CMD_GET_INFO:
NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
- IP_VS_CONN_TAB_SIZE);
+ ip_vs_conn_tab_size);
break;
}
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2009-12-29 1:34 ` Simon Horman
@ 2010-01-04 13:57 ` Patrick McHardy
2010-01-04 23:24 ` Simon Horman
0 siblings, 1 reply; 23+ messages in thread
From: Patrick McHardy @ 2010-01-04 13:57 UTC (permalink / raw)
To: Simon Horman
Cc: Mark Bergsma, netdev, lvs-devel, Catalin(ux) M. BOIE,
Joseph Mack NA3T, Graeme Fowler, David Miller
Simon Horman wrote:
> On Mon, Dec 28, 2009 at 07:49:38PM +0100, Mark Bergsma wrote:
>> On 03-12-08 01:37, David Miller wrote:
>>> From: "Catalin(ux) M. BOIE" <catab@embedromix.ro>
>>> Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST)
>>>> I was looking for anything that could get me past of 88.000 request per
>>>> seconds.
>>>> The help text told me to raise that value if I have big number of
>>>> connections. I just needed an easy way to test.
>>> You're just repeating what I said, you "think" it should be
>>> changed and as a result you are wasting everyones time.
>>>
>>> You don't actually "know", you're just guessing using random
>>> snippets from documentation rather than good hard evidence of
>>> a need.
>> Hello,
>>
>> I just found this year-old thread about a patch allowing the IPVS
>> connection hash table size to be set at load time by a module parameter.
>> Apparently the conclusion reached was that allowing this configuration
>> setting to be changed would be useless, and that the poster's
>> performance problems would likely lie elsewhere, since he had no
>> evidence it was caused by the hash table size.
>
> Hi Mark,
>
> thanks for your test results. I have added them to the patch.
> Feel free to edit the text.
Just wondering because of this comment - do you want me to apply this
patch?
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2010-01-04 13:57 ` Patrick McHardy
@ 2010-01-04 23:24 ` Simon Horman
2010-01-05 11:02 ` Mark Bergsma
0 siblings, 1 reply; 23+ messages in thread
From: Simon Horman @ 2010-01-04 23:24 UTC (permalink / raw)
To: Patrick McHardy
Cc: Mark Bergsma, netdev, lvs-devel, Catalin(ux) M. BOIE,
Joseph Mack NA3T, Graeme Fowler, David Miller
On Mon, Jan 04, 2010 at 02:57:22PM +0100, Patrick McHardy wrote:
> Simon Horman wrote:
> > On Mon, Dec 28, 2009 at 07:49:38PM +0100, Mark Bergsma wrote:
> >> On 03-12-08 01:37, David Miller wrote:
> >>> From: "Catalin(ux) M. BOIE" <catab@embedromix.ro>
> >>> Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST)
> >>>> I was looking for anything that could get me past of 88.000 request per
> >>>> seconds.
> >>>> The help text told me to raise that value if I have big number of
> >>>> connections. I just needed an easy way to test.
> >>> You're just repeating what I said, you "think" it should be
> >>> changed and as a result you are wasting everyones time.
> >>>
> >>> You don't actually "know", you're just guessing using random
> >>> snippets from documentation rather than good hard evidence of
> >>> a need.
> >> Hello,
> >>
> >> I just found this year-old thread about a patch allowing the IPVS
> >> connection hash table size to be set at load time by a module parameter.
> >> Apparently the conclusion reached was that allowing this configuration
> >> setting to be changed would be useless, and that the poster's
> >> performance problems would likely lie elsewhere, since he had no
> >> evidence it was caused by the hash table size.
> >
> > Hi Mark,
> >
> > thanks for your test results. I have added them to the patch.
> > Feel free to edit the text.
>
> Just wondering because of this comment - do you want me to apply this
> patch?
I was fishing for an response from Mark.
I'll resubmit it properly as that doesn't seem to be forthcoming.
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH] IPVS: Allow boot time change of hash size.
2009-12-28 18:49 ` Mark Bergsma
2009-12-29 1:34 ` Simon Horman
@ 2010-01-05 0:20 ` Simon Horman
2010-01-05 4:56 ` Patrick McHardy
1 sibling, 1 reply; 23+ messages in thread
From: Simon Horman @ 2010-01-05 0:20 UTC (permalink / raw)
To: netdev, lvs-devel
Cc: Catalin(ux) M. BOIE, Mark Bergsma, Joseph Mack NA3T,
Graeme Fowler, David Miller, Patrick McHardy
From: Catalin(ux) M. BOIE <catab@embedromix.ro>
IPVS: Allow boot time change of hash size.
I was very frustrated about the fact that I have to recompile the kernel
to change the hash size. So, I created this patch.
If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel
command line, or, if you built IPVS as modules, you can add
options ip_vs conn_tab_bits=??.
To keep everything backward compatible, you still can select the size at
compile time, and that will be used as default.
[ horms@verge.net.au: trivial up-port and minor style fixes ]
Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro>
Cc: Mark Bergsma <mark@wikimedia.org>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
Patrick, please consider applying this. I'd like to do something dynamic.
But this change is an obvious win in the mean time.
It has been about a year since this patch was originally posted
and subsequently dropped on the basis of insufficient test data.
Mark Bergsma has provided the following test results which seem
to strongly support the need for larger hash table sizes:
We do however run into the same problem with the default setting (2^12 =
4096 entries), as most of our LVS balancers handle around a million
connections/SLAB entries at any point in time (around 100-150 kpps
load). With only 4096 hash table entries this implies that each entry
consists of a linked list of 256 connections *on average*.
To provide some statistics, I did an oprofile run on an 2.6.31 kernel,
with both the default 4096 table size, and the same kernel recompiled
with IP_VS_CONN_TAB_BITS set to 18 (2^18 = 262144 entries). I built a
quick test setup with a part of Wikimedia/Wikipedia's live traffic
mirrored by the switch to the test host.
With the default setting, at ~ 120 kpps packet load we saw a typical %si
CPU usage of around 30-35%, and oprofile reported a hot spot in
ip_vs_conn_in_get:
samples % image name app name
symbol name
1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get
302577 7.4554 bnx2 bnx2 /bnx2
181984 4.4840 vmlinux vmlinux __ticket_spin_lock
128636 3.1695 vmlinux vmlinux ip_route_input
74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get
68482 1.6874 vmlinux vmlinux mwait_idle
After loading the recompiled kernel with 2^18 entries, %si CPU usage
dropped in half to around 12-18%, and oprofile looks much healthier,
with only 7% spent in ip_vs_conn_in_get:
samples % image name app name
symbol name
265641 14.4616 bnx2 bnx2 /bnx2
143251 7.7986 vmlinux vmlinux __ticket_spin_lock
140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get
94364 5.1372 vmlinux vmlinux mwait_idle
86267 4.6964 vmlinux vmlinux ip_route_input
Index: net-next-2.6/include/net/ip_vs.h
===================================================================
--- net-next-2.6.orig/include/net/ip_vs.h 2009-12-29 10:18:56.000000000 +0900
+++ net-next-2.6/include/net/ip_vs.h 2009-12-29 10:19:13.000000000 +0900
@@ -26,6 +26,11 @@
#include <linux/ipv6.h> /* for struct ipv6hdr */
#include <net/ipv6.h> /* for ipv6_addr_copy */
+
+/* Connections' size value needed by ip_vs_ctl.c */
+extern int ip_vs_conn_tab_size;
+
+
struct ip_vs_iphdr {
int len;
__u8 protocol;
@@ -592,17 +597,6 @@ extern void ip_vs_init_hash_table(struct
* (from ip_vs_conn.c)
*/
-/*
- * IPVS connection entry hash table
- */
-#ifndef CONFIG_IP_VS_TAB_BITS
-#define CONFIG_IP_VS_TAB_BITS 12
-#endif
-
-#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS
-#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS)
-#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1)
-
enum {
IP_VS_DIR_INPUT = 0,
IP_VS_DIR_OUTPUT,
Index: net-next-2.6/net/netfilter/ipvs/Kconfig
===================================================================
--- net-next-2.6.orig/net/netfilter/ipvs/Kconfig 2009-12-29 10:18:56.000000000 +0900
+++ net-next-2.6/net/netfilter/ipvs/Kconfig 2009-12-29 10:19:13.000000000 +0900
@@ -68,6 +68,10 @@ config IP_VS_TAB_BITS
each hash entry uses 8 bytes, so you can estimate how much memory is
needed for your box.
+ You can overwrite this number setting conn_tab_bits module parameter
+ or by appending ip_vs.conn_tab_bits=? to the kernel command line
+ if IP VS was compiled built-in.
+
comment "IPVS transport protocol load balancing support"
config IP_VS_PROTO_TCP
Index: net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c
===================================================================
--- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c 2009-12-29 10:18:56.000000000 +0900
+++ net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c 2009-12-29 10:19:13.000000000 +0900
@@ -40,6 +40,21 @@
#include <net/ip_vs.h>
+#ifndef CONFIG_IP_VS_TAB_BITS
+#define CONFIG_IP_VS_TAB_BITS 12
+#endif
+
+/*
+ * Connection hash size. Default is what was selected at compile time.
+*/
+int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
+MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
+
+/* size and mask values */
+int ip_vs_conn_tab_size;
+int ip_vs_conn_tab_mask;
+
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
@@ -125,11 +140,11 @@ static unsigned int ip_vs_conn_hashkey(i
if (af == AF_INET6)
return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
(__force u32)port, proto, ip_vs_conn_rnd)
- & IP_VS_CONN_TAB_MASK;
+ & ip_vs_conn_tab_mask;
#endif
return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
ip_vs_conn_rnd)
- & IP_VS_CONN_TAB_MASK;
+ & ip_vs_conn_tab_mask;
}
@@ -760,7 +775,7 @@ static void *ip_vs_conn_array(struct seq
int idx;
struct ip_vs_conn *cp;
- for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
ct_read_lock_bh(idx);
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
if (pos-- == 0) {
@@ -797,7 +812,7 @@ static void *ip_vs_conn_seq_next(struct
idx = l - ip_vs_conn_tab;
ct_read_unlock_bh(idx);
- while (++idx < IP_VS_CONN_TAB_SIZE) {
+ while (++idx < ip_vs_conn_tab_size) {
ct_read_lock_bh(idx);
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
seq->private = &ip_vs_conn_tab[idx];
@@ -976,8 +991,8 @@ void ip_vs_random_dropentry(void)
/*
* Randomly scan 1/32 of the whole table every second
*/
- for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
- unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
+ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
+ unsigned hash = net_random() & ip_vs_conn_tab_mask;
/*
* Lock is actually needed in this loop.
@@ -1029,7 +1044,7 @@ static void ip_vs_conn_flush(void)
struct ip_vs_conn *cp;
flush_again:
- for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
/*
* Lock is actually needed in this loop.
*/
@@ -1060,10 +1075,15 @@ int __init ip_vs_conn_init(void)
{
int idx;
+ /* Compute size and mask */
+ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
+ ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
+
/*
* Allocate the connection hash table and initialize its list heads
*/
- ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+ ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size *
+ sizeof(struct list_head));
if (!ip_vs_conn_tab)
return -ENOMEM;
@@ -1078,12 +1098,12 @@ int __init ip_vs_conn_init(void)
pr_info("Connection hash table configured "
"(size=%d, memory=%ldKbytes)\n",
- IP_VS_CONN_TAB_SIZE,
- (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+ ip_vs_conn_tab_size,
+ (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
sizeof(struct ip_vs_conn));
- for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
}
Index: net-next-2.6/net/netfilter/ipvs/ip_vs_ctl.c
===================================================================
--- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_ctl.c 2009-12-29 10:18:56.000000000 +0900
+++ net-next-2.6/net/netfilter/ipvs/ip_vs_ctl.c 2009-12-29 10:19:13.000000000 +0900
@@ -1843,7 +1843,7 @@ static int ip_vs_info_seq_show(struct se
if (v == SEQ_START_TOKEN) {
seq_printf(seq,
"IP Virtual Server version %d.%d.%d (size=%d)\n",
- NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
seq_puts(seq,
"Prot LocalAddress:Port Scheduler Flags\n");
seq_puts(seq,
@@ -2374,7 +2374,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cm
char buf[64];
sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
- NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
ret = -EFAULT;
goto out;
@@ -2387,7 +2387,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cm
{
struct ip_vs_getinfo info;
info.version = IP_VS_VERSION_CODE;
- info.size = IP_VS_CONN_TAB_SIZE;
+ info.size = ip_vs_conn_tab_size;
info.num_services = ip_vs_num_services;
if (copy_to_user(user, &info, sizeof(info)) != 0)
ret = -EFAULT;
@@ -3231,7 +3231,7 @@ static int ip_vs_genl_get_cmd(struct sk_
case IPVS_CMD_GET_INFO:
NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
- IP_VS_CONN_TAB_SIZE);
+ ip_vs_conn_tab_size);
break;
}
--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2010-01-05 0:20 ` Simon Horman
@ 2010-01-05 4:56 ` Patrick McHardy
0 siblings, 0 replies; 23+ messages in thread
From: Patrick McHardy @ 2010-01-05 4:56 UTC (permalink / raw)
To: Simon Horman
Cc: netdev, lvs-devel, Catalin(ux) M. BOIE, Mark Bergsma,
Joseph Mack NA3T, Graeme Fowler, David Miller
Simon Horman wrote:
> From: Catalin(ux) M. BOIE <catab@embedromix.ro>
>
> IPVS: Allow boot time change of hash size.
>
> I was very frustrated about the fact that I have to recompile the kernel
> to change the hash size. So, I created this patch.
>
> If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel
> command line, or, if you built IPVS as modules, you can add
> options ip_vs conn_tab_bits=??.
>
> To keep everything backward compatible, you still can select the size at
> compile time, and that will be used as default.
>
> [ horms@verge.net.au: trivial up-port and minor style fixes ]
> Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro>
> Cc: Mark Bergsma <mark@wikimedia.org>
> Signed-off-by: Simon Horman <horms@verge.net.au>
>
> ---
>
> Patrick, please consider applying this. I'd like to do something dynamic.
> But this change is an obvious win in the mean time.
Applied to nf-next-2.6.git, thanks.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2010-01-04 23:24 ` Simon Horman
@ 2010-01-05 11:02 ` Mark Bergsma
2010-01-06 15:25 ` Catalin(ux) M. BOIE
0 siblings, 1 reply; 23+ messages in thread
From: Mark Bergsma @ 2010-01-05 11:02 UTC (permalink / raw)
To: Simon Horman
Cc: Patrick McHardy, netdev, lvs-devel, Catalin(ux) M. BOIE,
Joseph Mack NA3T, Graeme Fowler, David Miller
On 05-01-10 00:24, Simon Horman wrote:
> On Mon, Jan 04, 2010 at 02:57:22PM +0100, Patrick McHardy wrote:
>> Simon Horman wrote:
>>> On Mon, Dec 28, 2009 at 07:49:38PM +0100, Mark Bergsma wrote:
>>>> On 03-12-08 01:37, David Miller wrote:
>>>>> From: "Catalin(ux) M. BOIE" <catab@embedromix.ro>
>>>>> Date: Tue, 2 Dec 2008 16:16:04 -0700 (MST)
>>>>>> I was looking for anything that could get me past of 88.000 request per
>>>>>> seconds.
>>>>>> The help text told me to raise that value if I have big number of
>>>>>> connections. I just needed an easy way to test.
>>>>> You're just repeating what I said, you "think" it should be
>>>>> changed and as a result you are wasting everyones time.
>>>>>
>>>>> You don't actually "know", you're just guessing using random
>>>>> snippets from documentation rather than good hard evidence of
>>>>> a need.
>>>> Hello,
>>>>
>>>> I just found this year-old thread about a patch allowing the IPVS
>>>> connection hash table size to be set at load time by a module parameter.
>>>> Apparently the conclusion reached was that allowing this configuration
>>>> setting to be changed would be useless, and that the poster's
>>>> performance problems would likely lie elsewhere, since he had no
>>>> evidence it was caused by the hash table size.
>>>
>>> Hi Mark,
>>>
>>> thanks for your test results. I have added them to the patch.
>>> Feel free to edit the text.
>>
>> Just wondering because of this comment - do you want me to apply this
>> patch?
>
> I was fishing for an response from Mark.
> I'll resubmit it properly as that doesn't seem to be forthcoming.
Hi Simon/Patrick,
Sorry for the late reply, and yes, this is fine. Thanks for applying the
patch!
--
Mark Bergsma <mark@wikimedia.org>
Operations Engineer, Wikimedia Foundation
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] IPVS: Allow boot time change of hash size.
2010-01-05 11:02 ` Mark Bergsma
@ 2010-01-06 15:25 ` Catalin(ux) M. BOIE
0 siblings, 0 replies; 23+ messages in thread
From: Catalin(ux) M. BOIE @ 2010-01-06 15:25 UTC (permalink / raw)
To: Mark Bergsma
Cc: Simon Horman, Patrick McHardy, netdev, lvs-devel,
Joseph Mack NA3T, Graeme Fowler, David Miller
> Sorry for the late reply, and yes, this is fine. Thanks for applying the
> patch!
>
> --
> Mark Bergsma <mark@wikimedia.org>
> Operations Engineer, Wikimedia Foundation
I also want to thank Mark for providing the data, allowing this patch to
be accepted. Thanks, Mark!
--
Catalin(ux) M. BOIE
http://kernel.embedromix.ro/
^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2010-01-06 15:25 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-11-26 13:36 [PATCH] IPVS: Allow boot time change of hash size Catalin(ux) M. BOIE
2008-11-26 14:40 ` Joseph Mack NA3T
2008-11-26 23:27 ` David Miller
2008-11-27 7:05 ` Catalin(ux) M. BOIE
2008-11-27 7:37 ` David Miller
2008-11-27 6:58 ` Catalin(ux) M. BOIE
2008-11-27 15:58 ` Joseph Mack NA3T
2008-11-28 8:49 ` Catalin(ux) M. BOIE
2008-11-28 14:55 ` Joseph Mack NA3T
2008-12-02 15:34 ` Catalin(ux) M. BOIE
2008-12-02 22:51 ` David Miller
2008-12-02 23:16 ` Catalin(ux) M. BOIE
2008-12-03 0:37 ` David Miller
2009-12-28 18:49 ` Mark Bergsma
2009-12-29 1:34 ` Simon Horman
2010-01-04 13:57 ` Patrick McHardy
2010-01-04 23:24 ` Simon Horman
2010-01-05 11:02 ` Mark Bergsma
2010-01-06 15:25 ` Catalin(ux) M. BOIE
2010-01-05 0:20 ` Simon Horman
2010-01-05 4:56 ` Patrick McHardy
2008-12-03 21:11 ` Graeme Fowler
2008-12-04 7:47 ` Catalin(ux) M. BOIE
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).