Re: [PATCH] Improve behaviour of Netlink Sockets

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Re: [PATCH] Improve behaviour of Netlink Sockets
       [not found]         ` <20040827172736.543dbd54.davem@redhat.com>
@ 2004-08-30  0:37           ` Pablo Neira
  2004-08-31  0:20             ` David S. Miller
  0 siblings, 1 reply; 96+ messages in thread
From: Pablo Neira @ 2004-08-30  0:37 UTC (permalink / raw)
  To: David S. Miller, netdev, linux-net

[-- Attachment #1: Type: text/plain, Size: 175 bytes --]

Hi Davem,

David S. Miller wrote:

>Is a 2.4.x version possible?
>  
>
Attached the 2.4.x version. It just spawns one kernel thread called 
netlink, is it ok?

regards,
Pablo

[-- Attachment #2: netlink-tqueue.patch --]
[-- Type: text/x-patch, Size: 4051 bytes --]

--- a/net/netlink/af_netlink.c	2004-08-28 05:35:35.000000000 +0200
+++ b/net/netlink/af_netlink.c	2004-08-29 16:00:46.000000000 +0200
@@ -42,6 +42,7 @@
 #include <linux/notifier.h>
 #include <net/sock.h>
 #include <net/scm.h>
+#include <linux/tqueue.h>
 
 #define Nprintk(a...)
 
@@ -63,6 +64,15 @@
 	void			(*data_ready)(struct sock *sk, int bytes);
 };
 
+struct netlink_work
+{
+	struct sock 		*sk;
+	int 			len;
+	struct tq_struct	work;
+};
+
+static DECLARE_TASK_QUEUE(tq_netlink);
+static DECLARE_WAIT_QUEUE_HEAD(netlink_thread_wait);
 static struct sock *nl_table[MAX_LINKS];
 static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
 static unsigned nl_nonroot[MAX_LINKS];
@@ -81,6 +91,15 @@
 
 static struct notifier_block *netlink_chain;
 
+void netlink_tq_handler(void *data)
+{
+	struct netlink_work *work = data;
+	
+	work->sk->data_ready(work->sk, work->len);
+	sock_put(work->sk);
+	kfree(work);
+}
+
 static void netlink_sock_destruct(struct sock *sk)
 {
 	skb_queue_purge(&sk->receive_queue);
@@ -437,6 +456,8 @@
 
 	if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf ||
 	    test_bit(0, &sk->protinfo.af_netlink->state)) {
+		struct task_struct *client;
+		
 		if (!timeo) {
 			if (ssk->protinfo.af_netlink->pid == 0)
 				netlink_overrun(sk);
@@ -445,6 +466,19 @@
 			return -EAGAIN;
 		}
 
+		if (!sk->protinfo.af_netlink->pid) {
+			/* Kernel is sending information to user space
+			 * and socket buffer is full: Wake up user */
+			
+			client = find_task_by_pid(sk->protinfo.af_netlink->pid);
+			if (!client) {
+				sock_put(sk);
+				kfree_skb(skb);
+				return -EAGAIN;
+			}
+			wake_up_process(client);
+		}
+
 		__set_current_state(TASK_INTERRUPTIBLE);
 		add_wait_queue(&sk->protinfo.af_netlink->wait, &wait);
 
@@ -467,8 +501,26 @@
 	skb_orphan(skb);
 	skb_set_owner_r(skb, sk);
 	skb_queue_tail(&sk->receive_queue, skb);
-	sk->data_ready(sk, len);
-	sock_put(sk);
+
+	if (!sk->protinfo.af_netlink->pid) {
+		struct netlink_work *nlwork = 
+			kmalloc(sizeof(struct netlink_work), GFP_KERNEL);
+		
+		if  (!nlwork) {
+			sock_put(sk);
+			return -EAGAIN;
+		}
+		
+		INIT_TQUEUE(&nlwork->work, netlink_tq_handler, nlwork);
+		nlwork->sk = sk;
+		nlwork->len = len;
+		queue_task(&nlwork->work, &tq_netlink);
+		wake_up(&netlink_thread_wait);
+	} else {
+		sk->data_ready(sk, len);
+		sock_put(sk);
+	}
+	
 	return len;
 
 no_dst:
@@ -490,7 +542,22 @@
                 skb_orphan(skb);
 		skb_set_owner_r(skb, sk);
 		skb_queue_tail(&sk->receive_queue, skb);
-		sk->data_ready(sk, skb->len);
+
+		if (!sk->protinfo.af_netlink->pid) {
+			struct netlink_work *nlwork = 
+				kmalloc(sizeof(struct netlink_work), GFP_KERNEL);
+		
+			if  (!nlwork) 
+				return -EAGAIN;
+		
+			INIT_TQUEUE(&nlwork->work, netlink_tq_handler, nlwork);
+			nlwork->sk = sk;
+			nlwork->len = skb->len;
+			queue_task(&nlwork->work, &tq_netlink);
+			wake_up(&netlink_thread_wait);
+		} else 
+			sk->data_ready(sk, skb->len);
+
 		return 0;
 	}
 	return -1;
@@ -534,11 +601,12 @@
 			netlink_overrun(sk);
 			/* Clone failed. Notify ALL listeners. */
 			failure = 1;
+			sock_put(sk);
 		} else if (netlink_broadcast_deliver(sk, skb2)) {
 			netlink_overrun(sk);
+			sock_put(sk);
 		} else
 			skb2 = NULL;
-		sock_put(sk);
 	}
 
 	netlink_unlock_table();
@@ -868,6 +936,26 @@
 	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
 }
 
+int netlink_thread(void *unused)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	daemonize();
+	strcpy(tsk->comm, "netlink");
+	sigfillset(&tsk->blocked);
+	mb();
+
+	for (;;) {
+		run_task_queue(&tq_netlink);
+		
+		__set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&netlink_thread_wait, &wait);
+		schedule();
+		__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&netlink_thread_wait, &wait);
+	}
+}
 
 #ifdef NL_EMULATE_DEV
 
@@ -1027,6 +1115,8 @@
 #ifdef CONFIG_PROC_FS
 	create_proc_read_entry("net/netlink", 0, 0, netlink_read_proc, NULL);
 #endif
+	kernel_thread(netlink_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	
 	return 0;
 }
 

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-08-30  0:37           ` [PATCH] Improve behaviour of Netlink Sockets Pablo Neira
@ 2004-08-31  0:20             ` David S. Miller
  2004-08-31 16:37               ` Pablo Neira
  0 siblings, 1 reply; 96+ messages in thread
From: David S. Miller @ 2004-08-31  0:20 UTC (permalink / raw)
  To: Pablo Neira; +Cc: netdev, linux-net

On Mon, 30 Aug 2004 02:37:45 +0200
Pablo Neira <pablo@eurodev.net> wrote:

> Attached the 2.4.x version. It just spawns one kernel thread called 
> netlink, is it ok?

It's fine, I'm going to make 2 minor fixes:

1) Mark netlink_thread() function static.
2) Name the thread "knetlinkd" to be consistent with
   other kernel thread names.

Thanks Pablo.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-08-31  0:20             ` David S. Miller
@ 2004-08-31 16:37               ` Pablo Neira
  2004-08-31 20:16                 ` David S. Miller
  0 siblings, 1 reply; 96+ messages in thread
From: Pablo Neira @ 2004-08-31 16:37 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-net, netdev

[-- Attachment #1: Type: text/plain, Size: 470 bytes --]

Hi Davem,

David S. Miller wrote:

>On Mon, 30 Aug 2004 02:37:45 +0200
>Pablo Neira <pablo@eurodev.net> wrote:
>
>  
>
>>Attached the 2.4.x version. It just spawns one kernel thread called 
>>netlink, is it ok?
>>    
>>
>
>It's fine, I'm going to make 2 minor fixes:
>
>1) Mark netlink_thread() function static.
>2) Name the thread "knetlinkd" to be consistent with
>   other kernel thread names.
>  
>

Attached the version which correct these issues.

regards,
Pablo

[-- Attachment #2: netlink-tqueue.patch --]
[-- Type: text/x-patch, Size: 4067 bytes --]

--- a/net/netlink/af_netlink.c	2004-08-28 05:35:35.000000000 +0200
+++ b/net/netlink/af_netlink.c	2004-08-29 16:00:46.000000000 +0200
@@ -42,6 +42,7 @@
 #include <linux/notifier.h>
 #include <net/sock.h>
 #include <net/scm.h>
+#include <linux/tqueue.h>
 
 #define Nprintk(a...)
 
@@ -63,6 +64,15 @@
 	void			(*data_ready)(struct sock *sk, int bytes);
 };
 
+struct netlink_work
+{
+	struct sock 		*sk;
+	int 			len;
+	struct tq_struct	work;
+};
+
+static DECLARE_TASK_QUEUE(tq_netlink);
+static DECLARE_WAIT_QUEUE_HEAD(netlink_thread_wait);
 static struct sock *nl_table[MAX_LINKS];
 static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
 static unsigned nl_nonroot[MAX_LINKS];
@@ -81,6 +91,15 @@
 
 static struct notifier_block *netlink_chain;
 
+static void netlink_tq_handler(void *data)
+{
+	struct netlink_work *work = data;
+	
+	work->sk->data_ready(work->sk, work->len);
+	sock_put(work->sk);
+	kfree(work);
+}
+
 static void netlink_sock_destruct(struct sock *sk)
 {
 	skb_queue_purge(&sk->receive_queue);
@@ -437,6 +456,8 @@
 
 	if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf ||
 	    test_bit(0, &sk->protinfo.af_netlink->state)) {
+		struct task_struct *client;
+		
 		if (!timeo) {
 			if (ssk->protinfo.af_netlink->pid == 0)
 				netlink_overrun(sk);
@@ -445,6 +466,19 @@
 			return -EAGAIN;
 		}
 
+		if (!sk->protinfo.af_netlink->pid) {
+			/* Kernel is sending information to user space
+			 * and socket buffer is full: Wake up user */
+			
+			client = find_task_by_pid(sk->protinfo.af_netlink->pid);
+			if (!client) {
+				sock_put(sk);
+				kfree_skb(skb);
+				return -EAGAIN;
+			}
+			wake_up_process(client);
+		}
+
 		__set_current_state(TASK_INTERRUPTIBLE);
 		add_wait_queue(&sk->protinfo.af_netlink->wait, &wait);
 
@@ -467,8 +501,26 @@
 	skb_orphan(skb);
 	skb_set_owner_r(skb, sk);
 	skb_queue_tail(&sk->receive_queue, skb);
-	sk->data_ready(sk, len);
-	sock_put(sk);
+
+	if (!sk->protinfo.af_netlink->pid) {
+		struct netlink_work *nlwork = 
+			kmalloc(sizeof(struct netlink_work), GFP_KERNEL);
+		
+		if  (!nlwork) {
+			sock_put(sk);
+			return -EAGAIN;
+		}
+		
+		INIT_TQUEUE(&nlwork->work, netlink_tq_handler, nlwork);
+		nlwork->sk = sk;
+		nlwork->len = len;
+		queue_task(&nlwork->work, &tq_netlink);
+		wake_up(&netlink_thread_wait);
+	} else {
+		sk->data_ready(sk, len);
+		sock_put(sk);
+	}
+	
 	return len;
 
 no_dst:
@@ -490,7 +542,22 @@
                 skb_orphan(skb);
 		skb_set_owner_r(skb, sk);
 		skb_queue_tail(&sk->receive_queue, skb);
-		sk->data_ready(sk, skb->len);
+
+		if (!sk->protinfo.af_netlink->pid) {
+			struct netlink_work *nlwork = 
+				kmalloc(sizeof(struct netlink_work), GFP_KERNEL);
+		
+			if  (!nlwork) 
+				return -EAGAIN;
+		
+			INIT_TQUEUE(&nlwork->work, netlink_tq_handler, nlwork);
+			nlwork->sk = sk;
+			nlwork->len = skb->len;
+			queue_task(&nlwork->work, &tq_netlink);
+			wake_up(&netlink_thread_wait);
+		} else 
+			sk->data_ready(sk, skb->len);
+
 		return 0;
 	}
 	return -1;
@@ -534,11 +601,12 @@
 			netlink_overrun(sk);
 			/* Clone failed. Notify ALL listeners. */
 			failure = 1;
+			sock_put(sk);
 		} else if (netlink_broadcast_deliver(sk, skb2)) {
 			netlink_overrun(sk);
+			sock_put(sk);
 		} else
 			skb2 = NULL;
-		sock_put(sk);
 	}
 
 	netlink_unlock_table();
@@ -868,6 +936,26 @@
 	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
 }
 
+static int netlink_thread(void *unused)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	daemonize();
+	strcpy(tsk->comm, "knetlinkd");
+	sigfillset(&tsk->blocked);
+	mb();
+
+	for (;;) {
+		run_task_queue(&tq_netlink);
+		
+		__set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&netlink_thread_wait, &wait);
+		schedule();
+		__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&netlink_thread_wait, &wait);
+	}
+}
 
 #ifdef NL_EMULATE_DEV
 
@@ -1027,6 +1115,8 @@
 #ifdef CONFIG_PROC_FS
 	create_proc_read_entry("net/netlink", 0, 0, netlink_read_proc, NULL);
 #endif
+	kernel_thread(netlink_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	
 	return 0;
 }
 

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-08-31 16:37               ` Pablo Neira
@ 2004-08-31 20:16                 ` David S. Miller
  0 siblings, 0 replies; 96+ messages in thread
From: David S. Miller @ 2004-08-31 20:16 UTC (permalink / raw)
  To: Pablo Neira; +Cc: linux-net, netdev

On Tue, 31 Aug 2004 18:37:53 +0200
Pablo Neira <pablo@eurodev.net> wrote:

> >It's fine, I'm going to make 2 minor fixes:
> >
> >1) Mark netlink_thread() function static.
> >2) Name the thread "knetlinkd" to be consistent with
> >   other kernel thread names.
> >  
> >
> 
> Attached the version which correct these issues.

You misunderstood, I made these fixes for you in my tree :)

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
       [not found]       ` <412F1269.8090303@eurodev.net>
       [not found]         ` <20040827172736.543dbd54.davem@redhat.com>
@ 2004-09-18 10:25         ` Herbert Xu
  2004-09-19  4:36           ` Pablo Neira
  1 sibling, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-18 10:25 UTC (permalink / raw)
  To: Pablo Neira; +Cc: davem, linux-net, netdev, kaber

Pablo Neira <pablo@eurodev.net> wrote:
> 
> thanks, I fixed, I shouldn't type at 5 a.m. If any other problem, please 
> let me know.

Sorry I should've reviewed this patch sooner.  It finally caught my
attention as I went to update the 2.4 IPsec backport.

Firstly there is a sock leak there.  The following patch fixes it.
The white space damage is not mine :)

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Secondly, I'm dubious about the patch as a whole.  For instance, what
exactly is the wake_up_process() bit trying to do? Surely that process
would've been woken up multiple times already if its queue is full.  If
it can't empty the queue fast enough, what is this extra wake up going
to achieve?

And what is it going to do with thread groups?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
===== net/netlink/af_netlink.c 1.52 vs edited =====
--- 1.52/net/netlink/af_netlink.c	2004-08-28 10:11:04 +10:00
+++ 2.6/net/netlink/af_netlink.c	2004-09-18 20:09:20 +10:00
@@ -630,7 +630,8 @@
 
 			if (!nlwork)
 				return -1;
-		
+
+			sock_hold(sk);
 			INIT_WORK(&nlwork->work, netlink_wq_handler, nlwork);
 			nlwork->sk = sk;
 			nlwork->len = skb->len;
@@ -683,14 +684,13 @@
 			netlink_overrun(sk);
 			/* Clone failed. Notify ALL listeners. */
 			failure = 1;
-			sock_put(sk);
 		} else if (netlink_broadcast_deliver(sk, skb2)) {
 			netlink_overrun(sk);
-			sock_put(sk);
 		} else {
 			delivered = 1;
 			skb2 = NULL;
 		}
+		sock_put(sk);
 	}
 
 	netlink_unlock_table();
--
===== net/netlink/af_netlink.c 1.15 vs edited =====
--- 1.15/net/netlink/af_netlink.c	2004-08-31 10:04:34 +10:00
+++ 2.4/net/netlink/af_netlink.c	2004-09-18 20:25:29 +10:00
@@ -549,7 +549,8 @@
 		
 			if  (!nlwork) 
 				return -EAGAIN;
-		
+
+			sock_hold(sk);
 			INIT_TQUEUE(&nlwork->work, netlink_tq_handler, nlwork);
 			nlwork->sk = sk;
 			nlwork->len = skb->len;
@@ -601,12 +602,11 @@
 			netlink_overrun(sk);
 			/* Clone failed. Notify ALL listeners. */
 			failure = 1;
-			sock_put(sk);
 		} else if (netlink_broadcast_deliver(sk, skb2)) {
 			netlink_overrun(sk);
-			sock_put(sk);
 		} else
 			skb2 = NULL;
+		sock_put(sk);
 	}
 
 	netlink_unlock_table();

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-18 10:25         ` Herbert Xu
@ 2004-09-19  4:36           ` Pablo Neira
  2004-09-19  5:18             ` Pablo Neira
  2004-09-19  7:58             ` Herbert Xu
  0 siblings, 2 replies; 96+ messages in thread
From: Pablo Neira @ 2004-09-19  4:36 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 1259 bytes --]

Hi Herbert,

Herbert Xu wrote:

>Firstly there is a sock leak there.  The following patch fixes it.
>The white space damage is not mine :)
>
>Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>  
>

you are right, I missed that, but I prefer the patches attached to this 
email. Now if netlink_broadcast_deliver function delivers correctly the 
packet, you decrease sock refcount and function returns 1. I think that 
I got confused because netlink_broadcast_deliver returns 0/-1.

>Secondly, I'm dubious about the patch as a whole.  For instance, what
>exactly is the wake_up_process() bit trying to do? Surely that process
>would've been woken up multiple times already if its queue is full.
>

This is what I theorically expected, but in practice if you stress a 
netlink socket sending a big bunch of information in a short period of 
time from kernel space to user space, socket overruns easily. That's why 
I wake up the user process to make it process information stored in the 
queue. Socket doesn't overrun anymore with my patch.

>And what is it going to do with thread groups?
>  
>

currently broadcast sockets can still overrun. I have a set of patches 
that I'll submit as soon as I test them and I finish my boring exams.

regards,
Pablo

[-- Attachment #2: netlink-fix.patch --]
[-- Type: text/x-patch, Size: 920 bytes --]

diff -u -r1.1.1.2 af_netlink.c
--- a/net/netlink/af_netlink.c	4 Sep 2004 17:34:06 -0000	1.1.1.2
+++ b/net/netlink/af_netlink.c	19 Sep 2004 03:57:20 -0000
@@ -629,18 +629,20 @@
 				kmalloc(sizeof(struct netlink_work), GFP_KERNEL);
 
 			if (!nlwork)
-				return -1;
+				return 0;
 		
 			INIT_WORK(&nlwork->work, netlink_wq_handler, nlwork);
 			nlwork->sk = sk;
 			nlwork->len = skb->len;
 			queue_work(netlink_wq, &nlwork->work);
-		} else 
+		} else {
 			sk->sk_data_ready(sk, skb->len);
+			sock_put(sock);
+		}
 
-		return 0;
+		return 1;
 	}
-	return -1;
+	return 0;
 }
 
 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
@@ -685,11 +687,11 @@
 			failure = 1;
 			sock_put(sk);
 		} else if (netlink_broadcast_deliver(sk, skb2)) {
-			netlink_overrun(sk);
-			sock_put(sk);
-		} else {
 			delivered = 1;
 			skb2 = NULL;
+		} else {
+			netlink_overrun(sk);
+			sock_put(sk);
 		}
 	}
 

[-- Attachment #3: netlink-fix-2.4.patch --]
[-- Type: text/x-patch, Size: 1017 bytes --]

--- a/net/netlink/af_netlink.c	2004-09-19 06:23:22.000000000 +0200
+++ b/net/netlink/af_netlink.c	2004-09-19 06:24:48.000000000 +0200
@@ -548,19 +548,21 @@
 				kmalloc(sizeof(struct netlink_work), GFP_KERNEL);
 		
 			if  (!nlwork) 
-				return -EAGAIN;
+				return 0;
 		
 			INIT_TQUEUE(&nlwork->work, netlink_tq_handler, nlwork);
 			nlwork->sk = sk;
 			nlwork->len = skb->len;
 			queue_task(&nlwork->work, &tq_netlink);
 			wake_up(&netlink_thread_wait);
-		} else 
+		} else {
 			sk->data_ready(sk, skb->len);
+			sock_put(sk);
+		}
 
-		return 0;
+		return 1;
 	}
-	return -1;
+	return 0;
 }
 
 void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
@@ -602,11 +604,12 @@
 			/* Clone failed. Notify ALL listeners. */
 			failure = 1;
 			sock_put(sk);
-		} else if (netlink_broadcast_deliver(sk, skb2)) {
+		} else if (netlink_broadcast_deliver(sk, skb2))
+			skb2 = NULL;
+		else {
 			netlink_overrun(sk);
 			sock_put(sk);
-		} else
-			skb2 = NULL;
+		}
 	}
 
 	netlink_unlock_table();

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19  4:36           ` Pablo Neira
@ 2004-09-19  5:18             ` Pablo Neira
  2004-09-19  7:58             ` Herbert Xu
  1 sibling, 0 replies; 96+ messages in thread
From: Pablo Neira @ 2004-09-19  5:18 UTC (permalink / raw)
  To: Pablo Neira; +Cc: Herbert Xu, David S. Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 162 bytes --]

Pablo Neira wrote:

> you are right, I missed that, but I prefer the patches attached to 
> this email. 


wrong 2.6 patch, consider this instead

regards,
Pablo

[-- Attachment #2: netlink-fix.patch --]
[-- Type: text/x-patch, Size: 918 bytes --]

diff -u -r1.1.1.2 af_netlink.c
--- a/net/netlink/af_netlink.c	4 Sep 2004 17:34:06 -0000	1.1.1.2
+++ b/net/netlink/af_netlink.c	19 Sep 2004 03:57:20 -0000
@@ -629,18 +629,20 @@
 				kmalloc(sizeof(struct netlink_work), GFP_KERNEL);
 
 			if (!nlwork)
-				return -1;
+				return 0;
 		
 			INIT_WORK(&nlwork->work, netlink_wq_handler, nlwork);
 			nlwork->sk = sk;
 			nlwork->len = skb->len;
 			queue_work(netlink_wq, &nlwork->work);
-		} else 
+		} else {
 			sk->sk_data_ready(sk, skb->len);
+			sock_put(sk);
+		}
 
-		return 0;
+		return 1;
 	}
-	return -1;
+	return 0;
 }
 
 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
@@ -685,11 +687,11 @@
 			failure = 1;
 			sock_put(sk);
 		} else if (netlink_broadcast_deliver(sk, skb2)) {
-			netlink_overrun(sk);
-			sock_put(sk);
-		} else {
 			delivered = 1;
 			skb2 = NULL;
+		} else {
+			netlink_overrun(sk);
+			sock_put(sk);
 		}
 	}
 

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19  4:36           ` Pablo Neira
  2004-09-19  5:18             ` Pablo Neira
@ 2004-09-19  7:58             ` Herbert Xu
  2004-09-19 12:02               ` Herbert Xu
  2004-09-19 20:47               ` Pablo Neira
  1 sibling, 2 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-19  7:58 UTC (permalink / raw)
  To: Pablo Neira; +Cc: herbert, davem, netdev

Pablo Neira <pablo@eurodev.net> wrote:
> 
> you are right, I missed that, but I prefer the patches attached to this 
> email. Now if netlink_broadcast_deliver function delivers correctly the 
> packet, you decrease sock refcount and function returns 1. I think that 
> I got confused because netlink_broadcast_deliver returns 0/-1.

Unfortunately it is still wrong.  You missed the exit path at the
very top.

And I think rather than adding all these new sock_put's, it's much
better to do what you mean and add a sock_hold on the new path that
you introduced.

>>Secondly, I'm dubious about the patch as a whole.  For instance, what
>>exactly is the wake_up_process() bit trying to do? Surely that process
>>would've been woken up multiple times already if its queue is full.
> 
> This is what I theorically expected, but in practice if you stress a 
> netlink socket sending a big bunch of information in a short period of 
> time from kernel space to user space, socket overruns easily. That's why 
> I wake up the user process to make it process information stored in the 
> queue. Socket doesn't overrun anymore with my patch.

Yes but your patch does a lot more than that wake_up_process.  Have you
reverted just the wake_up_process and seen a difference?

>>And what is it going to do with thread groups?
> 
> currently broadcast sockets can still overrun. I have a set of patches 
> that I'll submit as soon as I test them and I finish my boring exams.

That's not what I meant.  If you have a thread group where everybody's
got the same pid, your code will probably wake up the wrong thread.

Besides, for any netlink socket but the first for a process, they'll
all have negative pid's which have nothing to do with the real pid.
So I really think that the wake_up_process hunk is bogus.

BTW I apologise for the few bounces that you got from my mail server.
It should be fixed now.  If you're still getting bounces, please let
me know via the list.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
       [not found] ` <E1C90Da-0001V7-00@gondolin.me.apana.org.au>
@ 2004-09-19 12:00   ` Herbert Xu
  0 siblings, 0 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-19 12:00 UTC (permalink / raw)
  To: Pablo Neira; +Cc: davem, linux-net, netdev

On Sun, Sep 19, 2004 at 09:50:58PM +1000, Herbert Xu wrote:
> 
> Previously netlink requests had overhead comparable to that of an
> ioctl.  Now you've added two full context switches to it.  There's
> gotta be a better way to fix this dead-lock.

Worse still, previously if a netlink request blocked indefinitely it
only did that in the context of the calling process.  Now it'll
dead-lock the entire system because you're using the generic work
queue.
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19  7:58             ` Herbert Xu
@ 2004-09-19 12:02               ` Herbert Xu
  2004-09-19 12:07                 ` Herbert Xu
  2004-09-19 20:50                 ` Pablo Neira
  2004-09-19 20:47               ` Pablo Neira
  1 sibling, 2 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-19 12:02 UTC (permalink / raw)
  To: Pablo Neira; +Cc: davem, netdev

On Sun, Sep 19, 2004 at 05:58:52PM +1000, Herbert Xu wrote:
> 
> Besides, for any netlink socket but the first for a process, they'll
> all have negative pid's which have nothing to do with the real pid.
> So I really think that the wake_up_process hunk is bogus.

Another reason why this is bogus.

Most kernel users of unicast have a send timeout setting of 0.  Therefore
your wake_up_process() call will never happen when netlink_attachskb is
called by the kernel.
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 12:02               ` Herbert Xu
@ 2004-09-19 12:07                 ` Herbert Xu
  2004-09-19 20:50                   ` Pablo Neira
  2004-09-19 20:50                 ` Pablo Neira
  1 sibling, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-19 12:07 UTC (permalink / raw)
  To: Pablo Neira; +Cc: davem, netdev

On Sun, Sep 19, 2004 at 10:02:49PM +1000, herbert wrote:
> 
> Another reason why this is bogus.
> 
> Most kernel users of unicast have a send timeout setting of 0.  Therefore
> your wake_up_process() call will never happen when netlink_attachskb is
> called by the kernel.

In fact this is the reason why this whole problem is non-existant.

Unless there is a kernel user that sends netlink messages with a timeout
value other than 0, this patch does not resolve any dead-locks at all
since there is none to start with.  The reason is that the kernel will
simply drop the message when the destination socket queue is full.

So it looks to me as if this entire patch is simply papering over bugs
in the user-space application where it either isn't processing the
messages fast enough or it needs to enlarge its queue size.

Here is a tip, use separate sockets for unicast and multicast messages.
That way your unicast socket is very unlikely to overrun.

So I'd like to see the whole thing reverted.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19  7:58             ` Herbert Xu
  2004-09-19 12:02               ` Herbert Xu
@ 2004-09-19 20:47               ` Pablo Neira
  2004-09-19 21:20                 ` Herbert Xu
  1 sibling, 1 reply; 96+ messages in thread
From: Pablo Neira @ 2004-09-19 20:47 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 2013 bytes --]

Hi Herbert,

Herbert Xu wrote:

>Unfortunately it is still wrong.  You missed the exit path at the
>very top.
>  
>

if netlink_broadcast_deliver returns 0, you decrease sock_put. On the 
other hand if it returns 1, refcount will be decreased by the workqueue 
handler or after calling the callback. I don't see that exit path.

>And I think rather than adding all these new sock_put's, it's much
>better to do what you mean and add a sock_hold on the new path that
>you introduced.
>  
>

yes, it's true that all those sock_put pollutes the source code, I like 
your idea of add a sock_hold, please could you have a look at the patch 
attached? it's basically your patch plus netlink_broadcast_deliver 
revised return value. If you are ok with it, let me know.

>Yes but your patch does a lot more than that wake_up_process.  Have you
>reverted just the wake_up_process and seen a difference?
>  
>

Yes, it works as well so we could remove it, but I got some extra 
throughput with the wake_up_process call if I send something up to 500 
consecutive messages from kernel to user space. For more than 1000 I 
notice that throughput is similar without wake_up_process, I'll study 
the performance without wake_up_process if it's not a good idea using it 
as you think.

>That's not what I meant.  If you have a thread group where everybody's
>got the same pid, your code will probably wake up the wrong thread.
>  
>

no, my patch doesn't wake up the user process with broadcast sockets, 
snipped from broadcast_deliver:

        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
            !test_bit(0, &nlk->state)) {
             [...]
       }
       return 0;

it returns zero then socket overruns.

>Besides, for any netlink socket but the first for a process, they'll
>all have negative pid's which have nothing to do with the real pid.
>So I really think that the wake_up_process hunk is bogus.
>  
>

same reply as above, we don't wake up the user process with broadcast 
sockets.

regards,
Pablo

[-- Attachment #2: nl-fix-2.6.patch --]
[-- Type: text/x-patch, Size: 969 bytes --]

diff -u -r1.3 af_netlink.c
--- a/net/netlink/af_netlink.c	19 Sep 2004 19:46:20 -0000	1.3
+++ b/net/netlink/af_netlink.c	19 Sep 2004 20:04:44 -0000
@@ -629,8 +629,9 @@
 				kmalloc(sizeof(struct netlink_work), GFP_KERNEL);
 
 			if (!nlwork)
-				return -1;
-		
+				return 0;
+
+			sock_hold(sk);
 			INIT_WORK(&nlwork->work, netlink_wq_handler, nlwork);
 			nlwork->sk = sk;
 			nlwork->len = skb->len;
@@ -638,9 +639,9 @@
 		} else 
 			sk->sk_data_ready(sk, skb->len);
 
-		return 0;
+		return 1;
 	}
-	return -1;
+	return 0;
 }
 
 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
@@ -683,14 +684,13 @@
 			netlink_overrun(sk);
 			/* Clone failed. Notify ALL listeners. */
 			failure = 1;
-			sock_put(sk);
 		} else if (netlink_broadcast_deliver(sk, skb2)) {
-			netlink_overrun(sk);
-			sock_put(sk);
-		} else {
 			delivered = 1;
 			skb2 = NULL;
-		}
+		} else 
+			netlink_overrun(sk);
+
+		sock_put(sk);
 	}
 
 	netlink_unlock_table();

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 12:07                 ` Herbert Xu
@ 2004-09-19 20:50                   ` Pablo Neira
  2004-09-19 21:53                     ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: Pablo Neira @ 2004-09-19 20:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, netdev

Herbert Xu wrote:

>In fact this is the reason why this whole problem is non-existant.
>
>Unless there is a kernel user that sends netlink messages with a timeout
>value other than 0, this patch does not resolve any dead-locks at all
>since there is none to start with.
>

yes, you are totally right, but as I told you before, I'm not focusing 
my efforts in improving netlink sockets for tools like iproute, for 
those MSG_DONTWAIT is fairly ok.

>So it looks to me as if this entire patch is simply papering over bugs
>in the user-space application where it either isn't processing the
>messages fast enough or it needs to enlarge its queue size.
>  
>

I don't agree, enlarging the queue size for applications that send a lot 
of information in a short period of time is just a workaround.

>Here is a tip, use separate sockets for unicast and multicast messages.
>That way your unicast socket is very unlikely to overrun.
>
>So I'd like to see the whole thing reverted.
>  
>

Herbert, I don't agree, my patch isn't related to stuff that you've 
mentioned. It's fairly true that most çof application use socket timeout 
0, but my patch is not for those! :-) well, still disagree?

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 12:02               ` Herbert Xu
  2004-09-19 12:07                 ` Herbert Xu
@ 2004-09-19 20:50                 ` Pablo Neira
  2004-09-19 21:59                   ` Herbert Xu
  1 sibling, 1 reply; 96+ messages in thread
From: Pablo Neira @ 2004-09-19 20:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, netdev

Herbert Xu wrote:

>On Sun, Sep 19, 2004 at 05:58:52PM +1000, Herbert Xu wrote:
>  
>
>>Besides, for any netlink socket but the first for a process, they'll
>>all have negative pid's which have nothing to do with the real pid.
>>So I really think that the wake_up_process hunk is bogus.
>>    
>>
>
>Another reason why this is bogus.
>
>Most kernel users of unicast have a send timeout setting of 0.  Therefore
>your wake_up_process() call will never happen when netlink_attachskb is
>called by the kernel.
>  
>

Yes, I didn't modify MSG_DONTWAIT + unicast behaviour and I've never 
wanted to. Actually my patch is not addressed to that case, my intention 
is improving netlink socket with when sending a lot of information from 
kernel to user in a short time, in this case socket overruns easily.

Which are those applications? for example, netfilter modules like 
ip_queue and ipt_ULOG. Even more, these applications are in interrupt 
context and, if I'm not wrong, I realized that there's some problems 
using netlink sockets in this context, for example, see netlink_ack 
allocating a skb with GFP_KERNEL flag, it hits a bug slab.

>Worse still, previously if a netlink request blocked indefinitely it
>only did that in the context of the calling process.  Now it'll
>dead-lock the entire system because you're using the generic work
>queue.
>
Hebert, I don't see such case, please could you detail it a bit more?

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 20:47               ` Pablo Neira
@ 2004-09-19 21:20                 ` Herbert Xu
  2004-09-19 22:14                   ` Pablo Neira
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-19 21:20 UTC (permalink / raw)
  To: Pablo Neira; +Cc: David S. Miller, netdev

On Sun, Sep 19, 2004 at 10:47:26PM +0200, Pablo Neira wrote:
> 
> if netlink_broadcast_deliver returns 0, you decrease sock_put. On the 
> other hand if it returns 1, refcount will be decreased by the workqueue 
> handler or after calling the callback. I don't see that exit path.

I'm talking about the bits inside ifdef NL_EMULATE_DEV.
 
> yes, it's true that all those sock_put pollutes the source code, I like 
> your idea of add a sock_hold, please could you have a look at the patch 
> attached? it's basically your patch plus netlink_broadcast_deliver 
> revised return value. If you are ok with it, let me know.

It's OK apart from the fact that you missed the bits inside NL_EMULATE_DEV
again.
 
> Yes, it works as well so we could remove it, but I got some extra 
> throughput with the wake_up_process call if I send something up to 500 
> consecutive messages from kernel to user space. For more than 1000 I 
> notice that throughput is similar without wake_up_process, I'll study 
> the performance without wake_up_process if it's not a good idea using it 
> as you think.

I'm totally happy with the idea of improving the performance of netlink
applications.  However, let's do it properly rather than adding random
wake_up calls.

> >That's not what I meant.  If you have a thread group where everybody's
> >got the same pid, your code will probably wake up the wrong thread.
> 
> no, my patch doesn't wake up the user process with broadcast sockets, 
> snipped from broadcast_deliver:

You didn't understand me.  You're waking up a process based on its pid.
With thread groups this is simply broken.
 
> >Besides, for any netlink socket but the first for a process, they'll
> >all have negative pid's which have nothing to do with the real pid.
> >So I really think that the wake_up_process hunk is bogus.
> 
> same reply as above, we don't wake up the user process with broadcast 
> sockets.

I know, that's exactly the reason why it's bogus.  You're waking things
up on a completely arbitrary basis.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 20:50                   ` Pablo Neira
@ 2004-09-19 21:53                     ` Herbert Xu
  2004-09-19 22:49                       ` Pablo Neira
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-19 21:53 UTC (permalink / raw)
  To: Pablo Neira; +Cc: davem, netdev

On Sun, Sep 19, 2004 at 10:50:25PM +0200, Pablo Neira wrote:
>
> yes, you are totally right, but as I told you before, I'm not focusing 
> my efforts in improving netlink sockets for tools like iproute, for 
> those MSG_DONTWAIT is fairly ok.

No I'm talking not about iproute, I'm talking the kernel callers of
unicast/broadcast.  They never wait.
 
> I don't agree, enlarging the queue size for applications that send a lot 
> of information in a short period of time is just a workaround.

Workaround for what? Please define your problem.

Your original message mentioned a dead-lock, which is now obviously
non-existant.

> Herbert, I don't agree, my patch isn't related to stuff that you've 
> mentioned. It's fairly true that most ?of application use socket timeout 
> 0, but my patch is not for those! :-) well, still disagree?

Please reread my messages.  I'm not talking about user-space applications
setting timeout to 0.  Most of them don't.

I'm talking about every single kernel netlink sender.  They never wait.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 20:50                 ` Pablo Neira
@ 2004-09-19 21:59                   ` Herbert Xu
  2004-09-19 22:39                     ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-19 21:59 UTC (permalink / raw)
  To: Pablo Neira; +Cc: davem, netdev

On Sun, Sep 19, 2004 at 10:50:36PM +0200, Pablo Neira wrote:
>
> >Another reason why this is bogus.
> >
> >Most kernel users of unicast have a send timeout setting of 0.  Therefore
> >your wake_up_process() call will never happen when netlink_attachskb is
> >called by the kernel.
> 
> Yes, I didn't modify MSG_DONTWAIT + unicast behaviour and I've never 
> wanted to. Actually my patch is not addressed to that case, my intention 

I think this discussion might as well end here, we're obviously not
communicating at all.

I'm not talking about MSG_DONTWAIT, I'm talking about kernel netlink
message senders.  They never wait for obvious reasons.

> Which are those applications? for example, netfilter modules like 
> ip_queue and ipt_ULOG. Even more, these applications are in interrupt 
> context and, if I'm not wrong, I realized that there's some problems 
> using netlink sockets in this context, for example, see netlink_ack 
> allocating a skb with GFP_KERNEL flag, it hits a bug slab.

Well please define your problem more accurately.  A test-case would help.

I'd like to see the original patch reverted.  We can then sit down and work
out what the real problem is (if any).

> >Worse still, previously if a netlink request blocked indefinitely it
> >only did that in the context of the calling process.  Now it'll
> >dead-lock the entire system because you're using the generic work
> >queue.
>
> Hebert, I don't see such case, please could you detail it a bit more?

As it is since the kernel processing is done in the context of the calling
process, it is allowed to sleep indefinitely (semaphore contention, etc.).

Now that you've put it into keventd, sleeping indefinitely will lock up
the system.
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 21:20                 ` Herbert Xu
@ 2004-09-19 22:14                   ` Pablo Neira
  2004-09-19 23:31                     ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: Pablo Neira @ 2004-09-19 22:14 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, netdev

Hi Herbert,

Herbert Xu wrote:

>However, let's do it properly rather than adding random wake_up calls.
>  
>

When the buffer is full, I wake up the user process and put the kernel 
thread to sleep to let the user process empty the queue.

>You're waking up a process based on its pid. With thread groups this is simply broken.
>  
>

I didn't know about netlink applications using thread groups, in that 
case, submit a patch to delete that wake up call.

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 21:59                   ` Herbert Xu
@ 2004-09-19 22:39                     ` jamal
  2004-09-19 22:55                       ` Pablo Neira
  2004-09-19 23:17                       ` Herbert Xu
  0 siblings, 2 replies; 96+ messages in thread
From: jamal @ 2004-09-19 22:39 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

Theres one valuable piece in his patch. Ability to congestion control
the kernel so user space can clear the socket queue. The alternative is
an overun. You have shot a hole in the workqueue approach. May i also
recommend we revert the patch until we have a better resolution, Dave?

Pablo you really need to provide some good testcases to prove this.
I have my own netlink stress testing that i havent gotten around to
testing this issue. 

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 21:53                     ` Herbert Xu
@ 2004-09-19 22:49                       ` Pablo Neira
  2004-09-19 23:44                         ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: Pablo Neira @ 2004-09-19 22:49 UTC (permalink / raw)
  To: Herbert Xu; +Cc: netdev, David S. Miller

Hi Herbert,

Sorry, if i repeat things, I'm not willing to be annoying, just clarifying.

I'm always talking all the time about sending information from kernel 
space to user space where I found the problem. There's no problem in 
doing on the other way, in that case buffer never gets full.

Herbert Xu wrote:

>Workaround for what? Please define your problem.
>  
>
Two problems:

a) if you request information via nl sockets from user space and a 
kernel module doesn't use the dontwait flag (I know, *nobody* is doing 
this at the moment, so currently broadcast/unicast never wait as you 
remarked).

When we request info from user space (sendmsg), you execute the callback 
from a module which usually replies with some information 
(netlink_unicast). The problem happens if the size of the information is 
too big for the buffer. In that case, before using applying my patch, it 
hangs (in <2.6.9-rc1, it puts it to sleep if buffer gets full). I'm 
talking about using unicast/broadcast without dontwait flag.

b) A module needs to send a lot of information from kernel space to user 
space, if buffer gets full quickly, buffer overruns and 
netlink_unicast/broadcast never wait, so they drop packets.

Why someone could call netlink_unicast/broadcast from a module telling 
them to wait if necessary? if that module want to care about a possible 
message drop. This happens if you stress nl sockets. This way netlink 
sockets behave well with an important workload without dropping 
messages. So the user space part doesn't get that annoying -enobufs error.

>Your original message mentioned a dead-lock, which is now obviously non-existant.
>  
>

it exists if you use netlink_unicast/broadcast telling them to wait, so 
it's not a real problem for current applications because they tell 
netlink_unicast/broadcast not to wait.

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 22:39                     ` jamal
@ 2004-09-19 22:55                       ` Pablo Neira
  2004-09-19 23:04                         ` jamal
  2004-09-19 23:17                       ` Herbert Xu
  1 sibling, 1 reply; 96+ messages in thread
From: Pablo Neira @ 2004-09-19 22:55 UTC (permalink / raw)
  To: hadi; +Cc: Herbert Xu, David S. Miller, netdev

Hi Jamal,

jamal wrote:

>Theres one valuable piece in his patch. Ability to congestion control
>the kernel so user space can clear the socket queue. The alternative is
>an overun. You have shot a hole in the workqueue approach. May i also
>recommend we revert the patch until we have a better resolution, Dave?
>  
>

whatever you decided I'll be ok to improve the solution

>Pablo you really need to provide some good testcases to prove this.
>  
>

Jamal, I told you that I made a tool to stress netlink sockets, I think 
that it could help. I'll post in some hours to the maillist.

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 22:55                       ` Pablo Neira
@ 2004-09-19 23:04                         ` jamal
  2004-09-19 23:10                           ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-19 23:04 UTC (permalink / raw)
  To: Pablo Neira; +Cc: Herbert Xu, David S. Miller, netdev

Pablo,

Please dont be discouraged by this. You have some valuable contribution
it just needs to be put in proper context.
I am sure Herbert will be than happy to help you clean the patch as will
I when i get cycles.
Post the stress tool when you get the chance.
 
cheers,
jamal

On Sun, 2004-09-19 at 18:55, Pablo Neira wrote:
> Hi Jamal,
> 
> jamal wrote:
> 
> >Theres one valuable piece in his patch. Ability to congestion control
> >the kernel so user space can clear the socket queue. The alternative is
> >an overun. You have shot a hole in the workqueue approach. May i also
> >recommend we revert the patch until we have a better resolution, Dave?
> >  
> >
> 
> whatever you decided I'll be ok to improve the solution
> 
> >Pablo you really need to provide some good testcases to prove this.
> >  
> >
> 
> Jamal, I told you that I made a tool to stress netlink sockets, I think 
> that it could help. I'll post in some hours to the maillist.
> 
> regards,
> Pablo
> 

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 23:04                         ` jamal
@ 2004-09-19 23:10                           ` Herbert Xu
  0 siblings, 0 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-19 23:10 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Sun, Sep 19, 2004 at 07:04:28PM -0400, jamal wrote:
> 
> Please dont be discouraged by this. You have some valuable contribution
> it just needs to be put in proper context.

Totally agreed.  Pablo, let me thank you for working on this.
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 22:39                     ` jamal
  2004-09-19 22:55                       ` Pablo Neira
@ 2004-09-19 23:17                       ` Herbert Xu
  2004-09-20  2:39                         ` jamal
  1 sibling, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-19 23:17 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Sun, Sep 19, 2004 at 06:39:29PM -0400, jamal wrote:
> 
> Theres one valuable piece in his patch. Ability to congestion control
> the kernel so user space can clear the socket queue. The alternative is
> an overun. You have shot a hole in the workqueue approach. May i also

Congestion control would be a good alternative to overruns.  I too would
love to see netlink made more reliable if possible.  But hopefully without
sacrificing the good bits in it like performance.

However, in the scenarios that's been cited so far I can't see how it can
work.  For example, in the ip_queue context, the kernel really doesn't
have any alternative to dropping the packet, right?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 22:14                   ` Pablo Neira
@ 2004-09-19 23:31                     ` Herbert Xu
  0 siblings, 0 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-19 23:31 UTC (permalink / raw)
  To: Pablo Neira; +Cc: David S. Miller, netdev

On Mon, Sep 20, 2004 at 12:14:46AM +0200, Pablo Neira wrote:
> 
> When the buffer is full, I wake up the user process and put the kernel 
> thread to sleep to let the user process empty the queue.

Well that sounds like a good idea but it isn't:

1) You wake up no-one or the wrong process half of the time.
2) Every time a packet is added to the queue the destination process is
woken up.  So by the time it's full it has been woken up many times
already.  If all those wake_up's didn't make a difference, then this
extra one here isn't likely to either.
3) All kernel callers of this function have timeout set to zero.  Therefore
they won't even encounter your code.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 22:49                       ` Pablo Neira
@ 2004-09-19 23:44                         ` Herbert Xu
  2004-09-20  0:31                           ` Pablo Neira
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-19 23:44 UTC (permalink / raw)
  To: Pablo Neira; +Cc: netdev, David S. Miller

On Mon, Sep 20, 2004 at 12:49:10AM +0200, Pablo Neira wrote:
> 
> Sorry, if i repeat things, I'm not willing to be annoying, just clarifying.

You're alright.  We're on the same side :)

> I'm always talking all the time about sending information from kernel 
> space to user space where I found the problem. There's no problem in 
> doing on the other way, in that case buffer never gets full.

Yes I understand.

> Two problems:
> 
> a) if you request information via nl sockets from user space and a 
> kernel module doesn't use the dontwait flag (I know, *nobody* is doing 
> this at the moment, so currently broadcast/unicast never wait as you 
> remarked).

And if anybody does that then they are buggy.  So let's not slow everybody
down for the sake of a non-existant buggy kernel implementation.

There is no point for the kernel to wait at all.  For unicast sockets
used for sending commands to the kernel, you should never get overruns
if you are reading your responses correctly and have set the queue size
correctly.

> b) A module needs to send a lot of information from kernel space to user 
> space, if buffer gets full quickly, buffer overruns and 
> netlink_unicast/broadcast never wait, so they drop packets.

I agree that this may be a problem for some, but I don't see how your
patch addresses it.  For the case of ip_queue the kernel simply cannot
wait since it just creates a queue through the backdoor.  You might as
well just increase your receive queue length.

And if your application can't keep up with the flow of packets, then
whatever we do in the kernel is not going to help.

Congestion control that Jamal talked about is certainly a good idea in
many situations.  But it can't be applied in the situation you're in
because ip_queue can't back off.

> Why someone could call netlink_unicast/broadcast from a module telling 
> them to wait if necessary? if that module want to care about a possible 
> message drop. This happens if you stress nl sockets. This way netlink 
> sockets behave well with an important workload without dropping 
> messages. So the user space part doesn't get that annoying -enobufs error.

I don't understand this paragraph.

> >Your original message mentioned a dead-lock, which is now obviously 
> >non-existant.
> 
> it exists if you use netlink_unicast/broadcast telling them to wait, so 
> it's not a real problem for current applications because they tell 
> netlink_unicast/broadcast not to wait.

s/applications/kernel implementation/

As I said above, if the kernel waits then it's a bug.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 23:44                         ` Herbert Xu
@ 2004-09-20  0:31                           ` Pablo Neira
  2004-09-20  1:00                             ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: Pablo Neira @ 2004-09-20  0:31 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, jamal, netdev

Hi Herbert,

Herbert Xu wrote:

>On Mon, Sep 20, 2004 at 12:49:10AM +0200, Pablo Neira wrote:
>  
>
>>Sorry, if i repeat things, I'm not willing to be annoying, just clarifying.
>>    
>>
>
>You're alright.  We're on the same side :)
>  
>

that's fine :)

>There is no point for the kernel to wait at all.  For unicast sockets
>used for sending commands to the kernel, you should never get overruns
>if you are reading your responses correctly and have set the queue size
>correctly.
>  
>

sure, not for commands. When I talk about netlink sockets, I'm not 
focused on commands because what they do currently is quite enough for 
them I think. I know that commands are the main application of netlink 
sockets but there are many others.

>>b) A module needs to send a lot of information from kernel space to user 
>>space, if buffer gets full quickly, buffer overruns and 
>>netlink_unicast/broadcast never wait, so they drop packets.
>>    
>>
>
>I agree that this may be a problem for some, but I don't see how your
>patch addresses it.  For the case of ip_queue the kernel simply cannot
>wait since it just creates a queue through the backdoor.  You might as
>well just increase your receive queue length.
>  
>

I agree, ip_queue case is kind of complex. Well, think about a logging 
tool for packets in kernel space that sends messages via netlink when a 
packet matches a condition. Now, we receive 300 packets in a very very 
short period of time (with the default buffer size) and after that 
everything' gets calm again, that is, there's nothing to send to user space.

In that case, netlink buffer will surely overrun, so those 300 messages 
will be drop because kernel didn't wait a bit. This is what happens now, 
I can reproduce this with my tool.

Well, if system work load is high (in term of sending netlink messages), 
we don't have too many things to do...

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-20  0:31                           ` Pablo Neira
@ 2004-09-20  1:00                             ` Herbert Xu
  0 siblings, 0 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-20  1:00 UTC (permalink / raw)
  To: Pablo Neira; +Cc: David S. Miller, jamal, netdev

On Mon, Sep 20, 2004 at 02:31:32AM +0200, Pablo Neira wrote:
> 
> >There is no point for the kernel to wait at all.  For unicast sockets
> >used for sending commands to the kernel, you should never get overruns
> >if you are reading your responses correctly and have set the queue size
> >correctly.
> 
> sure, not for commands. When I talk about netlink sockets, I'm not 
> focused on commands because what they do currently is quite enough for 
> them I think. I know that commands are the main application of netlink 
> sockets but there are many others.

My message was in response to your problem a) which looks exactly like
a command.

> >>b) A module needs to send a lot of information from kernel space to user 
> >>space, if buffer gets full quickly, buffer overruns and 
> >>netlink_unicast/broadcast never wait, so they drop packets.
>
> In that case, netlink buffer will surely overrun, so those 300 messages 
> will be drop because kernel didn't wait a bit. This is what happens now, 
> I can reproduce this with my tool.

Well I see two solutions.  You either extend the receive queue or tell the
kernel to stop sending.  The second is not an option here because you'll
lose the packets anyway.

Getting the kernel to wait is NOT a solution.  It's just another form of
extending the receive queue, albeit an ugly one.

But hang on a second, the kernel should've woken up your process after the
very first message.  Why isn't that happening?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-19 23:17                       ` Herbert Xu
@ 2004-09-20  2:39                         ` jamal
  2004-09-20  2:58                           ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-20  2:39 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Sun, 2004-09-19 at 19:17, Herbert Xu wrote:
> Congestion control would be a good alternative to overruns.  I too would
> love to see netlink made more reliable if possible.  But hopefully without
> sacrificing the good bits in it like performance.

Agreed.
Note the fact that netlink can set the socket error flag provides it
with the workings needed for reliability i.e you get to know about lost
messages. If an overun occurs you (app) gets to know about it. You just
have to poll, unfortunately, for everything _again_.

> However, in the scenarios that's been cited so far I can't see how it can
> work.  For example, in the ip_queue context, the kernel really doesn't
> have any alternative to dropping the packet, right?

I havent paid a lot of attention to ip_queue workings. But something
that will get the kernel to backoff when a certain socket threshold gets
exceeded then get it going again when a below a low watermak. Dumps
already have markers stored in the callbacks, maybe something a long the
same line of thinking. You may have to tell user space "theres more
coming". Not sure how to achieve this, just brainstorming.
Essentially this is where the improvements over std overun signaling is,
IMO.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-20  2:39                         ` jamal
@ 2004-09-20  2:58                           ` Herbert Xu
  2004-09-20 12:34                             ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-20  2:58 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Sun, Sep 19, 2004 at 10:39:04PM -0400, jamal wrote:
>  
> > However, in the scenarios that's been cited so far I can't see how it can
> > work.  For example, in the ip_queue context, the kernel really doesn't
> > have any alternative to dropping the packet, right?
> 
> I havent paid a lot of attention to ip_queue workings. But something

Well the ip_queue thing is simply a pipe that redirects packets going
into netfilter to user space.  So we can't really stop that pipe when
there is congestion.

AFAICT the problem Pablo is trying to solve is packet loss due to
netlink congestion.

There might actually be a problem with the kernel not waking up the
the user process when we tell it to.  It might even be a scheduling
problem.  But we'll need a test-case to assess that.

> that will get the kernel to backoff when a certain socket threshold gets
> exceeded then get it going again when a below a low watermak. Dumps
> already have markers stored in the callbacks, maybe something a long the
> same line of thinking. You may have to tell user space "theres more
> coming". Not sure how to achieve this, just brainstorming.
> Essentially this is where the improvements over std overun signaling is,
> IMO.

This might work.  But we really need to look at something concrete to
see what the implications are.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-20  2:58                           ` Herbert Xu
@ 2004-09-20 12:34                             ` jamal
  2004-09-20 18:14                               ` Pablo Neira
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-20 12:34 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Sun, 2004-09-19 at 22:58, Herbert Xu wrote:

> Well the ip_queue thing is simply a pipe that redirects packets going
> into netfilter to user space.  So we can't really stop that pipe when
> there is congestion.

You can detect congestion by noticing thresholds on the socket queue.
i.e high watermark to give opportunity to user space and low watermark
to let kernel piece continue.
To be honest you would probably need to do more (maybe borrow some ideas
from lazy receiver processing)to be precise - but thats a good start if
you can pull it.
At the moment the high watermark is the queue-fill level (in which case
an overrun happens) and low watermark is not defined. 

> AFAICT the problem Pablo is trying to solve is packet loss due to
> netlink congestion.
> 
> There might actually be a problem with the kernel not waking up the
> the user process when we tell it to.  It might even be a scheduling
> problem.  But we'll need a test-case to assess that.
> 

Agreed.
For a test i typically have something adding say 10K items (actions in
my case, but could be ipsec policies) and then try to dump them. On my
xeon i get an overrun after about 6K items are dumped.
-> Note that the overrun would have been a good enough a signal if you
could tell netlink "give me the rest of the stuff just before and
including the overrun". 

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-20 12:34                             ` jamal
@ 2004-09-20 18:14                               ` Pablo Neira
  2004-09-20 21:59                                 ` Herbert Xu
  2004-09-22  0:05                                 ` Herbert Xu
  0 siblings, 2 replies; 96+ messages in thread
From: Pablo Neira @ 2004-09-20 18:14 UTC (permalink / raw)
  To: hadi; +Cc: Herbert Xu, David S. Miller, netdev

Hi,

jamal wrote:

>On Sun, 2004-09-19 at 22:58, Herbert Xu wrote:
>  
>
>>AFAICT the problem Pablo is trying to solve is packet loss due to
>>netlink congestion.
>>
>>There might actually be a problem with the kernel not waking up the
>>the user process when we tell it to.  It might even be a scheduling
>>problem.  But we'll need a test-case to assess that.
>>
>>    
>>
>
>Agreed.
>For a test i typically have something adding say 10K items (actions in
>my case, but could be ipsec policies) and then try to dump them. On my
>xeon i get an overrun after about 6K items are dumped.
>  
>
yes, this is exactly what I've observed.

Here a link to the tool that I use to stress netlink sockets.

http://eurodev.net/~pablo/netlinkbench-unicast-1.0.tar.gz

We've set a webpage at the university: 
http://perso.ens-lyon.fr/laurent.lefevre/software/netlinkbench/ but the 
link to download the tool is broken, it will be up soon.

I've make also a version for broadcast sockets but it's basically a copy 
and paste of the unicast tool, I can also send you another link with it.

In nlbench-unicast.c there's a macro to set/unset MSG_DONTWAIT flag to 
make it hit my code or not.

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-20 18:14                               ` Pablo Neira
@ 2004-09-20 21:59                                 ` Herbert Xu
  2004-09-21 11:47                                   ` jamal
  2004-09-22  0:05                                 ` Herbert Xu
  1 sibling, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-20 21:59 UTC (permalink / raw)
  To: Pablo Neira; +Cc: hadi, David S. Miller, netdev

On Mon, Sep 20, 2004 at 08:14:42PM +0200, Pablo Neira wrote:
> 
> jamal wrote:
>
> >Agreed.
> >For a test i typically have something adding say 10K items (actions in
> >my case, but could be ipsec policies) and then try to dump them. On my
> >xeon i get an overrun after about 6K items are dumped.

Good.  That's something we can look at easily.  Dumping is meant to
be self-controlling as each packet naturally stops the next one from
being sent until the user has done a recvmsg.

> yes, this is exactly what I've observed.
> 
> Here a link to the tool that I use to stress netlink sockets.
> 
> http://eurodev.net/~pablo/netlinkbench-unicast-1.0.tar.gz

Thanks for the link.  I'll take a look.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-20 21:59                                 ` Herbert Xu
@ 2004-09-21 11:47                                   ` jamal
  2004-09-21 12:09                                     ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-21 11:47 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Mon, 2004-09-20 at 17:59, Herbert Xu wrote:

> > jamal wrote:
> >
> > >Agreed.
> > >For a test i typically have something adding say 10K items (actions in
> > >my case, but could be ipsec policies) and then try to dump them. On my
> > >xeon i get an overrun after about 6K items are dumped.
> 
> Good.  That's something we can look at easily.  Dumping is meant to
> be self-controlling as each packet naturally stops the next one from
> being sent until the user has done a recvmsg.

A get which results in a huge response (cant think of anything that
fits there - I have some stuff i havent released yet which applies) will
also have the same. Note by "large" implies it will overflow socket
buffer (and a setsock to increase the size will delay the problem from
happening).
Note, that an overrun in a dump results in lost messages. Maybe we can
detect that and reset the cb pointers appropriately? Have to look at the
code.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-21 11:47                                   ` jamal
@ 2004-09-21 12:09                                     ` Herbert Xu
  0 siblings, 0 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-21 12:09 UTC (permalink / raw)
  To: hadi; +Cc: herbert, pablo, davem, netdev

jamal <hadi@cyberus.ca> wrote:
> 
> A get which results in a huge response (cant think of anything that
> fits there - I have some stuff i havent released yet which applies) will
> also have the same. Note by "large" implies it will overflow socket
> buffer (and a setsock to increase the size will delay the problem from
> happening).

But congestion control doesn't help you there.  We have no concept
of fragmentation for netlink so the queue size has to accomodate the
biggest response packet.  Seriously if a single response is going to
overflow your entire queue there's gotta be something wrong :)

> Note, that an overrun in a dump results in lost messages. Maybe we can
> detect that and reset the cb pointers appropriately? Have to look at the
> code.

Dumping should never result in overruns unless the application is buggy.
We always allocate a one-page skb and fill it up before sending it to
the user.  We never start filling in the next one until the user has
received the first skb and that the receive queue is less than half full.

So I'm intrigued about this problem that Pablo is seeing.  Let me have
a play with it first.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-20 18:14                               ` Pablo Neira
  2004-09-20 21:59                                 ` Herbert Xu
@ 2004-09-22  0:05                                 ` Herbert Xu
  2004-09-22  0:24                                   ` Pablo Neira
                                                     ` (2 more replies)
  1 sibling, 3 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-22  0:05 UTC (permalink / raw)
  To: Pablo Neira; +Cc: hadi, David S. Miller, netdev

On Mon, Sep 20, 2004 at 08:14:42PM +0200, Pablo Neira wrote:
> 
> Here a link to the tool that I use to stress netlink sockets.
> 
> http://eurodev.net/~pablo/netlinkbench-unicast-1.0.tar.gz

Thanks for the link.  I'm afraid that your kernel module is simply
buggy.

First of all as I explained before the kernel must never wait.  It has
exactly the same effect as extending the receive queue length.

Secondly each of your user-space messages is producing a number of
replies.  This should be done as a dump operation.  If you do it as
a dump operation, then you will never get overruns because the kernel
never sends more than the user can handle.

I'm happy to help you work on this if you have questions.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22  0:05                                 ` Herbert Xu
@ 2004-09-22  0:24                                   ` Pablo Neira
  2004-09-22  2:48                                   ` Pablo Neira
  2004-09-22  2:57                                   ` jamal
  2 siblings, 0 replies; 96+ messages in thread
From: Pablo Neira @ 2004-09-22  0:24 UTC (permalink / raw)
  To: Herbert Xu; +Cc: hadi, David S. Miller, netdev

Hi Herbert,

Herbert Xu wrote:

>On Mon, Sep 20, 2004 at 08:14:42PM +0200, Pablo Neira wrote:
>  
>
>>Here a link to the tool that I use to stress netlink sockets.
>>
>>http://eurodev.net/~pablo/netlinkbench-unicast-1.0.tar.gz
>>    
>>
>
>Thanks for the link.  I'm afraid that your kernel module is simply
>buggy.
>
>First of all as I explained before the kernel must never wait.  It has
>exactly the same effect as extending the receive queue length.
>
>Secondly each of your user-space messages is producing a number of
>replies.  This should be done as a dump operation.  If you do it as
>a dump operation, then you will never get overruns because the kernel
>never sends more than the user can handle.
>  
>

I'll adapt the module to use dump, I still think that I can reproduce 
the problem.

thanks for the review, please stay tuned.

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22  0:05                                 ` Herbert Xu
  2004-09-22  0:24                                   ` Pablo Neira
@ 2004-09-22  2:48                                   ` Pablo Neira
  2004-09-22  2:50                                     ` David S. Miller
                                                       ` (2 more replies)
  2004-09-22  2:57                                   ` jamal
  2 siblings, 3 replies; 96+ messages in thread
From: Pablo Neira @ 2004-09-22  2:48 UTC (permalink / raw)
  To: Herbert Xu; +Cc: hadi, David S. Miller, netdev

Hi Herbert,

We are thinking in two different things:

a) You think about netlink sockets as a method to pass information to 
kernel space via command line, in that case dump is ok.

b) I'm thinking about netlink sockets as a method to generate event 
notifications. My benchmark tool just want to emulate that N events 
happen in a *very* short period of time (worst case), that implies 
sending N messages. So, forget that my tool sends previously a message 
to generate those N messages.

But I'm intrigued about something

jamal wrote:

>For a test i typically have something adding say 10K items (actions in
>my case, but could be ipsec policies) and then try to dump them. On my
>xeon i get an overrun after about 6K items are dumped.
>

Jamal, you're getting an overrun dumping ipsec policies. If I'm not 
wrong, that tool is doing that as dump operation, but that this 
shouldn't happen by using dump. Am I missing something?

I would like to have a way to send notifications via netlink sockets 
without losing congestion control ability, that is, without losing 
messages because of an overrun.

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22  2:48                                   ` Pablo Neira
@ 2004-09-22  2:50                                     ` David S. Miller
  2004-09-22  2:53                                     ` jamal
  2004-09-22  4:52                                     ` Herbert Xu
  2 siblings, 0 replies; 96+ messages in thread
From: David S. Miller @ 2004-09-22  2:50 UTC (permalink / raw)
  To: Pablo Neira; +Cc: herbert, hadi, davem, netdev

I think you should manage your "events" in your own data structure.
Perhaps a circular buffer of some sort.

Then provide ->dump() operation which fishes the events out of
the circular buffer.

That way you don't have to spam user space with a bunch of quick
transactions, you just wake him up once and several events may
be consumed and processed at once.

You can make your own buffering policies that way, and therefore
there is no need to complicate netlink for this purpose.

Meanwhile, I'm going to revert Pablo's original netlink optimization
until this is all sorted out.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22  2:48                                   ` Pablo Neira
  2004-09-22  2:50                                     ` David S. Miller
@ 2004-09-22  2:53                                     ` jamal
  2004-09-22  3:46                                       ` Herbert Xu
  2004-09-22  4:52                                     ` Herbert Xu
  2 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-22  2:53 UTC (permalink / raw)
  To: Pablo Neira; +Cc: Herbert Xu, David S. Miller, netdev

On Tue, 2004-09-21 at 22:48, Pablo Neira wrote:
> jamal wrote:
> 
> >For a test i typically have something adding say 10K items (actions in
> >my case, but could be ipsec policies) and then try to dump them. On my
> >xeon i get an overrun after about 6K items are dumped.
> >
> 
> Jamal, you're getting an overrun dumping ipsec policies. If I'm not 
> wrong, that tool is doing that as dump operation, but that this 
> shouldn't happen by using dump. Am I missing something?

Its not ipsec policies, rather tc actions - but i expect the same to
happen to ipsec policies being dumped..

> I would like to have a way to send notifications via netlink sockets 
> without losing congestion control ability, that is, without losing 
> messages because of an overrun.

This is a legit requirement. It is also legit to be able to do so for
just a basic get request (not a dump). Nothing in the semantics of
netlink says that it should work in one way or the other only.

cheers,
jamal

BTW, I was never able to dload your code; i think you might have taken
it offline?

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22  0:05                                 ` Herbert Xu
  2004-09-22  0:24                                   ` Pablo Neira
  2004-09-22  2:48                                   ` Pablo Neira
@ 2004-09-22  2:57                                   ` jamal
  2004-09-22  3:39                                     ` Herbert Xu
  2 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-22  2:57 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, 2004-09-21 at 20:05, Herbert Xu wrote:


> Secondly each of your user-space messages is producing a number of
> replies.  This should be done as a dump operation.  If you do it as
> a dump operation, then you will never get overruns because the kernel
> never sends more than the user can handle.

This is not accurate. You will get overruns. Try adding a few thousand
ipsec policies and then dumping them. Perhaps we can have the cb reset
in case of overrun detection.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22  2:57                                   ` jamal
@ 2004-09-22  3:39                                     ` Herbert Xu
  0 siblings, 0 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-22  3:39 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, Sep 21, 2004 at 10:57:26PM -0400, jamal wrote:
> 
> > Secondly each of your user-space messages is producing a number of
> > replies.  This should be done as a dump operation.  If you do it as
> > a dump operation, then you will never get overruns because the kernel
> > never sends more than the user can handle.
> 
> This is not accurate. You will get overruns. Try adding a few thousand
> ipsec policies and then dumping them. Perhaps we can have the cb reset
> in case of overrun detection.

I just happen to have a few thousand ipsec policies :) I dump them
all the time with the great new tool in iproute and I've never had
any overruns.  In contrast whenever I use setkey to dump them over
PFKEY it dies very quickly indeed.

I bet that there is a bug somewhere in the tc stuff.  Let me have a look.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22  2:53                                     ` jamal
@ 2004-09-22  3:46                                       ` Herbert Xu
  2004-09-22 11:35                                         ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-22  3:46 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, Sep 21, 2004 at 10:53:44PM -0400, jamal wrote:
> 
> Its not ipsec policies, rather tc actions - but i expect the same to
> happen to ipsec policies being dumped..

Well I haven't seen any with IPsec.  Can you please provide a strace
of a failed tc dump so that I know what I'm looking for?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22  2:48                                   ` Pablo Neira
  2004-09-22  2:50                                     ` David S. Miller
  2004-09-22  2:53                                     ` jamal
@ 2004-09-22  4:52                                     ` Herbert Xu
  2004-09-22 12:08                                       ` jamal
  2 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-22  4:52 UTC (permalink / raw)
  To: Pablo Neira; +Cc: hadi, David S. Miller, netdev

On Wed, Sep 22, 2004 at 04:48:05AM +0200, Pablo Neira wrote:
> 
> b) I'm thinking about netlink sockets as a method to generate event 
> notifications. My benchmark tool just want to emulate that N events 
> happen in a *very* short period of time (worst case), that implies 
> sending N messages. So, forget that my tool sends previously a message 
> to generate those N messages.

That's what I thought you wanted too, something like ip_queue.

You've got to remember that for ip_queue netlink is inherently
*unreliable*.  Your application must be able to cope with dropped
packets and recover in a sane way.  The kernel makes it slightly
easier for you in that it'll tell you if messages have been dropped.
But in the end it's up to the application to deal with it.

The only knob that you can tweak is the receive queue length.  You
might think that making the kernel wait/back off is going solve the
problem.  But it's just another way of extending the receive queue
length.

> Jamal, you're getting an overrun dumping ipsec policies. If I'm not 
> wrong, that tool is doing that as dump operation, but that this 
> shouldn't happen by using dump. Am I missing something?

I've never seen any overruns with IPsec.  Perhaps there is a bug
somewhere.

> I would like to have a way to send notifications via netlink sockets 
> without losing congestion control ability, that is, without losing 
> messages because of an overrun.

Congestion control similar to the way dumping works may work for you.
But you need to tell us more about what you're actually using this for.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22  3:46                                       ` Herbert Xu
@ 2004-09-22 11:35                                         ` jamal
  2004-09-23 12:05                                           ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-22 11:35 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, 2004-09-21 at 23:46, Herbert Xu wrote:
> 
> Well I haven't seen any with IPsec.  Can you please provide a strace
> of a failed tc dump so that I know what I'm looking for?

Try explicitly reducing the socket buffer size and see what happens

Heres a script i use (you would need to compile tc action gact and have
the latest iproute2):

---
#!/bin/sh

tc qdisc del dev eth1 ingress
tc qdisc add dev eth1 ingress

for ((i = 1 ; i <= $1 ; i++))
do
        tc actions add action drop index $i
done
----

set i to 10000 or a little more

to dump: tc actions ls action drop

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22  4:52                                     ` Herbert Xu
@ 2004-09-22 12:08                                       ` jamal
  2004-09-22 17:52                                         ` David S. Miller
  2004-09-23 12:07                                         ` Herbert Xu
  0 siblings, 2 replies; 96+ messages in thread
From: jamal @ 2004-09-22 12:08 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Wed, 2004-09-22 at 00:52, Herbert Xu wrote: 
> Congestion control similar to the way dumping works may work for you.

Note the initial goal was to have dump work for large tables and a
simple get for a basic table row.
Since netlink is being used for a lot of things now, it may be time to
obsolete those assumptions. 
Note also, theres a lot of wastage of that scarce sock buffer via page
sized skbs - its not as trivial to fix, but would go some way to improve
overrunning of the socket.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22 12:08                                       ` jamal
@ 2004-09-22 17:52                                         ` David S. Miller
  2004-09-23 15:40                                           ` Pablo Neira
  2004-09-23 12:07                                         ` Herbert Xu
  1 sibling, 1 reply; 96+ messages in thread
From: David S. Miller @ 2004-09-22 17:52 UTC (permalink / raw)
  To: hadi; +Cc: herbert, pablo, davem, netdev

On 22 Sep 2004 08:08:40 -0400
jamal <hadi@cyberus.ca> wrote:

> Note also, theres a lot of wastage of that scarce sock buffer via page
> sized skbs - its not as trivial to fix, but would go some way to improve
> overrunning of the socket.

Thanks for reminding me about this.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22 11:35                                         ` jamal
@ 2004-09-23 12:05                                           ` Herbert Xu
  2004-09-24  2:56                                             ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-23 12:05 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Wed, Sep 22, 2004 at 07:35:57AM -0400, jamal wrote:
> On Tue, 2004-09-21 at 23:46, Herbert Xu wrote:
> > 
> > Well I haven't seen any with IPsec.  Can you please provide a strace
> > of a failed tc dump so that I know what I'm looking for?
> 
> Try explicitly reducing the socket buffer size and see what happens

What's wrong with the default sizes? If you decrease it far enough
of course it's going to overflow.
 
> set i to 10000 or a little more

You might want to optimise your add path.  It was painful with 20000 :)

> to dump: tc actions ls action drop

That was much faster, no overflows at all:

angband:~# time tc action ls action gact > out

real    0m0.943s
user    0m0.075s
sys     0m0.867s
angband:~# wc -l out
80000 out
angband:~#

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22 12:08                                       ` jamal
  2004-09-22 17:52                                         ` David S. Miller
@ 2004-09-23 12:07                                         ` Herbert Xu
  2004-09-24  1:19                                           ` Pablo Neira
  2004-09-24  3:04                                           ` jamal
  1 sibling, 2 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-23 12:07 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Wed, Sep 22, 2004 at 08:08:40AM -0400, jamal wrote:
>
> Since netlink is being used for a lot of things now, it may be time to
> obsolete those assumptions. 

I'm not against changes.  But so far I haven't seen anything concrete
about what these new things are yet :)

> Note also, theres a lot of wastage of that scarce sock buffer via page
> sized skbs - its not as trivial to fix, but would go some way to improve
> overrunning of the socket.

I wasn't aware of this scarcity problem.  Can you elaborate?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-22 17:52                                         ` David S. Miller
@ 2004-09-23 15:40                                           ` Pablo Neira
  2004-09-23 19:16                                             ` David S. Miller
  0 siblings, 1 reply; 96+ messages in thread
From: Pablo Neira @ 2004-09-23 15:40 UTC (permalink / raw)
  To: David S. Miller; +Cc: hadi, herbert, davem, netdev

David S. Miller wrote:

>On 22 Sep 2004 08:08:40 -0400
>jamal <hadi@cyberus.ca> wrote:
>
>  
>
>>Note also, theres a lot of wastage of that scarce sock buffer via page
>>sized skbs - its not as trivial to fix, but would go some way to improve
>>overrunning of the socket.
>>    
>>
>
>Thanks for reminding me about this.
>  
>

Initially we could allocate a skb with size NLMSG_GOODSIZE, then after 
all the information has been added, we could use a function (skb_*) 
which allocates a new buffer headroom, memcpy the old skb headroom and 
release it, so we trim the useless part of the headroom. This make us 
waste some extra jiffies with memcpy's but we could save same space in 
the queue. Does such skb_* function exist?

Another choice could be using a temporary buffer and them memcpy the 
buffer to a skb which has only allocated the buffer space used.

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-23 15:40                                           ` Pablo Neira
@ 2004-09-23 19:16                                             ` David S. Miller
  2004-09-24  3:28                                               ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: David S. Miller @ 2004-09-23 19:16 UTC (permalink / raw)
  To: Pablo Neira; +Cc: hadi, herbert, davem, netdev

On Thu, 23 Sep 2004 17:40:24 +0200
Pablo Neira <pablo@eurodev.net> wrote:

> Initially we could allocate a skb with size NLMSG_GOODSIZE, then after 
> all the information has been added, we could use a function (skb_*) 
> which allocates a new buffer headroom, memcpy the old skb headroom and 
> release it, so we trim the useless part of the headroom. This make us 
> waste some extra jiffies with memcpy's but we could save same space in 
> the queue. Does such skb_* function exist?

No such function exists.

Such a function would need to modify skb->truesize and that is
very dangerous.  People using such a routine would need to be
_extremely_ careful since if the skb being worked on is on
a socket queue, changing skb->truesize is going to mess up
socket buffer accounting later when the skb gets freed
and the socket buffer space liberated.

That doesn't apply to what you're trying to do here, of course.

Simpler would be:

1) For each netlink socket, allocate a page, much like TCP sockets
   do.

2) Construct the netlink response in this page sized buffer,
   keeping track of how much of the page is actually used.

3) At the end, allocate the skb with the necessary length,
   copy into the skb from the page buffer.

4) Since the RTNL semaphore is held during the length of these
   operations, the per-socket page needs no locking.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-23 12:07                                         ` Herbert Xu
@ 2004-09-24  1:19                                           ` Pablo Neira
  2004-09-24  3:04                                           ` jamal
  1 sibling, 0 replies; 96+ messages in thread
From: Pablo Neira @ 2004-09-24  1:19 UTC (permalink / raw)
  To: Herbert Xu; +Cc: jamal, David S. Miller, netdev

Hi Herbert,

Herbert Xu wrote:

>On Wed, Sep 22, 2004 at 08:08:40AM -0400, jamal wrote:
>  
>
>>Since netlink is being used for a lot of things now, it may be time to
>>obsolete those assumptions. 
>>    
>>
>
>I'm not against changes.  But so far I haven't seen anything concrete
>about what these new things are yet :)
>  
>

Nothing mysterious :), people are willing to put more and more things in 
kernel space, so I wanted to evaluate netlink sockets as a method to 
pass information from kernel to user space and vice-versa.

I think that, instead of putting all the things in kernel space, a 
kernel component can provide an API to user space programs to interact 
with them, so I wanted to evaluate them to figure out what they could do 
and what they could't do at this moment. I can also focus this to 
component replication.

regards,
Pablo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-23 12:05                                           ` Herbert Xu
@ 2004-09-24  2:56                                             ` jamal
  2004-09-24  3:20                                               ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-24  2:56 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Thu, 2004-09-23 at 08:05, Herbert Xu wrote:

> > Try explicitly reducing the socket buffer size and see what happens
> 
> What's wrong with the default sizes? If you decrease it far enough
> of course it's going to overflow.

The idea is to reproduce the overun ;->

> > set i to 10000 or a little more
> 
> You might want to optimise your add path.  It was painful with 20000 :)

Probably debug prints slowing things; you could of course do some
batching like so:

$TC actions add \
 action drop index 1
 action drop index 2 
 action drop index 3 ...

Do 10 a time in the for loop and should be roughly 10 times as fast

> > to dump: tc actions ls action drop
> 
> That was much faster, no overflows at all:

I apologize i dont have time right now to chase it and experiment; my
test machine can be made to reproduce it. Maybe this weekend

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-23 12:07                                         ` Herbert Xu
  2004-09-24  1:19                                           ` Pablo Neira
@ 2004-09-24  3:04                                           ` jamal
  2004-09-24  3:24                                             ` Herbert Xu
  1 sibling, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-24  3:04 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Thu, 2004-09-23 at 08:07, Herbert Xu wrote:
> On Wed, Sep 22, 2004 at 08:08:40AM -0400, jamal wrote:
> >
> > Since netlink is being used for a lot of things now, it may be time to
> > obsolete those assumptions. 
> 
> I'm not against changes.  But so far I haven't seen anything concrete
> about what these new things are yet :)

So lets start by using the following logic:
1) If you made the socket buffers small enough compared to message
size/arrival rate, then an overrun will happen
corrollary:
2) If you made the message size/arrival fast enough relative to socket
size, an overrun will happen

If you agree that #1 is equivalent to #2, then you can experiment by #1
to see the issue.

> > Note also, theres a lot of wastage of that scarce sock buffer via page
> > sized skbs - its not as trivial to fix, but would go some way to improve
> > overrunning of the socket.
> 
> I wasn't aware of this scarcity problem.  Can you elaborate?

The NLM_GOODSIZE allocation done for each netlink skb,
In the case of the dump you dont apriori know how much space you need,
so it is fair. 

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24  2:56                                             ` jamal
@ 2004-09-24  3:20                                               ` Herbert Xu
  2004-09-27 12:41                                                 ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-24  3:20 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Thu, Sep 23, 2004 at 10:56:37PM -0400, jamal wrote:
>
> > What's wrong with the default sizes? If you decrease it far enough
> > of course it's going to overflow.
> 
> The idea is to reproduce the overun ;->

OK, I've just tried 4096 and nothing:

21619 socket(PF_NETLINK, SOCK_RAW, 0)   = 3
21619 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [32768], 4) = 0
21619 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [4096], 4) = 0
...
21619 recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, sa_data="\0\0\0\0\0\0\0\0\
0\0\0\0\0\0"}, msg_iov(1)=[{"\30\16\0\0002\0\2\0\366@SAsT\0\0\0 00\4\16\1\0p\0\0
\0\24"..., 16384}], msg_controllen=0, msg_flags=0}, 0) = 3608
21619 write(1, "\n\taction order 0: gact action dr"..., 4096) = 4096
21619 recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, sa_data="\0\0\0\0\0\0\0\0\
0\0\0\0\0\0"}, msg_iov(1)=[{"\30\16\0\0002\0\2\0\366@SAsT\0\0\0 00\4\16\1\0p\0\0
\0\24"..., 16384}], msg_controllen=0, msg_flags=0}, 0) = 3608
21619 write(1, "0\n\t index 19312 ref 1 bind 0\n \n\t"..., 4096) = 4096
21619 recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, sa_data="\0\0\0\0\0\0\0\0\
0\0\0\0\0\0"}, msg_iov(1)=[{"\30\16\0\0002\0\2\0\366@SAsT\0\0\0 00\4\16\1\0p\0\0
\0\24"..., 16384}], msg_controllen=0, msg_flags=0}, 0) = 3608
21619 recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, sa_data="\0\0\0\0\0\0\0\0\
0\0\0\0\0\0"}, msg_iov(1)=[{"\30\16\0\0002\0\2\0\366@SAsT\0\0\0 00\4\16\1\0p\0\0
\0\24"..., 16384}], msg_controllen=0, msg_flags=0}, 0) = 3608
21619 write(1, "action drop\n\t random type none p"..., 4096) = 4096

> $TC actions add \
>  action drop index 1
>  action drop index 2 
>  action drop index 3 ...
> 
> Do 10 a time in the for loop and should be roughly 10 times as fast

Cool, didn't know that.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24  3:04                                           ` jamal
@ 2004-09-24  3:24                                             ` Herbert Xu
  2004-09-27 12:46                                               ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-24  3:24 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Thu, Sep 23, 2004 at 11:04:02PM -0400, jamal wrote:
> 
> So lets start by using the following logic:
> 1) If you made the socket buffers small enough compared to message
> size/arrival rate, then an overrun will happen
> corrollary:
> 2) If you made the message size/arrival fast enough relative to socket
> size, an overrun will happen
> 
> If you agree that #1 is equivalent to #2, then you can experiment by #1
> to see the issue.

There are three possible netlink usages:

1) Request/response:

No overruns should occur.

2) Dump:

No overruns should occur because of dump only fills in the next one when
the previous one is taken off the queue by the user.

3) Async messages:

Overruns may occur if the arrival rate exceeds the application's
processing capacity or if the queue is too small for a burst.

Now we were discussing about how we can do congestion control for 3).
But to do that we need to know exactly what these messages are.  For
example if they're coming from an external source as is the case in
ip_queue then you can't congestion control it at all.

Oh and never use the same socket for 1+2) and 3) together.  You can
use the socket for 1) and 2), but 3) must be in its own socket.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-23 19:16                                             ` David S. Miller
@ 2004-09-24  3:28                                               ` Herbert Xu
  2004-09-24  5:39                                                 ` David S. Miller
  2004-09-27 12:53                                                 ` jamal
  0 siblings, 2 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-24  3:28 UTC (permalink / raw)
  To: David S. Miller; +Cc: Pablo Neira, hadi, davem, netdev

On Thu, Sep 23, 2004 at 12:16:51PM -0700, David S. Miller wrote:
> 
> Simpler would be:
> 
> 1) For each netlink socket, allocate a page, much like TCP sockets
>    do.
> 
> 2) Construct the netlink response in this page sized buffer,
>    keeping track of how much of the page is actually used.
> 
> 3) At the end, allocate the skb with the necessary length,
>    copy into the skb from the page buffer.
> 
> 4) Since the RTNL semaphore is held during the length of these
>    operations, the per-socket page needs no locking.

Can someone please tell me why we need to do this at all?

Most of the dump messages should be close to PAGE_SIZE anyway, no?
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24  3:28                                               ` Herbert Xu
@ 2004-09-24  5:39                                                 ` David S. Miller
  2004-09-24  6:26                                                   ` Herbert Xu
  2004-09-27 12:53                                                 ` jamal
  1 sibling, 1 reply; 96+ messages in thread
From: David S. Miller @ 2004-09-24  5:39 UTC (permalink / raw)
  To: Herbert Xu; +Cc: pablo, hadi, davem, netdev

On Fri, 24 Sep 2004 13:28:30 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> Can someone please tell me why we need to do this at all?
> 
> Most of the dump messages should be close to PAGE_SIZE anyway, no?

It's moreso for the "give me a single entry" requests
than for dumps.  NLMSG_GOODSIZE is used for all cases.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24  5:39                                                 ` David S. Miller
@ 2004-09-24  6:26                                                   ` Herbert Xu
  2004-09-24 17:58                                                     ` David S. Miller
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-24  6:26 UTC (permalink / raw)
  To: David S. Miller; +Cc: pablo, hadi, davem, netdev

On Thu, Sep 23, 2004 at 10:39:09PM -0700, David S. Miller wrote:
> 
> It's moreso for the "give me a single entry" requests
> than for dumps.  NLMSG_GOODSIZE is used for all cases.

I see.  NLMSG_GOODSIZE is definitely a waste in that case.

> Simpler would be:
> 
> 1) For each netlink socket, allocate a page, much like TCP sockets
>    do.

I'd let the responder allocate that page skb as they do now.
After all what we lack is space on the receive queue, not
kernel memory.  So long as that page doesn't end up on the
queue, it should be fine.

It also means those responders that can calculate the correct size
of the reply doesn't need to go through this copy process.

> 2) Construct the netlink response in this page sized buffer,
>    keeping track of how much of the page is actually used.

The size is in the NL header so that's easy.

> 3) At the end, allocate the skb with the necessary length,
>    copy into the skb from the page buffer.

We can put that into a helper function and then those netlink
responders which use GOODSIZE (most of them for now) can call
it.

Putting it in a helper means that we can have implementations
in future that don't need to do this.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24  6:26                                                   ` Herbert Xu
@ 2004-09-24 17:58                                                     ` David S. Miller
  2004-09-24 22:06                                                       ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: David S. Miller @ 2004-09-24 17:58 UTC (permalink / raw)
  To: Herbert Xu; +Cc: pablo, hadi, davem, netdev

On Fri, 24 Sep 2004 16:26:04 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> It also means those responders that can calculate the correct size
> of the reply doesn't need to go through this copy process.

This is the crux of the problem, you don't know how big
the response will be until you look at the whole object
you are returning.

This is because there are usually an unknown number of
attributes that will be returned.

This is why I suggested the PAGE_SIZE scratchpad, you build
the response fully into there, then you know the exact size
SKB you need so you allocate it and copy into it from the
scratch pad.

It's just a response build up area, and the goal is to have
the minimum sized SKB necessary to represent the response.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24 17:58                                                     ` David S. Miller
@ 2004-09-24 22:06                                                       ` Herbert Xu
  2004-09-24 22:28                                                         ` Herbert Xu
  2004-09-27 12:59                                                         ` jamal
  0 siblings, 2 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-24 22:06 UTC (permalink / raw)
  To: David S. Miller; +Cc: pablo, hadi, davem, netdev

On Fri, Sep 24, 2004 at 10:58:55AM -0700, David S. Miller wrote:
> On Fri, 24 Sep 2004 16:26:04 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > It also means those responders that can calculate the correct size
> > of the reply doesn't need to go through this copy process.
> 
> This is the crux of the problem, you don't know how big
> the response will be until you look at the whole object
> you are returning.

I understand.  What I'm saying is that some of these places already know
how big it is going to be.  See tcp_diag.c for an example.  So we shouldn't
impose the copy over head on everyone.

> This is why I suggested the PAGE_SIZE scratchpad, you build
> the response fully into there, then you know the exact size
> SKB you need so you allocate it and copy into it from the
> scratch pad.

I agree completely.  I'm just saying that the current NLM_GOODSIZE
skb is the perfect scratch-pad so we don't need a new one.  All we
need to do is to insert a helper call just before netlink_unicast
to copy and trim the packet in the places that need it.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24 22:06                                                       ` Herbert Xu
@ 2004-09-24 22:28                                                         ` Herbert Xu
  2004-09-27 12:59                                                         ` jamal
  1 sibling, 0 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-24 22:28 UTC (permalink / raw)
  To: David S. Miller; +Cc: pablo, hadi, davem, netdev

[-- Attachment #1: Type: text/plain, Size: 731 bytes --]

On Sat, Sep 25, 2004 at 08:06:51AM +1000, herbert wrote:
> 
> I agree completely.  I'm just saying that the current NLM_GOODSIZE
> skb is the perfect scratch-pad so we don't need a new one.  All we
> need to do is to insert a helper call just before netlink_unicast
> to copy and trim the packet in the places that need it.

Here is a demonstration of what I meant.  Please note that this is
totally untested.

We could just make the last argument of netlink_unicast into a flag
so that it can do this directly.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p --]
[-- Type: text/plain, Size: 673 bytes --]

===== net/xfrm/xfrm_user.c 1.51 vs edited =====
--- 1.51/net/xfrm/xfrm_user.c	2004-09-12 21:51:43 +10:00
+++ edited/net/xfrm/xfrm_user.c	2004-09-25 08:22:05 +10:00
@@ -433,10 +433,17 @@
 	resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
 	if (IS_ERR(resp_skb)) {
 		err = PTR_ERR(resp_skb);
-	} else {
-		err = netlink_unicast(xfrm_nl, resp_skb,
-				      NETLINK_CB(skb).pid, MSG_DONTWAIT);
+		goto out_put;
 	}
+
+	err = pskb_expand_head(resp_skb, 0, skb->tail - skb->end, GFP_KERNEL);
+	if (err)
+		goto out_put;
+
+	err = netlink_unicast(xfrm_nl, resp_skb,
+			      NETLINK_CB(skb).pid, MSG_DONTWAIT);
+
+out_put:
 	xfrm_state_put(x);
 out_noput:
 	return err;

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24  3:20                                               ` Herbert Xu
@ 2004-09-27 12:41                                                 ` jamal
  0 siblings, 0 replies; 96+ messages in thread
From: jamal @ 2004-09-27 12:41 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Thu, 2004-09-23 at 23:20, Herbert Xu wrote:
> On Thu, Sep 23, 2004 at 10:56:37PM -0400, jamal wrote:
> >
> > > What's wrong with the default sizes? If you decrease it far enough
> > > of course it's going to overflow.
> > 
> > The idea is to reproduce the overun ;->
> 
> OK, I've just tried 4096 and nothing:

Try harder! ;-> 
Ok, I will go and lookup my test machine and get back with an exact test
case (problem is whenever i have time to scan my emails i am at least 30
minutes away from that box).

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24  3:24                                             ` Herbert Xu
@ 2004-09-27 12:46                                               ` jamal
  2004-09-27 21:36                                                 ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-27 12:46 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Thu, 2004-09-23 at 23:24, Herbert Xu wrote:

> There are three possible netlink usages:
> 
> 1) Request/response:
> 
> No overruns should occur.

Cant assume this. A request for an ACK is fine. A get is a different
ball game.

> 2) Dump:
> 
> No overruns should occur because of dump only fills in the next one when
> the previous one is taken off the queue by the user.
> 
> 3) Async messages:
> 
> Overruns may occur if the arrival rate exceeds the application's
> processing capacity or if the queue is too small for a burst.
> 
> Now we were discussing about how we can do congestion control for 3).
> But to do that we need to know exactly what these messages are.  For
> example if they're coming from an external source as is the case in
> ip_queue then you can't congestion control it at all.
> 
> Oh and never use the same socket for 1+2) and 3) together.  You can
> use the socket for 1) and 2), but 3) must be in its own socket.

I think the discussion to have a mini-slab-like allocator for netlink
that i see going on in other thread is what we need. However, so far
that seems to be only useful for 2).

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24  3:28                                               ` Herbert Xu
  2004-09-24  5:39                                                 ` David S. Miller
@ 2004-09-27 12:53                                                 ` jamal
  1 sibling, 0 replies; 96+ messages in thread
From: jamal @ 2004-09-27 12:53 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, Pablo Neira, David S. Miller, netdev

On Thu, 2004-09-23 at 23:28, Herbert Xu wrote:

> Most of the dump messages should be close to PAGE_SIZE anyway, no?

20 bytes for the header; typical messages and attributes are in the
range of 20 bytes. In the problematic case #2 or #3 (from what i have
seen) in your previous email, you get events being broadcast to multiple
users occupying say 400 bytes or about 10 messages (~10%) when 4096 is
charged to the socket receive queue.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-24 22:06                                                       ` Herbert Xu
  2004-09-24 22:28                                                         ` Herbert Xu
@ 2004-09-27 12:59                                                         ` jamal
  1 sibling, 0 replies; 96+ messages in thread
From: jamal @ 2004-09-27 12:59 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, pablo, David S. Miller, netdev

On Fri, 2004-09-24 at 18:06, Herbert Xu wrote:

> I agree completely.  I'm just saying that the current NLM_GOODSIZE
> skb is the perfect scratch-pad so we don't need a new one.  All we
> need to do is to insert a helper call just before netlink_unicast
> to copy and trim the packet in the places that need it.

You are probably talking past each other - to me you seem to be saying
the same thing. A "manager" of some form is required.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-27 12:46                                               ` jamal
@ 2004-09-27 21:36                                                 ` Herbert Xu
  2004-09-28  2:43                                                   ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-27 21:36 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Mon, Sep 27, 2004 at 08:46:30AM -0400, jamal wrote:
> > 
> > 1) Request/response:
> > 
> > No overruns should occur.
> 
> Cant assume this. A request for an ACK is fine. A get is a different
> ball game.

What I meant is that this is a case where the maximum size of the reply
can be known at compile time.  So assuming that your socket receive size
is set correctly, it should never overflow.

> > 3) Async messages:
> > 
> > Overruns may occur if the arrival rate exceeds the application's
> > processing capacity or if the queue is too small for a burst.
> > 
> > Now we were discussing about how we can do congestion control for 3).
> > But to do that we need to know exactly what these messages are.  For
> > example if they're coming from an external source as is the case in
> > ip_queue then you can't congestion control it at all.

You still haven't told me what you're going to use 3) for yet...

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-27 21:36                                                 ` Herbert Xu
@ 2004-09-28  2:43                                                   ` jamal
  2004-09-28  2:46                                                     ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-28  2:43 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Mon, 2004-09-27 at 17:36, Herbert Xu wrote:

> You still haven't told me what you're going to use 3) for yet...

Sorry, keep forgetting to retrieve it from other remote non-networked
test machine. The idea is:
Create something that generates lots of messages. Make socket buffer
small (4096 is good nuf)

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28  2:43                                                   ` jamal
@ 2004-09-28  2:46                                                     ` Herbert Xu
  2004-09-28  3:06                                                       ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-28  2:46 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Mon, Sep 27, 2004 at 10:43:27PM -0400, jamal wrote:
> On Mon, 2004-09-27 at 17:36, Herbert Xu wrote:
> 
> > You still haven't told me what you're going to use 3) for yet...
> 
> Sorry, keep forgetting to retrieve it from other remote non-networked
> test machine. The idea is:
> Create something that generates lots of messages. Make socket buffer
> small (4096 is good nuf)

No that's not what I meant.  What I mean is what is the real-world
scenario where you're going to be doing 3).

Only when we know where the messages are coming from and what they
are can we decide on a meaningful method of limiting their rate.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28  2:46                                                     ` Herbert Xu
@ 2004-09-28  3:06                                                       ` jamal
  2004-09-28  3:23                                                         ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-28  3:06 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Mon, 2004-09-27 at 22:46, Herbert Xu wrote:
> On Mon, Sep 27, 2004 at 10:43:27PM -0400, jamal wrote:
> > On Mon, 2004-09-27 at 17:36, Herbert Xu wrote:
> > 
> > > You still haven't told me what you're going to use 3) for yet...
> > 
> > Sorry, keep forgetting to retrieve it from other remote non-networked
> > test machine. The idea is:
> > Create something that generates lots of messages. Make socket buffer
> > small (4096 is good nuf)
> 
> No that's not what I meant.  What I mean is what is the real-world
> scenario where you're going to be doing 3).

The real world scenario is just that: Massive events generated from the
kernel for example

> Only when we know where the messages are coming from and what they
> are can we decide on a meaningful method of limiting their rate.

Ok, heres a sample testcase i created just now following those same
instructions i gave you ;->

---
#!/bin/sh

ifconfig dummy0 1.2.1.1 netmask 255.255.255.0 broadcast 1.2.1.255 up
ifconfig dummy0:0 1.2.1.1
for ((i = 1 ; i <= $1 ; i++))
do
        ifconfig dummy0:$i 1.2.1.$i
done
---

pass 100 to it to create 100 aliases; you may not need to setsock to
4096, but thats what i did. 

down dummy0:0 and see the overrun.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28  3:06                                                       ` jamal
@ 2004-09-28  3:23                                                         ` Herbert Xu
  2004-09-28  3:45                                                           ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-28  3:23 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Mon, Sep 27, 2004 at 11:06:12PM -0400, jamal wrote:
> 
> Ok, heres a sample testcase i created just now following those same
> instructions i gave you ;->
> 
> ---
> #!/bin/sh
> 
> ifconfig dummy0 1.2.1.1 netmask 255.255.255.0 broadcast 1.2.1.255 up
> ifconfig dummy0:0 1.2.1.1
> for ((i = 1 ; i <= $1 ; i++))
> do
>         ifconfig dummy0:$i 1.2.1.$i
> done
> ---
> 
> pass 100 to it to create 100 aliases; you may not need to setsock to
> 4096, but thats what i did. 
> 
> down dummy0:0 and see the overrun.

But ifconfig doesn't use netlink.  So I presume you're referring to
some other application that's listening for interface/address/route
events.

Firstly thanks this is exactly what I've been asking for -- a real
world scenario :)

Now that we know where the events are coming from and what they are,
we can decide on the solution.  In this particular case, there is
nothing you can do on the sending side.  Stopping people from operating
on networking objects just because some netlink listener can't keep up
isn't going to work.  So congestion control is out of the question.

That leaves two options AFAICS.  The easy way out is to increase the
receive buffer size.  Of course after a while this is not going to
work.

The second option which is the one I prefer: If so many events are
occuring and you can't keep up, it's time to give up :)

So just bite the bullet and reread the system state by issuing dump
operations.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28  3:23                                                         ` Herbert Xu
@ 2004-09-28  3:45                                                           ` jamal
  2004-09-28  3:59                                                             ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-28  3:45 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Mon, 2004-09-27 at 23:23, Herbert Xu wrote:

> But ifconfig doesn't use netlink.  So I presume you're referring to
> some other application that's listening for interface/address/route
> events.

Thats right. 
I think you should be able to run ip monitor to see the overrun.

> Firstly thanks this is exactly what I've been asking for -- a real
> world scenario :)

I can assure you theres many more like this;->
My remote app is not this btw.

> Now that we know where the events are coming from and what they are,
> we can decide on the solution.  In this particular case, there is
> nothing you can do on the sending side.  Stopping people from operating
> on networking objects just because some netlink listener can't keep up
> isn't going to work.  So congestion control is out of the question.

fixing the NLM_GOODSIZE issue is a very good first step.
Adding congestion control would be harder but not out of question.
The hard part is not in the maintaing state part (as the case of dump)
its rather how you start getting slowed down (in a broadcast) by one
slow (or buggy) app thats also decided its gonna have very low socket
buffer sizes. 

> That leaves two options AFAICS.  The easy way out is to increase the
> receive buffer size.  Of course after a while this is not going to
> work.

Indeed. It just postpones the overrun.

> The second option which is the one I prefer: If so many events are
> occuring and you can't keep up, it's time to give up :)
> 
> So just bite the bullet and reread the system state by issuing dump
> operations.

We may as well choose to document it as being this mostly because of the
issue i described above. We shouldnt give up so easily though ;->

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28  3:45                                                           ` jamal
@ 2004-09-28  3:59                                                             ` Herbert Xu
  2004-09-28 10:36                                                               ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-28  3:59 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Mon, Sep 27, 2004 at 11:45:25PM -0400, jamal wrote:
> 
> > Now that we know where the events are coming from and what they are,
> > we can decide on the solution.  In this particular case, there is
> > nothing you can do on the sending side.  Stopping people from operating
> > on networking objects just because some netlink listener can't keep up
> > isn't going to work.  So congestion control is out of the question.
> 
> fixing the NLM_GOODSIZE issue is a very good first step.

Well I'm afraid that it doesn't help in your interface address example
because rtmsg_ifa() already allocates a buffer of (approximately) the
right size.  That is, it doesn't use NLM_GOODSIZE at all.

> Adding congestion control would be harder but not out of question.

But the question is who are you going to throttle? If you throttle
the source of the messages then you're going to stop people from adding
or deleting IP addresses which can't be right.

If you move the netlink sender into a different execution context and
throttle that then that's just extending the receive queue length by
stealth.

So I'm afraid I don't see how congestion control could be applied in
*this* particular context.
 
> > So just bite the bullet and reread the system state by issuing dump
> > operations.
> 
> We may as well choose to document it as being this mostly because of the
> issue i described above. We shouldnt give up so easily though ;->

Well IMHO this is not giving up at all.

Think of it another way.  Monitoring routes is like maintaining a
program.  Normally you just fix the bugs as they come.  But if the
bug reports are coming in so fast that you can't keep up, perhaps
it's time to throw it away and rewrite it from scratch :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28  3:59                                                             ` Herbert Xu
@ 2004-09-28 10:36                                                               ` jamal
  2004-09-28 11:11                                                                 ` Herbert Xu
  2004-09-28 21:07                                                                 ` Pablo Neira
  0 siblings, 2 replies; 96+ messages in thread
From: jamal @ 2004-09-28 10:36 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Mon, 2004-09-27 at 23:59, Herbert Xu wrote:
> On Mon, Sep 27, 2004 at 11:45:25PM -0400, jamal wrote:

> > fixing the NLM_GOODSIZE issue is a very good first step.
> 
> Well I'm afraid that it doesn't help in your interface address example
> because rtmsg_ifa() already allocates a buffer of (approximately) the
> right size.  That is, it doesn't use NLM_GOODSIZE at all.

er, what about the host scope route msgs generated by same script? ;->
I am not sure if the IFAs cause any issues - they definetley contribute.

> > Adding congestion control would be harder but not out of question.
> 
> But the question is who are you going to throttle? If you throttle
> the source of the messages then you're going to stop people from adding
> or deleting IP addresses which can't be right.

The state is per socket. You may need an intermediate queue etc which
feeds to each user socket registered for the event. The socket queue
acts as a essentially a retransmitQ for broadcast state. Just waving my
hands throwing ideas here of course.

> If you move the netlink sender into a different execution context and
> throttle that then that's just extending the receive queue length by
> stealth.
> 
> So I'm afraid I don't see how congestion control could be applied in
> *this* particular context.
>  

We cant control the user space script for example that caused those
events. We shouldnt infact. Congestion control in this context equates
to desire to not overlaod the reader (of events). In other words if you
know the reader's capacity for swallowing events, then you dont exceed
that rate of sending to said reader. Knowing the capacity requires even
more state:
So on that thought, lets continue on the handwaving approach of an
intermidiate queue. Congestion control would mean the puck stops at this
queue and usre space doesnt get bogged down reading meaningless state,
The problem is, like i said earlier, that your rate of consumption of
events is going to be bottlenecked by the slowest and most socket buff
deprived reader. If you create the intermediate queue i described above,
then it gets to absorb the massive messages before any socket sees them.
If this queue gets full then there is no point in sending any message to
any waiting socket. Just overrun them immediately and they (readers) get
forced to reread the state. Of course this queue will have to be larger
than any of the active sockets recv queues.
You will need to have one such queue per event - and yes there maybe
scalability issues.
The moral of this is: you could do it if you wanted - aint trivial.

> > > So just bite the bullet and reread the system state by issuing dump
> > > operations.
> > 
> > We may as well choose to document it as being this mostly because of the
> > issue i described above. We shouldnt give up so easily though ;->
> 
> Well IMHO this is not giving up at all.
> 
> Think of it another way.  Monitoring routes is like maintaining a
> program.  Normally you just fix the bugs as they come.  But if the
> bug reports are coming in so fast that you can't keep up, perhaps
> it's time to throw it away and rewrite it from scratch :)

Except if you drop incoming bug reports and drop them early (point of
that intermidiate queue).
BTW, Davem gets away with this congestion control alg all the time.
Heck i think his sanity survives because of it - I bet you hes got this
thread under congestion controlled right now;->

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 10:36                                                               ` jamal
@ 2004-09-28 11:11                                                                 ` Herbert Xu
  2004-09-28 12:16                                                                   ` Herbert Xu
  2004-09-28 12:32                                                                   ` jamal
  2004-09-28 21:07                                                                 ` Pablo Neira
  1 sibling, 2 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-28 11:11 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, Sep 28, 2004 at 06:36:27AM -0400, jamal wrote:
> 
> er, what about the host scope route msgs generated by same script? ;->

You mean rtmsg_fib()? It also allocates a size much less than
NLM_GOODSIZE, namely sizeof(struct rtmsg) + 256.  However, the
trim function might be able to shave a bit off there since the
256 bytes is mostly reserved for multiple nexthops.

AFAIK, no async netlink event function uses NLM_GOODSIZE at all for
obvious reasons.

> The state is per socket. You may need an intermediate queue etc which
> feeds to each user socket registered for the event. The socket queue
> acts as a essentially a retransmitQ for broadcast state. Just waving my
> hands throwing ideas here of course.

Aha, you've fallen into my trap :) Now let me demonstrate why having
an intermediate queue doesn't help at all.

Holding the packet on the intermediate queue is exactly the same as
holding it on the receive queue of the destination socket.  The reason
is that we're simply cloning the packets.  So moving it from one queue
to another does not reduce system resource usage by much.

There is the cost in cloning the skbs.  However, that's an orthogonal
issue altogether.  We can reduce the cost there by making the packets
bigger.  This can either be done at the sender end by coalescing
successive messages.  Or we can merge them in netlink_broadcast.

Granted having an intermediate queue will avoid overruns if it is
large enough.  However, having all the receive queues to be as big
as your intermediate queue will have exactly the same effect.

In fact this has an advantage over the intermediate queue.  With the
latter, you need to hold the packet in place whether the applications
need it or not.  While currently, the application can choose whether
it wants to receive a large batch of events and if so how large.

Remember just because one application overruns, it doesn't mean that
the other recipients of the same skb will overrun.  They can continue
to receive messages as long as their receive queue allows it.

So applications that really want to see every event should have a
very large receive queue.  Those that can recover easily should use
with a much smaller queue.

> We cant control the user space script for example that caused those
> events. We shouldnt infact. Congestion control in this context equates
> to desire to not overlaod the reader (of events). In other words if you
> know the reader's capacity for swallowing events, then you dont exceed
> that rate of sending to said reader. Knowing the capacity requires even
> more state:

I understand what you mean.  But unless you can quench the source of
the messages/events, all you can do is batch them up somewhere.  What
I'm arguing about is that batching them up in the middle is no better
than batching them up at the destination.  In fact it's worse in that
it takes away choice from the receiving application.

> any waiting socket. Just overrun them immediately and they (readers) get
> forced to reread the state. Of course this queue will have to be larger
> than any of the active sockets recv queues.

Jamal, maybe I've got the wrong impression but it almost seems
that you think that if one applications overruns, then everyone
else on that multicast address will overrun as well.  This is
definitely not the case.

With an intermediate queue, you will in fact impose overruns on
everyone when it overflows which seems to be a step backwards.

> The moral of this is: you could do it if you wanted - aint trivial.

Well this is not what I'd call congestion control :) Let's take a
TCP analogy.  This is like batching up TCP packets on a router in
the middle rather than shutting down the sender.  Congestion control
is where you shut the sender up.

> Except if you drop incoming bug reports and drop them early (point of
> that intermidiate queue).

Nah.  Dropping them early is overruning early.

> BTW, Davem gets away with this congestion control alg all the time.
> Heck i think his sanity survives because of it - I bet you hes got this
> thread under congestion controlled right now;->

Yep, his overrun flag sure is set :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 11:11                                                                 ` Herbert Xu
@ 2004-09-28 12:16                                                                   ` Herbert Xu
  2004-09-28 12:39                                                                     ` On DaveMs congestion control algorithm WAS(Re: " jamal
  2004-09-28 12:32                                                                   ` jamal
  1 sibling, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-28 12:16 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, Sep 28, 2004 at 09:11:59PM +1000, herbert wrote:
>
> > BTW, Davem gets away with this congestion control alg all the time.
> > Heck i think his sanity survives because of it - I bet you hes got this
> > thread under congestion controlled right now;->
> 
> Yep, his overrun flag sure is set :)

Now if he puts his mailing admin hat on and tells us to shut up, that
would be congestion control :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 11:11                                                                 ` Herbert Xu
  2004-09-28 12:16                                                                   ` Herbert Xu
@ 2004-09-28 12:32                                                                   ` jamal
  2004-09-29  0:13                                                                     ` Herbert Xu
  1 sibling, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-28 12:32 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, 2004-09-28 at 07:11, Herbert Xu wrote:
> On Tue, Sep 28, 2004 at 06:36:27AM -0400, jamal wrote:
> > 
> > er, what about the host scope route msgs generated by same script? ;->

> AFAIK, no async netlink event function uses NLM_GOODSIZE at all for
> obvious reasons.

Actually i just examined the events generated by the script, they are
IFLA and ROUTE events and not IFA.
So take a look at:rtmsg_ifinfo()

> > The state is per socket. You may need an intermediate queue etc which
> > feeds to each user socket registered for the event. The socket queue
> > acts as a essentially a retransmitQ for broadcast state. Just waving my
> > hands throwing ideas here of course.
> 
> Aha, you've fallen into my trap :) 

Oh, goody ;-> 

> Now let me demonstrate why having
> an intermediate queue doesn't help at all.
> 
> Holding the packet on the intermediate queue is exactly the same as
> holding it on the receive queue of the destination socket.  The reason
> is that we're simply cloning the packets.  So moving it from one queue
> to another does not reduce system resource usage by much.

Ah, but theres clearly benefit into saving packets from crossing to user
space in particular in the case of overload. You do save on system
resources for sure in that case.
In the case of normal operation, no overload case, you end up using a
little more system resource - but thats a price tag that comes with the
benefits (needs to be weighed out).

> There is the cost in cloning the skbs.  However, that's an orthogonal
> issue altogether.  We can reduce the cost there by making the packets
> bigger.  This can either be done at the sender end by coalescing
> successive messages.  Or we can merge them in netlink_broadcast.
> 
> Granted having an intermediate queue will avoid overruns if it is
> large enough.  However, having all the receive queues to be as big
> as your intermediate queue will have exactly the same effect.

Agreed it will postpone the problem, and not cure it.
Where i saw the benefit is if this queue is full/overloaded then you
dont bother transfering skbs to the sock receiveQ - instead you overrun
the event listeners (on purpose) before giving them any data. This
assumes you only start feeding the listeners when the event generation
is complete (in multi message batch when DONE is processed).
In the case of overrunning the listeners you should alos flush the
intermediate queue.
Again, I am handwaving - there maybe a lot of practical issues whcih
become obvious with actually get hands dirty. I actually tried to
implement this a while back for socket packet tap listeners; cant find
my patches. What i was trying to get feedback that i could feed all the
way down to NAPI - it provide to be futile because i would need to have
hardware drop selectively and such NICs dont exist.

> In fact this has an advantage over the intermediate queue.  With the
> latter, you need to hold the packet in place whether the applications
> need it or not.  While currently, the application can choose whether
> it wants to receive a large batch of events and if so how large.
> 

Right, but only find out after reading a subset of messages which cross
into user space. Which is wasted cycles really.
Now if you could say from user space "please continue where you left
over" the messages before overrun wont be a waste. I do think thats not
wise for events(you should be able a summary of the issue some other way
as in overruns at the moment) but is definetely need for large get
results.

> Remember just because one application overruns, it doesn't mean that
> the other recipients of the same skb will overrun.  They can continue
> to receive messages as long as their receive queue allows it.
> 

Agreed. Note thats a design choice and the truth of which is a better
scheme only comes out by testing both schemes. Easier to leave
whats already in place - but what fun is that now?;->
Also to note this only applies to broadcasts.

> So applications that really want to see every event should have a
> very large receive queue.  Those that can recover easily should use
> with a much smaller queue.

Depending on how you look at it (since i am drinking the write variant
of cofee right now, lets look at it from a philosphoical view: is it the
area of the light radiated or the circumference of darkness surrounding
the light? ;-> Choose your metric ;-> ):
A large queue may actually be a problem if it also gets overflown since
it takes relatively longer to find out. You still have to read the damn
state to find out details.

[..]

> Jamal, maybe I've got the wrong impression but it almost seems
> that you think that if one applications overruns, then everyone
> else on that multicast address will overrun as well.  This is
> definitely not the case.

I think its fair to assume that if the intermidiate queue is overflown
all listeners will be.

> With an intermediate queue, you will in fact impose overruns on
> everyone when it overflows which seems to be a step backwards.

Refer to above assumption.

> > The moral of this is: you could do it if you wanted - aint trivial.
> 
> Well this is not what I'd call congestion control :) Let's take a
> TCP analogy.  This is like batching up TCP packets on a router in
> the middle rather than shutting down the sender.  Congestion control
> is where you shut the sender up.

Its actually worse than that -->which is a shame since we have more
control over what can be sent to user.
Congestion could be driven by receiver as well. Look at TCP zero windows
for example. Or even ECN.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* On DaveMs congestion control algorithm WAS(Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 12:16                                                                   ` Herbert Xu
@ 2004-09-28 12:39                                                                     ` jamal
  2004-09-28 18:24                                                                       ` David S. Miller
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-28 12:39 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, 2004-09-28 at 08:16, Herbert Xu wrote:
> On Tue, Sep 28, 2004 at 09:11:59PM +1000, herbert wrote:
> >
> > > BTW, Davem gets away with this congestion control alg all the time.
> > > Heck i think his sanity survives because of it - I bet you hes got this
> > > thread under congestion controlled right now;->
> > 
> > Yep, his overrun flag sure is set :)
> 
> Now if he puts his mailing admin hat on and tells us to shut up, that
> would be congestion control :)

Hed have to go and read all the emails first ;-> i.e the overrun flag
requires that you reread state.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: On DaveMs congestion control algorithm WAS(Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 12:39                                                                     ` On DaveMs congestion control algorithm WAS(Re: " jamal
@ 2004-09-28 18:24                                                                       ` David S. Miller
  2004-09-29  2:23                                                                         ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: David S. Miller @ 2004-09-28 18:24 UTC (permalink / raw)
  To: hadi; +Cc: herbert, pablo, netdev

On 28 Sep 2004 08:39:49 -0400
jamal <hadi@cyberus.ca> wrote:

> On Tue, 2004-09-28 at 08:16, Herbert Xu wrote:
> > On Tue, Sep 28, 2004 at 09:11:59PM +1000, herbert wrote:
> > >
> > > > BTW, Davem gets away with this congestion control alg all the time.
> > > > Heck i think his sanity survives because of it - I bet you hes got this
> > > > thread under congestion controlled right now;->
> > > 
> > > Yep, his overrun flag sure is set :)
> > 
> > Now if he puts his mailing admin hat on and tells us to shut up, that
> > would be congestion control :)
> 
> Hed have to go and read all the emails first ;-> i.e the overrun flag
> requires that you reread state.

When I see two guys going at it like you too, I just wait
for the dust to settle in the form of a patch and then I
reread the thread as if it were the latest drama show on TV
:-)

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 10:36                                                               ` jamal
  2004-09-28 11:11                                                                 ` Herbert Xu
@ 2004-09-28 21:07                                                                 ` Pablo Neira
  2004-09-28 23:19                                                                   ` Herbert Xu
  2004-09-29  2:28                                                                   ` jamal
  1 sibling, 2 replies; 96+ messages in thread
From: Pablo Neira @ 2004-09-28 21:07 UTC (permalink / raw)
  To: hadi; +Cc: Herbert Xu, David S. Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 559 bytes --]

Hi,

If buffer overruns we lost all the messages which are enqueued, so 
before enqueing the packet, we can check if there's space available in 
the buffer. I think that this way we can save these messages at least.

I'm also thinking in doing something with netlink_ack's since they can 
be drop if buffer is full. We could give more priority to error messages 
in some way, for example, check if there's space for an error message in 
the buffer, if there's not, drop as many packets in buffer as we get 
space to enqueue the error message.

regards,
Pablo

[-- Attachment #2: unicast-dont-overrun.patch --]
[-- Type: text/x-patch, Size: 446 bytes --]

===== net/netlink/af_netlink.c 1.58 vs edited =====
--- 1.58/net/netlink/af_netlink.c	Sat Sep 25 17:43:43 2004
+++ edited/net/netlink/af_netlink.c	Tue Sep 28 22:23:44 2004
@@ -475,7 +475,7 @@
 	if (nlk->handler)
 		return 0;
 #endif
-	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+	if (atomic_read(&sk->sk_rmem_alloc) + skb->len > sk->sk_rcvbuf ||
 	    test_bit(0, &nlk->state)) {
 		DECLARE_WAITQUEUE(wait, current);
 		if (!timeo) {

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 21:07                                                                 ` Pablo Neira
@ 2004-09-28 23:19                                                                   ` Herbert Xu
  2004-09-28 23:20                                                                     ` David S. Miller
  2004-09-29  2:28                                                                   ` jamal
  1 sibling, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-28 23:19 UTC (permalink / raw)
  To: Pablo Neira; +Cc: hadi, David S. Miller, netdev

On Tue, Sep 28, 2004 at 11:07:04PM +0200, Pablo Neira wrote:
> 
> If buffer overruns we lost all the messages which are enqueued, so 

This is not true.  You will get an ENOBUFS error but the packets
that have already been queued are still on the queue if you want
to read them.

> ===== net/netlink/af_netlink.c 1.58 vs edited =====
> --- 1.58/net/netlink/af_netlink.c	Sat Sep 25 17:43:43 2004
> +++ edited/net/netlink/af_netlink.c	Tue Sep 28 22:23:44 2004
> @@ -475,7 +475,7 @@
>  	if (nlk->handler)
>  		return 0;
>  #endif
> -	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
> +	if (atomic_read(&sk->sk_rmem_alloc) + skb->len > sk->sk_rcvbuf ||

I don't see the point of this patch.  All it does is make the overrun
happen one packet earlier.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 23:19                                                                   ` Herbert Xu
@ 2004-09-28 23:20                                                                     ` David S. Miller
  0 siblings, 0 replies; 96+ messages in thread
From: David S. Miller @ 2004-09-28 23:20 UTC (permalink / raw)
  To: Herbert Xu; +Cc: pablo, hadi, davem, netdev

On Wed, 29 Sep 2004 09:19:06 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> > ===== net/netlink/af_netlink.c 1.58 vs edited =====
> > --- 1.58/net/netlink/af_netlink.c	Sat Sep 25 17:43:43 2004
> > +++ edited/net/netlink/af_netlink.c	Tue Sep 28 22:23:44 2004
> > @@ -475,7 +475,7 @@
> >  	if (nlk->handler)
> >  		return 0;
> >  #endif
> > -	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
> > +	if (atomic_read(&sk->sk_rmem_alloc) + skb->len > sk->sk_rcvbuf ||
> 
> I don't see the point of this patch.  All it does is make the overrun
> happen one packet earlier.

That's my sentiment as well.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 12:32                                                                   ` jamal
@ 2004-09-29  0:13                                                                     ` Herbert Xu
  2004-09-29  2:52                                                                       ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-29  0:13 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, Sep 28, 2004 at 08:32:17AM -0400, jamal wrote:
> 
> Actually i just examined the events generated by the script, they are
> IFLA and ROUTE events and not IFA.
> So take a look at:rtmsg_ifinfo()

You're right.  rtmsg_info() is using GOODISZE unnecessarily.  I'll
write up a patch.

> Ah, but theres clearly benefit into saving packets from crossing to user
> space in particular in the case of overload. You do save on system
> resources for sure in that case.

Can you please elaborate on those benefites/resources?
 
> Agreed it will postpone the problem, and not cure it.
> Where i saw the benefit is if this queue is full/overloaded then you
> dont bother transfering skbs to the sock receiveQ - instead you overrun
> the event listeners (on purpose) before giving them any data. This

If you do this when only one listener's queue is full, then doesn't this
mean that you penalise everyone just because one task's receive queue is
small?

> > In fact this has an advantage over the intermediate queue.  With the
> > latter, you need to hold the packet in place whether the applications
> > need it or not.  While currently, the application can choose whether
> > it wants to receive a large batch of events and if so how large.
> 
> Right, but only find out after reading a subset of messages which cross
> into user space. Which is wasted cycles really.

Can you please elaborate on "crossing into user space"? I don't think
I understand where these cycles are being wasted.

> Now if you could say from user space "please continue where you left
> over" the messages before overrun wont be a waste. I do think thats not
> wise for events(you should be able a summary of the issue some other way
> as in overruns at the moment) but is definetely need for large get
> results.

What do you mean by large get results? I thought we've agreed that
scenarios 1) and 2) (get and dump) cannot generate overruns unless
you're doing something silly like sharing your unicast socket with
multicast.  Please point me to a netlink get function that returns
an unbounded or unreasonably large skb.

> A large queue may actually be a problem if it also gets overflown since
> it takes relatively longer to find out. You still have to read the damn
> state to find out details.

Not quite.  Overrun errors are reported immediately.  What we don't
have is an easy way to flush out the unread messages from before the
overrun.  Well actually you could just close the socket and reopen it.

> Its actually worse than that -->which is a shame since we have more
> control over what can be sent to user.
> Congestion could be driven by receiver as well. Look at TCP zero windows
> for example. Or even ECN.

Yes but the end result is that the sender stops sending.  I'm not aware
of any existing TCP mechanisms that result in packets being queued in a
router on the way.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: On DaveMs congestion control algorithm WAS(Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 18:24                                                                       ` David S. Miller
@ 2004-09-29  2:23                                                                         ` jamal
  0 siblings, 0 replies; 96+ messages in thread
From: jamal @ 2004-09-29  2:23 UTC (permalink / raw)
  To: David S. Miller; +Cc: herbert, pablo, netdev

On Tue, 2004-09-28 at 14:24, David S. Miller wrote:

>  I just wait
> for the dust to settle in the form of a patch and then I
> reread the thread as if it were the latest drama show on TV
> :-)

Ok, turn on your Tivo and go back to your regularly scheduled
programme ;-> Herbert is one determined dude, so the drama aint
over yet ;->

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-28 21:07                                                                 ` Pablo Neira
  2004-09-28 23:19                                                                   ` Herbert Xu
@ 2004-09-29  2:28                                                                   ` jamal
  2004-09-29  2:30                                                                     ` Herbert Xu
  1 sibling, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-29  2:28 UTC (permalink / raw)
  To: Pablo Neira; +Cc: Herbert Xu, David S. Miller, netdev

On Tue, 2004-09-28 at 17:07, Pablo Neira wrote:

> I'm also thinking in doing something with netlink_ack's since they can 
> be drop if buffer is full. We could give more priority to error messages 
> in some way, for example, check if there's space for an error message in 
> the buffer, if there's not, drop as many packets in buffer as we get 
> space to enqueue the error message.

Being able to prioritize control (errors and ACKs) would be valuable.
Would require mucking around with the socket queue. 
Something along what we do for a basic default 3 band queue (proabably
two band in this case) should work.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-29  2:28                                                                   ` jamal
@ 2004-09-29  2:30                                                                     ` Herbert Xu
  2004-09-29  2:59                                                                       ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-29  2:30 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, Sep 28, 2004 at 10:28:34PM -0400, jamal wrote:
> 
> Being able to prioritize control (errors and ACKs) would be valuable.
> Would require mucking around with the socket queue. 
> Something along what we do for a basic default 3 band queue (proabably
> two band in this case) should work.

Obviously neither of you have taken my tip :)

You should never use your unicast socket to receive multicast messages.
Otherwise you get to keep both pieces when it breaks.
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-29  0:13                                                                     ` Herbert Xu
@ 2004-09-29  2:52                                                                       ` jamal
  2004-09-29  3:27                                                                         ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-29  2:52 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, 2004-09-28 at 20:13, Herbert Xu wrote:
> On Tue, Sep 28, 2004 at 08:32:17AM -0400, jamal wrote:
> > 
> > Actually i just examined the events generated by the script, they are
> > IFLA and ROUTE events and not IFA.
> > So take a look at:rtmsg_ifinfo()
> 
> You're right.  rtmsg_info() is using GOODISZE unnecessarily.  I'll
> write up a patch.

But why? ;->

> > Ah, but theres clearly benefit into saving packets from crossing to user
> > space in particular in the case of overload. You do save on system
> > resources for sure in that case.
> 
> Can you please elaborate on those benefites/resources?

Well, if you are gonna overrun the socket anyways, is there a point
in delivering all that incomplete info?
If you go ahead and deliver it anyways, you will be crossing
kernel->userspace. Its a worthy cause to do so.

> If you do this when only one listener's queue is full, then doesn't this
> mean that you penalise everyone just because one task's receive queue is
> small?

Yes, thats part of the issues.

> > Right, but only find out after reading a subset of messages which cross
> > into user space. Which is wasted cycles really.
> 
> Can you please elaborate on "crossing into user space"? I don't think
> I understand where these cycles are being wasted.

delivering messages which get obsoleted by an overun from kernel to user
space for uses up unnecessary cycles.

> > Now if you could say from user space "please continue where you left
> > over" the messages before overrun wont be a waste. I do think thats not
> > wise for events(you should be able a summary of the issue some other way
> > as in overruns at the moment) but is definetely need for large get
> > results.
> 
> What do you mean by large get results? I thought we've agreed that
> scenarios 1) and 2) (get and dump) cannot generate overruns unless
> you're doing something silly like sharing your unicast socket with
> multicast.  Please point me to a netlink get function that returns
> an unbounded or unreasonably large skb.

A dump may be harder given it keeps state. A get item which generates
a huge dataset (dont ask me to give you an example) is going to cause
overruns. Think a multi message formated data.

> > A large queue may actually be a problem if it also gets overflown since
> > it takes relatively longer to find out. You still have to read the damn
> > state to find out details.
> 
> Not quite.  Overrun errors are reported immediately.  

Yes, except they get reported sooner (by virtue of queue getting filled
sooner) if you have a 4K sock buffer vs 1M if you are registered for the
same events. I Know its a digression - just making a point.

> What we don't
> have is an easy way to flush out the unread messages from before the
> overrun.  Well actually you could just close the socket and reopen it.

I think you you should be able to purge queue from the kernel side. I
have a feeling we are talking two different things.

> Yes but the end result is that the sender stops sending.  I'm not aware
> of any existing TCP mechanisms that result in packets being queued in a
> router on the way.

I think you are taking it too literal Herbet ;->
The end goal is what counts i.e the cause of congestion is alleviated.
Intermidiate buffering is a very well known/used trick to alleviate
congestion - especially in a local scenario. Of course such queues have 
finite buffers - you just engineer it so the queue doesnt overflow and
head of line blocking is tolerable. Either of those concerns not
addressed, shit will hit the fan.

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-29  2:30                                                                     ` Herbert Xu
@ 2004-09-29  2:59                                                                       ` jamal
  0 siblings, 0 replies; 96+ messages in thread
From: jamal @ 2004-09-29  2:59 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, 2004-09-28 at 22:30, Herbert Xu wrote:
> On Tue, Sep 28, 2004 at 10:28:34PM -0400, jamal wrote:
> > 
> > Being able to prioritize control (errors and ACKs) would be valuable.
> > Would require mucking around with the socket queue. 
> > Something along what we do for a basic default 3 band queue (proabably
> > two band in this case) should work.
> 
> Obviously neither of you have taken my tip :)
> 
> You should never use your unicast socket to receive multicast messages.
> Otherwise you get to keep both pieces when it breaks.

Ok, good point as long as it is common knowledge. 

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-29  2:52                                                                       ` jamal
@ 2004-09-29  3:27                                                                         ` Herbert Xu
  2004-09-29  4:02                                                                           ` David S. Miller
  2004-09-29 10:50                                                                           ` jamal
  0 siblings, 2 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-29  3:27 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 3022 bytes --]

On Tue, Sep 28, 2004 at 10:52:05PM -0400, jamal wrote:
>
> > You're right.  rtmsg_info() is using GOODISZE unnecessarily.  I'll
> > write up a patch.
> 
> But why? ;->

So that the alloc_skb() is slightly less likely to fail.  The dumpers
gain a lot by using GOODSIZE since they can fill it up.  As rtmsg_info
has no chance of getting anywhere near GOODSIZE we should provide a
more accruate estimate.

It also means that with netlink_trim() you'll save a realloc/copy.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Dave, you can stop reading now :)
 
> Well, if you are gonna overrun the socket anyways, is there a point
> in delivering all that incomplete info?
>
> If you go ahead and deliver it anyways, you will be crossing
> kernel->userspace. Its a worthy cause to do so.

Hang on a second, if we're going to overrun anyway, then we *can't*
deliver it to user-space.  If we could deliver then we wouldn't be
having an overrun.

> > Can you please elaborate on "crossing into user space"? I don't think
> > I understand where these cycles are being wasted.
> 
> delivering messages which get obsoleted by an overun from kernel to user
> space for uses up unnecessary cycles.

You'll have to spell it out for me I'm afraid :)

If we're overrunning then we can't deliver the message at hand.  If you
are referring to the messages afterwards then the only way we can deliver
them is if the appliation lets us by clearing the queue.  If you are
referring to the messages that are already on the queue then we've done
the work already so why shouldn't they stay?

> A dump may be harder given it keeps state. A get item which generates
> a huge dataset (dont ask me to give you an example) is going to cause
> overruns. Think a multi message formated data.

That's a completely different story.  For that problem I'd suggest that
we extend the dump paradigm to cover get as well.  However, to design the
interface, we need to look at potential users of this.  So please give
me an example :)

> > Not quite.  Overrun errors are reported immediately.  
> 
> Yes, except they get reported sooner (by virtue of queue getting filled
> sooner) if you have a 4K sock buffer vs 1M if you are registered for the
> same events. I Know its a digression - just making a point.

The one with the 1M buffer may not overrun at all if it can process the
events fast enough.

> congestion - especially in a local scenario. Of course such queues have 
> finite buffers - you just engineer it so the queue doesnt overflow and
> head of line blocking is tolerable. Either of those concerns not
> addressed, shit will hit the fan.

I don't see how you can engineer it so that it doesn't overflow.  In
the example that you gave with interface address, the number of messages
generated is practically unbounded.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p --]
[-- Type: text/plain, Size: 506 bytes --]

===== net/core/rtnetlink.c 1.28 vs edited =====
--- 1.28/net/core/rtnetlink.c	2004-09-22 12:16:43 +10:00
+++ edited/net/core/rtnetlink.c	2004-09-29 13:23:43 +10:00
@@ -412,7 +412,9 @@
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 {
 	struct sk_buff *skb;
-	int size = NLMSG_GOODSIZE;
+	int size = NLMSG_SPACE(sizeof(struct ifinfomsg) +
+			       sizeof(struct rtnl_link_ifmap) +
+			       sizeof(struct rtnl_link_stats) + 128);
 
 	skb = alloc_skb(size, GFP_KERNEL);
 	if (!skb)

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-29  3:27                                                                         ` Herbert Xu
@ 2004-09-29  4:02                                                                           ` David S. Miller
  2004-09-29 10:50                                                                           ` jamal
  1 sibling, 0 replies; 96+ messages in thread
From: David S. Miller @ 2004-09-29  4:02 UTC (permalink / raw)
  To: Herbert Xu; +Cc: hadi, pablo, davem, netdev

On Wed, 29 Sep 2004 13:27:24 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Tue, Sep 28, 2004 at 10:52:05PM -0400, jamal wrote:
> >
> > > You're right.  rtmsg_info() is using GOODISZE unnecessarily.  I'll
> > > write up a patch.
> > 
> > But why? ;->
> 
> So that the alloc_skb() is slightly less likely to fail.  The dumpers
> gain a lot by using GOODSIZE since they can fill it up.  As rtmsg_info
> has no chance of getting anywhere near GOODSIZE we should provide a
> more accruate estimate.
> 
> It also means that with netlink_trim() you'll save a realloc/copy.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> Dave, you can stop reading now :)

Hehe, ok.  Patch applied :-)

Thanks.

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
@ 2004-09-29  9:53 James Chapman
  2004-09-29  9:59 ` Herbert Xu
  0 siblings, 1 reply; 96+ messages in thread
From: James Chapman @ 2004-09-29  9:53 UTC (permalink / raw)
  To: hadi, herbert; +Cc: pablo, davem, netdev

Re: async netlink messages

Is it possible to deliver async messages to userspace using a polling
mechanism to help solve the overrun problem?

- Have the kernel keep a list of async messages sent towards each
  socket rather than trying to deliver each message to the app
  immediately.  Have the kernel send a netlink event message (say,
  NETLINK_EVENT_WAKEUP) to the app when this queue first goes
  non-empty. No new NETLINK_EVENT_WAKEUP messages are sent until the
  event queue goes empty again.

- On receipt of NETLINK_EVENT_WAKEUP, a process issues a netlink
  GET (or DUMP?) on its netlink socket to read its queued events.

- If the socket event queue overruns, discard new events and flag the
  event queue as having lost messages. When the userspace app reads
  the event queue, it will discover that messages have been lost and
  can then re-read state from the kernel.

- Use a setsockopt on the netlink socket to have the socket configured
  in this polled-event mode.

Just a thought.

/james


-----Original Message-----
From: Herbert Xu [mailto:herbert@gondor.apana.org.au]
Sent: 28 September 2004 04:59
To: hadi@cyberus.ca
Cc: pablo@eurodev.net,  davem@davemloft.net, netdev@oss.sgi.com
Subject: Re: [PATCH] Improve behaviour of Netlink Sockets

On Mon, Sep 27, 2004 at 11:45:25PM -0400, jamal wrote:
>
> > Now that we know where the events are coming from and what they are,
> > we can decide on the solution.  In this particular case, there is
> > nothing you can do on the sending side.  Stopping people from operating
> > on networking objects just because some netlink listener can't keep up
> > isn't going to work.  So congestion control is out of the question.
>
> fixing the NLM_GOODSIZE issue is a very good first step.

Well I'm afraid that it doesn't help in your interface address example
because rtmsg_ifa() already allocates a buffer of (approximately) the
right size.  That is, it doesn't use NLM_GOODSIZE at all.

> Adding congestion control would be harder but not out of question.

But the question is who are you going to throttle? If you throttle
the source of the messages then you're going to stop people from adding
or deleting IP addresses which can't be right.

If you move the netlink sender into a different execution context and
throttle that then that's just extending the receive queue length by
stealth.

So I'm afraid I don't see how congestion control could be applied in
*this* particular context.

> > So just bite the bullet and reread the system state by issuing dump
> > operations.
>
> We may as well choose to document it as being this mostly because of the
> issue i described above. We shouldnt give up so easily though ;->

Well IMHO this is not giving up at all.

Think of it another way.  Monitoring routes is like maintaining a
program.  Normally you just fix the bugs as they come.  But if the
bug reports are coming in so fast that you can't keep up, perhaps
it's time to throw it away and rewrite it from scratch :)
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt






----- End forwarded message -----

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-29  9:53 James Chapman
@ 2004-09-29  9:59 ` Herbert Xu
  0 siblings, 0 replies; 96+ messages in thread
From: Herbert Xu @ 2004-09-29  9:59 UTC (permalink / raw)
  To: James Chapman; +Cc: hadi, pablo, davem, netdev

On Wed, Sep 29, 2004 at 10:53:43AM +0100, James Chapman wrote:
> 
> - Have the kernel keep a list of async messages sent towards each
>   socket rather than trying to deliver each message to the app
>   immediately.  Have the kernel send a netlink event message (say,
>   NETLINK_EVENT_WAKEUP) to the app when this queue first goes
>   non-empty. No new NETLINK_EVENT_WAKEUP messages are sent until the
>   event queue goes empty again.
> 
> - On receipt of NETLINK_EVENT_WAKEUP, a process issues a netlink
>   GET (or DUMP?) on its netlink socket to read its queued events.
> 
> - If the socket event queue overruns, discard new events and flag the
>   event queue as having lost messages. When the userspace app reads
>   the event queue, it will discover that messages have been lost and
>   can then re-read state from the kernel.
> 
> - Use a setsockopt on the netlink socket to have the socket configured
>   in this polled-event mode.

That's basically how it works now.
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-29  3:27                                                                         ` Herbert Xu
  2004-09-29  4:02                                                                           ` David S. Miller
@ 2004-09-29 10:50                                                                           ` jamal
  2004-09-29 11:42                                                                             ` Herbert Xu
  1 sibling, 1 reply; 96+ messages in thread
From: jamal @ 2004-09-29 10:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Tue, 2004-09-28 at 23:27, Herbert Xu wrote:
> On Tue, Sep 28, 2004 at 10:52:05PM -0400, jamal wrote:
> >
> > > You're right.  rtmsg_info() is using GOODISZE unnecessarily.  I'll
> > > write up a patch.
> > 
> > But why? ;->
> 
> So that the alloc_skb() is slightly less likely to fail. 

[..]
Ok, You fell into my trap. I just had to say that ;-> We are even.

> > Well, if you are gonna overrun the socket anyways, is there a point
> > in delivering all that incomplete info?
> >
> > If you go ahead and deliver it anyways, you will be crossing
> > kernel->userspace. Its a worthy cause to do so.
> 
> Hang on a second, if we're going to overrun anyway, then we *can't*
> deliver it to user-space.  If we could deliver then we wouldn't be
> having an overrun.

Assume you are delivering a huge atomic multi message; lets say it
constitutes 50 pkts.
--> You deliver up to 15 and then overrun.
--> The 15 pkts are a waste, really. User will have to poll
for the info for all details to be sure i.e you dont know what was lost
for sure if you get an overrun. In otherwise the transaction was not
delivered to its completion.

> > > Can you please elaborate on "crossing into user space"? I don't think
> > > I understand where these cycles are being wasted.
> > 
> > delivering messages which get obsoleted by an overun from kernel to user
> > space for uses up unnecessary cycles.
> 
> You'll have to spell it out for me I'm afraid :)

See above.

> If we're overrunning then we can't deliver the message at hand.  If you
> are referring to the messages afterwards then the only way we can deliver
> them is if the appliation lets us by clearing the queue.  If you are
> referring to the messages that are already on the queue then we've done
> the work already so why shouldn't they stay?

I was refering to above scenario. To continue that example:
--> 15 pkst made it
--> some gap then lost skbs (maybe due to allocation issues etc)
--> If you recover and have the last 10 out of 50, then they are
obsolete. If you already did the work of collecting them, then reduce
the mess by not having the user copy them. 
I dont think 10 are a big deal but multiply by some factor and the cost
is clear to quantify.

[..]
> we extend the dump paradigm to cover get as well.  However, to design the
> interface, we need to look at potential users of this.  So please give
> me an example :)

extending get to do the dumplike approach is not a bad idea.

I said dont ask me for an example ;-> You provide the mechanism, you be
ready for the consequences - people will use it. Thats a good enough
reason;->
If you insist however, heres one i can visualize thats a legit get from
a semantic view:
A get of a table entry which itself happens to be a table (and if you
want to be funnier maybe three level deep table). You do a get on a
table entry, you get a _whole table_ back. 

> > > Not quite.  Overrun errors are reported immediately.  
> > 
> > Yes, except they get reported sooner (by virtue of queue getting filled
> > sooner) if you have a 4K sock buffer vs 1M if you are registered for the
> > same events. I Know its a digression - just making a point.
> 
> The one with the 1M buffer may not overrun at all if it can process the
> events fast enough.

Ok, i will agree to that but the assumption in the example is both will
be overrun by the same set of messages. To argue (for the sake of
arguing) say the 4K buffer user was faster;-> Lets drop this bit.

> > congestion - especially in a local scenario. Of course such queues have 
> > finite buffers - you just engineer it so the queue doesnt overflow and
> > head of line blocking is tolerable. Either of those concerns not
> > addressed, shit will hit the fan.
> 
> I don't see how you can engineer it so that it doesn't overflow. 

Sorry, you cant. I meant you have to get the two to work together.


cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-29 10:50                                                                           ` jamal
@ 2004-09-29 11:42                                                                             ` Herbert Xu
  2004-09-29 13:55                                                                               ` jamal
  0 siblings, 1 reply; 96+ messages in thread
From: Herbert Xu @ 2004-09-29 11:42 UTC (permalink / raw)
  To: jamal; +Cc: Pablo Neira, David S. Miller, netdev

On Wed, Sep 29, 2004 at 06:50:46AM -0400, jamal wrote:
>
> Ok, You fell into my trap. I just had to say that ;-> We are even.

Fair enough :)

> Assume you are delivering a huge atomic multi message; lets say it
> constitutes 50 pkts.
> --> You deliver up to 15 and then overrun.
> --> The 15 pkts are a waste, really. User will have to poll
> for the info for all details to be sure i.e you dont know what was lost
> for sure if you get an overrun. In otherwise the transaction was not
> delivered to its completion.

I presume that the ifdown script you posted fits this description?
I agree this is suboptimal.  However, I don't see how the intermediate
queue can help here.

Hmm, I suspect that there is something broken here that's causing the
overruns.  Let me have a play with your script/ip monitor and get back
to you.

> I dont think 10 are a big deal but multiply by some factor and the cost
> is clear to quantify.

Agreed.  Perhaps what we need is a way to identify these messages as
related.  If you can do that then the kernel can forget about them
when an overrun occurs.

I still don't see this as a big deal though since the application can
always weather the storm by closing the socket after an overrun,
sleeping for a while and then reopen it to listen again.

> If you insist however, heres one i can visualize thats a legit get from
> a semantic view:
> A get of a table entry which itself happens to be a table (and if you
> want to be funnier maybe three level deep table). You do a get on a
> table entry, you get a _whole table_ back. 

That's easy :) Anything that's a table should use dump.  Remember that
dump can take parameters just like get.  So there's nothing wrong with
using dump to get an entry which happens to be a table.

PS We've managed to cut down the size of our emails by half.  Shows that
congestion control is certainly working on this mailing list :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
  2004-09-29 11:42                                                                             ` Herbert Xu
@ 2004-09-29 13:55                                                                               ` jamal
  0 siblings, 0 replies; 96+ messages in thread
From: jamal @ 2004-09-29 13:55 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Pablo Neira, David S. Miller, netdev

On Wed, 2004-09-29 at 07:42, Herbert Xu wrote:

> I presume that the ifdown script you posted fits this description?

Probably not a good fit. A good fit (not sure how to create the race)
is to have (handwave to illustrate) 1000 netdevices getting downed. You
get notified of first 50, overrun happens, then you get last 30 of last
50 and then another overrun. And somewhere in the second overrun you
lost notification that the netdevices 5-8 infact came up.
A scheme which polls every X seconds from user space will catch this
of course but depending on how you do it the latency will vary.

> I agree this is suboptimal.  However, I don't see how the intermediate
> queue can help here.
> 

The intermidiate queue was just brainstorming. It is based on something
that i tried with taps - may not be the best approach and at the end we
may just say we are better off leaving things as they are. The
intermidiate queue is a classical approach to manage disparity between
speed and buffer mismatching in a case when you either have multiple
senders or receivers (in our case).
The idea behind it being that if i am going to overflow it with
1000 events then i am pretty sure that none of the user space apps
cannot handle that load.

> Hmm, I suspect that there is something broken here that's causing the
> overruns.  Let me have a play with your script/ip monitor and get back
> to you.
> 
> > I dont think 10 are a big deal but multiply by some factor and the cost
> > is clear to quantify.
> 
> Agreed.  Perhaps what we need is a way to identify these messages as
> related.  If you can do that then the kernel can forget about them
> when an overrun occurs.

Theres a lot of things that netlink needs to improve on, this being one.
I dont wanna go over the laundry list but it will need major surgery
and will break backward compatibility. I know you are a fireman and will
go fixing things - so i aint telling you yet ;-> (more worried we'll put
solutions that may get us going as bandaids and postpone the surgery).
In all fairness you could use the sequenece number as a transaction
identifier will require more management.

> I still don't see this as a big deal though since the application can
> always weather the storm by closing the socket after an overrun,
> sleeping for a while and then reopen it to listen again.
> 
> > If you insist however, heres one i can visualize thats a legit get from
> > a semantic view:
> > A get of a table entry which itself happens to be a table (and if you
> > want to be funnier maybe three level deep table). You do a get on a
> > table entry, you get a _whole table_ back. 
> 
> That's easy :) Anything that's a table should use dump.  

Thats why i emphasized "legit get from a semantic view" ;->
Assuming you know the item you are getting is a table, yes indeed;
however, it does get tiring and abuse of human intelligence to say "dump
table X which is found in entry a of table B which is found in entry c
of table D which is found in entry e of table F which itself is found it
table G entry h .." 
instead of just saying "get entry a of table X"

> Remember that
> dump can take parameters just like get.  So there's nothing wrong with
> using dump to get an entry which happens to be a table.

> PS We've managed to cut down the size of our emails by half.  Shows that
> congestion control is certainly working on this mailing list :)

Shall we include brazillian coffee as a parameter in congestion
control?;-> 

cheers,
jamal

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] Improve behaviour of Netlink Sockets
@ 2004-09-30  9:50 James Chapman
  0 siblings, 0 replies; 96+ messages in thread
From: James Chapman @ 2004-09-30  9:50 UTC (permalink / raw)
  To: herbert; +Cc: hadi, pablo, davem, netdev

> On Wed, Sep 29, 2004 at 10:53:43AM +0100, James Chapman wrote:
> >
> > - Have the kernel keep a list of async messages sent towards each
> >   socket rather than trying to deliver each message to the app
> >   immediately.  Have the kernel send a netlink event message (say,
> >   NETLINK_EVENT_WAKEUP) to the app when this queue first goes
> >   non-empty. No new NETLINK_EVENT_WAKEUP messages are sent until the
> >   event queue goes empty again.
> >
> > - On receipt of NETLINK_EVENT_WAKEUP, a process issues a netlink
> >   GET (or DUMP?) on its netlink socket to read its queued events.
> >
> > - If the socket event queue overruns, discard new events and flag the
> >   event queue as having lost messages. When the userspace app reads
> >   the event queue, it will discover that messages have been lost and
> >   can then re-read state from the kernel.
> >
> > - Use a setsockopt on the netlink socket to have the socket configured
> >   in this polled-event mode.
>
> That's basically how it works now.

Looks like I was proposing something similar to Jamal's intermediate
queue. With the above scheme, there would be no need for userspace to
periodically poll for events since that would be kicked off by the
wakeup message. Also, while events are held in the kernel, they
wouldn't be charged to the netlink socket -- instead they'd be limited
by a system-wide count, sharing the event resource across all netlink
sockets.

/james

^ permalink raw reply	[flat|nested] 96+ messages in thread

end of thread, other threads:[~2004-09-30  9:50 UTC | newest]

Thread overview: 96+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <412DF807.2040703@eurodev.net>
     [not found] ` <20040826141407.38b56729.davem@redhat.com>
     [not found]   ` <412EB40A.6010100@eurodev.net>
     [not found]     ` <20040826214710.5e322f1a.davem@redhat.com>
     [not found]       ` <412F1269.8090303@eurodev.net>
     [not found]         ` <20040827172736.543dbd54.davem@redhat.com>
2004-08-30  0:37           ` [PATCH] Improve behaviour of Netlink Sockets Pablo Neira
2004-08-31  0:20             ` David S. Miller
2004-08-31 16:37               ` Pablo Neira
2004-08-31 20:16                 ` David S. Miller
2004-09-18 10:25         ` Herbert Xu
2004-09-19  4:36           ` Pablo Neira
2004-09-19  5:18             ` Pablo Neira
2004-09-19  7:58             ` Herbert Xu
2004-09-19 12:02               ` Herbert Xu
2004-09-19 12:07                 ` Herbert Xu
2004-09-19 20:50                   ` Pablo Neira
2004-09-19 21:53                     ` Herbert Xu
2004-09-19 22:49                       ` Pablo Neira
2004-09-19 23:44                         ` Herbert Xu
2004-09-20  0:31                           ` Pablo Neira
2004-09-20  1:00                             ` Herbert Xu
2004-09-19 20:50                 ` Pablo Neira
2004-09-19 21:59                   ` Herbert Xu
2004-09-19 22:39                     ` jamal
2004-09-19 22:55                       ` Pablo Neira
2004-09-19 23:04                         ` jamal
2004-09-19 23:10                           ` Herbert Xu
2004-09-19 23:17                       ` Herbert Xu
2004-09-20  2:39                         ` jamal
2004-09-20  2:58                           ` Herbert Xu
2004-09-20 12:34                             ` jamal
2004-09-20 18:14                               ` Pablo Neira
2004-09-20 21:59                                 ` Herbert Xu
2004-09-21 11:47                                   ` jamal
2004-09-21 12:09                                     ` Herbert Xu
2004-09-22  0:05                                 ` Herbert Xu
2004-09-22  0:24                                   ` Pablo Neira
2004-09-22  2:48                                   ` Pablo Neira
2004-09-22  2:50                                     ` David S. Miller
2004-09-22  2:53                                     ` jamal
2004-09-22  3:46                                       ` Herbert Xu
2004-09-22 11:35                                         ` jamal
2004-09-23 12:05                                           ` Herbert Xu
2004-09-24  2:56                                             ` jamal
2004-09-24  3:20                                               ` Herbert Xu
2004-09-27 12:41                                                 ` jamal
2004-09-22  4:52                                     ` Herbert Xu
2004-09-22 12:08                                       ` jamal
2004-09-22 17:52                                         ` David S. Miller
2004-09-23 15:40                                           ` Pablo Neira
2004-09-23 19:16                                             ` David S. Miller
2004-09-24  3:28                                               ` Herbert Xu
2004-09-24  5:39                                                 ` David S. Miller
2004-09-24  6:26                                                   ` Herbert Xu
2004-09-24 17:58                                                     ` David S. Miller
2004-09-24 22:06                                                       ` Herbert Xu
2004-09-24 22:28                                                         ` Herbert Xu
2004-09-27 12:59                                                         ` jamal
2004-09-27 12:53                                                 ` jamal
2004-09-23 12:07                                         ` Herbert Xu
2004-09-24  1:19                                           ` Pablo Neira
2004-09-24  3:04                                           ` jamal
2004-09-24  3:24                                             ` Herbert Xu
2004-09-27 12:46                                               ` jamal
2004-09-27 21:36                                                 ` Herbert Xu
2004-09-28  2:43                                                   ` jamal
2004-09-28  2:46                                                     ` Herbert Xu
2004-09-28  3:06                                                       ` jamal
2004-09-28  3:23                                                         ` Herbert Xu
2004-09-28  3:45                                                           ` jamal
2004-09-28  3:59                                                             ` Herbert Xu
2004-09-28 10:36                                                               ` jamal
2004-09-28 11:11                                                                 ` Herbert Xu
2004-09-28 12:16                                                                   ` Herbert Xu
2004-09-28 12:39                                                                     ` On DaveMs congestion control algorithm WAS(Re: " jamal
2004-09-28 18:24                                                                       ` David S. Miller
2004-09-29  2:23                                                                         ` jamal
2004-09-28 12:32                                                                   ` jamal
2004-09-29  0:13                                                                     ` Herbert Xu
2004-09-29  2:52                                                                       ` jamal
2004-09-29  3:27                                                                         ` Herbert Xu
2004-09-29  4:02                                                                           ` David S. Miller
2004-09-29 10:50                                                                           ` jamal
2004-09-29 11:42                                                                             ` Herbert Xu
2004-09-29 13:55                                                                               ` jamal
2004-09-28 21:07                                                                 ` Pablo Neira
2004-09-28 23:19                                                                   ` Herbert Xu
2004-09-28 23:20                                                                     ` David S. Miller
2004-09-29  2:28                                                                   ` jamal
2004-09-29  2:30                                                                     ` Herbert Xu
2004-09-29  2:59                                                                       ` jamal
2004-09-22  2:57                                   ` jamal
2004-09-22  3:39                                     ` Herbert Xu
2004-09-19 20:47               ` Pablo Neira
2004-09-19 21:20                 ` Herbert Xu
2004-09-19 22:14                   ` Pablo Neira
2004-09-19 23:31                     ` Herbert Xu
     [not found] ` <E1C90Da-0001V7-00@gondolin.me.apana.org.au>
2004-09-19 12:00   ` Herbert Xu
2004-09-29  9:53 James Chapman
2004-09-29  9:59 ` Herbert Xu
  -- strict thread matches above, loose matches on Subject: below --
2004-09-30  9:50 James Chapman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).