All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Dumazet <dada1@cosmosbay.com>
To: Brian Bloniarz <bmb@athenacr.com>
Cc: kchang@athenacr.com, netdev@vger.kernel.org,
	"David S. Miller" <davem@davemloft.net>
Subject: Re: Multicast packet loss
Date: Wed, 11 Mar 2009 04:00:38 +0100	[thread overview]
Message-ID: <49B72956.9050504@cosmosbay.com> (raw)
In-Reply-To: <49B6F645.2070408@athenacr.com>

Brian Bloniarz a écrit :
> Hi Eric,
> 
> FYI: with your patch applied and lockdep enabled, I see:
> [   39.114628] ================================================
> [   39.121964] [ BUG: lock held when returning to user space! ]
> [   39.127704] ------------------------------------------------
> [   39.133461] msgtest/5242 is leaving the kernel with locks still held!
> [   39.140132] 1 lock held by msgtest/5242:
> [   39.144287]  #0:  (clock-AF_INET){-.-?}, at: [<ffffffff8041f5b9>]
> sock_def_readable+0x19/0xb0

And you told me you were not a kernel hacker ;)

> 
> I can't reproduced this with the mcasttest program yet, it
> was with an internal test program which does some userspace
> processing on the messages. I'll let you know if I find a way
> to reproduce it with a simple program I can share.

I reproduced it as well here quite easily with a tcpdump of a tcp session,
thanks for the report.

It seems  "if (in_softirq())" doesnt do what I thought.

I wanted to test if we were called from __do_softirq() handler,
since only this function is calling softirq_delay_exec()
to dequeue events.

It appears I have to make current->softirq_context available
even if !CONFIG_TRACE_IRQFLAGS

Here is an updated version of the patch.

I also made the call to softirq_delay_exec() is performed
with interrupts enabled, and that preempt count wont
overflow if many events are queued.

[PATCH] softirq: Introduce mechanism to defer wakeups

Some network workloads need to call scheduler too many times. For example,
each received multicast frame can wakeup many threads. ksoftirqd is then
not able to drain NIC RX queues and we get frame losses and high latencies.

This patch adds an infrastructure to delay part of work done in
sock_def_readable() at end of do_softirq(). This need to
make available current->softirq_context even if !CONFIG_TRACE_IRQFLAGS


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/interrupt.h |   18 +++++++++++++++++
 include/linux/irqflags.h  |    7 ++----
 include/linux/sched.h     |    2 -
 include/net/sock.h        |    1
 kernel/softirq.c          |   34 +++++++++++++++++++++++++++++++++
 net/core/sock.c           |   37 ++++++++++++++++++++++++++++++++++--
 6 files changed, 92 insertions(+), 7 deletions(-)


diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b..a773d0c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -295,6 +295,24 @@ extern void send_remote_softirq(struct call_single_data *cp, int cpu, int softir
 extern void __send_remote_softirq(struct call_single_data *cp, int cpu,
 				  int this_cpu, int softirq);
 
+/*
+ * softirq delayed works : should be delayed at do_softirq() end
+ */
+struct softirq_delay {
+	struct softirq_delay	*next;
+	void 			(*func)(struct softirq_delay *);
+};
+
+int softirq_delay_queue(struct softirq_delay *sdel);
+
+static inline void softirq_delay_init(struct softirq_delay *sdel,
+				      void (*func)(struct softirq_delay *))
+{
+	sdel->next = NULL;
+	sdel->func = func;
+}
+
+
 /* Tasklets --- multithreaded analogue of BHs.
 
    Main feature differing them of generic softirqs: tasklet
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 74bde13..fe55ec4 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -13,6 +13,9 @@
 
 #include <linux/typecheck.h>
 
+#define trace_softirq_enter()	do { current->softirq_context++; } while (0)
+#define trace_softirq_exit()	do { current->softirq_context--; } while (0)
+
 #ifdef CONFIG_TRACE_IRQFLAGS
   extern void trace_softirqs_on(unsigned long ip);
   extern void trace_softirqs_off(unsigned long ip);
@@ -24,8 +27,6 @@
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
 # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define trace_softirq_enter()	do { current->softirq_context++; } while (0)
-# define trace_softirq_exit()	do { current->softirq_context--; } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
@@ -38,8 +39,6 @@
 # define trace_softirqs_enabled(p)	0
 # define trace_hardirq_enter()		do { } while (0)
 # define trace_hardirq_exit()		do { } while (0)
-# define trace_softirq_enter()		do { } while (0)
-# define trace_softirq_exit()		do { } while (0)
 # define INIT_TRACE_IRQFLAGS
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8c216e0..5dd8487 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1320,8 +1320,8 @@ struct task_struct {
 	unsigned long softirq_enable_ip;
 	unsigned int softirq_enable_event;
 	int hardirq_context;
-	int softirq_context;
 #endif
+	int softirq_context;
 #ifdef CONFIG_LOCKDEP
 # define MAX_LOCK_DEPTH 48UL
 	u64 curr_chain_key;
diff --git a/include/net/sock.h b/include/net/sock.h
index eefeeaf..1bfd9b8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -260,6 +260,7 @@ struct sock {
 	unsigned long	        sk_lingertime;
 	struct sk_buff_head	sk_error_queue;
 	struct proto		*sk_prot_creator;
+	struct softirq_delay	sk_delay;
 	rwlock_t		sk_callback_lock;
 	int			sk_err,
 				sk_err_soft;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9041ea7..c601730 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -158,6 +158,38 @@ void local_bh_enable_ip(unsigned long ip)
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+
+#define SOFTIRQ_DELAY_END (struct softirq_delay *)1L
+
+static DEFINE_PER_CPU(struct softirq_delay *, softirq_delay_head) = {
+	SOFTIRQ_DELAY_END
+};
+
+/*
+ * Preemption is disabled by caller
+ */
+int softirq_delay_queue(struct softirq_delay *sdel)
+{
+	if (cmpxchg(&sdel->next, NULL, __get_cpu_var(softirq_delay_head)) == NULL) {
+		__get_cpu_var(softirq_delay_head) = sdel;
+		return 1;
+	}
+	return 0;
+}
+
+static void softirq_delay_exec(void)
+{
+	struct softirq_delay *sdel;
+
+	while ((sdel = __get_cpu_var(softirq_delay_head)) != SOFTIRQ_DELAY_END) {
+		__get_cpu_var(softirq_delay_head) = sdel->next;
+		sdel->next = NULL;
+		sdel->func(sdel);
+		}
+}
+
+
+
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
@@ -211,6 +243,8 @@ restart:
 		pending >>= 1;
 	} while (pending);
 
+	softirq_delay_exec();
+
 	local_irq_disable();
 
 	pending = local_softirq_pending();
diff --git a/net/core/sock.c b/net/core/sock.c
index 5f97caa..d51d57d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -212,6 +212,8 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 /* Maximal space eaten by iovec or ancilliary data plus some space */
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 
+static void sock_readable_defer(struct softirq_delay *sdel);
+
 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 {
 	struct timeval tv;
@@ -1026,6 +1028,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 #endif
 
 		rwlock_init(&newsk->sk_dst_lock);
+		softirq_delay_init(&newsk->sk_delay, sock_readable_defer);
 		rwlock_init(&newsk->sk_callback_lock);
 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
 				af_callback_keys + newsk->sk_family,
@@ -1634,12 +1637,41 @@ static void sock_def_error_report(struct sock *sk)
 	read_unlock(&sk->sk_callback_lock);
 }
 
+static void sock_readable_defer(struct softirq_delay *sdel)
+{
+	struct sock *sk = container_of(sdel, struct sock, sk_delay);
+
+	wake_up_interruptible_sync(sk->sk_sleep);
+	/*
+	 * Before unlocking, we increase preempt_count,
+	 * as it was decreased in sock_def_readable()
+	 */
+	preempt_disable();
+	read_unlock(&sk->sk_callback_lock);
+}
+
 static void sock_def_readable(struct sock *sk, int len)
 {
 	read_lock(&sk->sk_callback_lock);
-	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible_sync(sk->sk_sleep);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
+		if (current->softirq_context) {
+			/*
+			 * If called from __do_softirq(), we want to delay
+			 * calls to wake_up_interruptible_sync()
+			 */
+			if (!softirq_delay_queue(&sk->sk_delay))
+				goto unlock;
+			/*
+			 * We keep sk->sk_callback_lock read locked,
+			 * but decrease preempt_count to avoid an overflow
+			 */
+			preempt_enable_no_resched();
+			return;
+		}
+		wake_up_interruptible_sync(sk->sk_sleep);
+	}
+unlock:
 	read_unlock(&sk->sk_callback_lock);
 }
 
@@ -1720,6 +1752,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 		sk->sk_sleep	=	NULL;
 
 	rwlock_init(&sk->sk_dst_lock);
+	softirq_delay_init(&sk->sk_delay, sock_readable_defer);
 	rwlock_init(&sk->sk_callback_lock);
 	lockdep_set_class_and_name(&sk->sk_callback_lock,
 			af_callback_keys + sk->sk_family,


  reply	other threads:[~2009-03-11  3:00 UTC|newest]

Thread overview: 70+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-01-30 17:49 Multicast packet loss Kenny Chang
2009-01-30 19:04 ` Eric Dumazet
2009-01-30 19:17 ` Denys Fedoryschenko
2009-01-30 20:03 ` Neil Horman
2009-01-30 22:29   ` Kenny Chang
2009-01-30 22:41     ` Eric Dumazet
2009-01-31 16:03       ` Neil Horman
2009-02-02 16:13         ` Kenny Chang
2009-02-02 16:48         ` Kenny Chang
2009-02-03 11:55           ` Neil Horman
2009-02-03 15:20             ` Kenny Chang
2009-02-04  1:15               ` Neil Horman
2009-02-04 16:07                 ` Kenny Chang
2009-02-04 16:46                   ` Wesley Chow
2009-02-04 18:11                     ` Eric Dumazet
2009-02-05 13:33                       ` Neil Horman
2009-02-05 13:46                         ` Wesley Chow
2009-02-05 13:29                   ` Neil Horman
2009-02-01 12:40       ` Eric Dumazet
2009-02-02 13:45         ` Neil Horman
2009-02-02 16:57           ` Eric Dumazet
2009-02-02 18:22             ` Neil Horman
2009-02-02 19:51               ` Wes Chow
2009-02-02 20:29                 ` Eric Dumazet
2009-02-02 21:09                   ` Wes Chow
2009-02-02 21:31                     ` Eric Dumazet
2009-02-03 17:34                       ` Kenny Chang
2009-02-04  1:21                         ` Neil Horman
2009-02-26 17:15                           ` Kenny Chang
2009-02-28  8:51                             ` Eric Dumazet
2009-03-01 17:03                               ` Eric Dumazet
2009-03-04  8:16                               ` David Miller
2009-03-04  8:36                                 ` Eric Dumazet
2009-03-07  7:46                                   ` Eric Dumazet
2009-03-08 16:46                                     ` Eric Dumazet
2009-03-09  2:49                                       ` David Miller
2009-03-09  6:36                                         ` Eric Dumazet
2009-03-13 21:51                                           ` David Miller
2009-03-13 22:30                                             ` Eric Dumazet
2009-03-13 22:38                                               ` David Miller
2009-03-13 22:45                                                 ` Eric Dumazet
2009-03-14  9:03                                                   ` [PATCH] net: reorder fields of struct socket Eric Dumazet
2009-03-16  2:59                                                     ` David Miller
2009-03-16 22:22                                                 ` Multicast packet loss Eric Dumazet
2009-03-17 10:11                                                   ` Peter Zijlstra
2009-03-17 11:08                                                     ` Eric Dumazet
2009-03-17 11:57                                                       ` Peter Zijlstra
2009-03-17 15:00                                                       ` Brian Bloniarz
2009-03-17 15:16                                                         ` Eric Dumazet
2009-03-17 19:39                                                           ` David Stevens
2009-03-17 21:19                                                             ` Eric Dumazet
2009-04-03 19:28                                                   ` Brian Bloniarz
2009-04-05 13:49                                                     ` Eric Dumazet
2009-04-06 21:53                                                       ` Brian Bloniarz
2009-04-06 22:12                                                         ` Brian Bloniarz
2009-04-07 20:08                                                       ` Brian Bloniarz
2009-04-08  8:12                                                         ` Eric Dumazet
2009-03-09 22:56                                       ` Brian Bloniarz
2009-03-10  5:28                                         ` Eric Dumazet
2009-03-10 23:22                                           ` Brian Bloniarz
2009-03-11  3:00                                             ` Eric Dumazet [this message]
2009-03-12 15:47                                               ` Brian Bloniarz
2009-03-12 16:34                                                 ` Eric Dumazet
2009-02-27 18:40       ` Christoph Lameter
2009-02-27 18:56         ` Eric Dumazet
2009-02-27 19:45           ` Christoph Lameter
2009-02-27 20:12             ` Eric Dumazet
2009-02-27 21:36               ` Eric Dumazet
2009-02-02 13:53     ` Eric Dumazet
  -- strict thread matches above, loose matches on Subject: below --
2009-04-05 14:42 bmb

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=49B72956.9050504@cosmosbay.com \
    --to=dada1@cosmosbay.com \
    --cc=bmb@athenacr.com \
    --cc=davem@davemloft.net \
    --cc=kchang@athenacr.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.