* [PATCH RESEND -rt] mm: perform lru_add_drain_all() remotely
@ 2016-05-09 14:50 Luiz Capitulino
  2016-05-12  8:42 ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 7+ messages in thread
From: Luiz Capitulino @ 2016-05-09 14:50 UTC (permalink / raw)
  To: linux-rt-users; +Cc: riel, bigeasy, tglx, srostedt, williams
lru_add_drain_all() works by scheduling lru_add_drain_cpu() to run
on all CPUs that have non-empty LRU pagevecs and then waiting for
the scheduled work to complete. However, workqueue threads may never
have the chance to run on a CPU that's running a SCHED_FIFO task.
This causes lru_add_drain_all() to block forever.
This commit solves this problem by changing lru_add_drain_all()
to drain the LRU pagevecs of remote CPUs. This is done by grabbing
swapvec_lock and calling lru_add_drain_cpu().
PS: This is based on an idea and initial implementation by
    Rik van Riel.
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
---
 include/linux/locallock.h | 31 +++++++++++++++++++++++++++++++
 mm/swap.c                 | 35 +++++++++++++++++++++++++----------
 2 files changed, 56 insertions(+), 10 deletions(-)
diff --git a/include/linux/locallock.h b/include/linux/locallock.h
index 6fe5928..f4cd691 100644
--- a/include/linux/locallock.h
+++ b/include/linux/locallock.h
@@ -104,6 +104,17 @@ static inline void __local_unlock(struct local_irq_lock *lv)
 		put_local_var(lvar);				\
 	} while (0)
 
+#define local_lock_other_cpu(lvar, cpu)                         \
+	do {                                                    \
+		__local_lock(&per_cpu(lvar, cpu));              \
+	} while (0)
+
+#define local_unlock_other_cpu(lvar, cpu)                       \
+	do {                                                    \
+		__local_unlock(&per_cpu(lvar, cpu));            \
+	} while (0)
+
+
 static inline void __local_lock_irq(struct local_irq_lock *lv)
 {
 	spin_lock_irqsave(&lv->lock, lv->flags);
@@ -163,6 +174,22 @@ static inline int __local_lock_irqsave(struct local_irq_lock *lv)
 		_flags = per_cpu(lvar, cpu).flags;			\
 	} while (0)
 
+#define local_lock_irqsave_other_cpu(lvar, _flags, cpu)			\
+	do {								\
+		if (cpu == smp_processor_id())				\
+			local_lock_irqsave(lvar, _flags);		\
+		else							\
+			local_lock_other_cpu(lvar, cpu);		\
+	} while (0)
+
+#define local_unlock_irqrestore_other_cpu(lvar, _flags, cpu)	        \
+	do {								\
+		if (cpu == smp_processor_id())				\
+			local_unlock_irqrestore(lvar, _flags);		\
+		else							\
+			local_unlock_other_cpu(lvar, cpu);		\
+	} while (0)
+
 static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
 					    unsigned long flags)
 {
@@ -250,6 +277,10 @@ static inline void local_irq_lock_init(int lvar) { }
 #define local_unlock_irq(lvar)			local_irq_enable()
 #define local_lock_irqsave(lvar, flags)		local_irq_save(flags)
 #define local_unlock_irqrestore(lvar, flags)	local_irq_restore(flags)
+#define local_lock_irqsave_other_cpu(lvar, flags, cpu) \
+	local_irq_save(flags)
+#define local_unlock_irqrestore_other_cpu(lvar, flags, cpu) \
+	local_irq_restore(flags)
 
 #define local_spin_trylock_irq(lvar, lock)	spin_trylock_irq(lock)
 #define local_spin_lock_irq(lvar, lock)		spin_lock_irq(lock)
diff --git a/mm/swap.c b/mm/swap.c
index ca194ae..84c3c21 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -821,9 +821,9 @@ void lru_add_drain_cpu(int cpu)
 		unsigned long flags;
 
 		/* No harm done if a racing interrupt already did this */
-		local_lock_irqsave(rotate_lock, flags);
+		local_lock_irqsave_other_cpu(rotate_lock, flags, cpu);
 		pagevec_move_tail(pvec);
-		local_unlock_irqrestore(rotate_lock, flags);
+		local_unlock_irqrestore_other_cpu(rotate_lock, flags, cpu);
 	}
 
 	pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
@@ -866,12 +866,32 @@ void lru_add_drain(void)
 	local_unlock_cpu(swapvec_lock);
 }
 
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
+{
+	local_lock_other_cpu(swapvec_lock, cpu);
+	lru_add_drain_cpu(cpu);
+	local_unlock_other_cpu(swapvec_lock, cpu);
+}
+#else
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
 	lru_add_drain();
 }
 
-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
+{
+		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+
+		INIT_WORK(work, lru_add_drain_per_cpu);
+		schedule_work_on(cpu, work);
+		cpumask_set_cpu(cpu, has_work);
+
+}
+#endif
+
 
 void lru_add_drain_all(void)
 {
@@ -884,16 +904,11 @@ void lru_add_drain_all(void)
 	cpumask_clear(&has_work);
 
 	for_each_online_cpu(cpu) {
-		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
^ permalink raw reply related	[flat|nested] 7+ messages in thread
* Re: [PATCH RESEND -rt] mm: perform lru_add_drain_all() remotely
  2016-05-09 14:50 [PATCH RESEND -rt] mm: perform lru_add_drain_all() remotely Luiz Capitulino
@ 2016-05-12  8:42 ` Sebastian Andrzej Siewior
  2016-05-12  9:21   ` Sebastian Andrzej Siewior
  2016-05-12 13:51   ` Luiz Capitulino
  0 siblings, 2 replies; 7+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-05-12  8:42 UTC (permalink / raw)
  To: Luiz Capitulino; +Cc: linux-rt-users, riel, tglx, srostedt, williams
* Luiz Capitulino | 2016-05-09 10:50:37 [-0400]:
>diff --git a/include/linux/locallock.h b/include/linux/locallock.h
>index 6fe5928..f4cd691 100644
>--- a/include/linux/locallock.h
>+++ b/include/linux/locallock.h
>@@ -104,6 +104,17 @@ static inline void __local_unlock(struct local_irq_lock *lv)
> 		put_local_var(lvar);				\
> 	} while (0)
> 
>+#define local_lock_other_cpu(lvar, cpu)                         \
I would prefer to have local_lock_on() to be in sync with
local_lock_irq_on()
>+	do {                                                    \
>+		__local_lock(&per_cpu(lvar, cpu));              \
>+	} while (0)
>+
>+#define local_unlock_other_cpu(lvar, cpu)                       \
>+	do {                                                    \
>+		__local_unlock(&per_cpu(lvar, cpu));            \
>+	} while (0)
>+
>+
> static inline void __local_lock_irq(struct local_irq_lock *lv)
> {
> 	spin_lock_irqsave(&lv->lock, lv->flags);
>@@ -163,6 +174,22 @@ static inline int __local_lock_irqsave(struct local_irq_lock *lv)
> 		_flags = per_cpu(lvar, cpu).flags;			\
> 	} while (0)
> 
>+#define local_lock_irqsave_other_cpu(lvar, _flags, cpu)			\
why not local_lock_irq_on()? 
>+	do {								\
>+		if (cpu == smp_processor_id())				\
>+			local_lock_irqsave(lvar, _flags);		\
>+		else							\
>+			local_lock_other_cpu(lvar, cpu);		\
>+	} while (0)
>+
…
>diff --git a/mm/swap.c b/mm/swap.c
>index ca194ae..84c3c21 100644
>--- a/mm/swap.c
>+++ b/mm/swap.c
>@@ -821,9 +821,9 @@ void lru_add_drain_cpu(int cpu)
> 		unsigned long flags;
> 
> 		/* No harm done if a racing interrupt already did this */
>-		local_lock_irqsave(rotate_lock, flags);
>+		local_lock_irqsave_other_cpu(rotate_lock, flags, cpu);
> 		pagevec_move_tail(pvec);
>-		local_unlock_irqrestore(rotate_lock, flags);
>+		local_unlock_irqrestore_other_cpu(rotate_lock, flags, cpu);
This piece might be required independently of this patch. It would be
nice to have it in page_alloc_cpu_notify() :) I take care of this…
> 	}
> 
> 	pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
>@@ -866,12 +866,32 @@ void lru_add_drain(void)
> 	local_unlock_cpu(swapvec_lock);
> }
> 
>+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
>+
>+#ifdef CONFIG_PREEMPT_RT_BASE
>+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
>+{
>+	local_lock_other_cpu(swapvec_lock, cpu);
>+	lru_add_drain_cpu(cpu);
>+	local_unlock_other_cpu(swapvec_lock, cpu);
>+}
>+#else
> static void lru_add_drain_per_cpu(struct work_struct *dummy)
> {
> 	lru_add_drain();
> }
> 
>-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
>+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
>+{
>+		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
>+
>+		INIT_WORK(work, lru_add_drain_per_cpu);
>+		schedule_work_on(cpu, work);
>+		cpumask_set_cpu(cpu, has_work);
>+
>+}
>+#endif
>+
So on RT you use the local locks to perform $work which should be done
on the remote CPU locally. RT's local locks help here. You can't do this
in the !RT case because we optimize the local locks away so you have to
stick with the schedule_work_on() way of dealing with it.
In your description you write
>lru_add_drain_all() works by scheduling lru_add_drain_cpu() to run
>on all CPUs that have non-empty LRU pagevecs and then waiting for
>the scheduled work to complete. However, workqueue threads may never
>have the chance to run on a CPU that's running a SCHED_FIFO task.
>This causes lru_add_drain_all() to block forever.
It might block forever but not necessarily. By default the scheduler
would push RT tasks off the CPU if they run for too long. So you have
disabled this (and I don't complain about it). And once this does not
happen, there is nothing to force the workqueue to run.
But this is not a RT-only problem, is it? You could run high prio RT
task on a CPU and the workqueue would never get on the CPU as well.
Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply	[flat|nested] 7+ messages in thread
* Re: [PATCH RESEND -rt] mm: perform lru_add_drain_all() remotely
  2016-05-12  8:42 ` Sebastian Andrzej Siewior
@ 2016-05-12  9:21   ` Sebastian Andrzej Siewior
  2016-05-12 13:51   ` Luiz Capitulino
  1 sibling, 0 replies; 7+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-05-12  9:21 UTC (permalink / raw)
  To: Luiz Capitulino; +Cc: linux-rt-users, riel, tglx, srostedt, williams
* Sebastian Andrzej Siewior | 2016-05-12 10:42:30 [+0200]:
>>diff --git a/mm/swap.c b/mm/swap.c
>>index ca194ae..84c3c21 100644
>>--- a/mm/swap.c
>>+++ b/mm/swap.c
>>@@ -821,9 +821,9 @@ void lru_add_drain_cpu(int cpu)
>> 		unsigned long flags;
>> 
>> 		/* No harm done if a racing interrupt already did this */
>>-		local_lock_irqsave(rotate_lock, flags);
>>+		local_lock_irqsave_other_cpu(rotate_lock, flags, cpu);
>> 		pagevec_move_tail(pvec);
>>-		local_unlock_irqrestore(rotate_lock, flags);
>>+		local_unlock_irqrestore_other_cpu(rotate_lock, flags, cpu);
>
>This piece might be required independently of this patch. It would be
>nice to have it in page_alloc_cpu_notify() :) I take care of this…
no, I won't :)
It looks halfway like an upstream bug. Vanila has here local_irq_save()
which always matches the local CPU except when it is invoked from
page_alloc_cpu_notify() CPU_DEAD notifier. Here it does not matter
because the CPU is gone and won't take the locks anymore.
Unless, we start taking them cross-CPU which is only possible with this
patch.
Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply	[flat|nested] 7+ messages in thread
* Re: [PATCH RESEND -rt] mm: perform lru_add_drain_all() remotely
  2016-05-12  8:42 ` Sebastian Andrzej Siewior
  2016-05-12  9:21   ` Sebastian Andrzej Siewior
@ 2016-05-12 13:51   ` Luiz Capitulino
  2016-05-12 14:49     ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 7+ messages in thread
From: Luiz Capitulino @ 2016-05-12 13:51 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior; +Cc: linux-rt-users, riel, tglx, srostedt, williams
On Thu, 12 May 2016 10:42:30 +0200
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> * Luiz Capitulino | 2016-05-09 10:50:37 [-0400]:
> 
> >diff --git a/include/linux/locallock.h b/include/linux/locallock.h
> >index 6fe5928..f4cd691 100644
> >--- a/include/linux/locallock.h
> >+++ b/include/linux/locallock.h
> >@@ -104,6 +104,17 @@ static inline void __local_unlock(struct local_irq_lock *lv)
> > 		put_local_var(lvar);				\
> > 	} while (0)
> > 
> >+#define local_lock_other_cpu(lvar, cpu)                         \  
> 
> I would prefer to have local_lock_on() to be in sync with
> local_lock_irq_on()
> 
> >+	do {                                                    \
> >+		__local_lock(&per_cpu(lvar, cpu));              \
> >+	} while (0)
> >+
> >+#define local_unlock_other_cpu(lvar, cpu)                       \
> >+	do {                                                    \
> >+		__local_unlock(&per_cpu(lvar, cpu));            \
> >+	} while (0)
> >+
> >+
> > static inline void __local_lock_irq(struct local_irq_lock *lv)
> > {
> > 	spin_lock_irqsave(&lv->lock, lv->flags);
> >@@ -163,6 +174,22 @@ static inline int __local_lock_irqsave(struct local_irq_lock *lv)
> > 		_flags = per_cpu(lvar, cpu).flags;			\
> > 	} while (0)
> > 
> >+#define local_lock_irqsave_other_cpu(lvar, _flags, cpu)			\  
> 
> why not local_lock_irq_on()? 
I can change both.
> >+	do {								\
> >+		if (cpu == smp_processor_id())				\
> >+			local_lock_irqsave(lvar, _flags);		\
> >+		else							\
> >+			local_lock_other_cpu(lvar, cpu);		\
> >+	} while (0)
> >+  
> …
> 
> >diff --git a/mm/swap.c b/mm/swap.c
> >index ca194ae..84c3c21 100644
> >--- a/mm/swap.c
> >+++ b/mm/swap.c
> >@@ -821,9 +821,9 @@ void lru_add_drain_cpu(int cpu)
> > 		unsigned long flags;
> > 
> > 		/* No harm done if a racing interrupt already did this */
> >-		local_lock_irqsave(rotate_lock, flags);
> >+		local_lock_irqsave_other_cpu(rotate_lock, flags, cpu);
> > 		pagevec_move_tail(pvec);
> >-		local_unlock_irqrestore(rotate_lock, flags);
> >+		local_unlock_irqrestore_other_cpu(rotate_lock, flags, cpu);  
> 
> This piece might be required independently of this patch. It would be
> nice to have it in page_alloc_cpu_notify() :) I take care of this…
> 
> > 	}
> > 
> > 	pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
> >@@ -866,12 +866,32 @@ void lru_add_drain(void)
> > 	local_unlock_cpu(swapvec_lock);
> > }
> > 
> >+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
> >+
> >+#ifdef CONFIG_PREEMPT_RT_BASE
> >+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
> >+{
> >+	local_lock_other_cpu(swapvec_lock, cpu);
> >+	lru_add_drain_cpu(cpu);
> >+	local_unlock_other_cpu(swapvec_lock, cpu);
> >+}
> >+#else
> > static void lru_add_drain_per_cpu(struct work_struct *dummy)
> > {
> > 	lru_add_drain();
> > }
> > 
> >-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
> >+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
> >+{
> >+		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
> >+
> >+		INIT_WORK(work, lru_add_drain_per_cpu);
> >+		schedule_work_on(cpu, work);
> >+		cpumask_set_cpu(cpu, has_work);
> >+
> >+}
> >+#endif
> >+  
> 
> So on RT you use the local locks to perform $work which should be done
> on the remote CPU locally. RT's local locks help here. You can't do this
> in the !RT case because we optimize the local locks away so you have to
> stick with the schedule_work_on() way of dealing with it.
Exactly.
> In your description you write
> >lru_add_drain_all() works by scheduling lru_add_drain_cpu() to run
> >on all CPUs that have non-empty LRU pagevecs and then waiting for
> >the scheduled work to complete. However, workqueue threads may never
> >have the chance to run on a CPU that's running a SCHED_FIFO task.
> >This causes lru_add_drain_all() to block forever.  
> 
> It might block forever but not necessarily. By default the scheduler
> would push RT tasks off the CPU if they run for too long. So you have
> disabled this (and I don't complain about it). And once this does not
> happen, there is nothing to force the workqueue to run.
> But this is not a RT-only problem, is it? You could run high prio RT
> task on a CPU and the workqueue would never get on the CPU as well.
That's correct. This is an -RT patch for two reasons:
 1. This solution builds on top of the per-cpu locks API implemented
    by -RT and already in place in the pagevecs drain code
 2. As SCHED_FIFO is the norm in -RT, the problem is more common there
--
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply	[flat|nested] 7+ messages in thread
* Re: [PATCH RESEND -rt] mm: perform lru_add_drain_all() remotely
  2016-05-12 13:51   ` Luiz Capitulino
@ 2016-05-12 14:49     ` Sebastian Andrzej Siewior
  2016-05-12 19:52       ` Luiz Capitulino
  0 siblings, 1 reply; 7+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-05-12 14:49 UTC (permalink / raw)
  To: Luiz Capitulino; +Cc: linux-rt-users, riel, tglx, srostedt, williams
* Luiz Capitulino | 2016-05-12 09:51:49 [-0400]:
>That's correct. This is an -RT patch for two reasons:
>
> 1. This solution builds on top of the per-cpu locks API implemented
>    by -RT and already in place in the pagevecs drain code
>
> 2. As SCHED_FIFO is the norm in -RT, the problem is more common there
more common so not RT-only. Would you people prefer this for mainline or
would you rather something else in case this needs to be fixed
mainline, too?
Sebastian
^ permalink raw reply	[flat|nested] 7+ messages in thread
* Re: [PATCH RESEND -rt] mm: perform lru_add_drain_all() remotely
  2016-05-12 14:49     ` Sebastian Andrzej Siewior
@ 2016-05-12 19:52       ` Luiz Capitulino
  2016-05-25 15:59         ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 7+ messages in thread
From: Luiz Capitulino @ 2016-05-12 19:52 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior; +Cc: linux-rt-users, riel, tglx, srostedt, williams
On Thu, 12 May 2016 16:49:43 +0200
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> * Luiz Capitulino | 2016-05-12 09:51:49 [-0400]:
> 
> >That's correct. This is an -RT patch for two reasons:
> >
> > 1. This solution builds on top of the per-cpu locks API implemented
> >    by -RT and already in place in the pagevecs drain code
> >
> > 2. As SCHED_FIFO is the norm in -RT, the problem is more common there  
> 
> more common so not RT-only. Would you people prefer this for mainline or
> would you rather something else in case this needs to be fixed
> mainline, too?
To have this for mainline we'd have to forward-port local_locks.
It's doable, but I think it's a bit too much only to solve this
problem.
The alternative, which is what I had in mind, was to forward-port
this change to mainline when local_locks gets merged upstream. But
I guess this will only happen when sleeping spinlocks are merged?
^ permalink raw reply	[flat|nested] 7+ messages in thread
* Re: [PATCH RESEND -rt] mm: perform lru_add_drain_all() remotely
  2016-05-12 19:52       ` Luiz Capitulino
@ 2016-05-25 15:59         ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 7+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-05-25 15:59 UTC (permalink / raw)
  To: Luiz Capitulino; +Cc: linux-rt-users, riel, tglx, srostedt, williams
* Luiz Capitulino | 2016-05-12 15:52:02 [-0400]:
>The alternative, which is what I had in mind, was to forward-port
>this change to mainline when local_locks gets merged upstream. But
>I guess this will only happen when sleeping spinlocks are merged?
Yeah, I am not sure about the exact path. Sleeping locks are not
required for this. Let me worry about this later. In the meantime I pick
up this (your v2) in -RT.
Sebastian
^ permalink raw reply	[flat|nested] 7+ messages in thread
end of thread, other threads:[~2016-05-25 15:59 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-05-09 14:50 [PATCH RESEND -rt] mm: perform lru_add_drain_all() remotely Luiz Capitulino
2016-05-12  8:42 ` Sebastian Andrzej Siewior
2016-05-12  9:21   ` Sebastian Andrzej Siewior
2016-05-12 13:51   ` Luiz Capitulino
2016-05-12 14:49     ` Sebastian Andrzej Siewior
2016-05-12 19:52       ` Luiz Capitulino
2016-05-25 15:59         ` Sebastian Andrzej Siewior
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).