Re: slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Max Krasnyansky <maxk@qualcomm.com>
To: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Christoph Lameter <clameter@sgi.com>, Ingo Molnar <mingo@elte.hu>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	"Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>,
	Gautham shenoy <ego@in.ibm.com>, Andrew Morton <akpm@osdl.org>,
	linux-kernel@vger.kernel.org
Subject: Re: slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems
Date: Tue, 20 Feb 2007 10:39:08 -0800	[thread overview]
Message-ID: <45DB404C.4070305@qualcomm.com> (raw)
In-Reply-To: <20070129182742.GA158@tv-sign.ru>

Folks,

Oleg Nesterov wrote:
>>> Even if smp_processor_id() was stable during the execution of cache_reap(),
>>> this work_struct can be moved to another CPU if CPU_DEAD happens. We can't
>>> avoid this, and this is correct.
>> Uhh.... This may not be correct in terms of how the slab operates.
> 
> But this is practically impossible to avoid. We can't delay CPU_DOWN until all
> workqueues flush their cwq->worklist. This is livelockable, the work can re-queue
> itself, and new works can be added since the dying CPU is still on cpu_online_map.
> This means that some pending works will be processed on another CPU.
> 
> delayed_work is even worse, the timer can migrate as well.
> 
> The first problem (smp_processor_id() is not stable) could be solved if we
> use freezer or with the help of not-yet-implemented scalable lock_cpu_hotplug.
> 
>>> This means that __get_cpu_var(reap_work) returns a "wrong" struct delayed_work.
>>> This is absolutely harmless right now, but may be it's better to use
>>> container_of(unused, struct delayed_work, work).
>> Well seems that we have a set of unresolved issues with workqueues and cpu 
>> hotplug.

How about storing 'cpu' explicitly in the work queue instead of relying on the
smp_processor_id() and friends ? That way there is no ambiguity when threads/timers get
moved around.
I'm cooking a set of patches to extend cpu isolation concept a bit. In which case I'd
like one CPU to run cache_reap timer on behalf of another cpu. See the patch below.

diff --git a/mm/slab.c b/mm/slab.c
index c610062..0f46d11 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -766,7 +766,17 @@ int slab_is_available(void)
  	return g_cpucache_up == FULL;
  }

-static DEFINE_PER_CPU(struct delayed_work, reap_work);
+struct slab_work {
+	struct delayed_work dw;
+	unsigned int       cpu;
+};
+
+static DEFINE_PER_CPU(struct slab_work, reap_work);
+
+static inline struct array_cache *cpu_cache_get_on(struct kmem_cache *cachep, unsigned int cpu)
+{
+	return cachep->array[cpu];
+}

  static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
  {
@@ -915,9 +925,9 @@ static void init_reap_node(int cpu)
  	per_cpu(reap_node, cpu) = node;
  }

-static void next_reap_node(void)
+static void next_reap_node(unsigned int cpu)
  {
-	int node = __get_cpu_var(reap_node);
+	int node = per_cpu(reap_node, cpu);

  	/*
  	 * Also drain per cpu pages on remote zones
@@ -928,12 +938,12 @@ static void next_reap_node(void)
  	node = next_node(node, node_online_map);
  	if (unlikely(node >= MAX_NUMNODES))
  		node = first_node(node_online_map);
-	__get_cpu_var(reap_node) = node;
+	per_cpu(reap_node, cpu) = node;
  }

  #else
  #define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
+#define next_reap_node(cpu) do { } while (0)
  #endif

  /*
@@ -945,17 +955,18 @@ static void next_reap_node(void)
   */
  static void __devinit start_cpu_timer(int cpu)
  {
-	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
+	struct slab_work *reap_work = &per_cpu(reap_work, cpu);

  	/*
  	 * When this gets called from do_initcalls via cpucache_init(),
  	 * init_workqueues() has already run, so keventd will be setup
  	 * at that time.
  	 */
-	if (keventd_up() && reap_work->work.func == NULL) {
+	if (keventd_up() && reap_work->dw.work.func == NULL) {
  		init_reap_node(cpu);
-		INIT_DELAYED_WORK(reap_work, cache_reap);
-		schedule_delayed_work_on(cpu, reap_work,
+		INIT_DELAYED_WORK(&reap_work->dw, cache_reap);
+		reap_work->cpu = cpu;
+		schedule_delayed_work_on(cpu, &reap_work->dw,
  					__round_jiffies_relative(HZ, cpu));
  	}
  }
@@ -1004,7 +1015,7 @@ static int transfer_objects(struct array_cache *to,
  #ifndef CONFIG_NUMA

  #define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
+#define reap_alien(cachep, l3, cpu) do { } while (0)

  static inline struct array_cache **alloc_alien_cache(int node, int limit)
  {
@@ -1099,9 +1110,9 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
  /*
   * Called from cache_reap() to regularly drain alien caches round robin.
   */
-static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, unsigned int cpu)
  {
-	int node = __get_cpu_var(reap_node);
+	int node = per_cpu(reap_node, cpu);

  	if (l3->alien) {
  		struct array_cache *ac = l3->alien[node];
@@ -4017,16 +4028,17 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
   * If we cannot acquire the cache chain mutex then just give up - we'll try
   * again on the next iteration.
   */
-static void cache_reap(struct work_struct *unused)
+static void cache_reap(struct work_struct *_work)
  {
  	struct kmem_cache *searchp;
  	struct kmem_list3 *l3;
  	int node = numa_node_id();

+	struct slab_work *work = (struct slab_work *) _work;
+
  	if (!mutex_trylock(&cache_chain_mutex)) {
  		/* Give up. Setup the next iteration. */
-		schedule_delayed_work(&__get_cpu_var(reap_work),
-				      round_jiffies_relative(REAPTIMEOUT_CPUC));
+		schedule_delayed_work(&work->dw, round_jiffies_relative(REAPTIMEOUT_CPUC));
  		return;
  	}

@@ -4040,9 +4052,9 @@ static void cache_reap(struct work_struct *unused)
  		 */
  		l3 = searchp->nodelists[node];

-		reap_alien(searchp, l3);
+		reap_alien(searchp, l3, work->cpu);

-		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
+		drain_array(searchp, l3, cpu_cache_get_on(searchp, work->cpu), 0, node);

  		/*
  		 * These are racy checks but it does not matter
@@ -4069,11 +4081,11 @@ next:
  	}
  	check_irq_on();
  	mutex_unlock(&cache_chain_mutex);
-	next_reap_node();
-	refresh_cpu_vm_stats(smp_processor_id());
+	next_reap_node(work->cpu);
+	refresh_cpu_vm_stats(work->cpu);
+
  	/* Set up the next iteration */
-	schedule_delayed_work(&__get_cpu_var(reap_work),
-		round_jiffies_relative(REAPTIMEOUT_CPUC));
+	schedule_delayed_work(&work->dw, round_jiffies_relative(REAPTIMEOUT_CPUC));
  }

  #ifdef CONFIG_PROC_FS


Max

next prev parent reply	other threads:[~2007-02-20 19:06 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-01-29  1:13 slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems Oleg Nesterov
2007-01-29 16:54 ` Christoph Lameter
2007-01-29 17:19   ` Oleg Nesterov
2007-01-29 17:27     ` Christoph Lameter
2007-01-29 18:27       ` Oleg Nesterov
2007-01-29 19:09         ` Christoph Lameter
2007-01-29 19:29           ` Oleg Nesterov
2007-01-29 19:25         ` Christoph Lameter
2007-01-29 19:49           ` Oleg Nesterov
2007-01-29 20:29             ` Christoph Lameter
2007-01-29 21:05               ` Oleg Nesterov
2007-01-29 21:48                 ` Christoph Lameter
2007-01-29 22:14                   ` Oleg Nesterov
2007-02-20 18:39         ` Max Krasnyansky [this message]
2007-02-20 18:45           ` Christoph Lameter
2007-02-20 20:05             ` Oleg Nesterov
2007-02-20 21:22               ` Max Krasnyansky
2007-02-20 21:35                 ` Christoph Lameter
2007-02-20 22:01                   ` Max Krasnyansky
2007-02-20 22:14                     ` Christoph Lameter
2007-02-20 22:48                       ` SLAB cache reaper on isolated cpus Max Krasnyansky
2007-02-20 23:19                         ` Christoph Lameter
2007-02-21  3:41                           ` Max Krasnyansky
2007-02-20 21:05             ` slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems Max Krasnyansky
2007-02-20 21:34               ` Christoph Lameter

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c610062 dfblob:0f46d11 )
 OR (
bs:"Re: slab: start_cpu_timer/cache_reap CONFIG_HOTPLUG_CPU problems" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=45DB404C.4070305@qualcomm.com \
    --to=maxk@qualcomm.com \
    --cc=akpm@osdl.org \
    --cc=clameter@sgi.com \
    --cc=ego@in.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=oleg@tv-sign.ru \
    --cc=vatsa@in.ibm.com \
    --cc=venkatesh.pallipadi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.