From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1426300AbcBRKyQ (ORCPT ); Thu, 18 Feb 2016 05:54:16 -0500 Received: from mga09.intel.com ([134.134.136.24]:55997 "EHLO mga09.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1425160AbcBRKyM (ORCPT ); Thu, 18 Feb 2016 05:54:12 -0500 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.22,464,1449561600"; d="scan'208";a="654935255" Message-ID: <1455792851.9851.36.camel@linux.intel.com> Subject: Re: [PATCH] [RFC] kernel/cpu: Use lockref for online CPU reference counting From: Joonas Lahtinen To: Peter Zijlstra , Oleg Nesterov Cc: Intel graphics driver community testing & development , Linux kernel development , Ingo Molnar , David Hildenbrand , "Paul E. McKenney" , "Gautham R. Shenoy" , Chris Wilson Date: Thu, 18 Feb 2016 12:54:11 +0200 In-Reply-To: <20160215170618.GL6375@twins.programming.kicks-ass.net> References: <1455539803-13913-1-git-send-email-joonas.lahtinen@linux.intel.com> <20160215141755.GG6357@twins.programming.kicks-ass.net> <20160215170618.GL6375@twins.programming.kicks-ass.net> Organization: Intel Finland Oy - BIC 0357606-4 - Westendinkatu 7, 02160 Espoo Content-Type: text/plain; charset="UTF-8" X-Mailer: Evolution 3.18.4 (3.18.4-1.fc23) Mime-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Hi, On ma, 2016-02-15 at 18:06 +0100, Peter Zijlstra wrote: > On Mon, Feb 15, 2016 at 03:17:55PM +0100, Peter Zijlstra wrote: > > On Mon, Feb 15, 2016 at 02:36:43PM +0200, Joonas Lahtinen wrote: > > > Instead of implementing a custom locked reference counting, use lockref. > > > > > > Current implementation leads to a deadlock splat on Intel SKL platforms > > > when lockdep debugging is enabled. > > > > > > This is due to few of CPUfreq drivers (including Intel P-state) having this; > > > policy->rwsem is locked during driver initialization and the functions called > > > during init that actually apply CPU limits use get_online_cpus (because they > > > have other calling paths too), which will briefly lock cpu_hotplug.lock to > > > increase cpu_hotplug.refcount. > > > > > > On later calling path, when doing a suspend, when cpu_hotplug_begin() is called > > > in disable_nonboot_cpus(), callbacks to CPUfreq functions get called after, > > > which will lock policy->rwsem and cpu_hotplug.lock is already held by > > > cpu_hotplug_begin() and we do have a potential deadlock scenario reported by > > > our CI system (though it is a very unlikely one). See the Bugzilla link for more > > > details. > > > > I've been meaning to change the thing into a percpu-rwsem, I just > > haven't had time to look into the lockdep splat that generated. > > > The below has plenty lockdep issues because percpu-rwsem is > reader-writer fair (like the regular rwsem), so it does throw up a fair > number of very icky issues. > > If at all possible, I'd really rather fix those and have a 'saner' > hotplug lock, rather than muddle on with open-coded horror lock we have > now. > I do still agree the below would be a worthy change to proceed with. CC'd Oleg here too to give a comment. Regards, Joonas > > --- a/include/linux/cpu.h > +++ b/include/linux/cpu.h > @@ -225,6 +225,8 @@ extern struct bus_type cpu_subsys; >  #ifdef CONFIG_HOTPLUG_CPU >  /* Stop CPUs going up and down. */ >   > +extern void cpu_hotplug_init_task(struct task_struct *p); > + >  extern void cpu_hotplug_begin(void); >  extern void cpu_hotplug_done(void); >  extern void get_online_cpus(void); > @@ -242,6 +244,8 @@ int cpu_down(unsigned int cpu); >   >  #else /* CONFIG_HOTPLUG_CPU */ >   > +static inline void cpu_hotplug_init_task(struct task_struct *p) {} > + >  static inline void cpu_hotplug_begin(void) {} >  static inline void cpu_hotplug_done(void) {} >  #define get_online_cpus() do { } while (0) > --- a/include/linux/percpu-rwsem.h > +++ b/include/linux/percpu-rwsem.h > @@ -16,6 +16,15 @@ struct percpu_rw_semaphore { >   wait_queue_head_t write_waitq; >  }; >   > +#define DEFINE_STATIC_PERCPU_RWSEM(name) \ > +static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_frc_##name); \ > +static struct percpu_rw_semaphore name = { \ > + .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \ > + .fast_read_ctr = &__percpu_rwsem_frc_##name, \ > + .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ > + .write_waitq = __WAIT_QUEUE_HEAD_INITIALIZER(name.write_waitq), \ > +} > + >  extern void percpu_down_read(struct percpu_rw_semaphore *); >  extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *); >  extern void percpu_up_read(struct percpu_rw_semaphore *); > @@ -33,9 +42,11 @@ extern void percpu_free_rwsem(struct per >   __percpu_init_rwsem(brw, #brw, &rwsem_key); \ >  }) >   > - >  #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) >   > +#define percpu_rwsem_assert_held(sem)                          \ > + lockdep_assert_held(&(sem)->rw_sem) > + >  static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, >   bool read, unsigned long ip) >  { > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1403,6 +1403,9 @@ struct task_struct { >   struct task_struct *last_wakee; >   >   int wake_cpu; > +#ifdef CONFIG_HOTPLUG_CPU > + int cpuhp_ref; > +#endif >  #endif >   int on_rq; >   > --- a/kernel/cpu.c > +++ b/kernel/cpu.c > @@ -23,6 +23,7 @@ >  #include >  #include >  #include > +#include >   >  #include "smpboot.h" >   > @@ -51,121 +52,52 @@ EXPORT_SYMBOL(cpu_notifier_register_done >   >  static RAW_NOTIFIER_HEAD(cpu_chain); >   > -/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. > +/* > + * If set, cpu_up and cpu_down will return -EBUSY and do nothing. >   * Should always be manipulated under cpu_add_remove_lock >   */ >  static int cpu_hotplug_disabled; >   >  #ifdef CONFIG_HOTPLUG_CPU >   > -static struct { > - struct task_struct *active_writer; > - /* wait queue to wake up the active_writer */ > - wait_queue_head_t wq; > - /* verifies that no writer will get active while readers are active */ > - struct mutex lock; > - /* > -  * Also blocks the new readers during > -  * an ongoing cpu hotplug operation. > -  */ > - atomic_t refcount; > - > -#ifdef CONFIG_DEBUG_LOCK_ALLOC > - struct lockdep_map dep_map; > -#endif > -} cpu_hotplug = { > - .active_writer = NULL, > - .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), > - .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), > -#ifdef CONFIG_DEBUG_LOCK_ALLOC > - .dep_map = {.name = "cpu_hotplug.lock" }, > -#endif > -}; > - > -/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ > -#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) > -#define cpuhp_lock_acquire_tryread() \ > -   lock_map_acquire_tryread(&cpu_hotplug.dep_map) > -#define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map) > -#define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map) > +DEFINE_STATIC_PERCPU_RWSEM(hotplug); >   > +void cpu_hotplug_init_task(struct task_struct *p) > +{ > + if (WARN_ON_ONCE(p->cpuhp_ref)) > + p->cpuhp_ref = 0; > +} >   >  void get_online_cpus(void) >  { >   might_sleep(); > - if (cpu_hotplug.active_writer == current) > + > + if (current->cpuhp_ref++) /* read recursion */ >   return; > - cpuhp_lock_acquire_read(); > - mutex_lock(&cpu_hotplug.lock); > - atomic_inc(&cpu_hotplug.refcount); > - mutex_unlock(&cpu_hotplug.lock); > + > + percpu_down_read(&hotplug); >  } >  EXPORT_SYMBOL_GPL(get_online_cpus); >   >  void put_online_cpus(void) >  { > - int refcount; > - > - if (cpu_hotplug.active_writer == current) > + if (--current->cpuhp_ref) >   return; >   > - refcount = atomic_dec_return(&cpu_hotplug.refcount); > - if (WARN_ON(refcount < 0)) /* try to fix things up */ > - atomic_inc(&cpu_hotplug.refcount); > - > - if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) > - wake_up(&cpu_hotplug.wq); > - > - cpuhp_lock_release(); > - > + percpu_up_read(&hotplug); >  } >  EXPORT_SYMBOL_GPL(put_online_cpus); >   > -/* > - * This ensures that the hotplug operation can begin only when the > - * refcount goes to zero. > - * > - * Note that during a cpu-hotplug operation, the new readers, if any, > - * will be blocked by the cpu_hotplug.lock > - * > - * Since cpu_hotplug_begin() is always called after invoking > - * cpu_maps_update_begin(), we can be sure that only one writer is active. > - * > - * Note that theoretically, there is a possibility of a livelock: > - * - Refcount goes to zero, last reader wakes up the sleeping > - *   writer. > - * - Last reader unlocks the cpu_hotplug.lock. > - * - A new reader arrives at this moment, bumps up the refcount. > - * - The writer acquires the cpu_hotplug.lock finds the refcount > - *   non zero and goes to sleep again. > - * > - * However, this is very difficult to achieve in practice since > - * get_online_cpus() not an api which is called all that often. > - * > - */ >  void cpu_hotplug_begin(void) >  { > - DEFINE_WAIT(wait); > - > - cpu_hotplug.active_writer = current; > - cpuhp_lock_acquire(); > - > - for (;;) { > - mutex_lock(&cpu_hotplug.lock); > - prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); > - if (likely(!atomic_read(&cpu_hotplug.refcount))) > - break; > - mutex_unlock(&cpu_hotplug.lock); > - schedule(); > - } > - finish_wait(&cpu_hotplug.wq, &wait); > + percpu_down_write(&hotplug); > + current->cpuhp_ref++; /* allow read-in-write recursion */ >  } >   >  void cpu_hotplug_done(void) >  { > - cpu_hotplug.active_writer = NULL; > - mutex_unlock(&cpu_hotplug.lock); > - cpuhp_lock_release(); > + current->cpuhp_ref--; > + percpu_up_write(&hotplug); >  } >   >  /* > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1414,6 +1414,8 @@ static struct task_struct *copy_process( >   p->sequential_io_avg = 0; >  #endif >   > + cpu_hotplug_init_task(p); > + >   /* Perform scheduler related setup. Assign this task to a CPU. */ >   retval = sched_fork(clone_flags, p); >   if (retval) > --- a/lib/Kconfig > +++ b/lib/Kconfig > @@ -53,6 +53,11 @@ config GENERIC_IO >  config STMP_DEVICE >   bool >   > +config PERCPU_RWSEM_HOTPLUG > + def_bool y > + depends on HOTPLUG_CPU > + select PERCPU_RWSEM > + >  config ARCH_USE_CMPXCHG_LOCKREF >   bool >   -- Joonas Lahtinen Open Source Technology Center Intel Corporation