All of lore.kernel.org
 help / color / mirror / Atom feed
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
To: Peter Hurley <peter@hurleysoftware.com>
Cc: Byungchul Park <byungchul.park@lge.com>,
	akpm@linux-foundation.org, mingo@kernel.org,
	linux-kernel@vger.kernel.org, akinobu.mita@gmail.com,
	jack@suse.cz, torvalds@linux-foundation.org,
	Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>,
	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Subject: Re: [PATCH 3/3] spinlock_debug: panic on recursive lock spin_dump()
Date: Tue, 2 Feb 2016 16:59:40 +0900	[thread overview]
Message-ID: <20160202075940.GA499@swordfish> (raw)
In-Reply-To: <20160201161458.GA609@swordfish>

On (02/02/16 01:14), Sergey Senozhatsky wrote:
> how about splitting ->owner_cpu 4 bytes as:
> 
>                             |                                   |
>   1 byte spin bug recursion | 1 byte spin_dump recursion counter | 2 bytes owner cpu
>                             |                                   |

after some thinking... no, this will not do the trick. one byte is not
enough for recursion counter -- we can have 8K CPUs on the system and
8K-1 cpus can "suspect a lockup". so, a slightly different approach:

1) split ->owner_cpu 4 bytes in struct raw_spinlock
 unsigned short owner_cpu;
 unsigned short recursion;

I still can use only ->owner_cpu, but it's much easier when they are
apart. with a single 4 byte variable for recursion and cpu owner we
need to take extra care of higher 2 bytes every time we touch the
->owner_cpu

CPU1							CPU2
spin_dump
 ->owner_cpu[recursion_bits] += 1			spin_unlock
 							->owner_cpu = -1
						^^^ need to store cpu_id in
						lower 2 bytes, avoiding
						overwrite of 2 higher bytes, etc.
 ->owner_cpu[recursion_bits] -= 1

which is fragile and ugly.


2) ->recursion has most significant bit for spin_bug() bit, the
remaining bits are for recursion counter.

spin_bug() does
	set SPIN_BUG bit (most significant bit)
	spin_dump
	clear SPIN_BUG bit

spin_dump() does
	read SPIN_BUG bit
	inc ->recursion
	do_checks
	printk...
	dec ->recursion

and the do_checks is:

-- "if the SPIN_BUG bit is set AND recursion counter > NR_CPUS"
   then we have a spin_bug() recursion on at least one of the CPUs
   and we need to panic the system

printk
 spin_lock
  spin_bug
   spin_dump
    printk
     spin_lock
      spin_bug
       spin_dump
        ...


-- "if the SPIN_BUG bit is clear AND recursion counter >= SHRT_MAX/2"
   then we have spin_dump() recursion (16K calls.. can be bigger) and
   we need to panic the system. if recursion counter < SHRT_MAX/2 - keep
   going. "suspected soft lockup" potentially can be resolved (the lock
   owner unlocks the lock), so we need to have a big enough limit before
   we declare panic().

printk
 spin_lock
  spin_dump
   printk
    spin_lock
     spin_dump
      ...

I guess I'll I'll start a new thread with the next submission, to
refresh it.

RFC, any opinions are appreciated.
not yet tested code.

---
 include/linux/spinlock_types.h  |  4 +++-
 kernel/locking/spinlock_debug.c | 40 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 73548eb..c8f6b56 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -23,7 +23,9 @@ typedef struct raw_spinlock {
 	unsigned int break_lock;
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned int magic, owner_cpu;
+	unsigned int magic;
+	unsigned short owner_cpu;
+	unsigned short recursion;
 	void *owner;
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 0374a59..f838fe9 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -13,6 +13,8 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 
+#define SPIN_BUG_RECURSION		(1 << 15)
+
 void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
 			  struct lock_class_key *key)
 {
@@ -26,7 +28,8 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
 	lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 	lock->magic = SPINLOCK_MAGIC;
 	lock->owner = SPINLOCK_OWNER_INIT;
-	lock->owner_cpu = -1;
+	lock->owner_cpu = USHRT_MAX;
+	lock->recursion = 0;
 }
 
 EXPORT_SYMBOL(__raw_spin_lock_init);
@@ -49,9 +52,31 @@ void __rwlock_init(rwlock_t *lock, const char *name,
 
 EXPORT_SYMBOL(__rwlock_init);
 
+static void spin_recursion_panic(raw_spinlock_t *lock, const char *msg)
+{
+	panic("lock: %pS %s recursion on CPU#%d, %s/%d\n",
+			lock, msg, raw_smp_processor_id(),
+			current->comm, task_pid_nr(current));
+}
+
 static void spin_dump(raw_spinlock_t *lock, const char *msg)
 {
 	struct task_struct *owner = NULL;
+	unsigned short dump_counter;
+	bool spin_bug;
+
+	spin_bug = lock->recursion & SPIN_BUG_RECURSION;
+	dump_counter = lock->recursion & SHRT_MAX;
+	smp_rmb();
+
+	smp_wmb();
+	lock->recursion += 1;
+	dump_counter++;
+
+	if (spin_bug && dump_counter > NR_CPUS) /* num_online_cpus() */
+		spin_recursion_panic(lock, "spin_bug()");
+	if (dump_counter >= (SHRT_MAX >> 1))
+		spin_recursion_panic(lock, "spin_dump()");
 
 	if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
 		owner = lock->owner;
@@ -63,8 +88,11 @@ static void spin_dump(raw_spinlock_t *lock, const char *msg)
 		lock, lock->magic,
 		owner ? owner->comm : "<none>",
 		owner ? task_pid_nr(owner) : -1,
-		lock->owner_cpu);
+		lock->owner_cpu == USHRT_MAX ? -1 : lock->owner_cpu);
 	dump_stack();
+
+	smp_wmb();
+	lock->recursion -= 1;
 }
 
 static void spin_bug(raw_spinlock_t *lock, const char *msg)
@@ -72,7 +100,13 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg)
 	if (!debug_locks_off())
 		return;
 
+	smp_wmb();
+	lock->recursion |= SPIN_BUG_RECURSION;
+
 	spin_dump(lock, msg);
+
+	smp_wmb();
+	lock->recursion &= ~SPIN_BUG_RECURSION;
 }
 
 #define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
@@ -100,7 +134,7 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
 	SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
 							lock, "wrong CPU");
 	lock->owner = SPINLOCK_OWNER_INIT;
-	lock->owner_cpu = -1;
+	lock->owner_cpu = USHRT_MAX;
 }
 
 static void __spin_lock_debug(raw_spinlock_t *lock)
-- 
2.7.0

  reply	other threads:[~2016-02-02  7:58 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-01-27 12:01 [PATCH v4] lib/spinlock_debug.c: prevent a recursive cycle in the debug code Byungchul Park
2016-01-27 22:49 ` Peter Hurley
2016-01-28  7:15   ` Byungchul Park
2016-01-29  8:19     ` Byungchul Park
2016-01-28  1:42 ` Byungchul Park
2016-01-28  2:37   ` Sergey Senozhatsky
2016-01-28  4:36     ` byungchul.park
2016-01-28  6:05       ` Sergey Senozhatsky
2016-01-28  8:13         ` Byungchul Park
2016-01-28 10:41           ` Sergey Senozhatsky
2016-01-28 10:53             ` Sergey Senozhatsky
2016-01-28 15:42               ` Sergey Senozhatsky
2016-01-28 23:08                 ` Peter Hurley
2016-01-28 23:54                   ` Byungchul Park
2016-01-29  0:54                     ` Sergey Senozhatsky
2016-01-29  3:00                       ` Byungchul Park
2016-01-29  4:05                         ` Sergey Senozhatsky
2016-01-29 12:15                           ` Byungchul Park
2016-01-29  0:27                   ` Sergey Senozhatsky
2016-01-29  4:32                     ` Peter Hurley
2016-01-29  5:28                       ` Sergey Senozhatsky
2016-01-29  5:48                         ` Peter Hurley
2016-01-29  6:16                           ` Sergey Senozhatsky
2016-01-29  6:37                             ` Sergey Senozhatsky
2016-01-31 12:30                               ` Sergey Senozhatsky
2016-01-31 12:33                                 ` [PATCH 1/3] printk: introduce console_reset_on_panic() function Sergey Senozhatsky
2016-01-31 12:33                                   ` [PATCH 2/3] printk: introduce reset_console_drivers() Sergey Senozhatsky
2016-01-31 12:47                                     ` kbuild test robot
2016-01-31 12:33                                   ` [PATCH 3/3] spinlock_debug: panic on recursive lock spin_dump() Sergey Senozhatsky
2016-02-01 16:14                                     ` Sergey Senozhatsky
2016-02-02  7:59                                       ` Sergey Senozhatsky [this message]
2016-01-31 12:42                                   ` [PATCH 1/3] printk: introduce console_reset_on_panic() function kbuild test robot
2016-01-29  6:54                     ` [PATCH v4] lib/spinlock_debug.c: prevent a recursive cycle in the debug code Byungchul Park
2016-01-29  7:13                       ` Sergey Senozhatsky
2016-01-29  8:13                         ` Byungchul Park

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160202075940.GA499@swordfish \
    --to=sergey.senozhatsky.work@gmail.com \
    --cc=akinobu.mita@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=byungchul.park@lge.com \
    --cc=jack@suse.cz \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=peter@hurleysoftware.com \
    --cc=sergey.senozhatsky@gmail.com \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.