[RFC][PATCH 0/7] lockdep

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC][PATCH 0/7] lockdep
@ 2008-08-04 13:03 Peter Zijlstra
  2008-08-04 13:03 ` [RFC][PATCH 1/7] lockdep: Fix combinatorial explosion in lock subgraph traversal Peter Zijlstra
                   ` (9 more replies)
  0 siblings, 10 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 13:03 UTC (permalink / raw)
  To: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	a.p.zijlstra, linux-kernel, davej

Hi,

This series is my pending lockdep queue, it includes David's graph optimization,
my new scheduler runqueue annotation amongst others.

The part that makes this RFC are the latter few patches that add the "lock
protection locks" thing Linus proposed a few days ago. I've not yet written a
test case that actively tests this functionality, but wanted to get this out so
that Jeremy can see if it works for him.

Enjoy..
-- 

^ permalink raw reply	[flat|nested] 73+ messages in thread

* [RFC][PATCH 1/7] lockdep: Fix combinatorial explosion in lock subgraph traversal.
  2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
@ 2008-08-04 13:03 ` Peter Zijlstra
  2008-08-05  8:34   ` David Miller
  2008-08-04 13:03 ` [RFC][PATCH 2/7] lockdep: lock_set_subclass - reset a held locks subclass Peter Zijlstra
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 13:03 UTC (permalink / raw)
  To: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	a.p.zijlstra, linux-kernel, davej

[-- Attachment #1: davem-optimize-lockdep.patch --]
[-- Type: text/plain, Size: 7563 bytes --]

From: David Miller <davem@davemloft.net>

When we traverse the graph, either forwards or backwards, we
are interested in whether a certain property exists somewhere
in a node reachable in the graph.

Therefore it is never necessary to traverse through a node more
than once to get a correct answer to the given query.

Take advantage of this property using a global ID counter so that we
need not clear all the markers in all the lock_class entries before
doing a traversal.  A new ID is choosen when we start to traverse, and
we continue through a lock_class only if it's ID hasn't been marked
with the new value yet.

This short-circuiting is essential especially for high CPU count
systems.  The scheduler has a runqueue per cpu, and needs to take
two runqueue locks at a time, which leads to long chains of
backwards and forwards subgraphs from these runqueue lock nodes.
Without the short-circuit implemented here, a graph traversal on
a runqueue lock can take up to (1 << (N - 1)) checks on a system
with N cpus.

For anything more than 16 cpus or so, lockdep will eventually bring
the machine to a complete standstill.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/lockdep.h    |    1 
 kernel/lockdep.c           |   86 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/lockdep_internals.h |    3 +
 kernel/lockdep_proc.c      |   34 +----------------
 4 files changed, 93 insertions(+), 31 deletions(-)

Index: linux-2.6/include/linux/lockdep.h
===================================================================
--- linux-2.6.orig/include/linux/lockdep.h
+++ linux-2.6/include/linux/lockdep.h
@@ -89,6 +89,7 @@ struct lock_class {
 
 	struct lockdep_subclass_key	*key;
 	unsigned int			subclass;
+	unsigned int			dep_gen_id;
 
 	/*
 	 * IRQ/softirq usage tracking bits:
Index: linux-2.6/kernel/lockdep.c
===================================================================
--- linux-2.6.orig/kernel/lockdep.c
+++ linux-2.6/kernel/lockdep.c
@@ -372,6 +372,19 @@ unsigned int nr_process_chains;
 unsigned int max_lockdep_depth;
 unsigned int max_recursion_depth;
 
+static unsigned int lockdep_dependency_gen_id;
+
+static bool lockdep_dependency_visit(struct lock_class *source,
+				     unsigned int depth)
+{
+	if (!depth)
+		lockdep_dependency_gen_id++;
+	if (source->dep_gen_id == lockdep_dependency_gen_id)
+		return true;
+	source->dep_gen_id = lockdep_dependency_gen_id;
+	return false;
+}
+
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
  * We cannot printk in early bootup code. Not even early_printk()
@@ -558,6 +571,9 @@ static void print_lock_dependencies(stru
 {
 	struct lock_list *entry;
 
+	if (lockdep_dependency_visit(class, depth))
+		return;
+
 	if (DEBUG_LOCKS_WARN_ON(depth >= 20))
 		return;
 
@@ -959,6 +975,67 @@ static int noinline print_infinite_recur
 	return 0;
 }
 
+unsigned long __lockdep_count_forward_deps(struct lock_class *class,
+					   unsigned int depth)
+{
+	struct lock_list *entry;
+	unsigned long ret = 1;
+
+	if (lockdep_dependency_visit(class, depth))
+		return 0;
+
+	/*
+	 * Recurse this class's dependency list:
+	 */
+	list_for_each_entry(entry, &class->locks_after, entry)
+		ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+
+	return ret;
+}
+
+unsigned long lockdep_count_forward_deps(struct lock_class *class)
+{
+	unsigned long ret, flags;
+
+	local_irq_save(flags);
+	__raw_spin_lock(&lockdep_lock);
+	ret = __lockdep_count_forward_deps(class, 0);
+	__raw_spin_unlock(&lockdep_lock);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
+unsigned long __lockdep_count_backward_deps(struct lock_class *class,
+					    unsigned int depth)
+{
+	struct lock_list *entry;
+	unsigned long ret = 1;
+
+	if (lockdep_dependency_visit(class, depth))
+		return 0;
+	/*
+	 * Recurse this class's dependency list:
+	 */
+	list_for_each_entry(entry, &class->locks_before, entry)
+		ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+
+	return ret;
+}
+
+unsigned long lockdep_count_backward_deps(struct lock_class *class)
+{
+	unsigned long ret, flags;
+
+	local_irq_save(flags);
+	__raw_spin_lock(&lockdep_lock);
+	ret = __lockdep_count_backward_deps(class, 0);
+	__raw_spin_unlock(&lockdep_lock);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
 /*
  * Prove that the dependency graph starting at <entry> can not
  * lead to <target>. Print an error and return 0 if it does.
@@ -968,6 +1045,9 @@ check_noncircular(struct lock_class *sou
 {
 	struct lock_list *entry;
 
+	if (lockdep_dependency_visit(source, depth))
+		return 1;
+
 	debug_atomic_inc(&nr_cyclic_check_recursions);
 	if (depth > max_recursion_depth)
 		max_recursion_depth = depth;
@@ -1011,6 +1091,9 @@ find_usage_forwards(struct lock_class *s
 	struct lock_list *entry;
 	int ret;
 
+	if (lockdep_dependency_visit(source, depth))
+		return 1;
+
 	if (depth > max_recursion_depth)
 		max_recursion_depth = depth;
 	if (depth >= RECURSION_LIMIT)
@@ -1050,6 +1133,9 @@ find_usage_backwards(struct lock_class *
 	struct lock_list *entry;
 	int ret;
 
+	if (lockdep_dependency_visit(source, depth))
+		return 1;
+
 	if (!__raw_spin_is_locked(&lockdep_lock))
 		return DEBUG_LOCKS_WARN_ON(1);
 
Index: linux-2.6/kernel/lockdep_internals.h
===================================================================
--- linux-2.6.orig/kernel/lockdep_internals.h
+++ linux-2.6/kernel/lockdep_internals.h
@@ -53,6 +53,9 @@ extern unsigned int nr_process_chains;
 extern unsigned int max_lockdep_depth;
 extern unsigned int max_recursion_depth;
 
+extern unsigned long lockdep_count_forward_deps(struct lock_class *);
+extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
  * Various lockdep statistics:
Index: linux-2.6/kernel/lockdep_proc.c
===================================================================
--- linux-2.6.orig/kernel/lockdep_proc.c
+++ linux-2.6/kernel/lockdep_proc.c
@@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, v
 {
 }
 
-static unsigned long count_forward_deps(struct lock_class *class)
-{
-	struct lock_list *entry;
-	unsigned long ret = 1;
-
-	/*
-	 * Recurse this class's dependency list:
-	 */
-	list_for_each_entry(entry, &class->locks_after, entry)
-		ret += count_forward_deps(entry->class);
-
-	return ret;
-}
-
-static unsigned long count_backward_deps(struct lock_class *class)
-{
-	struct lock_list *entry;
-	unsigned long ret = 1;
-
-	/*
-	 * Recurse this class's dependency list:
-	 */
-	list_for_each_entry(entry, &class->locks_before, entry)
-		ret += count_backward_deps(entry->class);
-
-	return ret;
-}
-
 static void print_name(struct seq_file *m, struct lock_class *class)
 {
 	char str[128];
@@ -124,10 +96,10 @@ static int l_show(struct seq_file *m, vo
 #ifdef CONFIG_DEBUG_LOCKDEP
 	seq_printf(m, " OPS:%8ld", class->ops);
 #endif
-	nr_forward_deps = count_forward_deps(class);
+	nr_forward_deps = lockdep_count_forward_deps(class);
 	seq_printf(m, " FD:%5ld", nr_forward_deps);
 
-	nr_backward_deps = count_backward_deps(class);
+	nr_backward_deps = lockdep_count_backward_deps(class);
 	seq_printf(m, " BD:%5ld", nr_backward_deps);
 
 	get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -350,7 +322,7 @@ static int lockdep_stats_show(struct seq
 		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
 			nr_hardirq_read_unsafe++;
 
-		sum_forward_deps += count_forward_deps(class);
+		sum_forward_deps += lockdep_count_forward_deps(class);
 	}
 #ifdef CONFIG_DEBUG_LOCKDEP
 	DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);

-- 


^ permalink raw reply	[flat|nested] 73+ messages in thread

* [RFC][PATCH 2/7] lockdep: lock_set_subclass - reset a held locks subclass
  2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
  2008-08-04 13:03 ` [RFC][PATCH 1/7] lockdep: Fix combinatorial explosion in lock subgraph traversal Peter Zijlstra
@ 2008-08-04 13:03 ` Peter Zijlstra
  2008-08-05  8:35   ` David Miller
  2008-08-04 13:03 ` [RFC][PATCH 3/7] lockdep: re-annotate scheduler runqueues Peter Zijlstra
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 13:03 UTC (permalink / raw)
  To: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	a.p.zijlstra, linux-kernel, davej

[-- Attachment #1: lockdep-lock_set_subclass.patch --]
[-- Type: text/plain, Size: 3567 bytes --]

this can be used to reset a held lock's subclass, for arbitrary-depth
iterated data structures such as trees or lists which have per-node
locks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h |    4 ++
 kernel/lockdep.c        |   69 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

Index: linux-2.6/include/linux/lockdep.h
===================================================================
--- linux-2.6.orig/include/linux/lockdep.h
+++ linux-2.6/include/linux/lockdep.h
@@ -300,6 +300,9 @@ extern void lock_acquire(struct lockdep_
 extern void lock_release(struct lockdep_map *lock, int nested,
 			 unsigned long ip);
 
+extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass,
+			      unsigned long ip);
+
 # define INIT_LOCKDEP				.lockdep_recursion = 0,
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
@@ -316,6 +319,7 @@ static inline void lockdep_on(void)
 
 # define lock_acquire(l, s, t, r, c, i)		do { } while (0)
 # define lock_release(l, n, i)			do { } while (0)
+# define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_init()				do { } while (0)
 # define lockdep_info()				do { } while (0)
 # define lockdep_init_map(lock, name, key, sub)	do { (void)(key); } while (0)
Index: linux-2.6/kernel/lockdep.c
===================================================================
--- linux-2.6.orig/kernel/lockdep.c
+++ linux-2.6/kernel/lockdep.c
@@ -2658,6 +2658,55 @@ static int check_unlock(struct task_stru
 	return 1;
 }
 
+static int
+__lock_set_subclass(struct lockdep_map *lock,
+		    unsigned int subclass, unsigned long ip)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock, *prev_hlock;
+	struct lock_class *class;
+	unsigned int depth;
+	int i;
+
+	depth = curr->lockdep_depth;
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return 0;
+
+	prev_hlock = NULL;
+	for (i = depth-1; i >= 0; i--) {
+		hlock = curr->held_locks + i;
+		/*
+		 * We must not cross into another context:
+		 */
+		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+			break;
+		if (hlock->instance == lock)
+			goto found_it;
+		prev_hlock = hlock;
+	}
+	return print_unlock_inbalance_bug(curr, lock, ip);
+
+found_it:
+	class = register_lock_class(lock, subclass, 0);
+	hlock->class = class;
+
+	curr->lockdep_depth = i;
+	curr->curr_chain_key = hlock->prev_chain_key;
+
+	for (; i < depth; i++) {
+		hlock = curr->held_locks + i;
+		if (!__lock_acquire(hlock->instance,
+			hlock->class->subclass, hlock->trylock,
+				hlock->read, hlock->check, hlock->hardirqs_off,
+				hlock->acquire_ip))
+			return 0;
+	}
+
+	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+		return 0;
+	return 1;
+}
+
 /*
  * Remove the lock to the list of currently held locks in a
  * potentially non-nested (out of order) manner. This is a
@@ -2822,6 +2871,26 @@ static void check_flags(unsigned long fl
 #endif
 }
 
+void
+lock_set_subclass(struct lockdep_map *lock,
+		  unsigned int subclass, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	current->lockdep_recursion = 1;
+	check_flags(flags);
+	if (__lock_set_subclass(lock, subclass, ip))
+		check_chain_key(current);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_set_subclass);
+
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:

-- 


^ permalink raw reply	[flat|nested] 73+ messages in thread

* [RFC][PATCH 3/7] lockdep: re-annotate scheduler runqueues
  2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
  2008-08-04 13:03 ` [RFC][PATCH 1/7] lockdep: Fix combinatorial explosion in lock subgraph traversal Peter Zijlstra
  2008-08-04 13:03 ` [RFC][PATCH 2/7] lockdep: lock_set_subclass - reset a held locks subclass Peter Zijlstra
@ 2008-08-04 13:03 ` Peter Zijlstra
  2008-08-05  8:35   ` David Miller
  2008-08-04 13:03 ` [RFC][PATCH 4/7] lockdep: shrink held_lock structure Peter Zijlstra
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 13:03 UTC (permalink / raw)
  To: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	a.p.zijlstra, linux-kernel, davej

[-- Attachment #1: lockdep-sched-annotate.patch --]
[-- Type: text/plain, Size: 4237 bytes --]

Instead of using a per-rq lock class, use the regular nesting operations.

However, take extra care with double_lock_balance() as it can release the
already held rq->lock (and therefore change its nesting class).

So what can happen is:

 spin_lock(rq->lock);	// this rq subclass 0

 double_lock_balance(rq, other_rq);
   // release rq
   // acquire other_rq->lock subclass 0
   // acquire rq->lock subclass 1

 spin_unlock(other_rq->lock);

leaving you with rq->lock in subclass 1

So a subsequent double_lock_balance() call can try to nest a subclass 1
lock while already holding a subclass 1 lock.

Fix this by introducing double_unlock_balance() which releases the other
rq's lock, but also re-sets the subclass for this rq's lock to 0.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c    |   21 +++++++++++++--------
 kernel/sched_rt.c |    8 +++++---
 2 files changed, 18 insertions(+), 11 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -600,7 +600,6 @@ struct rq {
 	/* BKL stats */
 	unsigned int bkl_count;
 #endif
-	struct lock_class_key rq_lock_key;
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -2759,10 +2758,10 @@ static void double_rq_lock(struct rq *rq
 	} else {
 		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
-			spin_lock(&rq2->lock);
+			spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
 		} else {
 			spin_lock(&rq2->lock);
-			spin_lock(&rq1->lock);
+			spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
 		}
 	}
 	update_rq_clock(rq1);
@@ -2805,14 +2804,21 @@ static int double_lock_balance(struct rq
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
-			spin_lock(&this_rq->lock);
+			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
 			ret = 1;
 		} else
-			spin_lock(&busiest->lock);
+			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
 	}
 	return ret;
 }
 
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(busiest->lock)
+{
+	spin_unlock(&busiest->lock);
+	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -3637,7 +3643,7 @@ redo:
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
-		spin_unlock(&busiest->lock);
+		double_unlock_balance(this_rq, busiest);
 
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), *cpus);
@@ -3752,7 +3758,7 @@ static void active_load_balance(struct r
 		else
 			schedstat_inc(sd, alb_failed);
 	}
-	spin_unlock(&target_rq->lock);
+	double_unlock_balance(busiest_rq, target_rq);
 }
 
 #ifdef CONFIG_NO_HZ
@@ -8000,7 +8006,6 @@ void __init sched_init(void)
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
-		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -861,6 +861,8 @@ static void put_prev_task_rt(struct rq *
 #define RT_MAX_TRIES 3
 
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
+
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -1022,7 +1024,7 @@ static struct rq *find_lock_lowest_rq(st
 			break;
 
 		/* try again */
-		spin_unlock(&lowest_rq->lock);
+		double_unlock_balance(rq, lowest_rq);
 		lowest_rq = NULL;
 	}
 
@@ -1091,7 +1093,7 @@ static int push_rt_task(struct rq *rq)
 
 	resched_task(lowest_rq->curr);
 
-	spin_unlock(&lowest_rq->lock);
+	double_unlock_balance(rq, lowest_rq);
 
 	ret = 1;
 out:
@@ -1197,7 +1199,7 @@ static int pull_rt_task(struct rq *this_
 
 		}
  skip:
-		spin_unlock(&src_rq->lock);
+		double_unlock_balance(this_rq, src_rq);
 	}
 
 	return ret;

-- 


^ permalink raw reply	[flat|nested] 73+ messages in thread

* [RFC][PATCH 4/7] lockdep: shrink held_lock structure
  2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
                   ` (2 preceding siblings ...)
  2008-08-04 13:03 ` [RFC][PATCH 3/7] lockdep: re-annotate scheduler runqueues Peter Zijlstra
@ 2008-08-04 13:03 ` Peter Zijlstra
  2008-08-05 16:08   ` Peter Zijlstra
  2008-08-06  7:17   ` Peter Zijlstra
  2008-08-04 13:03 ` [RFC][PATCH 5/7] lockdep: map_acquire Peter Zijlstra
                   ` (5 subsequent siblings)
  9 siblings, 2 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 13:03 UTC (permalink / raw)
  To: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	a.p.zijlstra, linux-kernel, davej

[-- Attachment #1: lockdep-shrink-hlock.patch --]
[-- Type: text/plain, Size: 16661 bytes --]

From: Dave Jones <davej@redhat.com>

struct held_lock {
        u64                        prev_chain_key;       /*     0     8 */
        struct lock_class *        class;                /*     8     8 */
        long unsigned int          acquire_ip;           /*    16     8 */
        struct lockdep_map *       instance;             /*    24     8 */
        int                        irq_context;          /*    32     4 */
        int                        trylock;              /*    36     4 */
        int                        read;                 /*    40     4 */
        int                        check;                /*    44     4 */
        int                        hardirqs_off;         /*    48     4 */

        /* size: 56, cachelines: 1 */
        /* padding: 4 */
        /* last cacheline: 56 bytes */
};

struct held_lock {
        u64                        prev_chain_key;       /*     0     8 */
        long unsigned int          acquire_ip;           /*     8     8 */
        struct lockdep_map *       instance;             /*    16     8 */
        unsigned int               class_idx:11;         /*    24:21  4 */
        unsigned int               irq_context:1;        /*    24:20  4 */
        unsigned int               trylock:1;            /*    24:19  4 */
        unsigned int               read:2;               /*    24:17  4 */
        unsigned int               check:1;              /*    24:16  4 */
        unsigned int               hardirqs_off:1;       /*    24:15  4 */

        /* size: 32, cachelines: 1 */
        /* padding: 4 */
        /* bit_padding: 15 bits */
        /* last cacheline: 32 bytes */
};

[ From: Ingo Molnar <mingo@elte.hu>, shrunk hlock->class too. ]

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/lockdep.h    |   16 +++---
 kernel/lockdep.c           |  113 +++++++++++++++++++++++++--------------------
 kernel/lockdep_internals.h |    3 -
 3 files changed, 74 insertions(+), 58 deletions(-)

Index: linux-2.6/include/linux/lockdep.h
===================================================================
--- linux-2.6.orig/include/linux/lockdep.h
+++ linux-2.6/include/linux/lockdep.h
@@ -190,6 +190,9 @@ struct lock_chain {
 	u64				chain_key;
 };
 
+#define MAX_LOCKDEP_KEYS_BITS		11
+#define MAX_LOCKDEP_KEYS		(1UL << MAX_LOCKDEP_KEYS_BITS)
+
 struct held_lock {
 	/*
 	 * One-way hash of the dependency chain up to this point. We
@@ -206,14 +209,13 @@ struct held_lock {
 	 * with zero), here we store the previous hash value:
 	 */
 	u64				prev_chain_key;
-	struct lock_class		*class;
 	unsigned long			acquire_ip;
 	struct lockdep_map		*instance;
-
 #ifdef CONFIG_LOCK_STAT
 	u64 				waittime_stamp;
 	u64				holdtime_stamp;
 #endif
+	unsigned int			class_idx:MAX_LOCKDEP_KEYS_BITS;
 	/*
 	 * The lock-stack is unified in that the lock chains of interrupt
 	 * contexts nest ontop of process context chains, but we 'separate'
@@ -227,11 +229,11 @@ struct held_lock {
 	 * The following field is used to detect when we cross into an
 	 * interrupt context:
 	 */
-	int				irq_context;
-	int				trylock;
-	int				read;
-	int				check;
-	int				hardirqs_off;
+	unsigned int irq_context:1;
+	unsigned int trylock:1;
+	unsigned int read:2;
+	unsigned int check:1;
+	unsigned int hardirqs_off:1;
 };
 
 /*
Index: linux-2.6/kernel/lockdep.c
===================================================================
--- linux-2.6.orig/kernel/lockdep.c
+++ linux-2.6/kernel/lockdep.c
@@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX
 unsigned long nr_lock_classes;
 static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 
+static inline struct lock_class *hlock_class(struct held_lock *hlock)
+{
+	if (!hlock->class_idx) {
+		DEBUG_LOCKS_WARN_ON(1);
+		return NULL;
+	}
+	return lock_classes + hlock->class_idx - 1;
+}
+
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
 
@@ -222,7 +231,7 @@ static void lock_release_holdtime(struct
 
 	holdtime = sched_clock() - hlock->holdtime_stamp;
 
-	stats = get_lock_stats(hlock->class);
+	stats = get_lock_stats(hlock_class(hlock));
 	if (hlock->read)
 		lock_time_inc(&stats->read_holdtime, holdtime);
 	else
@@ -518,7 +527,7 @@ static void print_lockdep_cache(struct l
 
 static void print_lock(struct held_lock *hlock)
 {
-	print_lock_name(hlock->class);
+	print_lock_name(hlock_class(hlock));
 	printk(", at: ");
 	print_ip_sym(hlock->acquire_ip);
 }
@@ -948,7 +957,7 @@ static noinline int print_circular_bug_t
 	if (debug_locks_silent)
 		return 0;
 
-	this.class = check_source->class;
+	this.class = hlock_class(check_source);
 	if (!save_trace(&this.trace))
 		return 0;
 
@@ -1057,7 +1066,7 @@ check_noncircular(struct lock_class *sou
 	 * Check this lock's dependency list:
 	 */
 	list_for_each_entry(entry, &source->locks_after, entry) {
-		if (entry->class == check_target->class)
+		if (entry->class == hlock_class(check_target))
 			return print_circular_bug_header(entry, depth+1);
 		debug_atomic_inc(&nr_cyclic_checks);
 		if (!check_noncircular(entry->class, depth+1))
@@ -1150,6 +1159,11 @@ find_usage_backwards(struct lock_class *
 		return 2;
 	}
 
+	if (!source && debug_locks_off_graph_unlock()) {
+		WARN_ON(1);
+		return 0;
+	}
+
 	/*
 	 * Check this lock's dependency list:
 	 */
@@ -1189,9 +1203,9 @@ print_bad_irq_dependency(struct task_str
 	printk("\nand this task is already holding:\n");
 	print_lock(prev);
 	printk("which would create a new lock dependency:\n");
-	print_lock_name(prev->class);
+	print_lock_name(hlock_class(prev));
 	printk(" ->");
-	print_lock_name(next->class);
+	print_lock_name(hlock_class(next));
 	printk("\n");
 
 	printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
@@ -1232,12 +1246,12 @@ check_usage(struct task_struct *curr, st
 
 	find_usage_bit = bit_backwards;
 	/* fills in <backwards_match> */
-	ret = find_usage_backwards(prev->class, 0);
+	ret = find_usage_backwards(hlock_class(prev), 0);
 	if (!ret || ret == 1)
 		return ret;
 
 	find_usage_bit = bit_forwards;
-	ret = find_usage_forwards(next->class, 0);
+	ret = find_usage_forwards(hlock_class(next), 0);
 	if (!ret || ret == 1)
 		return ret;
 	/* ret == 2 */
@@ -1362,7 +1376,7 @@ check_deadlock(struct task_struct *curr,
 
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		prev = curr->held_locks + i;
-		if (prev->class != next->class)
+		if (hlock_class(prev) != hlock_class(next))
 			continue;
 		/*
 		 * Allow read-after-read recursion of the same
@@ -1415,7 +1429,7 @@ check_prev_add(struct task_struct *curr,
 	 */
 	check_source = next;
 	check_target = prev;
-	if (!(check_noncircular(next->class, 0)))
+	if (!(check_noncircular(hlock_class(next), 0)))
 		return print_circular_bug_tail();
 
 	if (!check_prev_add_irq(curr, prev, next))
@@ -1439,8 +1453,8 @@ check_prev_add(struct task_struct *curr,
 	 *  chains - the second one will be new, but L1 already has
 	 *  L2 added to its dependency list, due to the first chain.)
 	 */
-	list_for_each_entry(entry, &prev->class->locks_after, entry) {
-		if (entry->class == next->class) {
+	list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
+		if (entry->class == hlock_class(next)) {
 			if (distance == 1)
 				entry->distance = 1;
 			return 2;
@@ -1451,26 +1465,28 @@ check_prev_add(struct task_struct *curr,
 	 * Ok, all validations passed, add the new lock
 	 * to the previous lock's dependency list:
 	 */
-	ret = add_lock_to_list(prev->class, next->class,
-			       &prev->class->locks_after, next->acquire_ip, distance);
+	ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+			       &hlock_class(prev)->locks_after,
+			       next->acquire_ip, distance);
 
 	if (!ret)
 		return 0;
 
-	ret = add_lock_to_list(next->class, prev->class,
-			       &next->class->locks_before, next->acquire_ip, distance);
+	ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+			       &hlock_class(next)->locks_before,
+			       next->acquire_ip, distance);
 	if (!ret)
 		return 0;
 
 	/*
 	 * Debugging printouts:
 	 */
-	if (verbose(prev->class) || verbose(next->class)) {
+	if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
 		graph_unlock();
 		printk("\n new dependency: ");
-		print_lock_name(prev->class);
+		print_lock_name(hlock_class(prev));
 		printk(" => ");
-		print_lock_name(next->class);
+		print_lock_name(hlock_class(next));
 		printk("\n");
 		dump_stack();
 		return graph_lock();
@@ -1567,7 +1583,7 @@ static inline int lookup_chain_cache(str
 				     struct held_lock *hlock,
 				     u64 chain_key)
 {
-	struct lock_class *class = hlock->class;
+	struct lock_class *class = hlock_class(hlock);
 	struct list_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
 	struct held_lock *hlock_curr, *hlock_next;
@@ -1640,7 +1656,7 @@ cache_hit:
 	if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
 		chain->base = cn;
 		for (j = 0; j < chain->depth - 1; j++, i++) {
-			int lock_id = curr->held_locks[i].class - lock_classes;
+			int lock_id = curr->held_locks[i].class_idx - 1;
 			chain_hlocks[chain->base + j] = lock_id;
 		}
 		chain_hlocks[chain->base + j] = class - lock_classes;
@@ -1736,7 +1752,7 @@ static void check_chain_key(struct task_
 			WARN_ON(1);
 			return;
 		}
-		id = hlock->class - lock_classes;
+		id = hlock->class_idx - 1;
 		if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
 			return;
 
@@ -1781,7 +1797,7 @@ print_usage_bug(struct task_struct *curr
 	print_lock(this);
 
 	printk("{%s} state was registered at:\n", usage_str[prev_bit]);
-	print_stack_trace(this->class->usage_traces + prev_bit, 1);
+	print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
 
 	print_irqtrace_events(curr);
 	printk("\nother info that might help us debug this:\n");
@@ -1800,7 +1816,7 @@ static inline int
 valid_state(struct task_struct *curr, struct held_lock *this,
 	    enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
 {
-	if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+	if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
 		return print_usage_bug(curr, this, bad_bit, new_bit);
 	return 1;
 }
@@ -1839,7 +1855,7 @@ print_irq_inversion_bug(struct task_stru
 	lockdep_print_held_locks(curr);
 
 	printk("\nthe first lock's dependencies:\n");
-	print_lock_dependencies(this->class, 0);
+	print_lock_dependencies(hlock_class(this), 0);
 
 	printk("\nthe second lock's dependencies:\n");
 	print_lock_dependencies(other, 0);
@@ -1862,7 +1878,7 @@ check_usage_forwards(struct task_struct 
 
 	find_usage_bit = bit;
 	/* fills in <forwards_match> */
-	ret = find_usage_forwards(this->class, 0);
+	ret = find_usage_forwards(hlock_class(this), 0);
 	if (!ret || ret == 1)
 		return ret;
 
@@ -1881,7 +1897,7 @@ check_usage_backwards(struct task_struct
 
 	find_usage_bit = bit;
 	/* fills in <backwards_match> */
-	ret = find_usage_backwards(this->class, 0);
+	ret = find_usage_backwards(hlock_class(this), 0);
 	if (!ret || ret == 1)
 		return ret;
 
@@ -1947,7 +1963,7 @@ static int mark_lock_irq(struct task_str
 				LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
 			return 0;
 #endif
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ:
@@ -1972,7 +1988,7 @@ static int mark_lock_irq(struct task_str
 				LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
 			return 0;
 #endif
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_HARDIRQ_READ:
@@ -1985,7 +2001,7 @@ static int mark_lock_irq(struct task_str
 		if (!check_usage_forwards(curr, this,
 					  LOCK_ENABLED_HARDIRQS, "hard"))
 			return 0;
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ_READ:
@@ -1998,7 +2014,7 @@ static int mark_lock_irq(struct task_str
 		if (!check_usage_forwards(curr, this,
 					  LOCK_ENABLED_SOFTIRQS, "soft"))
 			return 0;
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_HARDIRQS:
@@ -2024,7 +2040,7 @@ static int mark_lock_irq(struct task_str
 				   LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
 			return 0;
 #endif
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_SOFTIRQS:
@@ -2050,7 +2066,7 @@ static int mark_lock_irq(struct task_str
 				   LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
 			return 0;
 #endif
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_HARDIRQS_READ:
@@ -2065,7 +2081,7 @@ static int mark_lock_irq(struct task_str
 					   LOCK_USED_IN_HARDIRQ, "hard"))
 			return 0;
 #endif
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_SOFTIRQS_READ:
@@ -2080,7 +2096,7 @@ static int mark_lock_irq(struct task_str
 					   LOCK_USED_IN_SOFTIRQ, "soft"))
 			return 0;
 #endif
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	default:
@@ -2396,7 +2412,7 @@ static int mark_lock(struct task_struct 
 	 * If already set then do not dirty the cacheline,
 	 * nor do any checks:
 	 */
-	if (likely(this->class->usage_mask & new_mask))
+	if (likely(hlock_class(this)->usage_mask & new_mask))
 		return 1;
 
 	if (!graph_lock())
@@ -2404,14 +2420,14 @@ static int mark_lock(struct task_struct 
 	/*
 	 * Make sure we didnt race:
 	 */
-	if (unlikely(this->class->usage_mask & new_mask)) {
+	if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
 		graph_unlock();
 		return 1;
 	}
 
-	this->class->usage_mask |= new_mask;
+	hlock_class(this)->usage_mask |= new_mask;
 
-	if (!save_trace(this->class->usage_traces + new_bit))
+	if (!save_trace(hlock_class(this)->usage_traces + new_bit))
 		return 0;
 
 	switch (new_bit) {
@@ -2545,8 +2561,9 @@ static int __lock_acquire(struct lockdep
 		return 0;
 
 	hlock = curr->held_locks + depth;
-
-	hlock->class = class;
+	if (DEBUG_LOCKS_WARN_ON(!class))
+		return 0;
+	hlock->class_idx = class - lock_classes + 1;
 	hlock->acquire_ip = ip;
 	hlock->instance = lock;
 	hlock->trylock = trylock;
@@ -2690,7 +2707,7 @@ __lock_set_subclass(struct lockdep_map *
 
 found_it:
 	class = register_lock_class(lock, subclass, 0);
-	hlock->class = class;
+	hlock->class_idx = class - lock_classes + 1;
 
 	curr->lockdep_depth = i;
 	curr->curr_chain_key = hlock->prev_chain_key;
@@ -2698,7 +2715,7 @@ found_it:
 	for (; i < depth; i++) {
 		hlock = curr->held_locks + i;
 		if (!__lock_acquire(hlock->instance,
-			hlock->class->subclass, hlock->trylock,
+			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
 				hlock->acquire_ip))
 			return 0;
@@ -2759,7 +2776,7 @@ found_it:
 	for (i++; i < depth; i++) {
 		hlock = curr->held_locks + i;
 		if (!__lock_acquire(hlock->instance,
-			hlock->class->subclass, hlock->trylock,
+			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
 				hlock->acquire_ip))
 			return 0;
@@ -2804,7 +2821,7 @@ static int lock_release_nested(struct ta
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 	hlock->prev_chain_key = 0;
-	hlock->class = NULL;
+	hlock_class(hlock) = NULL;
 	hlock->acquire_ip = 0;
 	hlock->irq_context = 0;
 #endif
@@ -3000,9 +3017,9 @@ __lock_contended(struct lockdep_map *loc
 found_it:
 	hlock->waittime_stamp = sched_clock();
 
-	point = lock_contention_point(hlock->class, ip);
+	point = lock_contention_point(hlock_class(hlock), ip);
 
-	stats = get_lock_stats(hlock->class);
+	stats = get_lock_stats(hlock_class(hlock));
 	if (point < ARRAY_SIZE(stats->contention_point))
 		stats->contention_point[i]++;
 	if (lock->cpu != smp_processor_id())
@@ -3048,7 +3065,7 @@ found_it:
 		hlock->holdtime_stamp = now;
 	}
 
-	stats = get_lock_stats(hlock->class);
+	stats = get_lock_stats(hlock_class(hlock));
 	if (waittime) {
 		if (hlock->read)
 			lock_time_inc(&stats->read_waittime, waittime);
Index: linux-2.6/kernel/lockdep_internals.h
===================================================================
--- linux-2.6.orig/kernel/lockdep_internals.h
+++ linux-2.6/kernel/lockdep_internals.h
@@ -17,9 +17,6 @@
  */
 #define MAX_LOCKDEP_ENTRIES	8192UL
 
-#define MAX_LOCKDEP_KEYS_BITS	11
-#define MAX_LOCKDEP_KEYS	(1UL << MAX_LOCKDEP_KEYS_BITS)
-
 #define MAX_LOCKDEP_CHAINS_BITS	14
 #define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
 

-- 


^ permalink raw reply	[flat|nested] 73+ messages in thread

* [RFC][PATCH 5/7] lockdep: map_acquire
  2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
                   ` (3 preceding siblings ...)
  2008-08-04 13:03 ` [RFC][PATCH 4/7] lockdep: shrink held_lock structure Peter Zijlstra
@ 2008-08-04 13:03 ` Peter Zijlstra
  2008-08-04 13:03 ` [RFC][PATCH 6/7] lockdep: lock protection locks Peter Zijlstra
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 13:03 UTC (permalink / raw)
  To: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	a.p.zijlstra, linux-kernel, davej

[-- Attachment #1: lockdep-map_acquire.patch --]
[-- Type: text/plain, Size: 4486 bytes --]

Most the free-standing lock_acquire() usages look remarkably similar, sweep
them into a new helper.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 fs/jbd/transaction.c    |    4 ++--
 fs/jbd2/transaction.c   |    4 ++--
 include/linux/lockdep.h |   12 ++++++++++++
 kernel/workqueue.c      |   24 ++++++++++++------------
 4 files changed, 28 insertions(+), 16 deletions(-)

Index: linux-2.6/fs/jbd/transaction.c
===================================================================
--- linux-2.6.orig/fs/jbd/transaction.c
+++ linux-2.6/fs/jbd/transaction.c
@@ -291,7 +291,7 @@ handle_t *journal_start(journal_t *journ
 		goto out;
 	}
 
-	lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+	map_acquire(&handle->h_lockdep_map);
 
 out:
 	return handle;
@@ -1448,7 +1448,7 @@ int journal_stop(handle_t *handle)
 		spin_unlock(&journal->j_state_lock);
 	}
 
-	lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+	map_release(&handle->h_lockdep_map);
 
 	jbd_free_handle(handle);
 	return err;
Index: linux-2.6/fs/jbd2/transaction.c
===================================================================
--- linux-2.6.orig/fs/jbd2/transaction.c
+++ linux-2.6/fs/jbd2/transaction.c
@@ -301,7 +301,7 @@ handle_t *jbd2_journal_start(journal_t *
 		goto out;
 	}
 
-	lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+	map_acquire(&handle->h_lockdep_map);
 out:
 	return handle;
 }
@@ -1279,7 +1279,7 @@ int jbd2_journal_stop(handle_t *handle)
 		spin_unlock(&journal->j_state_lock);
 	}
 
-	lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+	map_release(&handle->h_lockdep_map);
 
 	jbd2_free_handle(handle);
 	return err;
Index: linux-2.6/include/linux/lockdep.h
===================================================================
--- linux-2.6.orig/include/linux/lockdep.h
+++ linux-2.6/include/linux/lockdep.h
@@ -459,4 +459,16 @@ static inline void print_irqtrace_events
 # define rwsem_release(l, n, i)			do { } while (0)
 #endif
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_PROVE_LOCKING
+#  define map_acquire(l)		lock_acquire(l, 0, 0, 0, 2, _THIS_IP_)
+# else
+#  define map_acquire(l)		lock_acquire(l, 0, 0, 0, 1, _THIS_IP_)
+# endif
+# define map_release(l)			lock_release(l, 1, _THIS_IP_)
+#else
+# define map_acquire(l)			do { } while (0)
+# define map_release(l)			do { } while (0)
+#endif
+
 #endif /* __LINUX_LOCKDEP_H */
Index: linux-2.6/kernel/workqueue.c
===================================================================
--- linux-2.6.orig/kernel/workqueue.c
+++ linux-2.6/kernel/workqueue.c
@@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_wor
 
 		BUG_ON(get_wq_data(work) != cwq);
 		work_clear_pending(work);
-		lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
-		lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+		map_acquire(&cwq->wq->lockdep_map);
+		map_acquire(&lockdep_map);
 		f(work);
-		lock_release(&lockdep_map, 1, _THIS_IP_);
-		lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+		map_release(&lockdep_map);
+		map_release(&cwq->wq->lockdep_map);
 
 		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
 			printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
@@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_st
 	int cpu;
 
 	might_sleep();
-	lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
-	lock_release(&wq->lockdep_map, 1, _THIS_IP_);
+	map_acquire(&wq->lockdep_map);
+	map_release(&wq->lockdep_map);
 	for_each_cpu_mask_nr(cpu, *cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
@@ -441,8 +441,8 @@ int flush_work(struct work_struct *work)
 	if (!cwq)
 		return 0;
 
-	lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
-	lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+	map_acquire(&cwq->wq->lockdep_map);
+	map_release(&cwq->wq->lockdep_map);
 
 	prev = NULL;
 	spin_lock_irq(&cwq->lock);
@@ -536,8 +536,8 @@ static void wait_on_work(struct work_str
 
 	might_sleep();
 
-	lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
-	lock_release(&work->lockdep_map, 1, _THIS_IP_);
+	map_acquire(&work->lockdep_map);
+	map_release(&work->lockdep_map);
 
 	cwq = get_wq_data(work);
 	if (!cwq)
@@ -872,8 +872,8 @@ static void cleanup_workqueue_thread(str
 	if (cwq->thread == NULL)
 		return;
 
-	lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
-	lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+	map_acquire(&cwq->wq->lockdep_map);
+	map_release(&cwq->wq->lockdep_map);
 
 	flush_cpu_workqueue(cwq);
 	/*

-- 


^ permalink raw reply	[flat|nested] 73+ messages in thread

* [RFC][PATCH 6/7] lockdep: lock protection locks
  2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
                   ` (4 preceding siblings ...)
  2008-08-04 13:03 ` [RFC][PATCH 5/7] lockdep: map_acquire Peter Zijlstra
@ 2008-08-04 13:03 ` Peter Zijlstra
  2008-08-04 13:03 ` [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock() Peter Zijlstra
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 13:03 UTC (permalink / raw)
  To: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	a.p.zijlstra, linux-kernel, davej

[-- Attachment #1: lockdep-nest-lock.patch --]
[-- Type: text/plain, Size: 9432 bytes --]

On Fri, 2008-08-01 at 16:26 -0700, Linus Torvalds wrote:

> On Fri, 1 Aug 2008, David Miller wrote:
> > 
> > Taking more than a few locks of the same class at once is bad
> > news and it's better to find an alternative method.
> 
> It's not always wrong.
> 
> If you can guarantee that anybody that takes more than one lock of a 
> particular class will always take a single top-level lock _first_, then 
> that's all good. You can obviously screw up and take the same lock _twice_ 
> (which will deadlock), but at least you cannot get into ABBA situations.
> 
> So maybe the right thing to do is to just teach lockdep about "lock 
> protection locks". That would have solved the multi-queue issues for 
> networking too - all the actual network drivers would still have taken 
> just their single queue lock, but the one case that needs to take all of 
> them would have taken a separate top-level lock first.
> 
> Never mind that the multi-queue locks were always taken in the same order: 
> it's never wrong to just have some top-level serialization, and anybody 
> who needs to take <n> locks might as well do <n+1>, because they sure as 
> hell aren't going to be on _any_ fastpaths.
> 
> So the simplest solution really sounds like just teaching lockdep about 
> that one special case. It's not "nesting" exactly, although it's obviously 
> related to it.

Do as Linus suggested. The lock protection lock is called nest_lock.

Note that we still have the MAX_LOCK_DEPTH (48) limit to consider, so anything
that spills that it still up shit creek.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/lockdep.h    |   34 ++++++++++++++++++----------------
 include/linux/rcuclassic.h |    2 +-
 kernel/lockdep.c           |   26 +++++++++++++++++++++-----
 3 files changed, 40 insertions(+), 22 deletions(-)

Index: linux-2.6/kernel/lockdep.c
===================================================================
--- linux-2.6.orig/kernel/lockdep.c
+++ linux-2.6/kernel/lockdep.c
@@ -1372,18 +1372,32 @@ check_deadlock(struct task_struct *curr,
 	       struct lockdep_map *next_instance, int read)
 {
 	struct held_lock *prev;
+	struct held_lock *nest = NULL;
 	int i;
 
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		prev = curr->held_locks + i;
+
+		if (prev->instance == next->nest_lock)
+			nest = prev;
+
 		if (hlock_class(prev) != hlock_class(next))
 			continue;
+
 		/*
 		 * Allow read-after-read recursion of the same
 		 * lock class (i.e. read_lock(lock)+read_lock(lock)):
 		 */
 		if ((read == 2) && prev->read)
 			return 2;
+
+		/*
+		 * We're holding the nest_lock, which serializes this lock's
+		 * nesting behaviour.
+		 */
+		if (nest)
+			return 2;
+
 		return print_deadlock_bug(curr, prev, next);
 	}
 	return 1;
@@ -2507,7 +2521,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
  */
 static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			  int trylock, int read, int check, int hardirqs_off,
-			  unsigned long ip)
+			  struct lockdep_map *nest_lock, unsigned long ip)
 {
 	struct task_struct *curr = current;
 	struct lock_class *class = NULL;
@@ -2566,6 +2580,7 @@ static int __lock_acquire(struct lockdep
 	hlock->class_idx = class - lock_classes + 1;
 	hlock->acquire_ip = ip;
 	hlock->instance = lock;
+	hlock->nest_lock = nest_lock;
 	hlock->trylock = trylock;
 	hlock->read = read;
 	hlock->check = check;
@@ -2717,7 +2732,7 @@ found_it:
 		if (!__lock_acquire(hlock->instance,
 			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
-				hlock->acquire_ip))
+				hlock->nest_lock, hlock->acquire_ip))
 			return 0;
 	}
 
@@ -2778,7 +2793,7 @@ found_it:
 		if (!__lock_acquire(hlock->instance,
 			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
-				hlock->acquire_ip))
+				hlock->nest_lock, hlock->acquire_ip))
 			return 0;
 	}
 
@@ -2915,7 +2930,8 @@ EXPORT_SYMBOL_GPL(lock_set_subclass);
  * and also avoid lockdep recursion:
  */
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-			  int trylock, int read, int check, unsigned long ip)
+			  int trylock, int read, int check,
+			  struct lockdep_map *nest_lock, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2930,7 +2946,7 @@ void lock_acquire(struct lockdep_map *lo
 
 	current->lockdep_recursion = 1;
 	__lock_acquire(lock, subclass, trylock, read, check,
-		       irqs_disabled_flags(flags), ip);
+		       irqs_disabled_flags(flags), nest_lock, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
Index: linux-2.6/include/linux/lockdep.h
===================================================================
--- linux-2.6.orig/include/linux/lockdep.h
+++ linux-2.6/include/linux/lockdep.h
@@ -211,6 +211,7 @@ struct held_lock {
 	u64				prev_chain_key;
 	unsigned long			acquire_ip;
 	struct lockdep_map		*instance;
+	struct lockdep_map		*nest_lock;
 #ifdef CONFIG_LOCK_STAT
 	u64 				waittime_stamp;
 	u64				holdtime_stamp;
@@ -297,7 +298,8 @@ extern void lockdep_init_map(struct lock
  *   2: full validation
  */
 extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-			 int trylock, int read, int check, unsigned long ip);
+			 int trylock, int read, int check,
+			 struct lockdep_map *nest_lock, unsigned long ip);
 
 extern void lock_release(struct lockdep_map *lock, int nested,
 			 unsigned long ip);
@@ -319,7 +321,7 @@ static inline void lockdep_on(void)
 {
 }
 
-# define lock_acquire(l, s, t, r, c, i)		do { } while (0)
+# define lock_acquire(l, s, t, r, c, n, i)	do { } while (0)
 # define lock_release(l, n, i)			do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_init()				do { } while (0)
@@ -407,9 +409,9 @@ static inline void print_irqtrace_events
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # ifdef CONFIG_PROVE_LOCKING
-#  define spin_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, i)
+#  define spin_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, NULL, i)
 # else
-#  define spin_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, i)
+#  define spin_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, NULL, i)
 # endif
 # define spin_release(l, n, i)			lock_release(l, n, i)
 #else
@@ -419,11 +421,11 @@ static inline void print_irqtrace_events
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # ifdef CONFIG_PROVE_LOCKING
-#  define rwlock_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, i)
-#  define rwlock_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 2, 2, i)
+#  define rwlock_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, NULL, i)
+#  define rwlock_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 2, 2, NULL, i)
 # else
-#  define rwlock_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, i)
-#  define rwlock_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 2, 1, i)
+#  define rwlock_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, NULL, i)
+#  define rwlock_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 2, 1, NULL, i)
 # endif
 # define rwlock_release(l, n, i)		lock_release(l, n, i)
 #else
@@ -434,9 +436,9 @@ static inline void print_irqtrace_events
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # ifdef CONFIG_PROVE_LOCKING
-#  define mutex_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, i)
+#  define mutex_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, NULL, i)
 # else
-#  define mutex_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, i)
+#  define mutex_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, NULL, i)
 # endif
 # define mutex_release(l, n, i)			lock_release(l, n, i)
 #else
@@ -446,11 +448,11 @@ static inline void print_irqtrace_events
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # ifdef CONFIG_PROVE_LOCKING
-#  define rwsem_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, i)
-#  define rwsem_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 1, 2, i)
+#  define rwsem_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, NULL, i)
+#  define rwsem_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 1, 2, NULL, i)
 # else
-#  define rwsem_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, i)
-#  define rwsem_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 1, 1, i)
+#  define rwsem_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, NULL, i)
+#  define rwsem_acquire_read(l, s, t, i)	lock_acquire(l, s, t, 1, 1, NULL, i)
 # endif
 # define rwsem_release(l, n, i)			lock_release(l, n, i)
 #else
@@ -461,9 +463,9 @@ static inline void print_irqtrace_events
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # ifdef CONFIG_PROVE_LOCKING
-#  define map_acquire(l)		lock_acquire(l, 0, 0, 0, 2, _THIS_IP_)
+#  define map_acquire(l)		lock_acquire(l, 0, 0, 0, 2, NULL, _THIS_IP_)
 # else
-#  define map_acquire(l)		lock_acquire(l, 0, 0, 0, 1, _THIS_IP_)
+#  define map_acquire(l)		lock_acquire(l, 0, 0, 0, 1, NULL, _THIS_IP_)
 # endif
 # define map_release(l)			lock_release(l, 1, _THIS_IP_)
 #else
Index: linux-2.6/include/linux/rcuclassic.h
===================================================================
--- linux-2.6.orig/include/linux/rcuclassic.h
+++ linux-2.6/include/linux/rcuclassic.h
@@ -117,7 +117,7 @@ extern int rcu_needs_cpu(int cpu);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern struct lockdep_map rcu_lock_map;
 # define rcu_read_acquire()	\
-			lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
+			lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
 # define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
 #else
 # define rcu_read_acquire()	do { } while (0)

-- 


^ permalink raw reply	[flat|nested] 73+ messages in thread

* [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
                   ` (5 preceding siblings ...)
  2008-08-04 13:03 ` [RFC][PATCH 6/7] lockdep: lock protection locks Peter Zijlstra
@ 2008-08-04 13:03 ` Peter Zijlstra
  2008-08-04 14:07   ` Roland Dreier
                     ` (2 more replies)
  2008-08-07 11:25 ` [RFC][PATCH 8/7] lockdep: annotate mm_take_all_locks() Peter Zijlstra
                   ` (2 subsequent siblings)
  9 siblings, 3 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 13:03 UTC (permalink / raw)
  To: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	a.p.zijlstra, linux-kernel, davej

[-- Attachment #1: lockdep-spin_lock_nest.patch --]
[-- Type: text/plain, Size: 2737 bytes --]

Expose the new lock protection lock.

This can be used to annotate places where we take multiple locks of the
same class and avoid deadlocks by always taking another (top-level) lock
first.

NOTE: we're still bound to the MAX_LOCK_DEPTH (48) limit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/lockdep.h  |    2 ++
 include/linux/spinlock.h |    6 ++++++
 kernel/spinlock.c        |   11 +++++++++++
 3 files changed, 19 insertions(+)

Index: linux-2.6/include/linux/lockdep.h
===================================================================
--- linux-2.6.orig/include/linux/lockdep.h
+++ linux-2.6/include/linux/lockdep.h
@@ -410,8 +410,10 @@ static inline void print_irqtrace_events
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # ifdef CONFIG_PROVE_LOCKING
 #  define spin_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, NULL, i)
+#  define spin_acquire_nest(l, s, t, n, i)	lock_acquire(l, s, t, 0, 2, n, i)
 # else
 #  define spin_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, NULL, i)
+#  define spin_acquire_nest(l, s, t, n, i)	lock_acquire(l, s, t, 0, 1, NULL, i)
 # endif
 # define spin_release(l, n, i)			lock_release(l, n, i)
 #else
Index: linux-2.6/include/linux/spinlock.h
===================================================================
--- linux-2.6.orig/include/linux/spinlock.h
+++ linux-2.6/include/linux/spinlock.h
@@ -183,8 +183,14 @@ do {								\
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass)
+# define spin_lock_nest_lock(lock, nest_lock)				\
+	 do {								\
+		 typecheck(struct lockdep_map *, &nest_lock->dep_map);	\
+		 _spin_lock_nest_lock(lock, &nest_lock->dep_map);	\
+	 } while (0)
 #else
 # define spin_lock_nested(lock, subclass) _spin_lock(lock)
+# define spin_lock_nest_lock(lock, nest_lock) _spin_lock(lock)
 #endif
 
 #define write_lock(lock)		_write_lock(lock)
Index: linux-2.6/kernel/spinlock.c
===================================================================
--- linux-2.6.orig/kernel/spinlock.c
+++ linux-2.6/kernel/spinlock.c
@@ -292,6 +292,7 @@ void __lockfunc _spin_lock_nested(spinlo
 }
 
 EXPORT_SYMBOL(_spin_lock_nested);
+
 unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
 {
 	unsigned long flags;
@@ -314,6 +315,16 @@ unsigned long __lockfunc _spin_lock_irqs
 
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
 
+void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
+				     struct lockdep_map *nest_lock)
+{
+	preempt_disable();
+	spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+
+EXPORT_SYMBOL(_spin_lock_nest_lock)
+
 #endif
 
 void __lockfunc _spin_unlock(spinlock_t *lock)

-- 


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 13:03 ` [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock() Peter Zijlstra
@ 2008-08-04 14:07   ` Roland Dreier
  2008-08-04 14:19     ` Peter Zijlstra
  2008-08-04 18:06   ` Jeremy Fitzhardinge
  2008-08-07 11:25   ` Peter Zijlstra
  2 siblings, 1 reply; 73+ messages in thread
From: Roland Dreier @ 2008-08-04 14:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel, davej

 > NOTE: we're still bound to the MAX_LOCK_DEPTH (48) limit.

A) It is probably a good idea to put this in a comment somewhere near
   where spin_lock_nest_lock() is declared.

B) It is probably a good idea to write that comment in such a way that
   dumb people like me understand what the limit is.  The sentence I
   quoted above is too telegraphic for me to get.  Is the point that no
   more than 48 spinlocks can be held at once, even if the inner locks
   are protected by some top level lock?  Or do you mean something else?

 - R.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 14:07   ` Roland Dreier
@ 2008-08-04 14:19     ` Peter Zijlstra
  2008-08-04 14:26       ` Roland Dreier
  0 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 14:19 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel, davej

On Mon, 2008-08-04 at 07:07 -0700, Roland Dreier wrote:
> > NOTE: we're still bound to the MAX_LOCK_DEPTH (48) limit.
> 
> A) It is probably a good idea to put this in a comment somewhere near
>    where spin_lock_nest_lock() is declared.

Its not particular to this annotation - its true for anything lockdep.

> B) It is probably a good idea to write that comment in such a way that
>    dumb people like me understand what the limit is.  The sentence I
>    quoted above is too telegraphic for me to get.  Is the point that no
>    more than 48 spinlocks can be held at once, even if the inner locks
>    are protected by some top level lock?  Or do you mean something else?

No more than 48 locks (mutexes, rwlocks, spinlock, RCU, everything
covered by lockdep) held by any one code-path; including nested
interrupt contexts.


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 14:19     ` Peter Zijlstra
@ 2008-08-04 14:26       ` Roland Dreier
  2008-08-04 14:32         ` Peter Zijlstra
  0 siblings, 1 reply; 73+ messages in thread
From: Roland Dreier @ 2008-08-04 14:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel, davej

 > No more than 48 locks (mutexes, rwlocks, spinlock, RCU, everything
 > covered by lockdep) held by any one code-path; including nested
 > interrupt contexts.

Does that mean that something like the new mm_take_all_locks() operation
is going to explode if someone tries to use it with lockdep on?

 - R.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 14:26       ` Roland Dreier
@ 2008-08-04 14:32         ` Peter Zijlstra
  2008-08-04 14:53           ` Dave Jones
  0 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 14:32 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel, davej

On Mon, 2008-08-04 at 07:26 -0700, Roland Dreier wrote:
> > No more than 48 locks (mutexes, rwlocks, spinlock, RCU, everything
>  > covered by lockdep) held by any one code-path; including nested
>  > interrupt contexts.
> 
> Does that mean that something like the new mm_take_all_locks() operation
> is going to explode if someone tries to use it with lockdep on?

Gah - yes, clearly nobody tried this.. :-/

Just looking at the code it will not only run into this limit, but it
would warn about recursion on the second file/anon vma due to utter lack
of annotation.

Why are people still developing without lockdep?

/me sad


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 14:32         ` Peter Zijlstra
@ 2008-08-04 14:53           ` Dave Jones
  2008-08-04 14:56             ` Peter Zijlstra
  0 siblings, 1 reply; 73+ messages in thread
From: Dave Jones @ 2008-08-04 14:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Roland Dreier, Linus Torvalds, David Miller, jeremy, hugh, mingo,
	akpm, linux-kernel

On Mon, Aug 04, 2008 at 04:32:12PM +0200, Peter Zijlstra wrote:
 > On Mon, 2008-08-04 at 07:26 -0700, Roland Dreier wrote:
 > > > No more than 48 locks (mutexes, rwlocks, spinlock, RCU, everything
 > >  > covered by lockdep) held by any one code-path; including nested
 > >  > interrupt contexts.
 > > 
 > > Does that mean that something like the new mm_take_all_locks() operation
 > > is going to explode if someone tries to use it with lockdep on?
 > 
 > Gah - yes, clearly nobody tried this.. :-/
 > 
 > Just looking at the code it will not only run into this limit, but it
 > would warn about recursion on the second file/anon vma due to utter lack
 > of annotation.
 > 
 > Why are people still developing without lockdep?

More puzzling, is why hasn't this triggered in the Fedora rawhide kernels,
which do have lockdep enabled.

	Dave

-- 
http://www.codemonkey.org.uk

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 14:53           ` Dave Jones
@ 2008-08-04 14:56             ` Peter Zijlstra
  2008-08-04 16:26               ` Andrea Arcangeli
  0 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 14:56 UTC (permalink / raw)
  To: Dave Jones
  Cc: Roland Dreier, Linus Torvalds, David Miller, jeremy, hugh, mingo,
	akpm, linux-kernel, Andrea Arcangeli

On Mon, 2008-08-04 at 10:53 -0400, Dave Jones wrote:
> On Mon, Aug 04, 2008 at 04:32:12PM +0200, Peter Zijlstra wrote:
>  > On Mon, 2008-08-04 at 07:26 -0700, Roland Dreier wrote:
>  > > > No more than 48 locks (mutexes, rwlocks, spinlock, RCU, everything
>  > >  > covered by lockdep) held by any one code-path; including nested
>  > >  > interrupt contexts.
>  > > 
>  > > Does that mean that something like the new mm_take_all_locks() operation
>  > > is going to explode if someone tries to use it with lockdep on?
>  > 
>  > Gah - yes, clearly nobody tried this.. :-/
>  > 
>  > Just looking at the code it will not only run into this limit, but it
>  > would warn about recursion on the second file/anon vma due to utter lack
>  > of annotation.
>  > 
>  > Why are people still developing without lockdep?
> 
> More puzzling, is why hasn't this triggered in the Fedora rawhide kernels,
> which do have lockdep enabled.

My guess is that the kvm thing attaches before there are any vma, and
leaves after all the vma are gone. So it would never actually trigger.




^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 14:56             ` Peter Zijlstra
@ 2008-08-04 16:26               ` Andrea Arcangeli
  2008-08-04 16:38                 ` Peter Zijlstra
  0 siblings, 1 reply; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-04 16:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Jones, Roland Dreier, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel

On Mon, Aug 04, 2008 at 04:56:03PM +0200, Peter Zijlstra wrote:
> My guess is that the kvm thing attaches before there are any vma, and
> leaves after all the vma are gone. So it would never actually trigger.

Yes, lockdep seems to be fine with kvm in kernel mainline in my
current and past testing. Andrew asked me to check this long ago.

vmx ~ # zgrep LOCKDEP /proc/config.gz 
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_LOCKDEP=y
CONFIG_DEBUG_LOCKDEP=y

CPA self-test:
 4k 26112 large 1997 gb 0 x 2652[ffff880000000000-ffff8800bffff000]
 miss 262144
 4k 186880 large 1683 gb 0 x 43021[ffff880000000000-ffff8800bffff000]
 miss 262144
 4k 186880 large 1683 gb 0 x 43021[ffff880000000000-ffff8800bffff000]
 miss 262144
ok.
loaded kvm module ()
apic write: bad size=1 fee00030
Ignoring de-assert INIT to vcpu 0
Ignoring de-assert INIT to vcpu 0
kvm: emulating exchange as write
apic write: bad size=1 fee00030
Ignoring de-assert INIT to vcpu 0
Ignoring de-assert INIT to vcpu 0
apic write: bad size=1 fee00030
Ignoring de-assert INIT to vcpu 0
Ignoring de-assert INIT to vcpu 0
apic write: bad size=1 fee00030
Ignoring de-assert INIT to vcpu 0
apic write: bad size=1 fee00030
Ignoring de-assert INIT to vcpu 0
apic write: bad size=1 fee00030
Ignoring de-assert INIT to vcpu 0
Ignoring de-assert INIT to vcpu 0
Ignoring de-assert INIT to vcpu 0

I can't see lockdep errors in dmesg starting one more multiple VM in a
loop (all run on a quadcore).

GRU is likely the same.

The only real place where lockdep is unusable in my experience is
preempt-RT, it grinds it to an halt during boot on 8-way before
reaching the shell.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 16:26               ` Andrea Arcangeli
@ 2008-08-04 16:38                 ` Peter Zijlstra
  2008-08-04 17:27                   ` Andrea Arcangeli
  2008-08-04 21:32                   ` David Miller
  0 siblings, 2 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 16:38 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Dave Jones, Roland Dreier, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel

On Mon, 2008-08-04 at 18:26 +0200, Andrea Arcangeli wrote:
> On Mon, Aug 04, 2008 at 04:56:03PM +0200, Peter Zijlstra wrote:
> > My guess is that the kvm thing attaches before there are any vma, and
> > leaves after all the vma are gone. So it would never actually trigger.
> 
> Yes, lockdep seems to be fine with kvm in kernel mainline in my
> current and past testing. Andrew asked me to check this long ago.
> 
> vmx ~ # zgrep LOCKDEP /proc/config.gz 
> CONFIG_LOCKDEP_SUPPORT=y
> CONFIG_LOCKDEP=y
> CONFIG_DEBUG_LOCKDEP=y

You also need CONFIG_PROVE_LOCKING, but I'll assume that's set too.

> CPA self-test:
>  4k 26112 large 1997 gb 0 x 2652[ffff880000000000-ffff8800bffff000]
>  miss 262144
>  4k 186880 large 1683 gb 0 x 43021[ffff880000000000-ffff8800bffff000]
>  miss 262144
>  4k 186880 large 1683 gb 0 x 43021[ffff880000000000-ffff8800bffff000]
>  miss 262144
> ok.
> loaded kvm module ()
> apic write: bad size=1 fee00030
> Ignoring de-assert INIT to vcpu 0
> Ignoring de-assert INIT to vcpu 0
> kvm: emulating exchange as write
> apic write: bad size=1 fee00030
> Ignoring de-assert INIT to vcpu 0
> Ignoring de-assert INIT to vcpu 0
> apic write: bad size=1 fee00030
> Ignoring de-assert INIT to vcpu 0
> Ignoring de-assert INIT to vcpu 0
> apic write: bad size=1 fee00030
> Ignoring de-assert INIT to vcpu 0
> apic write: bad size=1 fee00030
> Ignoring de-assert INIT to vcpu 0
> apic write: bad size=1 fee00030
> Ignoring de-assert INIT to vcpu 0
> Ignoring de-assert INIT to vcpu 0
> Ignoring de-assert INIT to vcpu 0
> 
> I can't see lockdep errors in dmesg starting one more multiple VM in a
> loop (all run on a quadcore).

Dave Jones just handed me:

  https://bugzilla.redhat.com/show_bug.cgi?id=457779


> GRU is likely the same.
> 
> The only real place where lockdep is unusable in my experience is
> preempt-RT, it grinds it to an halt during boot on 8-way before
> reaching the shell.

David Miller just did a patch that might fix that.


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 16:38                 ` Peter Zijlstra
@ 2008-08-04 17:27                   ` Andrea Arcangeli
  2008-08-04 17:46                     ` Andrea Arcangeli
  2008-08-04 18:48                     ` [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock() Peter Zijlstra
  2008-08-04 21:32                   ` David Miller
  1 sibling, 2 replies; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-04 17:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Jones, Roland Dreier, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel

On Mon, Aug 04, 2008 at 06:38:55PM +0200, Peter Zijlstra wrote:
> You also need CONFIG_PROVE_LOCKING, but I'll assume that's set too.

Sorry, that wasn't the problem, but my current testing passed because
of another error... to test this report I rebuilt a kvm configured for
rhel not for mainline so it wasn't the right test...

When "long ago" I tested that this was working fine (actually when
Andrew asked me), I guess lockdep was working because the
implementation with the vmalloc array was slightly different,
otherwise I don't know. I'm fairly certain that it worked fine at some
point, and I didn't expect the refactoring to generate false positives.

> Dave Jones just handed me:
> 
>   https://bugzilla.redhat.com/show_bug.cgi?id=457779

I can reproduce this now yes after a 'make sync'.

Obviously this is a bug in lockdep that it trips over this otherwise
if lockdep was right the kernel should deadlock while this is just a
false positive and everything runs fine.

I assume it can't understand the spinlock address is different (I
think it uses the address as key only for static locks), so I wonder
if you could call print_deadlock_bug() from the failure path of the
spinlock to solve this?

Do things like double_rq_lock works just because rq1 and rq2 don't
have the same name like in my case where all locks are called "mapping"->?

> David Miller just did a patch that might fix that.

Woow cool, after 11 months I lost any hope that lockdep could ever
work in that environment... Was it an actual bug or is this some way
to lower the complexity of the graph build?

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 17:27                   ` Andrea Arcangeli
@ 2008-08-04 17:46                     ` Andrea Arcangeli
  2008-08-04 17:57                       ` [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks Andrea Arcangeli
  2008-08-04 18:48                     ` [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock() Peter Zijlstra
  1 sibling, 1 reply; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-04 17:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Jones, Roland Dreier, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel

From: Andrea Arcangeli <andrea@qumranet.com>

Lockdep can't recognize if spinlocks are at a different address. So
trylock avoids lockdep to generate false positives. After lockdep will
be fixed this change can and should be reverted.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
---

On Mon, Aug 04, 2008 at 07:27:28PM +0200, Andrea Arcangeli wrote:
> I can reproduce this now yes after a 'make sync'.

(btw it's not like I forgot to sync, but I sync against the wrong
source tree previously because it was in the bash history, it's a bit
complicate to explain)

> I assume it can't understand the spinlock address is different (I
> think it uses the address as key only for static locks), so I wonder
> if you could call print_deadlock_bug() from the failure path of the
> spinlock to solve this?

In the meantime (as I doubt lockdep will get fixed any time soon) this
will workaround it.

diff -r 3469dce61df1 mm/mmap.c
--- a/mm/mmap.c	Tue Jul 29 20:01:28 2008 +0200
+++ b/mm/mmap.c	Mon Aug 04 19:41:53 2008 +0200
@@ -2279,8 +2279,13 @@ static void vm_lock_anon_vma(struct anon
 		/*
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
+		 *
+		 * spin_lock would confuse lockdep who can't
+		 * differentiate between the 'mapping' always changing
+		 * address.
 		 */
-		spin_lock(&anon_vma->lock);
+		while (!spin_trylock(&anon_vma->lock))
+			cpu_relax();
 		/*
 		 * We can safely modify head.next after taking the
 		 * anon_vma->lock. If some other vma in this mm shares
@@ -2310,7 +2315,13 @@ static void vm_lock_mapping(struct addre
 		 */
 		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
 			BUG();
-		spin_lock(&mapping->i_mmap_lock);
+		/*
+		 * spin_lock would confuse lockdep who can't
+		 * differentiate between the 'mapping' always changing
+		 * address.
+		 */
+		while (!spin_trylock(&mapping->i_mmap_lock))
+			cpu_relax();
 	}
 }
 

^ permalink raw reply	[flat|nested] 73+ messages in thread

* [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 17:46                     ` Andrea Arcangeli
@ 2008-08-04 17:57                       ` Andrea Arcangeli
  2008-08-04 18:48                         ` Peter Zijlstra
  0 siblings, 1 reply; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-04 17:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Jones, Roland Dreier, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel

From: Andrea Arcangeli <andrea@qumranet.com>

Lockdep can't recognize if spinlocks are at a different address. So
using trylock in a loop is one way to avoid lockdep to generate false
positives. After lockdep will be fixed this change can and should be
reverted.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
---

Resubmit because I didn't update the subject and so I improved the
comments too. This is mostly to show it's a bug in lockdep and it can
be trivially worked around without having to fix lockdep for real, any
superior solution to this hack is more than welcome and recommended.

diff -r 3469dce61df1 mm/mmap.c
--- a/mm/mmap.c	Tue Jul 29 20:01:28 2008 +0200
+++ b/mm/mmap.c	Mon Aug 04 19:54:27 2008 +0200
@@ -2279,8 +2279,12 @@ static void vm_lock_anon_vma(struct anon
 		/*
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
+		 *
+		 * spin_lock would confuse lockdep who doesn't notice
+		 * the 'anon_vma' always changing address.
 		 */
-		spin_lock(&anon_vma->lock);
+		while (!spin_trylock(&anon_vma->lock))
+			cpu_relax();
 		/*
 		 * We can safely modify head.next after taking the
 		 * anon_vma->lock. If some other vma in this mm shares
@@ -2310,7 +2314,12 @@ static void vm_lock_mapping(struct addre
 		 */
 		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
 			BUG();
-		spin_lock(&mapping->i_mmap_lock);
+		/*
+		 * spin_lock would confuse lockdep who doesn't notice
+		 * the 'mapping' always changing address.
+		 */
+		while (!spin_trylock(&mapping->i_mmap_lock))
+			cpu_relax();
 	}
 }
 

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 13:03 ` [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock() Peter Zijlstra
  2008-08-04 14:07   ` Roland Dreier
@ 2008-08-04 18:06   ` Jeremy Fitzhardinge
  2008-08-04 18:54     ` Peter Zijlstra
  2008-08-07 11:25   ` Peter Zijlstra
  2 siblings, 1 reply; 73+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-04 18:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, David Miller, hugh, mingo, akpm, linux-kernel,
	davej

Peter Zijlstra wrote:
> Expose the new lock protection lock.
>
> This can be used to annotate places where we take multiple locks of the
> same class and avoid deadlocks by always taking another (top-level) lock
> first.
>   

OK, so the expected usage is:

	spin_lock(&outer_lock);
	/* take in any order */
	spin_lock_nest_lock(&inner[0], &outer_lock);
	spin_lock_nest_lock(&inner[2], &outer_lock);
	spin_lock_nest_lock(&inner[1], &outer_lock);
	...

?

And it's OK to

   1. take inner locks one at a time without holding the outer lock
   2. use plain spin_lock on inner locks when you're taking them one at
      a time, and
   3. release the outer lock before releasing the inner locks

but it's not OK to try to use different outer locks for a given inner lock.

Yes?

    J

> NOTE: we're still bound to the MAX_LOCK_DEPTH (48) limit.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/lockdep.h  |    2 ++
>  include/linux/spinlock.h |    6 ++++++
>  kernel/spinlock.c        |   11 +++++++++++
>  3 files changed, 19 insertions(+)
>
> Index: linux-2.6/include/linux/lockdep.h
> ===================================================================
> --- linux-2.6.orig/include/linux/lockdep.h
> +++ linux-2.6/include/linux/lockdep.h
> @@ -410,8 +410,10 @@ static inline void print_irqtrace_events
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  # ifdef CONFIG_PROVE_LOCKING
>  #  define spin_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 2, NULL, i)
> +#  define spin_acquire_nest(l, s, t, n, i)	lock_acquire(l, s, t, 0, 2, n, i)
>  # else
>  #  define spin_acquire(l, s, t, i)		lock_acquire(l, s, t, 0, 1, NULL, i)
> +#  define spin_acquire_nest(l, s, t, n, i)	lock_acquire(l, s, t, 0, 1, NULL, i)
>  # endif
>  # define spin_release(l, n, i)			lock_release(l, n, i)
>  #else
> Index: linux-2.6/include/linux/spinlock.h
> ===================================================================
> --- linux-2.6.orig/include/linux/spinlock.h
> +++ linux-2.6/include/linux/spinlock.h
> @@ -183,8 +183,14 @@ do {								\
>  
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  # define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass)
> +# define spin_lock_nest_lock(lock, nest_lock)				\
> +	 do {								\
> +		 typecheck(struct lockdep_map *, &nest_lock->dep_map);	\
> +		 _spin_lock_nest_lock(lock, &nest_lock->dep_map);	\
> +	 } while (0)
>  #else
>  # define spin_lock_nested(lock, subclass) _spin_lock(lock)
> +# define spin_lock_nest_lock(lock, nest_lock) _spin_lock(lock)
>  #endif
>  
>  #define write_lock(lock)		_write_lock(lock)
> Index: linux-2.6/kernel/spinlock.c
> ===================================================================
> --- linux-2.6.orig/kernel/spinlock.c
> +++ linux-2.6/kernel/spinlock.c
> @@ -292,6 +292,7 @@ void __lockfunc _spin_lock_nested(spinlo
>  }
>  
>  EXPORT_SYMBOL(_spin_lock_nested);
> +
>  unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
>  {
>  	unsigned long flags;
> @@ -314,6 +315,16 @@ unsigned long __lockfunc _spin_lock_irqs
>  
>  EXPORT_SYMBOL(_spin_lock_irqsave_nested);
>  
> +void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
> +				     struct lockdep_map *nest_lock)
> +{
> +	preempt_disable();
> +	spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
> +	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
> +}
> +
> +EXPORT_SYMBOL(_spin_lock_nest_lock)
> +
>  #endif
>  
>  void __lockfunc _spin_unlock(spinlock_t *lock)
>
>   


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 17:27                   ` Andrea Arcangeli
  2008-08-04 17:46                     ` Andrea Arcangeli
@ 2008-08-04 18:48                     ` Peter Zijlstra
  1 sibling, 0 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 18:48 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Dave Jones, Roland Dreier, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel

On Mon, 2008-08-04 at 19:27 +0200, Andrea Arcangeli wrote:
> On Mon, Aug 04, 2008 at 06:38:55PM +0200, Peter Zijlstra wrote:
> > You also need CONFIG_PROVE_LOCKING, but I'll assume that's set too.
> 
> Sorry, that wasn't the problem, but my current testing passed because
> of another error... to test this report I rebuilt a kvm configured for
> rhel not for mainline so it wasn't the right test...
> 
> When "long ago" I tested that this was working fine (actually when
> Andrew asked me), I guess lockdep was working because the
> implementation with the vmalloc array was slightly different,
> otherwise I don't know. I'm fairly certain that it worked fine at some
> point, and I didn't expect the refactoring to generate false positives.
> 
> > Dave Jones just handed me:
> > 
> >   https://bugzilla.redhat.com/show_bug.cgi?id=457779
> 
> I can reproduce this now yes after a 'make sync'.
> 
> Obviously this is a bug in lockdep that it trips over this otherwise
> if lockdep was right the kernel should deadlock while this is just a
> false positive and everything runs fine.
> 
> I assume it can't understand the spinlock address is different (I
> think it uses the address as key only for static locks), so I wonder
> if you could call print_deadlock_bug() from the failure path of the
> spinlock to solve this?
> 
> Do things like double_rq_lock works just because rq1 and rq2 don't
> have the same name like in my case where all locks are called "mapping"->?

Lockdep doesn't look at individual lock instances, but instead works on
lock classes. The rq locks used to have a separate class each, but the
third patch in this series changes that.

The advantage of not working on lock instances is that the dependency
graph becomes much smaller, and it will report problems much earlier,
the down-side is that you will have to augment the information a little.

Normally locks are grouped in classes based on their init site; eg.
every instance initialized by a particular spin_lock_init() will belong
to the same class.

Besides this grouping, you can use things like spin_lock_nested(), which
can be used to annotate simple recursion (up to 8 sub-classes).

Then there is the possibility to explicitly create classes and assign
locks to classes using lockdep_set_class().

Either way - there is a limit of MAX_LOCK_DEPTH (48) locks that can be
tracked per task (including nesting interrupts).

Making that dynamic is rather challenging - as it would involve
allocating memory while potentially holding allocator locks.

> > David Miller just did a patch that might fix that.
> 
> Woow cool, after 11 months I lost any hope that lockdep could ever
> work in that environment... Was it an actual bug or is this some way
> to lower the complexity of the graph build?

He reduced the search complexity of the graph, look at the first patch
in this series.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 17:57                       ` [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks Andrea Arcangeli
@ 2008-08-04 18:48                         ` Peter Zijlstra
  2008-08-04 18:56                           ` Roland Dreier
  2008-08-04 20:15                           ` Andrea Arcangeli
  0 siblings, 2 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 18:48 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Dave Jones, Roland Dreier, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel

On Mon, 2008-08-04 at 19:57 +0200, Andrea Arcangeli wrote:
> From: Andrea Arcangeli <andrea@qumranet.com>
> 
> Lockdep can't recognize if spinlocks are at a different address. So
> using trylock in a loop is one way to avoid lockdep to generate false
> positives. After lockdep will be fixed this change can and should be
> reverted.
> 
> Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

NAK, come-on, you didn't even bother to look at the available
annotations..

> ---
> 
> Resubmit because I didn't update the subject and so I improved the
> comments too. This is mostly to show it's a bug in lockdep and it can
> be trivially worked around without having to fix lockdep for real, any
> superior solution to this hack is more than welcome and recommended.
> 
> diff -r 3469dce61df1 mm/mmap.c
> --- a/mm/mmap.c	Tue Jul 29 20:01:28 2008 +0200
> +++ b/mm/mmap.c	Mon Aug 04 19:54:27 2008 +0200
> @@ -2279,8 +2279,12 @@ static void vm_lock_anon_vma(struct anon
>  		/*
>  		 * The LSB of head.next can't change from under us
>  		 * because we hold the mm_all_locks_mutex.
> +		 *
> +		 * spin_lock would confuse lockdep who doesn't notice
> +		 * the 'anon_vma' always changing address.
>  		 */
> -		spin_lock(&anon_vma->lock);
> +		while (!spin_trylock(&anon_vma->lock))
> +			cpu_relax();
>  		/*
>  		 * We can safely modify head.next after taking the
>  		 * anon_vma->lock. If some other vma in this mm shares
> @@ -2310,7 +2314,12 @@ static void vm_lock_mapping(struct addre
>  		 */
>  		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
>  			BUG();
> -		spin_lock(&mapping->i_mmap_lock);
> +		/*
> +		 * spin_lock would confuse lockdep who doesn't notice
> +		 * the 'mapping' always changing address.
> +		 */
> +		while (!spin_trylock(&mapping->i_mmap_lock))
> +			cpu_relax();
>  	}
>  }
>  


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 18:06   ` Jeremy Fitzhardinge
@ 2008-08-04 18:54     ` Peter Zijlstra
  2008-08-04 19:26       ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 18:54 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Linus Torvalds, David Miller, hugh, mingo, akpm, linux-kernel,
	davej

On Mon, 2008-08-04 at 11:06 -0700, Jeremy Fitzhardinge wrote:
> Peter Zijlstra wrote:
> > Expose the new lock protection lock.
> >
> > This can be used to annotate places where we take multiple locks of the
> > same class and avoid deadlocks by always taking another (top-level) lock
> > first.
> >   
> 
> OK, so the expected usage is:
> 
> 	spin_lock(&outer_lock);
> 	/* take in any order */
> 	spin_lock_nest_lock(&inner[0], &outer_lock);
> 	spin_lock_nest_lock(&inner[2], &outer_lock);
> 	spin_lock_nest_lock(&inner[1], &outer_lock);
> 	...
> 
> ?

Yes (there it no requirement that the outer lock is a spinlock_t, just
that it has a ->dep_map member - so: mutex, rwsem and spinlock will do).

> And it's OK to
> 
>    1. take inner locks one at a time without holding the outer lock

Yes

>    2. use plain spin_lock on inner locks when you're taking them one at
>       a time, and

Yes

>    3. release the outer lock before releasing the inner locks

Only if you then release the inner locks in the reverse order you took
them - the nested release code (releasing a lock that is not on the top
of the stack) basically pops and pushes all the locks, the push will
fail if the outer lock is released.

> but it's not OK to try to use different outer locks for a given inner lock.

It doesn't validate this part - as with most lockdep annotations you can
annotate away real deadlocks.



^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 18:48                         ` Peter Zijlstra
@ 2008-08-04 18:56                           ` Roland Dreier
  2008-08-04 19:05                             ` Peter Zijlstra
  2008-08-04 20:15                           ` Andrea Arcangeli
  1 sibling, 1 reply; 73+ messages in thread
From: Roland Dreier @ 2008-08-04 18:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrea Arcangeli, Dave Jones, Linus Torvalds, David Miller,
	jeremy, hugh, mingo, akpm, linux-kernel

 > NAK, come-on, you didn't even bother to look at the available
 > annotations..

Agree that this trylock-in-a-loop is a horrible hack that should never
be merged.

However I wonder what you think the right way to annotate the
potentially unbounded number of locks taken in mm_take_all_locks() is?

 - R.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 18:56                           ` Roland Dreier
@ 2008-08-04 19:05                             ` Peter Zijlstra
  0 siblings, 0 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 19:05 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Andrea Arcangeli, Dave Jones, Linus Torvalds, David Miller,
	jeremy, hugh, mingo, akpm, linux-kernel

On Mon, 2008-08-04 at 11:56 -0700, Roland Dreier wrote:
> > NAK, come-on, you didn't even bother to look at the available
>  > annotations..
> 
> Agree that this trylock-in-a-loop is a horrible hack that should never
> be merged.
> 
> However I wonder what you think the right way to annotate the
> potentially unbounded number of locks taken in mm_take_all_locks() is?

Good question, one which I don't currently have an answer for - see my
other mail why getting rid of the MAX_LOCK_DEPTH limitation is hard.

I'll give it another serious go tomorrow (if nothing else preempts me).


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 18:54     ` Peter Zijlstra
@ 2008-08-04 19:26       ` Jeremy Fitzhardinge
  2008-08-04 19:31         ` Linus Torvalds
  0 siblings, 1 reply; 73+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-04 19:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, David Miller, hugh, mingo, akpm, linux-kernel,
	davej

Peter Zijlstra wrote:
>>    3. release the outer lock before releasing the inner locks
>>     
>
> Only if you then release the inner locks in the reverse order you took
> them - the nested release code (releasing a lock that is not on the top
> of the stack) basically pops and pushes all the locks, the push will
> fail if the outer lock is released.
>   

OK.  I don't actually need to do this, but I was asking for 
completeness.  But to clarify, you only need to do the reverse unlock if 
you do it after unlocking the outer lock?  If you're still holding the 
outer lock, you can unlock in any order?

>> but it's not OK to try to use different outer locks for a given inner lock.
>>     
>
> It doesn't validate this part - as with most lockdep annotations you can
> annotate away real deadlocks.
>   

Hm, OK.


    J

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 19:26       ` Jeremy Fitzhardinge
@ 2008-08-04 19:31         ` Linus Torvalds
  2008-08-04 19:39           ` Peter Zijlstra
                             ` (2 more replies)
  0 siblings, 3 replies; 73+ messages in thread
From: Linus Torvalds @ 2008-08-04 19:31 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Peter Zijlstra, David Miller, hugh, mingo, akpm, linux-kernel,
	davej



On Mon, 4 Aug 2008, Jeremy Fitzhardinge wrote:
> 
> OK.  I don't actually need to do this, but I was asking for completeness.  But
> to clarify, you only need to do the reverse unlock if you do it after
> unlocking the outer lock?  If you're still holding the outer lock, you can
> unlock in any order?

Release order should always be totally irrelevant, whether you hold outer 
locks or not. Only the order of _getting_ locks matter.

And yes, if there is an outer lock, even the order of getting locks is 
irrelevant, as long as anybody who gets more than one inner lock always 
holds the outer one.

		Linus

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 19:31         ` Linus Torvalds
@ 2008-08-04 19:39           ` Peter Zijlstra
  2008-08-04 20:16           ` Jeremy Fitzhardinge
  2008-10-08 15:27           ` Steven Rostedt
  2 siblings, 0 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 19:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeremy Fitzhardinge, David Miller, hugh, mingo, akpm,
	linux-kernel, davej

On Mon, 2008-08-04 at 12:31 -0700, Linus Torvalds wrote:
> 
> On Mon, 4 Aug 2008, Jeremy Fitzhardinge wrote:
> > 
> > OK.  I don't actually need to do this, but I was asking for completeness.  But
> > to clarify, you only need to do the reverse unlock if you do it after
> > unlocking the outer lock?  If you're still holding the outer lock, you can
> > unlock in any order?
> 
> Release order should always be totally irrelevant, whether you hold outer 
> locks or not. Only the order of _getting_ locks matter.
> 
> And yes, if there is an outer lock, even the order of getting locks is 
> irrelevant, as long as anybody who gets more than one inner lock always 
> holds the outer one.

I agree, its just non-trivial to convince lockdep of this. I can give it
a go though.


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 18:48                         ` Peter Zijlstra
  2008-08-04 18:56                           ` Roland Dreier
@ 2008-08-04 20:15                           ` Andrea Arcangeli
  2008-08-04 20:37                             ` Peter Zijlstra
  1 sibling, 1 reply; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-04 20:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Jones, Roland Dreier, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel

On Mon, Aug 04, 2008 at 08:48:59PM +0200, Peter Zijlstra wrote:
> On Mon, 2008-08-04 at 19:57 +0200, Andrea Arcangeli wrote:
> > From: Andrea Arcangeli <andrea@qumranet.com>
> > 
> > Lockdep can't recognize if spinlocks are at a different address. So
> > using trylock in a loop is one way to avoid lockdep to generate false
> > positives. After lockdep will be fixed this change can and should be
> > reverted.
> > 
> > Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
> 
> NAK, come-on, you didn't even bother to look at the available
> annotations..

Let's say when I hear prove-locking my instinct tells me to walk away
as fast as I can. I'll try to explain why I prefer the trylock loop
(which btw is the only one that will hide the lockdep false positives
here and I welcome you to fix it in another way, well another way that
comes to mind is to call __raw_spin_lock which I didn't do because I
don't like those lowlevel details in common code).

So about prove-locking:

1) in production is disabled so when I get bugreports I've to grab
   locking deadlock information as usual (sysrq+t/p or preferably
   lkcd/kdump)

2) while coding it's useless as well because I don't need this thing
   to debug and fix any deadlocks

3) this only finds bugs after the system is hung and I can fix it by
   other means then

Despite having used this for a while, it never happened to me that
this has been useful as single time, I only run into false positives
and systems grinding to an halt so not allowing debug kernels to run
which reduced even more the ability to debug the kernel. I'm not aware
of anybody else who was debugging that fixed bugs thanks to this
code. So in my experience this only has hurt me and I don't want it
ever enabled in anything I deal with.

If only this had a slight chance to find bugs that don't easily
trigger at runtime it'd be an entirely different matter.

To be clear: my criticism is only to prove-locking, for example I find
very useful that there are other things like spinlock debug options
that warns if you call GFP with GFP_KERNEL while you're in atomic
context (which unfortunately only works with preempt enabled for no
good reason because the atomic context is maintained by non-preempt
too but anyway). The spinlock sleep debug effectively find bugs that
would otherwise go unnoticed. Let's say I'm only interested in stuff
that shows bugs that would otherwise go unnoticed, everything else I
can debug it by myself without prove-locking printk in the logs.

So this can only help if you aren't capable of running and decoding
sysrq+p/t. And it can't help in production either because it has to be
disabled there (and production would be the only place where
explaining how to grab a sysrq+p/t is a problem, and normally it's
simpler to grab kdump/lkcd).

Furthermore I don't like to slowdown (up to grinding system to an halt
before these fixes, but still not so fast) for this so unnecessary
feature while debugging. This also explains why after checking that
the patch was ok when Andrew asked it, I disabled prove-locking pretty
quickly.

So perhaps I misunderstand how lockdep works, in which case you're
welcome to correct me, otherwise I'd really like to know if anybody
really felt some bug couldn't be fixed in equal time without it (and I
mean excluding production where obviously this is useless as it can't
be possibly enabled). Understanding what are the implications of the
deadlock is sure slower than comparing the disassembly of the
sysrq+p/t (which 99% of has to be done anyway because the report comes
from a production system, if the bug was reproducible it would have
been tripped before it hit production).

I see it as a didactical theoretical exercise, and I hope this
explains why I prefer the trylock loop any day compared to any more
complex prove-locking specific API polluting the common code.  And if
it was for me I'd remove prove-locking from the kernel the moment it
started to pollute the common code for zero debugging advantages.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 19:31         ` Linus Torvalds
  2008-08-04 19:39           ` Peter Zijlstra
@ 2008-08-04 20:16           ` Jeremy Fitzhardinge
  2008-10-08 15:27           ` Steven Rostedt
  2 siblings, 0 replies; 73+ messages in thread
From: Jeremy Fitzhardinge @ 2008-08-04 20:16 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, David Miller, hugh, mingo, akpm, linux-kernel,
	davej

Linus Torvalds wrote:
> On Mon, 4 Aug 2008, Jeremy Fitzhardinge wrote:
>   
>> OK.  I don't actually need to do this, but I was asking for completeness.  But
>> to clarify, you only need to do the reverse unlock if you do it after
>> unlocking the outer lock?  If you're still holding the outer lock, you can
>> unlock in any order?
>>     
>
> Release order should always be totally irrelevant, whether you hold outer 
> locks or not. Only the order of _getting_ locks matter.
>   

Right.  But Peter said lockdep was fussy about it for some reason.  If 
the point of the exercise is to eliminate spurious warnings, it would be 
nice to avoid new ones in the process...

    J

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 20:15                           ` Andrea Arcangeli
@ 2008-08-04 20:37                             ` Peter Zijlstra
  2008-08-04 21:09                               ` Andrea Arcangeli
  0 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-04 20:37 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Dave Jones, Roland Dreier, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel, arjan

On Mon, 2008-08-04 at 22:15 +0200, Andrea Arcangeli wrote:
> On Mon, Aug 04, 2008 at 08:48:59PM +0200, Peter Zijlstra wrote:
> > On Mon, 2008-08-04 at 19:57 +0200, Andrea Arcangeli wrote:
> > > From: Andrea Arcangeli <andrea@qumranet.com>
> > > 
> > > Lockdep can't recognize if spinlocks are at a different address. So
> > > using trylock in a loop is one way to avoid lockdep to generate false
> > > positives. After lockdep will be fixed this change can and should be
> > > reverted.
> > > 
> > > Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
> > 
> > NAK, come-on, you didn't even bother to look at the available
> > annotations..
> 
> Let's say when I hear prove-locking my instinct tells me to walk away
> as fast as I can. I'll try to explain why I prefer the trylock loop
> (which btw is the only one that will hide the lockdep false positives
> here and I welcome you to fix it in another way, well another way that
> comes to mind is to call __raw_spin_lock which I didn't do because I
> don't like those lowlevel details in common code).
> 
> So about prove-locking:
> 
> 1) in production is disabled so when I get bugreports I've to grab
>    locking deadlock information as usual (sysrq+t/p or preferably
>    lkcd/kdump)
> 
> 2) while coding it's useless as well because I don't need this thing
>    to debug and fix any deadlocks
> 
> 3) this only finds bugs after the system is hung and I can fix it by
>    other means then

You're so wrong it not even funny. It reports about deadlocks before
they happen. All it needs is to observe a lock order violation and it
will report it. In order for the dead-lock to happen, you need to
actually hit the violation concurrently with the normal order. 

IOW, lockdep can even spot deadlocks on a non-preempt single cpu setup
where they can never actually happen.

Furthermore, it does more than the simple lock deadlocks, it also takes
IRQ state into account. So it can tell about hard or soft irq recursion
deadlocks.

Having lockdep on while developing saves a lot of trouble - in fact it
_has_ caught many real bugs before they could be introduced to mainline,
ask Arjan who has supervised driver development.

Not only that, it caught plenty of real bugs in mainline as well as -rt.
These days it appears to not catch many because the tree is in such good
shape, but that is fully thanks to lockdep.

That is not to say it's perferct - lockdep certainly does have it
limitations. But your portrail is very in-accurate.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 20:37                             ` Peter Zijlstra
@ 2008-08-04 21:09                               ` Andrea Arcangeli
  2008-08-04 21:14                                 ` Pekka Enberg
                                                   ` (3 more replies)
  0 siblings, 4 replies; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-04 21:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Jones, Roland Dreier, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel, arjan

On Mon, Aug 04, 2008 at 10:37:22PM +0200, Peter Zijlstra wrote:
> You're so wrong it not even funny. It reports about deadlocks before
> they happen. All it needs is to observe a lock order violation and it

Now tell me how it helps to report them... It tells me the system has
crashed and where, it's not like I couldn't figure it out by myself
but just noticing nothing works and all cpus are spinning in some
spinlock slow path and pressing sysrq+t/p.

> will report it. In order for the dead-lock to happen, you need to
> actually hit the violation concurrently with the normal order. 

The point is that this is a runtime evaluation of lock orders, if
runtime isn't the lucky one that reproduces the deadlock, it'll find
nothing at all.

> IOW, lockdep can even spot deadlocks on a non-preempt single cpu setup
> where they can never actually happen.

Ask the preempt-RT folks, advertising infinite-cpu-smp on UP. No need
of lockdep to reproduce deadlocks that couldn't be found without it in
UP. Infact lockdep only avoids to optimize away the spinlock, you
don't need preempt-RT to reproduce at zero-runtime-cost in a equally
easily debuggable way whatever the lockdep can reproduced on a UP
compile. You've only to run a SMP kernel on UP, big deal that lockdep
is solving to prevent you to run a smp kernel in up...

> Furthermore, it does more than the simple lock deadlocks, it also takes
> IRQ state into account. So it can tell about hard or soft irq recursion
> deadlocks.

Ok this gets just a bit more interesting but my understanding is that
again this only "reports" stuff that crashes hard and is trivially
debuggable by other means.

> Having lockdep on while developing saves a lot of trouble - in fact it
> _has_ caught many real bugs before they could be introduced to mainline,
> ask Arjan who has supervised driver development.

Are you saying those bugs weren't trivially debuggable anyway by
kernel-hackers with a simple sysrq+p/t or kgdb stack trace?

If yes, I can only imagine that nobody takes care of setting up
debugging options that actually are _zero_ runtime cost and don't
pollute the kernel common code.

> Not only that, it caught plenty of real bugs in mainline as well as -rt.
> These days it appears to not catch many because the tree is in such good
> shape, but that is fully thanks to lockdep.
> 
> That is not to say it's perferct - lockdep certainly does have it
> limitations. But your portrail is very in-accurate.

I've an hard time to believe you, the only thing I can agree is that
if you don't want to run smp kernel on UP (or even better preempt-RT
on UP) you can run lockdep to find recursion.

smp-preempt-RT on UP is a real feature that is useful debugging,
because it makes bugs visible that weren't, lockdep is useless as far
as I can tell.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 21:09                               ` Andrea Arcangeli
@ 2008-08-04 21:14                                 ` Pekka Enberg
  2008-08-04 21:30                                   ` Andrea Arcangeli
  2008-08-04 21:27                                 ` Arjan van de Ven
                                                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 73+ messages in thread
From: Pekka Enberg @ 2008-08-04 21:14 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Peter Zijlstra, Dave Jones, Roland Dreier, Linus Torvalds,
	David Miller, jeremy, hugh, mingo, akpm, linux-kernel, arjan

Hi Andrea,

On Mon, Aug 04, 2008 at 10:37:22PM +0200, Peter Zijlstra wrote:
>> You're so wrong it not even funny. It reports about deadlocks before
>> they happen. All it needs is to observe a lock order violation and it

On Tue, Aug 5, 2008 at 12:09 AM, Andrea Arcangeli <andrea@qumranet.com> wrote:
> Now tell me how it helps to report them... It tells me the system has
> crashed and where, it's not like I couldn't figure it out by myself
> but just noticing nothing works and all cpus are spinning in some
> spinlock slow path and pressing sysrq+t/p.

Sometimes it's hard to actually trigger a deadlock condition during
development, especially if you're not developing on a big iron
machine.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 21:09                               ` Andrea Arcangeli
  2008-08-04 21:14                                 ` Pekka Enberg
@ 2008-08-04 21:27                                 ` Arjan van de Ven
  2008-08-04 21:54                                   ` Andrea Arcangeli
  2008-08-04 21:57                                 ` David Miller
  2008-08-05  2:00                                 ` Roland Dreier
  3 siblings, 1 reply; 73+ messages in thread
From: Arjan van de Ven @ 2008-08-04 21:27 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Peter Zijlstra, Dave Jones, Roland Dreier, Linus Torvalds,
	David Miller, jeremy, hugh, mingo, akpm, linux-kernel

On Mon, 4 Aug 2008 23:09:54 +0200
Andrea Arcangeli <andrea@qumranet.com> wrote:

> On Mon, Aug 04, 2008 at 10:37:22PM +0200, Peter Zijlstra wrote:
> > You're so wrong it not even funny. It reports about deadlocks before
> > they happen. All it needs is to observe a lock order violation and
> > it
> 
> Now tell me how it helps to report them... It tells me the system has
> crashed and where, 


I think you totally misunderstand things then.

Lockdep will report a problem if it *ever* sees a BA order after it has
seen a BA order. They don't have to happen at the same time. Or even
within hours of eachother.

They MIGHT happen... in a narrow time window, when you have a deadlock.
But lockdep will warn you about the order violation without actually
having to dealock... because the AB is likely to be done already most
of the time when the BA happens.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 21:14                                 ` Pekka Enberg
@ 2008-08-04 21:30                                   ` Andrea Arcangeli
  2008-08-04 21:41                                     ` Andrew Morton
  2008-08-04 21:42                                     ` Arjan van de Ven
  0 siblings, 2 replies; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-04 21:30 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Peter Zijlstra, Dave Jones, Roland Dreier, Linus Torvalds,
	David Miller, jeremy, hugh, mingo, akpm, linux-kernel, arjan

On Tue, Aug 05, 2008 at 12:14:48AM +0300, Pekka Enberg wrote:
> Sometimes it's hard to actually trigger a deadlock condition during
> development, especially if you're not developing on a big iron
> machine.

My understand is that lockdep can't do static analysis of code, it's
not a checker. It can only evaluate the locks in the order the cpu
takes them. And if the CPU never takes them in the wrong order because
only your unlikely to happen slow path takes them in the wrong order
while the other cpus hold them in the opposite order, lockdep can't
say nothing at all, how could it? If it would say something when the
cpu doesn't deadlock a moment later, then it would be a false positive
99% of the time.

So now we learn lockdep also lead developers to think if their code
doesn't generate lockdep errors it will be safe in production, so
again more damage thanks to lockdep.

Lockdep is a printk that reports that you were incredibly lucky, that
you got the lock inversion during development even if there was only
once chance in a thousand, and that you can celebrate. But I know I
can figure it out by myself when the system has crashed without
something eating all cpu and polluting kernel common code to tell me.

In other words I imagine lockdep as something like this:

	char *ptr = NULL
	BUG_ON(!ptr); /* this is lockdep */
	*ptr = 100;

Luckily nobody dreams to add BUG_ON that checks for null pointer to
dereference, as there's the mmu for that.

The same way for the null pointer dereference, there are zerocost
means to debug all sort of kernel crashing deadlocks.

Now perhaps I misunderstood lockdep entirely but I doubt as this thing
is driven by calls made by the spin_lock functions, and it can't know
anything about the spin_lock that haven't been run by the cpu yet.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 16:38                 ` Peter Zijlstra
  2008-08-04 17:27                   ` Andrea Arcangeli
@ 2008-08-04 21:32                   ` David Miller
  1 sibling, 0 replies; 73+ messages in thread
From: David Miller @ 2008-08-04 21:32 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: andrea, davej, rdreier, torvalds, jeremy, hugh, mingo, akpm,
	linux-kernel

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 04 Aug 2008 18:38:55 +0200

> On Mon, 2008-08-04 at 18:26 +0200, Andrea Arcangeli wrote:
> > The only real place where lockdep is unusable in my experience is
> > preempt-RT, it grinds it to an halt during boot on 8-way before
> > reaching the shell.
> 
> David Miller just did a patch that might fix that.

Peter, your patches are just as effective at making all of my 64-cpu,
128-cpu, etc. machines usable with lockdep.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 21:30                                   ` Andrea Arcangeli
@ 2008-08-04 21:41                                     ` Andrew Morton
  2008-08-04 22:12                                       ` Andrea Arcangeli
  2008-08-04 21:42                                     ` Arjan van de Ven
  1 sibling, 1 reply; 73+ messages in thread
From: Andrew Morton @ 2008-08-04 21:41 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: penberg, a.p.zijlstra, davej, rdreier, torvalds, davem, jeremy,
	hugh, mingo, linux-kernel, arjan

On Mon, 4 Aug 2008 23:30:18 +0200
Andrea Arcangeli <andrea@qumranet.com> wrote:

> Now perhaps I misunderstood lockdep entirely

You did ;)

At runtime lockdep will "learn" the lock ordering rules, building a
graph in memory.  If it ever sees the thus-learnt rules violated, it
will warn.


Simple example:


static DEFINE_MUTEX(a);
static DEFINE_MUTEX(b);

void f1(void)
{
	mutex_lock(a);
	mutex_lock(b);
}

void f2(void)
{
	mutex_lock(b);
	mutex_lock(a);
}

void doit(void)
{
	f1();
	f2();
}



lockdep will warn here about the ranking violation.  As soon as it
occurs.  Even though the system never deadlocked.

It's very very clever and powerful.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 21:30                                   ` Andrea Arcangeli
  2008-08-04 21:41                                     ` Andrew Morton
@ 2008-08-04 21:42                                     ` Arjan van de Ven
  2008-08-04 22:30                                       ` Andrea Arcangeli
  1 sibling, 1 reply; 73+ messages in thread
From: Arjan van de Ven @ 2008-08-04 21:42 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Pekka Enberg, Peter Zijlstra, Dave Jones, Roland Dreier,
	Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel

On Mon, 4 Aug 2008 23:30:18 +0200
Andrea Arcangeli <andrea@qumranet.com> wrote:

> On Tue, Aug 05, 2008 at 12:14:48AM +0300, Pekka Enberg wrote:
> > Sometimes it's hard to actually trigger a deadlock condition during
> > development, especially if you're not developing on a big iron
> > machine.
> 
> My understand is that lockdep can't do static analysis of code, it's
> not a checker. It can only evaluate the locks in the order the cpu
> takes them. And if the CPU never takes them in the wrong order because
> only your unlikely to happen slow path takes them in the wrong order

yes lockdep will only complain WHEN you take them in the wrong order

But you claimed you would for sure be in a deadlock at that point which
is generally not correct.

> So now we learn lockdep also lead developers to think if their code
> doesn't generate lockdep errors it will be safe in production, so
> again more damage thanks to lockdep.

this comment totally puzzles me... rather than calling you naive...
where was this said or even implied????




-- 
If you want to reach me at my work email, use arjan@linux.intel.com
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 21:27                                 ` Arjan van de Ven
@ 2008-08-04 21:54                                   ` Andrea Arcangeli
  0 siblings, 0 replies; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-04 21:54 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Peter Zijlstra, Dave Jones, Roland Dreier, Linus Torvalds,
	David Miller, jeremy, hugh, mingo, akpm, linux-kernel

On Mon, Aug 04, 2008 at 02:27:52PM -0700, Arjan van de Ven wrote:
> I think you totally misunderstand things then.
> 
> Lockdep will report a problem if it *ever* sees a BA order after it has
> seen a BA order. They don't have to happen at the same time. Or even
> within hours of eachother.

For me the false positive was generated by the recursive spinlock
feature, not the deadlock inversion feature. If you limit my comments
to the recursive spinlock feature perhaps you'll be more likely to
agree with them.

> They MIGHT happen... in a narrow time window, when you have a deadlock.
> But lockdep will warn you about the order violation without actually
> having to dealock... because the AB is likely to be done already most
> of the time when the BA happens.

So this is about the lock inversion feature, I admit I didn't realize
it has memory that locks have been taken in AB order and spawns the
first time locks have been taken in BA order. But this will lead to
more false positives because there's nothing wrong to do AB BA if
there's C lock taken before them!

Perhaps enforcing lockdep all over the kernel is worth it just for
this AB BA thing in case it doesn't only spawn false positives, but
still I can't like something that is as inaccurate and prone for
errors as lockdep and spreading all over the place to try to reduce
the false positives it emits. I can't like that but that's just
me... others clearly love it.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 21:09                               ` Andrea Arcangeli
  2008-08-04 21:14                                 ` Pekka Enberg
  2008-08-04 21:27                                 ` Arjan van de Ven
@ 2008-08-04 21:57                                 ` David Miller
  2008-08-05  2:00                                 ` Roland Dreier
  3 siblings, 0 replies; 73+ messages in thread
From: David Miller @ 2008-08-04 21:57 UTC (permalink / raw)
  To: andrea
  Cc: a.p.zijlstra, davej, rdreier, torvalds, jeremy, hugh, mingo, akpm,
	linux-kernel, arjan

From: Andrea Arcangeli <andrea@qumranet.com>
Date: Mon, 4 Aug 2008 23:09:54 +0200

> On Mon, Aug 04, 2008 at 10:37:22PM +0200, Peter Zijlstra wrote:
> > You're so wrong it not even funny. It reports about deadlocks before
> > they happen. All it needs is to observe a lock order violation and it
> 
> Now tell me how it helps to report them... It tells me the system has
> crashed and where, it's not like I couldn't figure it out by myself
> but just noticing nothing works and all cpus are spinning in some
> spinlock slow path and pressing sysrq+t/p.

It tells you about deadlock scenerios that you haven't even encoutered
yet.  It shows bugs for parallel code path sequences that have not
even occurred yet.

But I think you're beyond the point of being able to see why lockdep
is valuable.  So why don't we just move on.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 21:41                                     ` Andrew Morton
@ 2008-08-04 22:12                                       ` Andrea Arcangeli
  0 siblings, 0 replies; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-04 22:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: penberg, a.p.zijlstra, davej, rdreier, torvalds, davem, jeremy,
	hugh, mingo, linux-kernel, arjan

On Mon, Aug 04, 2008 at 02:41:14PM -0700, Andrew Morton wrote:
> lockdep will warn here about the ranking violation.  As soon as it
> occurs.  Even though the system never deadlocked.

Yes I wasn't aware of the AB BA feature that doesn't require the
inversion to happen on both cpus at the same time, despite having
spent weeks when it was kernel crashing systems at boot (which shows
it's not immediately easy to understand how that thing works by
reading the code).

static DEFINE_MUTEX(a);
static DEFINE_MUTEX(b);
static DEFINE_MUTEX(c);

void f1(void)
{
	mutex_lock(c);
	mutex_lock(a);
	mutex_lock(b);
	mutex_unlock(c);
	mutex_unlock(a);
	mutex_unlock(b);
}

void f2(void)
{
	mutex_lock(c);
	mutex_lock(b);
	mutex_lock(a);
	mutex_unlock(c);
	mutex_unlock(b);
	mutex_unlock(a);
}

void f3(void)
{
	mutex_lock(a);
	mutex_lock(b);
	mutex_unlock(a);
	mutex_unlock(b);
}

void f4(void)
{
	mutex_lock(b);
	mutex_lock(a);
	mutex_unlock(b);
	mutex_unlock(a);
}

void doit(void)
{
	f1();
	f2();
	stop_machine(f3);
	stop_machine(f4);
}

> It's very very clever and powerful.

I'm curious, does it also handle the above? I mean not a big deal to
refactor the code to shutdown lockdep warnings, but it doesn't look
that clever to me. It seems just clever the minimum enough to be
useful with its AB BA memory.

Now I see lockdep has a good point to exist, for the lock inversion
"memory" but this is by no mean a feature I can remotely like in
general given how inaccurate and prone for false positives it is. It
looks more a necessary evil to deal with, like I tried to do with my
patch (even before understanding it was more useful than what I
thought initially).

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 21:42                                     ` Arjan van de Ven
@ 2008-08-04 22:30                                       ` Andrea Arcangeli
  2008-08-04 23:38                                         ` Arjan van de Ven
  0 siblings, 1 reply; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-04 22:30 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Pekka Enberg, Peter Zijlstra, Dave Jones, Roland Dreier,
	Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel

On Mon, Aug 04, 2008 at 02:42:28PM -0700, Arjan van de Ven wrote:
> yes lockdep will only complain WHEN you take them in the wrong order
> 
> But you claimed you would for sure be in a deadlock at that point which
> is generally not correct.

I already said I didn't know about that despite having spent a fair
amount of time trying to understand why lockdep crashes systems at
boot about an year ago. I admit I didn't understand much about it and
reducing its computation time didn't look feasible, perhaps my fault,
and I'm glad if Peter found a way to make it boot after 1 year.

> this comment totally puzzles me... rather than calling you naive...
> where was this said or even implied????

It was just a logical conclusion of the statement that lockdep will
warn of the following classes of locking bugs: "lock inversion
scenarios"... There's no warning anywhere that it might not find them
at all. Or point me to where it is warned you still have to read the
code and verify it yourself, or you still risk to AB BA in
production. If it was warned I wouldn't have mentioned it, people seem
to talk like if lockdep is a checker doing static analyses of all
paths when it can't.

Partly of what I said before is true, even if I didn't understand the
actual details of the AB BA memory it has by reading the code: the BA
may happen only when system is OOM etc... so even lockdep memory may
never find it. So my warning that code might AB BA deadlock even if
lockdep doesn't warn sounds fair enough and without it I'm still
afraid it can lead to developers think everything is ok with regard to
AB BA. In any case (even if everyone already understand lockdep better
than I did before this discussion), even if I'm wrong a sign of
warning that lockdep isn't enough, can't hurt.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 22:30                                       ` Andrea Arcangeli
@ 2008-08-04 23:38                                         ` Arjan van de Ven
  2008-08-05  0:47                                           ` Andrea Arcangeli
  0 siblings, 1 reply; 73+ messages in thread
From: Arjan van de Ven @ 2008-08-04 23:38 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Pekka Enberg, Peter Zijlstra, Dave Jones, Roland Dreier,
	Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel

On Tue, 5 Aug 2008 00:30:11 +0200
Andrea Arcangeli <andrea@qumranet.com> wrote:

> On Mon, Aug 04, 2008 at 02:42:28PM -0700, Arjan van de Ven wrote:
> > yes lockdep will only complain WHEN you take them in the wrong order
> > 
> > But you claimed you would for sure be in a deadlock at that point
> > which is generally not correct.
> 
> I already said I didn't know about that despite having spent a fair
> amount of time trying to understand why lockdep crashes systems at
> boot about an year ago. I admit I didn't understand much about it and
> reducing its computation time didn't look feasible, perhaps my fault,
> and I'm glad if Peter found a way to make it boot after 1 year.

interesting; lockdep has been working for the last.. 2 1/2 years at
least, and I don't remember seeing bugreports against it from you that
would describe it as totally non-functional.


Oh well.. seems you're rather preoccupied about it; that's ok, you're
entitled to your opinion even if I don't agree with it ;-)

-- 
If you want to reach me at my work email, use arjan@linux.intel.com
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 23:38                                         ` Arjan van de Ven
@ 2008-08-05  0:47                                           ` Andrea Arcangeli
  0 siblings, 0 replies; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-05  0:47 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Pekka Enberg, Peter Zijlstra, Dave Jones, Roland Dreier,
	Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel

On Mon, Aug 04, 2008 at 04:38:16PM -0700, Arjan van de Ven wrote:
> interesting; lockdep has been working for the last.. 2 1/2 years at
> least, and I don't remember seeing bugreports against it from you that
> would describe it as totally non-functional.

I reported it to Peter. If you see David's email, I guess it can be
implied that I wasn't the only one aware that prove-locking made
certain systems non functional, I thought it was widespread knowledge
maybe not.

It's amazing that things seem to have improved on that side, it surely
gives me more confidence in prove-locking!

> Oh well.. seems you're rather preoccupied about it; that's ok, you're
> entitled to your opinion even if I don't agree with it ;-)

So let me understand better: your opinion is that all of lockdep is
useful, not just the AB BA detection?

By reading the source again after 11 months to me it still looks
check_deadlock() only has knowledge of the current context. It loops
over the task struct checking all the locks of the current task!
Combine the great feature that check_deadlock provides, with crashing
at boot, and I hope that better explains my feeling about
lockdep-prove-locking.

This check_deadlock() thing is the real core of my dislike of
prove-locking! The check_noncircular part I totally agree it's useful
now that I see it works differently than check_deadlock (when I read
it last time I thought it worked the same as check_deadlock).

check_noncircular being useful doesn't automatically make
check_deadlock useful.

And incidentally it's exactly this check_deadlock part that is
trapping on my code and that is now requiring silly changes to the
common code (the ones I did) or to make the common code even more
complex (what Peter is planning I guess).

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-04 21:09                               ` Andrea Arcangeli
                                                   ` (2 preceding siblings ...)
  2008-08-04 21:57                                 ` David Miller
@ 2008-08-05  2:00                                 ` Roland Dreier
  2008-08-05  2:18                                   ` Andrea Arcangeli
  3 siblings, 1 reply; 73+ messages in thread
From: Roland Dreier @ 2008-08-05  2:00 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Peter Zijlstra, Dave Jones, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel, arjan

 > The point is that this is a runtime evaluation of lock orders, if
 > runtime isn't the lucky one that reproduces the deadlock, it'll find
 > nothing at all.

I think the point you miss is that lockdep can report a potential
deadlock, even if the deadlock does not actually occur.  For example
suppose there is an AB-BA deadlock somewhere.  For this to actually
trigger, we have to have one CPU running the AB code path at exactly the
moment another CPU runs the BA code path, with the right timing so one
CPU holds A and tries to grab B while the other CPU already holds B.

With lockdep, we just have to have the AB code path run once at any
point, and then the BA code path run at any later time (even days after
the AB code path has released all the locks).  And then we get a
warning dump that explains the exact potential deadlock.

 - R.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-05  2:00                                 ` Roland Dreier
@ 2008-08-05  2:18                                   ` Andrea Arcangeli
  2008-08-05 12:02                                     ` Roland Dreier
  0 siblings, 1 reply; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-05  2:18 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Peter Zijlstra, Dave Jones, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel, arjan

On Mon, Aug 04, 2008 at 07:00:03PM -0700, Roland Dreier wrote:
>  > The point is that this is a runtime evaluation of lock orders, if
>  > runtime isn't the lucky one that reproduces the deadlock, it'll find
>  > nothing at all.
> 
> I think the point you miss is that lockdep can report a potential
> deadlock, even if the deadlock does not actually occur.  For example
> suppose there is an AB-BA deadlock somewhere.  For this to actually
> trigger, we have to have one CPU running the AB code path at exactly the
> moment another CPU runs the BA code path, with the right timing so one
> CPU holds A and tries to grab B while the other CPU already holds B.
> 
> With lockdep, we just have to have the AB code path run once at any
> point, and then the BA code path run at any later time (even days after
> the AB code path has released all the locks).  And then we get a
> warning dump that explains the exact potential deadlock.

Thanks a lot for the detailed explanation of check_noncircular. I
agree check_noncircular is surely a good argument not to get rid of
prove-locking as a whole. But check_noncircular is also a red-herring
in this context. It's not check_noncircular trapping here,
check_deadlock traps with false positives instead. The question is
what are those false positives buying us? To avoid a developer to
press sysrq+p or break on kgdb?

Let's focus on check_deadlock->print_deadlock_bug and somebody who's
not beyond the point please explain what print_deadlock_bug reports
that does not actually occur and why it's a good idea to change the
common code to accommodate for its false positives instead of getting
rid of it for good.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 1/7] lockdep: Fix combinatorial explosion in lock subgraph traversal.
  2008-08-04 13:03 ` [RFC][PATCH 1/7] lockdep: Fix combinatorial explosion in lock subgraph traversal Peter Zijlstra
@ 2008-08-05  8:34   ` David Miller
  2008-08-05  8:46     ` Peter Zijlstra
  0 siblings, 1 reply; 73+ messages in thread
From: David Miller @ 2008-08-05  8:34 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: torvalds, jeremy, hugh, mingo, akpm, linux-kernel, davej

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 04 Aug 2008 15:03:18 +0200

> When we traverse the graph, either forwards or backwards, we
> are interested in whether a certain property exists somewhere
> in a node reachable in the graph.
 ...
> Signed-off-by: David S. Miller <davem@davemloft.net>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

Peter, when Ingo added this to his tree he needed to add a build
fix of some sort.  Did you include that here?

I think the problem was that if CONFIG_PROVE_LOCKING is not
set, we need to add nop inline versions of lockdep_count_*_deps()
in kernel/lockdep_internals.h

See:

http://marc.info/?l=linux-kernel&m=121758260130275&w=2






^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 2/7] lockdep: lock_set_subclass - reset a held locks subclass
  2008-08-04 13:03 ` [RFC][PATCH 2/7] lockdep: lock_set_subclass - reset a held locks subclass Peter Zijlstra
@ 2008-08-05  8:35   ` David Miller
  0 siblings, 0 replies; 73+ messages in thread
From: David Miller @ 2008-08-05  8:35 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: torvalds, jeremy, hugh, mingo, akpm, linux-kernel, davej

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 04 Aug 2008 15:03:19 +0200

> this can be used to reset a held lock's subclass, for arbitrary-depth
> iterated data structures such as trees or lists which have per-node
> locks.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Signed-off-by: Ingo Molnar <mingo@elte.hu>

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 3/7] lockdep: re-annotate scheduler runqueues
  2008-08-04 13:03 ` [RFC][PATCH 3/7] lockdep: re-annotate scheduler runqueues Peter Zijlstra
@ 2008-08-05  8:35   ` David Miller
  0 siblings, 0 replies; 73+ messages in thread
From: David Miller @ 2008-08-05  8:35 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: torvalds, jeremy, hugh, mingo, akpm, linux-kernel, davej

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 04 Aug 2008 15:03:20 +0200

> Instead of using a per-rq lock class, use the regular nesting operations.
> 
> However, take extra care with double_lock_balance() as it can release the
> already held rq->lock (and therefore change its nesting class).
> 
> So what can happen is:
> 
>  spin_lock(rq->lock);	// this rq subclass 0
> 
>  double_lock_balance(rq, other_rq);
>    // release rq
>    // acquire other_rq->lock subclass 0
>    // acquire rq->lock subclass 1
> 
>  spin_unlock(other_rq->lock);
> 
> leaving you with rq->lock in subclass 1
> 
> So a subsequent double_lock_balance() call can try to nest a subclass 1
> lock while already holding a subclass 1 lock.
> 
> Fix this by introducing double_unlock_balance() which releases the other
> rq's lock, but also re-sets the subclass for this rq's lock to 0.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

Acked-by: David S. Miller <davem@davemloft.net>

I also tested this on 64-cpu and 128-cpu systems.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 1/7] lockdep: Fix combinatorial explosion in lock subgraph traversal.
  2008-08-05  8:34   ` David Miller
@ 2008-08-05  8:46     ` Peter Zijlstra
  2008-08-13  3:48       ` Tim Pepper
  0 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-05  8:46 UTC (permalink / raw)
  To: David Miller; +Cc: torvalds, jeremy, hugh, mingo, akpm, linux-kernel, davej

On Tue, 2008-08-05 at 01:34 -0700, David Miller wrote:
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Mon, 04 Aug 2008 15:03:18 +0200
> 
> > When we traverse the graph, either forwards or backwards, we
> > are interested in whether a certain property exists somewhere
> > in a node reachable in the graph.
>  ...
> > Signed-off-by: David S. Miller <davem@davemloft.net>
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> 
> Peter, when Ingo added this to his tree he needed to add a build
> fix of some sort.  Did you include that here?
> 
> I think the problem was that if CONFIG_PROVE_LOCKING is not
> set, we need to add nop inline versions of lockdep_count_*_deps()
> in kernel/lockdep_internals.h
> 
> See:
> 
> http://marc.info/?l=linux-kernel&m=121758260130275&w=2

Probably missed that, thanks for reminding me.

Andrew, please pick up:

---
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 1 Aug 2008 11:23:50 +0200
Subject: [PATCH] lockdep: build fix

fix:

 kernel/built-in.o: In function `lockdep_stats_show':
 lockdep_proc.c:(.text+0x3cb2f): undefined reference to `lockdep_count_forward_deps'
 kernel/built-in.o: In function `l_show':
 lockdep_proc.c:(.text+0x3d02b): undefined reference to `lockdep_count_forward_deps'
 lockdep_proc.c:(.text+0x3d047): undefined reference to `lockdep_count_backward_deps'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_internals.h |   13 +++++++++++++
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 68d44ec..f5c6a14 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -53,8 +53,21 @@ extern unsigned int nr_process_chains;
 extern unsigned int max_lockdep_depth;
 extern unsigned int max_recursion_depth;
 
+#ifdef CONFIG_PROVE_LOCKING
 extern unsigned long lockdep_count_forward_deps(struct lock_class *);
 extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#else
+static inline unsigned long
+lockdep_count_forward_deps(struct lock_class *class)
+{
+	return 0;
+}
+static inline unsigned long
+lockdep_count_backward_deps(struct lock_class *class)
+{
+	return 0;
+}
+#endif
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*



^ permalink raw reply related	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-05  2:18                                   ` Andrea Arcangeli
@ 2008-08-05 12:02                                     ` Roland Dreier
  2008-08-05 12:20                                       ` Andrea Arcangeli
  0 siblings, 1 reply; 73+ messages in thread
From: Roland Dreier @ 2008-08-05 12:02 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Peter Zijlstra, Dave Jones, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel, arjan

 > Let's focus on check_deadlock->print_deadlock_bug and somebody who's
 > not beyond the point please explain what print_deadlock_bug reports
 > that does not actually occur and why it's a good idea to change the
 > common code to accommodate for its false positives instead of getting
 > rid of it for good.

check_deadlock operates on classes of locks, so it can warn about
potential deadlocks, eg if we have

	foo(obj1, obj2)
	{
		lock(obj1);
		lock(obj2);
		...

then foo(obj, obj); is a deadlock but lockdep can warn about foo(obj,
different_obj) without triggering the deadlock in reality.  Of course
this leads to false positives, and we sometimes have to change correct
code to help lockdep, but usually such rewriting leads to simpler
clearer better locking anyway.

 - R.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks
  2008-08-05 12:02                                     ` Roland Dreier
@ 2008-08-05 12:20                                       ` Andrea Arcangeli
  0 siblings, 0 replies; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-05 12:20 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Peter Zijlstra, Dave Jones, Linus Torvalds, David Miller, jeremy,
	hugh, mingo, akpm, linux-kernel, arjan

On Tue, Aug 05, 2008 at 05:02:07AM -0700, Roland Dreier wrote:
> check_deadlock operates on classes of locks, so it can warn about
> potential deadlocks, eg if we have
> 
> 	foo(obj1, obj2)
> 	{
> 		lock(obj1);
> 		lock(obj2);
> 		...
> 
> then foo(obj, obj); is a deadlock but lockdep can warn about foo(obj,
> different_obj) without triggering the deadlock in reality.  Of course
> this leads to false positives, and we sometimes have to change correct
> code to help lockdep, but usually such rewriting leads to simpler
> clearer better locking anyway.

It surely doesn't lead to simpler and clearer better locking in the
case we're discussing here, and I don't know of other cases where it
leads to better locking but feel free to make examples.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 4/7] lockdep: shrink held_lock structure
  2008-08-04 13:03 ` [RFC][PATCH 4/7] lockdep: shrink held_lock structure Peter Zijlstra
@ 2008-08-05 16:08   ` Peter Zijlstra
  2008-08-06  7:17   ` Peter Zijlstra
  1 sibling, 0 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-05 16:08 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, jeremy, hugh, mingo, akpm, linux-kernel, davej

Replacement patch - on account of the old one having a bug that
basically disables lockdep.

Index: linux-2.6/include/linux/lockdep.h
===================================================================
--- linux-2.6.orig/include/linux/lockdep.h
+++ linux-2.6/include/linux/lockdep.h
@@ -229,10 +229,10 @@ struct held_lock {
 	 * The following field is used to detect when we cross into an
 	 * interrupt context:
 	 */
-	unsigned int irq_context:1;
+	unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
 	unsigned int trylock:1;
-	unsigned int read:2;
-	unsigned int check:1;
+	unsigned int read:2;        /* see lock_acquire() comment */
+	unsigned int check:2;       /* see lock_acquire() comment */
 	unsigned int hardirqs_off:1;
 };
 


---
Subject: lockdep: shrink held_lock structure
From: Dave Jones <davej@redhat.com>

struct held_lock {
        u64                        prev_chain_key;       /*     0     8 */
        struct lock_class *        class;                /*     8     8 */
        long unsigned int          acquire_ip;           /*    16     8 */
        struct lockdep_map *       instance;             /*    24     8 */
        int                        irq_context;          /*    32     4 */
        int                        trylock;              /*    36     4 */
        int                        read;                 /*    40     4 */
        int                        check;                /*    44     4 */
        int                        hardirqs_off;         /*    48     4 */

        /* size: 56, cachelines: 1 */
        /* padding: 4 */
        /* last cacheline: 56 bytes */
};

struct held_lock {
        u64                        prev_chain_key;       /*     0     8 */
        long unsigned int          acquire_ip;           /*     8     8 */
        struct lockdep_map *       instance;             /*    16     8 */
        unsigned int               class_idx:11;         /*    24:21  4 */
        unsigned int               irq_context:2;        /*    24:19  4 */
        unsigned int               trylock:1;            /*    24:18  4 */
        unsigned int               read:2;               /*    24:16  4 */
        unsigned int               check:2;              /*    24:14  4 */
        unsigned int               hardirqs_off:1;       /*    24:13  4 */

        /* size: 32, cachelines: 1 */
        /* padding: 4 */
        /* bit_padding: 13 bits */
        /* last cacheline: 32 bytes */
};

[mingo@elte.hu: shrunk hlock->class too]
[peterz@infradead.org: fixup bit sizes]
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/lockdep.h    |   16 +++---
 kernel/lockdep.c           |  113 +++++++++++++++++++++++++--------------------
 kernel/lockdep_internals.h |    3 -
 3 files changed, 74 insertions(+), 58 deletions(-)

Index: linux-2.6/include/linux/lockdep.h
===================================================================
--- linux-2.6.orig/include/linux/lockdep.h
+++ linux-2.6/include/linux/lockdep.h
@@ -190,6 +190,9 @@ struct lock_chain {
 	u64				chain_key;
 };
 
+#define MAX_LOCKDEP_KEYS_BITS		11
+#define MAX_LOCKDEP_KEYS		(1UL << MAX_LOCKDEP_KEYS_BITS)
+
 struct held_lock {
 	/*
 	 * One-way hash of the dependency chain up to this point. We
@@ -206,14 +209,13 @@ struct held_lock {
 	 * with zero), here we store the previous hash value:
 	 */
 	u64				prev_chain_key;
-	struct lock_class		*class;
 	unsigned long			acquire_ip;
 	struct lockdep_map		*instance;
-
 #ifdef CONFIG_LOCK_STAT
 	u64 				waittime_stamp;
 	u64				holdtime_stamp;
 #endif
+	unsigned int			class_idx:MAX_LOCKDEP_KEYS_BITS;
 	/*
 	 * The lock-stack is unified in that the lock chains of interrupt
 	 * contexts nest ontop of process context chains, but we 'separate'
@@ -227,11 +229,11 @@ struct held_lock {
 	 * The following field is used to detect when we cross into an
 	 * interrupt context:
 	 */
-	int				irq_context;
-	int				trylock;
-	int				read;
-	int				check;
-	int				hardirqs_off;
+	unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
+	unsigned int trylock:1;
+	unsigned int read:2;        /* see lock_acquire() comment */
+	unsigned int check:2;       /* see lock_acquire() comment */
+	unsigned int hardirqs_off:1;
 };
 
 /*
Index: linux-2.6/kernel/lockdep.c
===================================================================
--- linux-2.6.orig/kernel/lockdep.c
+++ linux-2.6/kernel/lockdep.c
@@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX
 unsigned long nr_lock_classes;
 static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 
+static inline struct lock_class *hlock_class(struct held_lock *hlock)
+{
+	if (!hlock->class_idx) {
+		DEBUG_LOCKS_WARN_ON(1);
+		return NULL;
+	}
+	return lock_classes + hlock->class_idx - 1;
+}
+
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
 
@@ -222,7 +231,7 @@ static void lock_release_holdtime(struct
 
 	holdtime = sched_clock() - hlock->holdtime_stamp;
 
-	stats = get_lock_stats(hlock->class);
+	stats = get_lock_stats(hlock_class(hlock));
 	if (hlock->read)
 		lock_time_inc(&stats->read_holdtime, holdtime);
 	else
@@ -518,7 +527,7 @@ static void print_lockdep_cache(struct l
 
 static void print_lock(struct held_lock *hlock)
 {
-	print_lock_name(hlock->class);
+	print_lock_name(hlock_class(hlock));
 	printk(", at: ");
 	print_ip_sym(hlock->acquire_ip);
 }
@@ -948,7 +957,7 @@ static noinline int print_circular_bug_t
 	if (debug_locks_silent)
 		return 0;
 
-	this.class = check_source->class;
+	this.class = hlock_class(check_source);
 	if (!save_trace(&this.trace))
 		return 0;
 
@@ -1057,7 +1066,7 @@ check_noncircular(struct lock_class *sou
 	 * Check this lock's dependency list:
 	 */
 	list_for_each_entry(entry, &source->locks_after, entry) {
-		if (entry->class == check_target->class)
+		if (entry->class == hlock_class(check_target))
 			return print_circular_bug_header(entry, depth+1);
 		debug_atomic_inc(&nr_cyclic_checks);
 		if (!check_noncircular(entry->class, depth+1))
@@ -1150,6 +1159,11 @@ find_usage_backwards(struct lock_class *
 		return 2;
 	}
 
+	if (!source && debug_locks_off_graph_unlock()) {
+		WARN_ON(1);
+		return 0;
+	}
+
 	/*
 	 * Check this lock's dependency list:
 	 */
@@ -1189,9 +1203,9 @@ print_bad_irq_dependency(struct task_str
 	printk("\nand this task is already holding:\n");
 	print_lock(prev);
 	printk("which would create a new lock dependency:\n");
-	print_lock_name(prev->class);
+	print_lock_name(hlock_class(prev));
 	printk(" ->");
-	print_lock_name(next->class);
+	print_lock_name(hlock_class(next));
 	printk("\n");
 
 	printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
@@ -1232,12 +1246,12 @@ check_usage(struct task_struct *curr, st
 
 	find_usage_bit = bit_backwards;
 	/* fills in <backwards_match> */
-	ret = find_usage_backwards(prev->class, 0);
+	ret = find_usage_backwards(hlock_class(prev), 0);
 	if (!ret || ret == 1)
 		return ret;
 
 	find_usage_bit = bit_forwards;
-	ret = find_usage_forwards(next->class, 0);
+	ret = find_usage_forwards(hlock_class(next), 0);
 	if (!ret || ret == 1)
 		return ret;
 	/* ret == 2 */
@@ -1362,7 +1376,7 @@ check_deadlock(struct task_struct *curr,
 
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		prev = curr->held_locks + i;
-		if (prev->class != next->class)
+		if (hlock_class(prev) != hlock_class(next))
 			continue;
 		/*
 		 * Allow read-after-read recursion of the same
@@ -1415,7 +1429,7 @@ check_prev_add(struct task_struct *curr,
 	 */
 	check_source = next;
 	check_target = prev;
-	if (!(check_noncircular(next->class, 0)))
+	if (!(check_noncircular(hlock_class(next), 0)))
 		return print_circular_bug_tail();
 
 	if (!check_prev_add_irq(curr, prev, next))
@@ -1439,8 +1453,8 @@ check_prev_add(struct task_struct *curr,
 	 *  chains - the second one will be new, but L1 already has
 	 *  L2 added to its dependency list, due to the first chain.)
 	 */
-	list_for_each_entry(entry, &prev->class->locks_after, entry) {
-		if (entry->class == next->class) {
+	list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
+		if (entry->class == hlock_class(next)) {
 			if (distance == 1)
 				entry->distance = 1;
 			return 2;
@@ -1451,26 +1465,28 @@ check_prev_add(struct task_struct *curr,
 	 * Ok, all validations passed, add the new lock
 	 * to the previous lock's dependency list:
 	 */
-	ret = add_lock_to_list(prev->class, next->class,
-			       &prev->class->locks_after, next->acquire_ip, distance);
+	ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+			       &hlock_class(prev)->locks_after,
+			       next->acquire_ip, distance);
 
 	if (!ret)
 		return 0;
 
-	ret = add_lock_to_list(next->class, prev->class,
-			       &next->class->locks_before, next->acquire_ip, distance);
+	ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+			       &hlock_class(next)->locks_before,
+			       next->acquire_ip, distance);
 	if (!ret)
 		return 0;
 
 	/*
 	 * Debugging printouts:
 	 */
-	if (verbose(prev->class) || verbose(next->class)) {
+	if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
 		graph_unlock();
 		printk("\n new dependency: ");
-		print_lock_name(prev->class);
+		print_lock_name(hlock_class(prev));
 		printk(" => ");
-		print_lock_name(next->class);
+		print_lock_name(hlock_class(next));
 		printk("\n");
 		dump_stack();
 		return graph_lock();
@@ -1567,7 +1583,7 @@ static inline int lookup_chain_cache(str
 				     struct held_lock *hlock,
 				     u64 chain_key)
 {
-	struct lock_class *class = hlock->class;
+	struct lock_class *class = hlock_class(hlock);
 	struct list_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
 	struct held_lock *hlock_curr, *hlock_next;
@@ -1640,7 +1656,7 @@ cache_hit:
 	if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
 		chain->base = cn;
 		for (j = 0; j < chain->depth - 1; j++, i++) {
-			int lock_id = curr->held_locks[i].class - lock_classes;
+			int lock_id = curr->held_locks[i].class_idx - 1;
 			chain_hlocks[chain->base + j] = lock_id;
 		}
 		chain_hlocks[chain->base + j] = class - lock_classes;
@@ -1736,7 +1752,7 @@ static void check_chain_key(struct task_
 			WARN_ON(1);
 			return;
 		}
-		id = hlock->class - lock_classes;
+		id = hlock->class_idx - 1;
 		if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
 			return;
 
@@ -1781,7 +1797,7 @@ print_usage_bug(struct task_struct *curr
 	print_lock(this);
 
 	printk("{%s} state was registered at:\n", usage_str[prev_bit]);
-	print_stack_trace(this->class->usage_traces + prev_bit, 1);
+	print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
 
 	print_irqtrace_events(curr);
 	printk("\nother info that might help us debug this:\n");
@@ -1800,7 +1816,7 @@ static inline int
 valid_state(struct task_struct *curr, struct held_lock *this,
 	    enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
 {
-	if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+	if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
 		return print_usage_bug(curr, this, bad_bit, new_bit);
 	return 1;
 }
@@ -1839,7 +1855,7 @@ print_irq_inversion_bug(struct task_stru
 	lockdep_print_held_locks(curr);
 
 	printk("\nthe first lock's dependencies:\n");
-	print_lock_dependencies(this->class, 0);
+	print_lock_dependencies(hlock_class(this), 0);
 
 	printk("\nthe second lock's dependencies:\n");
 	print_lock_dependencies(other, 0);
@@ -1862,7 +1878,7 @@ check_usage_forwards(struct task_struct 
 
 	find_usage_bit = bit;
 	/* fills in <forwards_match> */
-	ret = find_usage_forwards(this->class, 0);
+	ret = find_usage_forwards(hlock_class(this), 0);
 	if (!ret || ret == 1)
 		return ret;
 
@@ -1881,7 +1897,7 @@ check_usage_backwards(struct task_struct
 
 	find_usage_bit = bit;
 	/* fills in <backwards_match> */
-	ret = find_usage_backwards(this->class, 0);
+	ret = find_usage_backwards(hlock_class(this), 0);
 	if (!ret || ret == 1)
 		return ret;
 
@@ -1947,7 +1963,7 @@ static int mark_lock_irq(struct task_str
 				LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
 			return 0;
 #endif
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ:
@@ -1972,7 +1988,7 @@ static int mark_lock_irq(struct task_str
 				LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
 			return 0;
 #endif
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_HARDIRQ_READ:
@@ -1985,7 +2001,7 @@ static int mark_lock_irq(struct task_str
 		if (!check_usage_forwards(curr, this,
 					  LOCK_ENABLED_HARDIRQS, "hard"))
 			return 0;
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ_READ:
@@ -1998,7 +2014,7 @@ static int mark_lock_irq(struct task_str
 		if (!check_usage_forwards(curr, this,
 					  LOCK_ENABLED_SOFTIRQS, "soft"))
 			return 0;
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_HARDIRQS:
@@ -2024,7 +2040,7 @@ static int mark_lock_irq(struct task_str
 				   LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
 			return 0;
 #endif
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_SOFTIRQS:
@@ -2050,7 +2066,7 @@ static int mark_lock_irq(struct task_str
 				   LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
 			return 0;
 #endif
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_HARDIRQS_READ:
@@ -2065,7 +2081,7 @@ static int mark_lock_irq(struct task_str
 					   LOCK_USED_IN_HARDIRQ, "hard"))
 			return 0;
 #endif
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_SOFTIRQS_READ:
@@ -2080,7 +2096,7 @@ static int mark_lock_irq(struct task_str
 					   LOCK_USED_IN_SOFTIRQ, "soft"))
 			return 0;
 #endif
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	default:
@@ -2396,7 +2412,7 @@ static int mark_lock(struct task_struct 
 	 * If already set then do not dirty the cacheline,
 	 * nor do any checks:
 	 */
-	if (likely(this->class->usage_mask & new_mask))
+	if (likely(hlock_class(this)->usage_mask & new_mask))
 		return 1;
 
 	if (!graph_lock())
@@ -2404,14 +2420,14 @@ static int mark_lock(struct task_struct 
 	/*
 	 * Make sure we didnt race:
 	 */
-	if (unlikely(this->class->usage_mask & new_mask)) {
+	if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
 		graph_unlock();
 		return 1;
 	}
 
-	this->class->usage_mask |= new_mask;
+	hlock_class(this)->usage_mask |= new_mask;
 
-	if (!save_trace(this->class->usage_traces + new_bit))
+	if (!save_trace(hlock_class(this)->usage_traces + new_bit))
 		return 0;
 
 	switch (new_bit) {
@@ -2545,8 +2561,9 @@ static int __lock_acquire(struct lockdep
 		return 0;
 
 	hlock = curr->held_locks + depth;
-
-	hlock->class = class;
+	if (DEBUG_LOCKS_WARN_ON(!class))
+		return 0;
+	hlock->class_idx = class - lock_classes + 1;
 	hlock->acquire_ip = ip;
 	hlock->instance = lock;
 	hlock->trylock = trylock;
@@ -2690,7 +2707,7 @@ __lock_set_subclass(struct lockdep_map *
 
 found_it:
 	class = register_lock_class(lock, subclass, 0);
-	hlock->class = class;
+	hlock->class_idx = class - lock_classes + 1;
 
 	curr->lockdep_depth = i;
 	curr->curr_chain_key = hlock->prev_chain_key;
@@ -2698,7 +2715,7 @@ found_it:
 	for (; i < depth; i++) {
 		hlock = curr->held_locks + i;
 		if (!__lock_acquire(hlock->instance,
-			hlock->class->subclass, hlock->trylock,
+			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
 				hlock->acquire_ip))
 			return 0;
@@ -2759,7 +2776,7 @@ found_it:
 	for (i++; i < depth; i++) {
 		hlock = curr->held_locks + i;
 		if (!__lock_acquire(hlock->instance,
-			hlock->class->subclass, hlock->trylock,
+			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
 				hlock->acquire_ip))
 			return 0;
@@ -2804,7 +2821,7 @@ static int lock_release_nested(struct ta
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 	hlock->prev_chain_key = 0;
-	hlock->class = NULL;
+	hlock_class(hlock) = NULL;
 	hlock->acquire_ip = 0;
 	hlock->irq_context = 0;
 #endif
@@ -3000,9 +3017,9 @@ __lock_contended(struct lockdep_map *loc
 found_it:
 	hlock->waittime_stamp = sched_clock();
 
-	point = lock_contention_point(hlock->class, ip);
+	point = lock_contention_point(hlock_class(hlock), ip);
 
-	stats = get_lock_stats(hlock->class);
+	stats = get_lock_stats(hlock_class(hlock));
 	if (point < ARRAY_SIZE(stats->contention_point))
 		stats->contention_point[i]++;
 	if (lock->cpu != smp_processor_id())
@@ -3048,7 +3065,7 @@ found_it:
 		hlock->holdtime_stamp = now;
 	}
 
-	stats = get_lock_stats(hlock->class);
+	stats = get_lock_stats(hlock_class(hlock));
 	if (waittime) {
 		if (hlock->read)
 			lock_time_inc(&stats->read_waittime, waittime);
Index: linux-2.6/kernel/lockdep_internals.h
===================================================================
--- linux-2.6.orig/kernel/lockdep_internals.h
+++ linux-2.6/kernel/lockdep_internals.h
@@ -17,9 +17,6 @@
  */
 #define MAX_LOCKDEP_ENTRIES	8192UL
 
-#define MAX_LOCKDEP_KEYS_BITS	11
-#define MAX_LOCKDEP_KEYS	(1UL << MAX_LOCKDEP_KEYS_BITS)
-
 #define MAX_LOCKDEP_CHAINS_BITS	14
 #define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
 



^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 4/7] lockdep: shrink held_lock structure
  2008-08-04 13:03 ` [RFC][PATCH 4/7] lockdep: shrink held_lock structure Peter Zijlstra
  2008-08-05 16:08   ` Peter Zijlstra
@ 2008-08-06  7:17   ` Peter Zijlstra
  1 sibling, 0 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-06  7:17 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, jeremy, hugh, mingo, akpm, linux-kernel, davej

A build fix, probably run-away search-'n-replace damage..

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
Index: linux-2.6/kernel/lockdep.c
===================================================================
--- linux-2.6.orig/kernel/lockdep.c
+++ linux-2.6/kernel/lockdep.c
@@ -2821,7 +2821,7 @@ static int lock_release_nested(struct ta
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 	hlock->prev_chain_key = 0;
-	hlock_class(hlock) = NULL;
+	hlock->class_idx = 0;
 	hlock->acquire_ip = 0;
 	hlock->irq_context = 0;
 #endif



^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 13:03 ` [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock() Peter Zijlstra
  2008-08-04 14:07   ` Roland Dreier
  2008-08-04 18:06   ` Jeremy Fitzhardinge
@ 2008-08-07 11:25   ` Peter Zijlstra
  2 siblings, 0 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-07 11:25 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, jeremy, hugh, mingo, akpm, linux-kernel, davej

Please fold (or tell me to provide a new patch)


Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
diff -u linux-2.6/include/linux/spinlock.h linux-2.6/include/linux/spinlock.h
--- linux-2.6/include/linux/spinlock.h
+++ linux-2.6/include/linux/spinlock.h
@@ -185,8 +185,8 @@
 # define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass)
 # define spin_lock_nest_lock(lock, nest_lock)				\
 	 do {								\
-		 typecheck(struct lockdep_map *, &nest_lock->dep_map);	\
-		 _spin_lock_nest_lock(lock, &nest_lock->dep_map);	\
+		 typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\
+		 _spin_lock_nest_lock(lock, &(nest_lock)->dep_map);	\
 	 } while (0)
 #else
 # define spin_lock_nested(lock, subclass) _spin_lock(lock)
diff -u linux-2.6/kernel/spinlock.c linux-2.6/kernel/spinlock.c
--- linux-2.6/kernel/spinlock.c
+++ linux-2.6/kernel/spinlock.c
@@ -323,7 +323,7 @@
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 
-EXPORT_SYMBOL(_spin_lock_nest_lock)
+EXPORT_SYMBOL(_spin_lock_nest_lock);
 
 #endif
 
--- linux-2.6.orig/include/linux/spinlock_api_smp.h
+++ linux-2.6/include/linux/spinlock_api_smp.h
@@ -22,6 +22,8 @@ int in_lock_functions(unsigned long addr
 void __lockfunc _spin_lock(spinlock_t *lock)		__acquires(lock);
 void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
 							__acquires(lock);
+void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *map)
+							__acquires(lock);
 void __lockfunc _read_lock(rwlock_t *lock)		__acquires(lock);
 void __lockfunc _write_lock(rwlock_t *lock)		__acquires(lock);
 void __lockfunc _spin_lock_bh(spinlock_t *lock)		__acquires(lock);






^ permalink raw reply	[flat|nested] 73+ messages in thread

* [RFC][PATCH 8/7] lockdep: annotate mm_take_all_locks()
  2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
                   ` (6 preceding siblings ...)
  2008-08-04 13:03 ` [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock() Peter Zijlstra
@ 2008-08-07 11:25 ` Peter Zijlstra
  2008-08-07 11:25 ` [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order Peter Zijlstra
  2008-08-11 10:08 ` [RFC][PATCH 0/7] lockdep Ingo Molnar
  9 siblings, 0 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-07 11:25 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, jeremy, hugh, mingo, akpm, linux-kernel, davej,
	Andrea Arcangeli, Roland Dreier

Subject: lockdep: annotate mm_take_all_locks()

The nesting is correct due to holding mmap_sem, use the new annotation
to annotate this.

This doesn't solve the MAX_LOCK_DEPTH issue, but 48 seems enough for
the current kvm usage, which does this early on in the life of an mm.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 mm/mmap.c |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -2273,14 +2273,14 @@ int install_special_mapping(struct mm_st
 
 static DEFINE_MUTEX(mm_all_locks_mutex);
 
-static void vm_lock_anon_vma(struct anon_vma *anon_vma)
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
 	if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
 		/*
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
 		 */
-		spin_lock(&anon_vma->lock);
+		spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
 		/*
 		 * We can safely modify head.next after taking the
 		 * anon_vma->lock. If some other vma in this mm shares
@@ -2296,7 +2296,7 @@ static void vm_lock_anon_vma(struct anon
 	}
 }
 
-static void vm_lock_mapping(struct address_space *mapping)
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 {
 	if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
 		/*
@@ -2310,7 +2310,7 @@ static void vm_lock_mapping(struct addre
 		 */
 		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
 			BUG();
-		spin_lock(&mapping->i_mmap_lock);
+		spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
 	}
 }
 
@@ -2359,9 +2359,9 @@ int mm_take_all_locks(struct mm_struct *
 		if (signal_pending(current))
 			goto out_unlock;
 		if (vma->anon_vma)
-			vm_lock_anon_vma(vma->anon_vma);
+			vm_lock_anon_vma(mm, vma->anon_vma);
 		if (vma->vm_file && vma->vm_file->f_mapping)
-			vm_lock_mapping(vma->vm_file->f_mapping);
+			vm_lock_mapping(mm, vma->vm_file->f_mapping);
 	}
 	ret = 0;
 



^ permalink raw reply	[flat|nested] 73+ messages in thread

* [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order
  2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
                   ` (7 preceding siblings ...)
  2008-08-07 11:25 ` [RFC][PATCH 8/7] lockdep: annotate mm_take_all_locks() Peter Zijlstra
@ 2008-08-07 11:25 ` Peter Zijlstra
  2008-08-07 12:14   ` Hugh Dickins
  2008-08-07 21:46   ` Andrea Arcangeli
  2008-08-11 10:08 ` [RFC][PATCH 0/7] lockdep Ingo Molnar
  9 siblings, 2 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-07 11:25 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, jeremy, hugh, mingo, akpm, linux-kernel, davej

Subject: mm: fix mm_take_all_locks() locking order

Lockdep spotted:

=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.27-rc1 #270
-------------------------------------------------------
qemu-kvm/2033 is trying to acquire lock:
 (&inode->i_data.i_mmap_lock){----}, at: [<ffffffff802996cc>] mm_take_all_locks+0xc2/0xea

but task is already holding lock:
 (&anon_vma->lock){----}, at: [<ffffffff8029967a>] mm_take_all_locks+0x70/0xea

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (&anon_vma->lock){----}:
       [<ffffffff8025cd37>] __lock_acquire+0x11be/0x14d2
       [<ffffffff8025d0a9>] lock_acquire+0x5e/0x7a
       [<ffffffff804c655b>] _spin_lock+0x3b/0x47
       [<ffffffff8029a2ef>] vma_adjust+0x200/0x444
       [<ffffffff8029a662>] split_vma+0x12f/0x146
       [<ffffffff8029bc60>] mprotect_fixup+0x13c/0x536
       [<ffffffff8029c203>] sys_mprotect+0x1a9/0x21e
       [<ffffffff8020c0db>] system_call_fastpath+0x16/0x1b
       [<ffffffffffffffff>] 0xffffffffffffffff

-> #0 (&inode->i_data.i_mmap_lock){----}:
       [<ffffffff8025ca54>] __lock_acquire+0xedb/0x14d2
       [<ffffffff8025d397>] lock_release_non_nested+0x1c2/0x219
       [<ffffffff8025d515>] lock_release+0x127/0x14a
       [<ffffffff804c6403>] _spin_unlock+0x1e/0x50
       [<ffffffff802995d9>] mm_drop_all_locks+0x7f/0xb0
       [<ffffffff802a965d>] do_mmu_notifier_register+0xe2/0x112
       [<ffffffff802a96a8>] mmu_notifier_register+0xe/0x10
       [<ffffffffa0043b6b>] kvm_dev_ioctl+0x11e/0x287 [kvm]
       [<ffffffff802bd0ca>] vfs_ioctl+0x2a/0x78
       [<ffffffff802bd36f>] do_vfs_ioctl+0x257/0x274
       [<ffffffff802bd3e1>] sys_ioctl+0x55/0x78
       [<ffffffff8020c0db>] system_call_fastpath+0x16/0x1b
       [<ffffffffffffffff>] 0xffffffffffffffff

other info that might help us debug this:

5 locks held by qemu-kvm/2033:
 #0:  (&mm->mmap_sem){----}, at: [<ffffffff802a95d0>] do_mmu_notifier_register+0x55/0x112
 #1:  (mm_all_locks_mutex){--..}, at: [<ffffffff8029963e>] mm_take_all_locks+0x34/0xea
 #2:  (&anon_vma->lock){----}, at: [<ffffffff8029967a>] mm_take_all_locks+0x70/0xea
 #3:  (&anon_vma->lock){----}, at: [<ffffffff8029967a>] mm_take_all_locks+0x70/0xea
 #4:  (&anon_vma->lock){----}, at: [<ffffffff8029967a>] mm_take_all_locks+0x70/0xea

stack backtrace:
Pid: 2033, comm: qemu-kvm Not tainted 2.6.27-rc1 #270

Call Trace:
 [<ffffffff8025b7c7>] print_circular_bug_tail+0xb8/0xc3
 [<ffffffff8025ca54>] __lock_acquire+0xedb/0x14d2
 [<ffffffff80259bb1>] ? add_lock_to_list+0x7e/0xad
 [<ffffffff8029967a>] ? mm_take_all_locks+0x70/0xea
 [<ffffffff8029967a>] ? mm_take_all_locks+0x70/0xea
 [<ffffffff8025d397>] lock_release_non_nested+0x1c2/0x219
 [<ffffffff802996cc>] ? mm_take_all_locks+0xc2/0xea
 [<ffffffff802996cc>] ? mm_take_all_locks+0xc2/0xea
 [<ffffffff8025b202>] ? trace_hardirqs_on_caller+0x4d/0x115
 [<ffffffff802995d9>] ? mm_drop_all_locks+0x7f/0xb0
 [<ffffffff8025d515>] lock_release+0x127/0x14a
 [<ffffffff804c6403>] _spin_unlock+0x1e/0x50
 [<ffffffff802995d9>] mm_drop_all_locks+0x7f/0xb0
 [<ffffffff802a965d>] do_mmu_notifier_register+0xe2/0x112
 [<ffffffff802a96a8>] mmu_notifier_register+0xe/0x10
 [<ffffffffa0043b6b>] kvm_dev_ioctl+0x11e/0x287 [kvm]
 [<ffffffff8033f9f2>] ? file_has_perm+0x83/0x8e
 [<ffffffff802bd0ca>] vfs_ioctl+0x2a/0x78
 [<ffffffff802bd36f>] do_vfs_ioctl+0x257/0x274
 [<ffffffff802bd3e1>] sys_ioctl+0x55/0x78
 [<ffffffff8020c0db>] system_call_fastpath+0x16/0x1b

Which the locking hierarchy in mm/rmap.c confirms as 'valid'.

Although I don't think there are any users of these two locks that don't
hold the mmap_sem, therefore the nesting is strictly ok, but since we
already have an established order, we might as well respect it.

Fix this by first taking all the mapping->i_mmap_lock instances and then
take all anon_vma->lock instances.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 mm/mmap.c |   10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -2358,11 +2358,17 @@ int mm_take_all_locks(struct mm_struct *
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		if (signal_pending(current))
 			goto out_unlock;
-		if (vma->anon_vma)
-			vm_lock_anon_vma(mm, vma->anon_vma);
 		if (vma->vm_file && vma->vm_file->f_mapping)
 			vm_lock_mapping(mm, vma->vm_file->f_mapping);
 	}
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (signal_pending(current))
+			goto out_unlock;
+		if (vma->anon_vma)
+			vm_lock_anon_vma(mm, vma->anon_vma);
+	}
+
 	ret = 0;
 
 out_unlock:



^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order
  2008-08-07 11:25 ` [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order Peter Zijlstra
@ 2008-08-07 12:14   ` Hugh Dickins
  2008-08-07 12:41     ` Peter Zijlstra
  2008-08-07 21:46   ` Andrea Arcangeli
  1 sibling, 1 reply; 73+ messages in thread
From: Hugh Dickins @ 2008-08-07 12:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, David Miller, jeremy, mingo, akpm, linux-kernel,
	davej

On Thu, 7 Aug 2008, Peter Zijlstra wrote:
> 
> Which the locking hierarchy in mm/rmap.c confirms as 'valid'.
> 
> Although I don't think there are any users of these two locks that don't
> hold the mmap_sem, therefore the nesting is strictly ok, but since we
> already have an established order, we might as well respect it.

Yes, I agree.

> Fix this by first taking all the mapping->i_mmap_lock instances and then
> take all anon_vma->lock instances.

Okay.  I'd have preferred taking anon_vma lock after i_mmap_lock
each time around the loop, but imagine that's just as problematic
for lockdep as the original.

> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

Acked-by: Hugh Dickins <hugh@veritas.com>

> ---
>  mm/mmap.c |   10 ++++++++--
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> Index: linux-2.6/mm/mmap.c
> ===================================================================
> --- linux-2.6.orig/mm/mmap.c
> +++ linux-2.6/mm/mmap.c
> @@ -2358,11 +2358,17 @@ int mm_take_all_locks(struct mm_struct *
>  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
>  		if (signal_pending(current))
>  			goto out_unlock;
> -		if (vma->anon_vma)
> -			vm_lock_anon_vma(mm, vma->anon_vma);
>  		if (vma->vm_file && vma->vm_file->f_mapping)
>  			vm_lock_mapping(mm, vma->vm_file->f_mapping);
>  	}
> +
> +	for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +		if (signal_pending(current))
> +			goto out_unlock;
> +		if (vma->anon_vma)
> +			vm_lock_anon_vma(mm, vma->anon_vma);
> +	}
> +
>  	ret = 0;
>  
>  out_unlock:

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order
  2008-08-07 12:14   ` Hugh Dickins
@ 2008-08-07 12:41     ` Peter Zijlstra
  2008-08-07 13:27       ` Hugh Dickins
  0 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-07 12:41 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Linus Torvalds, David Miller, jeremy, mingo, akpm, linux-kernel,
	davej

On Thu, 2008-08-07 at 13:14 +0100, Hugh Dickins wrote:
> On Thu, 7 Aug 2008, Peter Zijlstra wrote:
> > 
> > Which the locking hierarchy in mm/rmap.c confirms as 'valid'.
> > 
> > Although I don't think there are any users of these two locks that don't
> > hold the mmap_sem, therefore the nesting is strictly ok, but since we
> > already have an established order, we might as well respect it.
> 
> Yes, I agree.
> 
> > Fix this by first taking all the mapping->i_mmap_lock instances and then
> > take all anon_vma->lock instances.
> 
> Okay.  I'd have preferred taking anon_vma lock after i_mmap_lock
> each time around the loop, but imagine that's just as problematic
> for lockdep as the original.

I'm a little confused as to what you mean here, are you suggesting:

  for_each_vma() {
    if (file)
      vm_lock_mapping();
    if (anon)
      vm_lock_anon();
  }

?

That can still create the inverse lock order due to each vma being only
of a single type, and therefore the lock order is set by the vma order,
which can be anything.



^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order
  2008-08-07 12:41     ` Peter Zijlstra
@ 2008-08-07 13:27       ` Hugh Dickins
  0 siblings, 0 replies; 73+ messages in thread
From: Hugh Dickins @ 2008-08-07 13:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, David Miller, jeremy, mingo, akpm, linux-kernel,
	davej

On Thu, 7 Aug 2008, Peter Zijlstra wrote:
> On Thu, 2008-08-07 at 13:14 +0100, Hugh Dickins wrote:
> > On Thu, 7 Aug 2008, Peter Zijlstra wrote:
> > > 
> > > Which the locking hierarchy in mm/rmap.c confirms as 'valid'.
> > > 
> > > Although I don't think there are any users of these two locks that don't
> > > hold the mmap_sem, therefore the nesting is strictly ok, but since we
> > > already have an established order, we might as well respect it.
> > 
> > Yes, I agree.
> > 
> > > Fix this by first taking all the mapping->i_mmap_lock instances and then
> > > take all anon_vma->lock instances.
> > 
> > Okay.  I'd have preferred taking anon_vma lock after i_mmap_lock
> > each time around the loop, but imagine that's just as problematic
> > for lockdep as the original.
> 
> I'm a little confused as to what you mean here, are you suggesting:
> 
>   for_each_vma() {
>     if (file)
>       vm_lock_mapping();
>     if (anon)
>       vm_lock_anon();
>   }
> 
> ?

Yes, I would have preferred that.

> 
> That can still create the inverse lock order due to each vma being only
> of a single type, and therefore the lock order is set by the vma order,
> which can be anything.

Yes, I imagined it would be just as problematic for lockep.

Hugh

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order
  2008-08-07 11:25 ` [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order Peter Zijlstra
  2008-08-07 12:14   ` Hugh Dickins
@ 2008-08-07 21:46   ` Andrea Arcangeli
  2008-08-08  1:34     ` Andrea Arcangeli
  2008-08-08  7:16     ` Peter Zijlstra
  1 sibling, 2 replies; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-07 21:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel, davej

On Thu, Aug 07, 2008 at 01:25:49PM +0200, Peter Zijlstra wrote:
> Index: linux-2.6/mm/mmap.c
> ===================================================================
> --- linux-2.6.orig/mm/mmap.c
> +++ linux-2.6/mm/mmap.c
> @@ -2358,11 +2358,17 @@ int mm_take_all_locks(struct mm_struct *
>  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
>  		if (signal_pending(current))
>  			goto out_unlock;
> -		if (vma->anon_vma)
> -			vm_lock_anon_vma(mm, vma->anon_vma);
>  		if (vma->vm_file && vma->vm_file->f_mapping)
>  			vm_lock_mapping(mm, vma->vm_file->f_mapping);
>  	}
> +
> +	for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +		if (signal_pending(current))
> +			goto out_unlock;
> +		if (vma->anon_vma)
> +			vm_lock_anon_vma(mm, vma->anon_vma);
> +	}
> +
>  	ret = 0;

I'd suggest to add a comment to document this is just to reduce the
amount of false positives that lockdep emits, otherwise it'll be
optimized away again sooner or later I guess. I'm fine either ways
even if this runs a bit slower. Note that I _strongly_fully_ support
this kind of lockdep changes like above because those are to
accommodate check_noncircular, which is very useful feature of
prove-locking and it can find bugs that would otherwise go unnoticed
(even if it clearly emits false positives at will like above).

As for 8/7 you know my opinion from somebody who's way beyond the
point: check_deadlock should be dropped and we'd rather spend time
incorporating kdb/nlkd/whatever if sysrq/nmiwatchdog/kgdb aren't
already friendly enough for casual driver developers who may not be
able to read assembly or setup kgdb, to make a recursion deadlock
trivial to identify by other means (furthermore if those developers
can't use sysrq/nmiwatchdog/kgdb/systemtap a real debugger will help
them for many others things like singlestepping so they don't have to
add printk all over the place to debug). So I really dislike 8/7 and
furthermore I doubt it works because with regular kvm I get 57 vmas
filebacked alone, did you test 8/7, I didn't yet but I can test if you
didn't. It's true mmu notifier registration happens at the very early
stage of vm creation but most libs are loaded by the dynamic linker
before it. In any case instead of 8/7 I prefer my trylock patch than
to try to accomodate useless check_deadlock (again useless from
someone who's beyond the point, agree to disagree here).

Thanks for the help in cleaning up these lockdep bits!

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order
  2008-08-07 21:46   ` Andrea Arcangeli
@ 2008-08-08  1:34     ` Andrea Arcangeli
  2008-08-08  7:16     ` Peter Zijlstra
  1 sibling, 0 replies; 73+ messages in thread
From: Andrea Arcangeli @ 2008-08-08  1:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel, davej

On Thu, Aug 07, 2008 at 11:46:42PM +0200, Andrea Arcangeli wrote:
> furthermore I doubt it works because with regular kvm I get 57 vmas
> filebacked alone, did you test 8/7, I didn't yet but I can test if you

Correction: 57 are the vmas but counting with something less coarse
than grep lib |wc -l, there are less inodes backing those vmas than
the nesting limit, so 8/7 is ok for now (as long as check_deadlock
exists at least).

Thanks again!

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order
  2008-08-07 21:46   ` Andrea Arcangeli
  2008-08-08  1:34     ` Andrea Arcangeli
@ 2008-08-08  7:16     ` Peter Zijlstra
  1 sibling, 0 replies; 73+ messages in thread
From: Peter Zijlstra @ 2008-08-08  7:16 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Linus Torvalds, David Miller, jeremy, hugh, mingo, akpm,
	linux-kernel, davej

On Thu, 2008-08-07 at 23:46 +0200, Andrea Arcangeli wrote:

> As for 8/7 you know my opinion from somebody who's way beyond the
> point: check_deadlock should be dropped 

I'll try again one more time, don't feel obliged to reply or
anything :-)

Suppose you have objects initialized from a single site:

struct my_obj *create_obj()
{
	...
	spin_lock_init(&obj->lock);
	...

	return obj;
}

So that all these object's their locks are in a single class.

Now, put these objects into two lists without fixed order.

L1: A B C D
L2: B A D C

If you were to lock-couple your way through these lists there is a
deadlock potential.

The check_noncircular stuff will not catch this due to it all being of a
single class. The only thing we have to indicate you need to pay
attention is check_deadlock.

Yes, check_deadlock is a tad too sensitive in the amount of false
positives, but without it there are gaping holes in 'proving' lock
correctness.

Currently - if you get 100% code coverage and lockdep doesn't shout,
you're good. If we were to drop check_deadlock we can't say that
anymore.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 0/7] lockdep
  2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
                   ` (8 preceding siblings ...)
  2008-08-07 11:25 ` [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order Peter Zijlstra
@ 2008-08-11 10:08 ` Ingo Molnar
  9 siblings, 0 replies; 73+ messages in thread
From: Ingo Molnar @ 2008-08-11 10:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, David Miller, jeremy, hugh, akpm, linux-kernel,
	davej


* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Hi,
> 
> This series is my pending lockdep queue, it includes David's graph 
> optimization, my new scheduler runqueue annotation amongst others.
> 
> The part that makes this RFC are the latter few patches that add the 
> "lock protection locks" thing Linus proposed a few days ago. I've not 
> yet written a test case that actively tests this functionality, but 
> wanted to get this out so that Jeremy can see if it works for him.

update: i'm now back from vacation and i've added these patches to 
tip/core/locking, but -tip testing found a few problems with them so i 
wont push them upstream yet.

	Ingo

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 1/7] lockdep: Fix combinatorial explosion in lock subgraph traversal.
  2008-08-05  8:46     ` Peter Zijlstra
@ 2008-08-13  3:48       ` Tim Pepper
  2008-08-13 10:56         ` Ingo Molnar
  0 siblings, 1 reply; 73+ messages in thread
From: Tim Pepper @ 2008-08-13  3:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Miller, torvalds, jeremy, hugh, mingo, akpm, linux-kernel,
	davej

On Tue, Aug 5, 2008 at 1:46 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> On Tue, 2008-08-05 at 01:34 -0700, David Miller wrote:
>> See:
>>
>> http://marc.info/?l=linux-kernel&m=121758260130275&w=2
>
> Probably missed that, thanks for reminding me.
>
> Andrew, please pick up:
>
> ---
> From: Ingo Molnar <mingo@elte.hu>
> Date: Fri, 1 Aug 2008 11:23:50 +0200
> Subject: [PATCH] lockdep: build fix
>
> fix:
>
>  kernel/built-in.o: In function `lockdep_stats_show':
>  lockdep_proc.c:(.text+0x3cb2f): undefined reference to `lockdep_count_forward_deps'
>  kernel/built-in.o: In function `l_show':
>  lockdep_proc.c:(.text+0x3d02b): undefined reference to `lockdep_count_forward_deps'
>  lockdep_proc.c:(.text+0x3d047): undefined reference to `lockdep_count_backward_deps'
>
> Signed-off-by: Ingo Molnar <mingo@elte.hu>
> ---
>  kernel/lockdep_internals.h |   13 +++++++++++++
>  1 files changed, 13 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
> index 68d44ec..f5c6a14 100644
> --- a/kernel/lockdep_internals.h
> +++ b/kernel/lockdep_internals.h
> @@ -53,8 +53,21 @@ extern unsigned int nr_process_chains;
>  extern unsigned int max_lockdep_depth;
>  extern unsigned int max_recursion_depth;
>
> +#ifdef CONFIG_PROVE_LOCKING
>  extern unsigned long lockdep_count_forward_deps(struct lock_class *);
>  extern unsigned long lockdep_count_backward_deps(struct lock_class *);
> +#else
> +static inline unsigned long
> +lockdep_count_forward_deps(struct lock_class *class)
> +{
> +       return 0;
> +}
> +static inline unsigned long
> +lockdep_count_backward_deps(struct lock_class *class)
> +{
> +       return 0;
> +}
> +#endif
>
>  #ifdef CONFIG_DEBUG_LOCKDEP
>  /*
>
>


Looks like this is needed for 2.6.27-rc3 to build here.  Also noted,
regardless of the patch:

kernel/lockdep.c:580: warning: 'print_lock_dependencies' defined but not used



Tim

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 1/7] lockdep: Fix combinatorial explosion in lock subgraph traversal.
  2008-08-13  3:48       ` Tim Pepper
@ 2008-08-13 10:56         ` Ingo Molnar
  0 siblings, 0 replies; 73+ messages in thread
From: Ingo Molnar @ 2008-08-13 10:56 UTC (permalink / raw)
  To: Tim Pepper
  Cc: Peter Zijlstra, David Miller, torvalds, jeremy, hugh, akpm,
	linux-kernel, davej


* Tim Pepper <tpepper@gmail.com> wrote:

> On Tue, Aug 5, 2008 at 1:46 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > On Tue, 2008-08-05 at 01:34 -0700, David Miller wrote:
> >> See:
> >>
> >> http://marc.info/?l=linux-kernel&m=121758260130275&w=2
> >
> > Probably missed that, thanks for reminding me.
> >
> > Andrew, please pick up:
> >
> > ---
> > From: Ingo Molnar <mingo@elte.hu>
> > Date: Fri, 1 Aug 2008 11:23:50 +0200
> > Subject: [PATCH] lockdep: build fix
> >
> > fix:
> >
> >  kernel/built-in.o: In function `lockdep_stats_show':
> >  lockdep_proc.c:(.text+0x3cb2f): undefined reference to `lockdep_count_forward_deps'
> >  kernel/built-in.o: In function `l_show':
> >  lockdep_proc.c:(.text+0x3d02b): undefined reference to `lockdep_count_forward_deps'
> >  lockdep_proc.c:(.text+0x3d047): undefined reference to `lockdep_count_backward_deps'
> >
> > Signed-off-by: Ingo Molnar <mingo@elte.hu>
> > ---
> >  kernel/lockdep_internals.h |   13 +++++++++++++
> >  1 files changed, 13 insertions(+), 0 deletions(-)
> >
> > diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
> > index 68d44ec..f5c6a14 100644
> > --- a/kernel/lockdep_internals.h
> > +++ b/kernel/lockdep_internals.h
> > @@ -53,8 +53,21 @@ extern unsigned int nr_process_chains;
> >  extern unsigned int max_lockdep_depth;
> >  extern unsigned int max_recursion_depth;
> >
> > +#ifdef CONFIG_PROVE_LOCKING
> >  extern unsigned long lockdep_count_forward_deps(struct lock_class *);
> >  extern unsigned long lockdep_count_backward_deps(struct lock_class *);
> > +#else
> > +static inline unsigned long
> > +lockdep_count_forward_deps(struct lock_class *class)
> > +{
> > +       return 0;
> > +}
> > +static inline unsigned long
> > +lockdep_count_backward_deps(struct lock_class *class)
> > +{
> > +       return 0;
> > +}
> > +#endif
> >
> >  #ifdef CONFIG_DEBUG_LOCKDEP
> >  /*
> >
> >
> 
> 
> Looks like this is needed for 2.6.27-rc3 to build here. [...]

ok, i've put this into tip/core/urgent, i mistakenly applied it to 
tip/master so it didnt go to Linus in time. Below is the commit from 
tip/core/urgent.

	Ingo

----------------->
>From d6672c501852d577097f6757c311d937aca0b04b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 1 Aug 2008 11:23:50 +0200
Subject: [PATCH] lockdep: build fix

fix:

 kernel/built-in.o: In function `lockdep_stats_show':
 lockdep_proc.c:(.text+0x3cb2f): undefined reference to `lockdep_count_forward_deps'
 kernel/built-in.o: In function `l_show':
 lockdep_proc.c:(.text+0x3d02b): undefined reference to `lockdep_count_forward_deps'
 lockdep_proc.c:(.text+0x3d047): undefined reference to `lockdep_count_backward_deps'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_internals.h |   13 +++++++++++++
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 55db193..56b1969 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -50,8 +50,21 @@ extern unsigned int nr_process_chains;
 extern unsigned int max_lockdep_depth;
 extern unsigned int max_recursion_depth;
 
+#ifdef CONFIG_PROVE_LOCKING
 extern unsigned long lockdep_count_forward_deps(struct lock_class *);
 extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#else
+static inline unsigned long
+lockdep_count_forward_deps(struct lock_class *class)
+{
+	return 0;
+}
+static inline unsigned long
+lockdep_count_backward_deps(struct lock_class *class)
+{
+	return 0;
+}
+#endif
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-08-04 19:31         ` Linus Torvalds
  2008-08-04 19:39           ` Peter Zijlstra
  2008-08-04 20:16           ` Jeremy Fitzhardinge
@ 2008-10-08 15:27           ` Steven Rostedt
  2008-10-08 15:43             ` Linus Torvalds
  2008-10-08 15:52             ` Nick Piggin
  2 siblings, 2 replies; 73+ messages in thread
From: Steven Rostedt @ 2008-10-08 15:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeremy Fitzhardinge, Peter Zijlstra, David Miller, hugh, mingo,
	akpm, linux-kernel, davej, srostedt

On Mon, Aug 04, 2008 at 12:31:22PM -0700, Linus Torvalds wrote:
> 
> 
> On Mon, 4 Aug 2008, Jeremy Fitzhardinge wrote:
> > 
> > OK.  I don't actually need to do this, but I was asking for completeness.  But
> > to clarify, you only need to do the reverse unlock if you do it after
> > unlocking the outer lock?  If you're still holding the outer lock, you can
> > unlock in any order?
> 
> Release order should always be totally irrelevant, whether you hold outer 
> locks or not. Only the order of _getting_ locks matter.

Technically, you are 100% correct.

> 
> And yes, if there is an outer lock, even the order of getting locks is 
> irrelevant, as long as anybody who gets more than one inner lock always 
> holds the outer one.

But I need to disagree on a programming practice style.  Unlocking locks
in a non nested order is just bad programming practice. Unless there is
a good reason to do so, one should never release locks in a non reverse
order they were taken.

This can be a source of bugs, where people might notice an outer lock
being released and think the inner locks were too.

Lately the kernel has been going through a lot of clean ups that have
been making the kernel a much more maintainable beast. I feel we should
enforce the rule of unlocking order (again, unless there is a good
reason not to). Not for a technical reason, but just for a more
maintainable one.

-- Steve

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-10-08 15:27           ` Steven Rostedt
@ 2008-10-08 15:43             ` Linus Torvalds
  2008-10-08 16:03               ` Steven Rostedt
  2008-10-08 15:52             ` Nick Piggin
  1 sibling, 1 reply; 73+ messages in thread
From: Linus Torvalds @ 2008-10-08 15:43 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Jeremy Fitzhardinge, Peter Zijlstra, David Miller, hugh, mingo,
	akpm, linux-kernel, davej, srostedt

On Wed, 8 Oct 2008, Steven Rostedt wrote:
> > 
> > And yes, if there is an outer lock, even the order of getting locks is 
> > irrelevant, as long as anybody who gets more than one inner lock always 
> > holds the outer one.
> 
> But I need to disagree on a programming practice style.  Unlocking locks
> in a non nested order is just bad programming practice.

No it is not.

Unlocking locks in non-nested order can sometimes be very much the rigth 
thing to do, and thinking otherwise is (a) naive and (b) can generate 
totally unnecessary and pointless bugs.

The thing is, sometimes you have to do it, and imposing totally made-up 
rules ("unlocks have to nest") just confuses everybody.

The FACT is, that unlocks do not have to nest cleanly. That's a rock solid 
*FACT*. The locking order matters, and the unlocking order does not.

And if you cannot accept that as a fact, and you then say "unlock order 
should matter just to keep things nice and clean", then you end up being 
screwed and/or confused when you can't hold to the unlock order.

There are many perfectly valid reasons not to unlock in reverse order. 
Don't create make-believe rules that break those reasons for no gain.

For example:
 - let's say that you have a singly-linked list of objects.
 - you need to lock all objects, do something, and then unlock all 
   objects.
 - the *only* sane way to do that is to just traverse the list twice.
 - that means that you will unlock the objects in the same order you 
   locked them, _not_ in reverse ("nested") order.
 - if you force a rule of "unlocks must be nested", then

	YOU ARE A F*CKING MORON.

It's that simple. Don't do made-up rules that have no technical reason for 
them. 

Lock ordering matters. Unlock ordering does not. It really is that simple. 
Don't confuse the issue by claiming anything else.

		Linus

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-10-08 15:27           ` Steven Rostedt
  2008-10-08 15:43             ` Linus Torvalds
@ 2008-10-08 15:52             ` Nick Piggin
  2008-10-08 17:18               ` Steven Rostedt
  1 sibling, 1 reply; 73+ messages in thread
From: Nick Piggin @ 2008-10-08 15:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Linus Torvalds, Jeremy Fitzhardinge, Peter Zijlstra, David Miller,
	hugh, mingo, akpm, linux-kernel, davej, srostedt

On Thursday 09 October 2008 02:27, Steven Rostedt wrote:
> On Mon, Aug 04, 2008 at 12:31:22PM -0700, Linus Torvalds wrote:
> > On Mon, 4 Aug 2008, Jeremy Fitzhardinge wrote:
> > > OK.  I don't actually need to do this, but I was asking for
> > > completeness.  But to clarify, you only need to do the reverse unlock
> > > if you do it after unlocking the outer lock?  If you're still holding
> > > the outer lock, you can unlock in any order?
> >
> > Release order should always be totally irrelevant, whether you hold outer
> > locks or not. Only the order of _getting_ locks matter.
>
> Technically, you are 100% correct.
>
> > And yes, if there is an outer lock, even the order of getting locks is
> > irrelevant, as long as anybody who gets more than one inner lock always
> > holds the outer one.
>
> But I need to disagree on a programming practice style.  Unlocking locks
> in a non nested order is just bad programming practice. Unless there is
> a good reason to do so, one should never release locks in a non reverse
> order they were taken.

An outer one might be more likely to be contended, so you might want
to release it asap.

Other times, you have lock A and lock B held (like scheduler rqs).
You can say unlock(A); unlock(B);
or if (A < B) unlock(B); unlock(A); if (A > B) unlock (B);


> This can be a source of bugs, where people might notice an outer lock
> being released and think the inner locks were too.
>
> Lately the kernel has been going through a lot of clean ups that have
> been making the kernel a much more maintainable beast. I feel we should
> enforce the rule of unlocking order (again, unless there is a good
> reason not to). Not for a technical reason, but just for a more
> maintainable one.

I don't really think it would make things more maintainable, FWIW.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-10-08 15:43             ` Linus Torvalds
@ 2008-10-08 16:03               ` Steven Rostedt
  2008-10-08 16:19                 ` Linus Torvalds
  0 siblings, 1 reply; 73+ messages in thread
From: Steven Rostedt @ 2008-10-08 16:03 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Steven Rostedt, Jeremy Fitzhardinge, Peter Zijlstra, David Miller,
	hugh, mingo, akpm, linux-kernel, davej

Linus Torvalds wrote:
> On Wed, 8 Oct 2008, Steven Rostedt wrote:
>   
>>> And yes, if there is an outer lock, even the order of getting locks is 
>>> irrelevant, as long as anybody who gets more than one inner lock always 
>>> holds the outer one.
>>>       
>> But I need to disagree on a programming practice style.  Unlocking locks
>> in a non nested order is just bad programming practice.
>>     
>
> No it is not.
>
> Unlocking locks in non-nested order can sometimes be very much the rigth 
> thing to do, and thinking otherwise is (a) naive and (b) can generate 
> totally unnecessary and pointless bugs.
>
> The thing is, sometimes you have to do it, and imposing totally made-up 
> rules ("unlocks have to nest") just confuses everybody.
>
> The FACT is, that unlocks do not have to nest cleanly. That's a rock solid 
> *FACT*. The locking order matters, and the unlocking order does not.
>
> And if you cannot accept that as a fact, and you then say "unlock order 
> should matter just to keep things nice and clean", then you end up being 
> screwed and/or confused when you can't hold to the unlock order.
>
> There are many perfectly valid reasons not to unlock in reverse order. 
> Don't create make-believe rules that break those reasons for no gain.
>   

Unfortunately, you cut out my comment that I stated "unless there is a 
good reason not to",
which the below example is a good reason ;-)
> For example:
>  - let's say that you have a singly-linked list of objects.
>  - you need to lock all objects, do something, and then unlock all 
>    objects.
>  - the *only* sane way to do that is to just traverse the list twice.
>  - that means that you will unlock the objects in the same order you 
>    locked them, _not_ in reverse ("nested") order.
>  - if you force a rule of "unlocks must be nested", then
>
> 	YOU ARE A F*CKING MORON.
>
> It's that simple. Don't do made-up rules that have no technical reason for 
> them. 
>
> Lock ordering matters. Unlock ordering does not. It really is that simple. 
> Don't confuse the issue by claiming anything else.
>   

Yes, I totally agree that there are good reasons not to unlock in 
reverse order, and the example
you gave happens to be one of them.

I just find that seeing something like:

    lock(A);
    lock(B);

    [do something]

    unlock(A);
    unlock(B);

just seems to be sloppy.

I wont harp on this, it only came up in conversation in which someone 
pointed out your
post.

-- Steve


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-10-08 16:03               ` Steven Rostedt
@ 2008-10-08 16:19                 ` Linus Torvalds
  2008-10-08 16:53                   ` Steven Rostedt
  0 siblings, 1 reply; 73+ messages in thread
From: Linus Torvalds @ 2008-10-08 16:19 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Steven Rostedt, Jeremy Fitzhardinge, Peter Zijlstra, David Miller,
	hugh, mingo, akpm, linux-kernel, davej

On Wed, 8 Oct 2008, Steven Rostedt wrote:
> 
> Unfortunately, you cut out my comment that I stated "unless there is a good
> reason not to", which the below example is a good reason ;-)

Well, teh thing is, as long as you think nesting is good, you end up being 
very confused when it isn't.

> I just find that seeing something like:
> 
>    lock(A);
>    lock(B);
> 
>    [do something]
> 
>    unlock(A);
>    unlock(B);
> 
> just seems to be sloppy.

Of course you'll often get nesting almost by mistake.

For example, any normal locking that is hierarchical tends to nest 
naturally, especially if you end up using lots of small functions (which 
you should). Or simply due to error handling.

So in that sense, nesting may be a "natural" thing, but partly exactly 
_because_ it is fairly natural, we should not try to make it even more 
than that, because then when things don't nest (and it does happen), if 
the "things should nest" camp gets too strong, bugs happen.

They happen because somebody looks at the non-nesting case, and thinks 
it's sloppy, and tries to fix it. And in the process just complicates 
matters - and very likely introduces bugs.

And THAT is why people shouldn't think that locks "should" nest. Because 
they shouldn't. They often do, but we're better off making people very 
aware of the fact that 'often' isn't 'good design', it's just a matter of 
'it happens in practice' rather than 'it should be that way'.

I know for a fact that some people thought unlocking in non-nested order 
was a bug. And I believe that belief is a dangerous one.

				Linus

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-10-08 16:19                 ` Linus Torvalds
@ 2008-10-08 16:53                   ` Steven Rostedt
  0 siblings, 0 replies; 73+ messages in thread
From: Steven Rostedt @ 2008-10-08 16:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Steven Rostedt, Jeremy Fitzhardinge, Peter Zijlstra, David Miller,
	hugh, mingo, akpm, linux-kernel, davej

Linus Torvalds wrote:
>
>   
> I know for a fact that some people thought unlocking in non-nested order 
> was a bug. And I believe that belief is a dangerous one.
>   

Ah, OK. You are fighting against nesting nazis, fair enough.

I have written a bit of code where nesting was not possible (similar to
your example, but I call those traversal locking not nesting). I just 
find that
the locks should be nested when the nesting is natural. Breaking the nesting
on natural nesting locks is a bug, IMHO.  But as you know, there are several
programmers out there that can not determine the difference between natural
nesting locks and non nesting locks.

By adding such a rule, those that can not tell the difference will be 
making a
lot of needless noise, hence, it is best not to make any such rule.

Lesson learned.  I'll now go back to debugging my code.

-- Steve

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock()
  2008-10-08 15:52             ` Nick Piggin
@ 2008-10-08 17:18               ` Steven Rostedt
  0 siblings, 0 replies; 73+ messages in thread
From: Steven Rostedt @ 2008-10-08 17:18 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Steven Rostedt, Linus Torvalds, Jeremy Fitzhardinge,
	Peter Zijlstra, David Miller, hugh, mingo, akpm, linux-kernel,
	davej

Nick Piggin wrote:
>
>> This can be a source of bugs, where people might notice an outer lock
>> being released and think the inner locks were too.
>>
>> Lately the kernel has been going through a lot of clean ups that have
>> been making the kernel a much more maintainable beast. I feel we should
>> enforce the rule of unlocking order (again, unless there is a good
>> reason not to). Not for a technical reason, but just for a more
>> maintainable one.
>>     
>
> I don't really think it would make things more maintainable, FWIW.
>   

I actually did come across one bug in my lifetime where the out of 
nesting order
of unlocks caused a bug. IIRC, it had to do (as Linus mentioned) with 
lots of
little functions that required locking. One of these functions was 
between the
out of order unlocking and was taking another inner lock.

I don't remember the exact details, but it was something that made me 
try to nest
locks and unlocks nicely when possible. And as Linus pointed out, there 
are several
cases where it just doesn't make sense to nest.

-- Steve

^ permalink raw reply	[flat|nested] 73+ messages in thread

end of thread, other threads:[~2008-10-08 17:19 UTC | newest]

Thread overview: 73+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-04 13:03 [RFC][PATCH 0/7] lockdep Peter Zijlstra
2008-08-04 13:03 ` [RFC][PATCH 1/7] lockdep: Fix combinatorial explosion in lock subgraph traversal Peter Zijlstra
2008-08-05  8:34   ` David Miller
2008-08-05  8:46     ` Peter Zijlstra
2008-08-13  3:48       ` Tim Pepper
2008-08-13 10:56         ` Ingo Molnar
2008-08-04 13:03 ` [RFC][PATCH 2/7] lockdep: lock_set_subclass - reset a held locks subclass Peter Zijlstra
2008-08-05  8:35   ` David Miller
2008-08-04 13:03 ` [RFC][PATCH 3/7] lockdep: re-annotate scheduler runqueues Peter Zijlstra
2008-08-05  8:35   ` David Miller
2008-08-04 13:03 ` [RFC][PATCH 4/7] lockdep: shrink held_lock structure Peter Zijlstra
2008-08-05 16:08   ` Peter Zijlstra
2008-08-06  7:17   ` Peter Zijlstra
2008-08-04 13:03 ` [RFC][PATCH 5/7] lockdep: map_acquire Peter Zijlstra
2008-08-04 13:03 ` [RFC][PATCH 6/7] lockdep: lock protection locks Peter Zijlstra
2008-08-04 13:03 ` [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock() Peter Zijlstra
2008-08-04 14:07   ` Roland Dreier
2008-08-04 14:19     ` Peter Zijlstra
2008-08-04 14:26       ` Roland Dreier
2008-08-04 14:32         ` Peter Zijlstra
2008-08-04 14:53           ` Dave Jones
2008-08-04 14:56             ` Peter Zijlstra
2008-08-04 16:26               ` Andrea Arcangeli
2008-08-04 16:38                 ` Peter Zijlstra
2008-08-04 17:27                   ` Andrea Arcangeli
2008-08-04 17:46                     ` Andrea Arcangeli
2008-08-04 17:57                       ` [PATCH] workaround minor lockdep bug triggered by mm_take_all_locks Andrea Arcangeli
2008-08-04 18:48                         ` Peter Zijlstra
2008-08-04 18:56                           ` Roland Dreier
2008-08-04 19:05                             ` Peter Zijlstra
2008-08-04 20:15                           ` Andrea Arcangeli
2008-08-04 20:37                             ` Peter Zijlstra
2008-08-04 21:09                               ` Andrea Arcangeli
2008-08-04 21:14                                 ` Pekka Enberg
2008-08-04 21:30                                   ` Andrea Arcangeli
2008-08-04 21:41                                     ` Andrew Morton
2008-08-04 22:12                                       ` Andrea Arcangeli
2008-08-04 21:42                                     ` Arjan van de Ven
2008-08-04 22:30                                       ` Andrea Arcangeli
2008-08-04 23:38                                         ` Arjan van de Ven
2008-08-05  0:47                                           ` Andrea Arcangeli
2008-08-04 21:27                                 ` Arjan van de Ven
2008-08-04 21:54                                   ` Andrea Arcangeli
2008-08-04 21:57                                 ` David Miller
2008-08-05  2:00                                 ` Roland Dreier
2008-08-05  2:18                                   ` Andrea Arcangeli
2008-08-05 12:02                                     ` Roland Dreier
2008-08-05 12:20                                       ` Andrea Arcangeli
2008-08-04 18:48                     ` [RFC][PATCH 7/7] lockdep: spin_lock_nest_lock() Peter Zijlstra
2008-08-04 21:32                   ` David Miller
2008-08-04 18:06   ` Jeremy Fitzhardinge
2008-08-04 18:54     ` Peter Zijlstra
2008-08-04 19:26       ` Jeremy Fitzhardinge
2008-08-04 19:31         ` Linus Torvalds
2008-08-04 19:39           ` Peter Zijlstra
2008-08-04 20:16           ` Jeremy Fitzhardinge
2008-10-08 15:27           ` Steven Rostedt
2008-10-08 15:43             ` Linus Torvalds
2008-10-08 16:03               ` Steven Rostedt
2008-10-08 16:19                 ` Linus Torvalds
2008-10-08 16:53                   ` Steven Rostedt
2008-10-08 15:52             ` Nick Piggin
2008-10-08 17:18               ` Steven Rostedt
2008-08-07 11:25   ` Peter Zijlstra
2008-08-07 11:25 ` [RFC][PATCH 8/7] lockdep: annotate mm_take_all_locks() Peter Zijlstra
2008-08-07 11:25 ` [RFC][PATCH 9/7] mm: fix mm_take_all_locks() locking order Peter Zijlstra
2008-08-07 12:14   ` Hugh Dickins
2008-08-07 12:41     ` Peter Zijlstra
2008-08-07 13:27       ` Hugh Dickins
2008-08-07 21:46   ` Andrea Arcangeli
2008-08-08  1:34     ` Andrea Arcangeli
2008-08-08  7:16     ` Peter Zijlstra
2008-08-11 10:08 ` [RFC][PATCH 0/7] lockdep Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox