linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] fs: fix data races on inode->i_flctx
@ 2015-09-21  7:43 Dmitry Vyukov
  2015-09-21 10:50 ` Jeff Layton
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Vyukov @ 2015-09-21  7:43 UTC (permalink / raw)
  To: jlayton, bfields, viro, linux-fsdevel, linux-kernel
  Cc: kcc, andreyknvl, glider, ktsan, paulmck, Dmitry Vyukov

locks_get_lock_context() uses cmpxchg() to install i_flctx.
cmpxchg() is a release operation which is correct. But it uses
a plain load to load i_flctx. This is incorrect. Subsequent loads
from i_flctx can hoist above the load of i_flctx pointer itself
and observe uninitialized garbage there. This in turn can lead
to corruption of ctx->flc_lock and other members.

Documentation/memory-barriers.txt explicitly requires to use
a barrier in such context:
"A load-load control dependency requires a full read memory barrier".

Use smp_load_acquire() in locks_get_lock_context() and in bunch
of other functions that can proceed concurrently with
locks_get_lock_context().

The data race was found with KernelThreadSanitizer (KTSAN).

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
---
For the record, here is the KTSAN report:

ThreadSanitizer: data-race in _raw_spin_lock

Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
 [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
 [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
 [<     inline     >] spin_lock include/linux/spinlock.h:312
 [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
 [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
 [<ffffffff812e2814>] SyS_flock+0x224/0x23

Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
 [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
 [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
 [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
 [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
 [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
 [<     inline     >] SYSC_flock fs/locks.c:1941
 [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
 [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
---
 fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 2a54c80..316e474 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
 static struct file_lock_context *
 locks_get_lock_context(struct inode *inode, int type)
 {
-	struct file_lock_context *new;
+	struct file_lock_context *ctx;
 
-	if (likely(inode->i_flctx) || type == F_UNLCK)
+	/* paired with cmpxchg() below */
+	ctx = smp_load_acquire(&inode->i_flctx);
+	if (likely(ctx) || type == F_UNLCK)
 		goto out;
 
-	new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
-	if (!new)
+	ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+	if (!ctx)
 		goto out;
 
-	spin_lock_init(&new->flc_lock);
-	INIT_LIST_HEAD(&new->flc_flock);
-	INIT_LIST_HEAD(&new->flc_posix);
-	INIT_LIST_HEAD(&new->flc_lease);
+	spin_lock_init(&ctx->flc_lock);
+	INIT_LIST_HEAD(&ctx->flc_flock);
+	INIT_LIST_HEAD(&ctx->flc_posix);
+	INIT_LIST_HEAD(&ctx->flc_lease);
 
 	/*
 	 * Assign the pointer if it's not already assigned. If it is, then
 	 * free the context we just allocated.
 	 */
-	if (cmpxchg(&inode->i_flctx, NULL, new))
-		kmem_cache_free(flctx_cache, new);
+	if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
+		kmem_cache_free(flctx_cache, ctx);
+		ctx = smp_load_acquire(&inode->i_flctx);
+	}
 out:
-	return inode->i_flctx;
+	return ctx;
 }
 
 void
@@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 	struct file_lock_context *ctx;
 	struct inode *inode = file_inode(filp);
 
-	ctx = inode->i_flctx;
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
 		fl->fl_type = F_UNLCK;
 		return;
@@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
 	struct file_lock_context *ctx;
 	struct file_lock *fl;
 
-	ctx = inode->i_flctx;
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx || list_empty_careful(&ctx->flc_posix))
 		return 0;
 
@@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
 	int error = 0;
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	struct file_lock *new_fl, *fl, *tmp;
 	unsigned long break_time;
 	int want_write = (mode & O_ACCMODE) != O_RDONLY;
@@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 	new_fl->fl_flags = type;
 
 	/* typically we will check that ctx is non-NULL before calling */
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx) {
 		WARN_ON_ONCE(1);
 		return error;
@@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
 	bool has_lease = false;
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	struct file_lock *fl;
 
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&ctx->flc_lock);
 		if (!list_empty(&ctx->flc_lease)) {
@@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
 {
 	struct file_lock *fl;
 	struct inode *inode = file_inode(filp);
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	int type = F_UNLCK;
 	LIST_HEAD(dispose);
 
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&ctx->flc_lock);
 		time_out_leases(file_inode(filp), &dispose);
@@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
 	struct file_lock *fl, *victim = NULL;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	LIST_HEAD(dispose);
 
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx) {
 		trace_generic_delete_lease(inode, NULL);
 		return error;
@@ -2359,13 +2367,14 @@ out:
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
 	struct file_lock lock;
-	struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+	struct file_lock_context *ctx;
 
 	/*
 	 * If there are no locks held on this file, we don't need to call
 	 * posix_lock_file().  Another process could be setting a lock on this
 	 * file at the same time, but we wouldn't remove that lock anyway.
 	 */
+	ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
 	if (!ctx || list_empty(&ctx->flc_posix))
 		return;
 
@@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
 
 /* The i_flctx must be valid when calling into here */
 static void
-locks_remove_flock(struct file *filp)
+locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
 {
 	struct file_lock fl = {
 		.fl_owner = filp,
@@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
 		.fl_end = OFFSET_MAX,
 	};
 	struct inode *inode = file_inode(filp);
-	struct file_lock_context *flctx = inode->i_flctx;
 
 	if (list_empty(&flctx->flc_flock))
 		return;
@@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
 
 /* The i_flctx must be valid when calling into here */
 static void
-locks_remove_lease(struct file *filp)
+locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
-	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lock *fl, *tmp;
 	LIST_HEAD(dispose);
 
@@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
  */
 void locks_remove_file(struct file *filp)
 {
-	if (!file_inode(filp)->i_flctx)
+	struct file_lock_context *ctx;
+
+	ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
+	if (!ctx)
 		return;
 
 	/* remove any OFD locks */
 	locks_remove_posix(filp, filp);
 
 	/* remove flock locks */
-	locks_remove_flock(filp);
+	locks_remove_flock(filp, ctx);
 
 	/* remove any leases */
-	locks_remove_lease(filp);
+	locks_remove_lease(filp, ctx);
 }
 
 /**
@@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
 	struct file_lock_context *ctx;
 	int id = 0;
 
-	ctx = inode->i_flctx;
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx)
 		return;
 
-- 
2.6.0.rc0.131.gf624c3d

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-09-21  7:43 [PATCH] fs: fix data races on inode->i_flctx Dmitry Vyukov
@ 2015-09-21 10:50 ` Jeff Layton
  2015-09-21 10:53   ` Dmitry Vyukov
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Layton @ 2015-09-21 10:50 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: bfields, viro, linux-fsdevel, linux-kernel, kcc, andreyknvl,
	glider, ktsan, paulmck

On Mon, 21 Sep 2015 09:43:06 +0200
Dmitry Vyukov <dvyukov@google.com> wrote:

> locks_get_lock_context() uses cmpxchg() to install i_flctx.
> cmpxchg() is a release operation which is correct. But it uses
> a plain load to load i_flctx. This is incorrect. Subsequent loads
> from i_flctx can hoist above the load of i_flctx pointer itself
> and observe uninitialized garbage there. This in turn can lead
> to corruption of ctx->flc_lock and other members.
> 

I don't get this bit. The i_flctx field is initialized to zero when the
inode is allocated, and we only assign it with cmpxchg(). How would you
ever see uninitialized garbage in there? It should either be NULL or a
valid pointer, no?

If that'st the case, then most of the places where you're adding
smp_load_acquire are places that can tolerate seeing NULL there. e.g.
you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
something.

> Documentation/memory-barriers.txt explicitly requires to use
> a barrier in such context:
> "A load-load control dependency requires a full read memory barrier".
> 

The same file also says:

"Any atomic operation that modifies some state in memory and returns information
about the state (old or new) implies an SMP-conditional general memory barrier
(smp_mb()) on each side of the actual operation (with the exception of
explicit lock operations, described later).  These include:

...
/* when succeeds */
cmpxchg();
"

If there is an implied smp_mb() before and after the cmpxchg(), how
could the CPU hoist anything before that point?

Note that I'm not saying that you're wrong, I just want to make sure I
fully understand the problem before we go trying to fix it.

> Use smp_load_acquire() in locks_get_lock_context() and in bunch
> of other functions that can proceed concurrently with
> locks_get_lock_context().
> 
> The data race was found with KernelThreadSanitizer (KTSAN).
> 
> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> ---
> For the record, here is the KTSAN report:
> 
> ThreadSanitizer: data-race in _raw_spin_lock
> 
> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
>  [<     inline     >] spin_lock include/linux/spinlock.h:312
>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
> 
> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
>  [<     inline     >] SYSC_flock fs/locks.c:1941
>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
> ---
>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
>  1 file changed, 36 insertions(+), 27 deletions(-)
> 
> diff --git a/fs/locks.c b/fs/locks.c
> index 2a54c80..316e474 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
>  static struct file_lock_context *
>  locks_get_lock_context(struct inode *inode, int type)
>  {
> -	struct file_lock_context *new;
> +	struct file_lock_context *ctx;
>  
> -	if (likely(inode->i_flctx) || type == F_UNLCK)
> +	/* paired with cmpxchg() below */
> +	ctx = smp_load_acquire(&inode->i_flctx);
> +	if (likely(ctx) || type == F_UNLCK)
>  		goto out;
>  
> -	new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> -	if (!new)
> +	ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> +	if (!ctx)
>  		goto out;
>  
> -	spin_lock_init(&new->flc_lock);
> -	INIT_LIST_HEAD(&new->flc_flock);
> -	INIT_LIST_HEAD(&new->flc_posix);
> -	INIT_LIST_HEAD(&new->flc_lease);
> +	spin_lock_init(&ctx->flc_lock);
> +	INIT_LIST_HEAD(&ctx->flc_flock);
> +	INIT_LIST_HEAD(&ctx->flc_posix);
> +	INIT_LIST_HEAD(&ctx->flc_lease);
>  
>  	/*
>  	 * Assign the pointer if it's not already assigned. If it is, then
>  	 * free the context we just allocated.
>  	 */
> -	if (cmpxchg(&inode->i_flctx, NULL, new))
> -		kmem_cache_free(flctx_cache, new);
> +	if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
> +		kmem_cache_free(flctx_cache, ctx);
> +		ctx = smp_load_acquire(&inode->i_flctx);
> +	}
>  out:
> -	return inode->i_flctx;
> +	return ctx;
>  }
>  
>  void
> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>  	struct file_lock_context *ctx;
>  	struct inode *inode = file_inode(filp);
>  
> -	ctx = inode->i_flctx;
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>  		fl->fl_type = F_UNLCK;
>  		return;
> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
>  	struct file_lock_context *ctx;
>  	struct file_lock *fl;
>  
> -	ctx = inode->i_flctx;
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx || list_empty_careful(&ctx->flc_posix))
>  		return 0;
>  
> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>  {
>  	int error = 0;
> -	struct file_lock_context *ctx = inode->i_flctx;
> +	struct file_lock_context *ctx;
>  	struct file_lock *new_fl, *fl, *tmp;
>  	unsigned long break_time;
>  	int want_write = (mode & O_ACCMODE) != O_RDONLY;
> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>  	new_fl->fl_flags = type;
>  
>  	/* typically we will check that ctx is non-NULL before calling */
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx) {
>  		WARN_ON_ONCE(1);
>  		return error;
> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
>  void lease_get_mtime(struct inode *inode, struct timespec *time)
>  {
>  	bool has_lease = false;
> -	struct file_lock_context *ctx = inode->i_flctx;
> +	struct file_lock_context *ctx;
>  	struct file_lock *fl;
>  
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>  		spin_lock(&ctx->flc_lock);
>  		if (!list_empty(&ctx->flc_lease)) {
> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
>  {
>  	struct file_lock *fl;
>  	struct inode *inode = file_inode(filp);
> -	struct file_lock_context *ctx = inode->i_flctx;
> +	struct file_lock_context *ctx;
>  	int type = F_UNLCK;
>  	LIST_HEAD(dispose);
>  
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>  		spin_lock(&ctx->flc_lock);
>  		time_out_leases(file_inode(filp), &dispose);
> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
>  	struct file_lock *fl, *victim = NULL;
>  	struct dentry *dentry = filp->f_path.dentry;
>  	struct inode *inode = dentry->d_inode;
> -	struct file_lock_context *ctx = inode->i_flctx;
> +	struct file_lock_context *ctx;
>  	LIST_HEAD(dispose);
>  
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx) {
>  		trace_generic_delete_lease(inode, NULL);
>  		return error;
> @@ -2359,13 +2367,14 @@ out:
>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>  {
>  	struct file_lock lock;
> -	struct file_lock_context *ctx = file_inode(filp)->i_flctx;
> +	struct file_lock_context *ctx;
>  
>  	/*
>  	 * If there are no locks held on this file, we don't need to call
>  	 * posix_lock_file().  Another process could be setting a lock on this
>  	 * file at the same time, but we wouldn't remove that lock anyway.
>  	 */
> +	ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
>  	if (!ctx || list_empty(&ctx->flc_posix))
>  		return;
>  
> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
>  
>  /* The i_flctx must be valid when calling into here */
>  static void
> -locks_remove_flock(struct file *filp)
> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>  {
>  	struct file_lock fl = {
>  		.fl_owner = filp,
> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
>  		.fl_end = OFFSET_MAX,
>  	};
>  	struct inode *inode = file_inode(filp);
> -	struct file_lock_context *flctx = inode->i_flctx;
>  
>  	if (list_empty(&flctx->flc_flock))
>  		return;
> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
>  
>  /* The i_flctx must be valid when calling into here */
>  static void
> -locks_remove_lease(struct file *filp)
> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
>  {
> -	struct inode *inode = file_inode(filp);
> -	struct file_lock_context *ctx = inode->i_flctx;
>  	struct file_lock *fl, *tmp;
>  	LIST_HEAD(dispose);
>  
> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
>   */
>  void locks_remove_file(struct file *filp)
>  {
> -	if (!file_inode(filp)->i_flctx)
> +	struct file_lock_context *ctx;
> +
> +	ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
> +	if (!ctx)
>  		return;
>  
>  	/* remove any OFD locks */
>  	locks_remove_posix(filp, filp);
>  
>  	/* remove flock locks */
> -	locks_remove_flock(filp);
> +	locks_remove_flock(filp, ctx);
>  
>  	/* remove any leases */
> -	locks_remove_lease(filp);
> +	locks_remove_lease(filp, ctx);
>  }
>  
>  /**
> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
>  	struct file_lock_context *ctx;
>  	int id = 0;
>  
> -	ctx = inode->i_flctx;
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx)
>  		return;
>  


-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-09-21 10:50 ` Jeff Layton
@ 2015-09-21 10:53   ` Dmitry Vyukov
  2015-09-21 10:56     ` Jeff Layton
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Vyukov @ 2015-09-21 10:53 UTC (permalink / raw)
  To: Jeff Layton
  Cc: bfields, Al Viro, linux-fsdevel@vger.kernel.org, LKML,
	Kostya Serebryany, Andrey Konovalov, Alexander Potapenko, ktsan,
	Paul McKenney

On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Mon, 21 Sep 2015 09:43:06 +0200
> Dmitry Vyukov <dvyukov@google.com> wrote:
>
>> locks_get_lock_context() uses cmpxchg() to install i_flctx.
>> cmpxchg() is a release operation which is correct. But it uses
>> a plain load to load i_flctx. This is incorrect. Subsequent loads
>> from i_flctx can hoist above the load of i_flctx pointer itself
>> and observe uninitialized garbage there. This in turn can lead
>> to corruption of ctx->flc_lock and other members.
>>
>
> I don't get this bit. The i_flctx field is initialized to zero when the
> inode is allocated, and we only assign it with cmpxchg(). How would you
> ever see uninitialized garbage in there? It should either be NULL or a
> valid pointer, no?

This is not about i_flctx pointer, this is about i_flctx object
contents (pointee).
Readers can see a non-NULL pointer, but read garbage from *i_flctx.

> If that'st the case, then most of the places where you're adding
> smp_load_acquire are places that can tolerate seeing NULL there. e.g.
> you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
> something.
>
>> Documentation/memory-barriers.txt explicitly requires to use
>> a barrier in such context:
>> "A load-load control dependency requires a full read memory barrier".
>>
>
> The same file also says:
>
> "Any atomic operation that modifies some state in memory and returns information
> about the state (old or new) implies an SMP-conditional general memory barrier
> (smp_mb()) on each side of the actual operation (with the exception of
> explicit lock operations, described later).  These include:
>
> ...
> /* when succeeds */
> cmpxchg();
> "
>
> If there is an implied smp_mb() before and after the cmpxchg(), how
> could the CPU hoist anything before that point?
>
> Note that I'm not saying that you're wrong, I just want to make sure I
> fully understand the problem before we go trying to fix it.
>
>> Use smp_load_acquire() in locks_get_lock_context() and in bunch
>> of other functions that can proceed concurrently with
>> locks_get_lock_context().
>>
>> The data race was found with KernelThreadSanitizer (KTSAN).
>>
>> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
>> ---
>> For the record, here is the KTSAN report:
>>
>> ThreadSanitizer: data-race in _raw_spin_lock
>>
>> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
>>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
>>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
>>  [<     inline     >] spin_lock include/linux/spinlock.h:312
>>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
>>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
>>
>> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
>>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
>>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
>>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
>>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
>>  [<     inline     >] SYSC_flock fs/locks.c:1941
>>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
>>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
>> ---
>>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
>>  1 file changed, 36 insertions(+), 27 deletions(-)
>>
>> diff --git a/fs/locks.c b/fs/locks.c
>> index 2a54c80..316e474 100644
>> --- a/fs/locks.c
>> +++ b/fs/locks.c
>> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
>>  static struct file_lock_context *
>>  locks_get_lock_context(struct inode *inode, int type)
>>  {
>> -     struct file_lock_context *new;
>> +     struct file_lock_context *ctx;
>>
>> -     if (likely(inode->i_flctx) || type == F_UNLCK)
>> +     /* paired with cmpxchg() below */
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>> +     if (likely(ctx) || type == F_UNLCK)
>>               goto out;
>>
>> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> -     if (!new)
>> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> +     if (!ctx)
>>               goto out;
>>
>> -     spin_lock_init(&new->flc_lock);
>> -     INIT_LIST_HEAD(&new->flc_flock);
>> -     INIT_LIST_HEAD(&new->flc_posix);
>> -     INIT_LIST_HEAD(&new->flc_lease);
>> +     spin_lock_init(&ctx->flc_lock);
>> +     INIT_LIST_HEAD(&ctx->flc_flock);
>> +     INIT_LIST_HEAD(&ctx->flc_posix);
>> +     INIT_LIST_HEAD(&ctx->flc_lease);
>>
>>       /*
>>        * Assign the pointer if it's not already assigned. If it is, then
>>        * free the context we just allocated.
>>        */
>> -     if (cmpxchg(&inode->i_flctx, NULL, new))
>> -             kmem_cache_free(flctx_cache, new);
>> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
>> +             kmem_cache_free(flctx_cache, ctx);
>> +             ctx = smp_load_acquire(&inode->i_flctx);
>> +     }
>>  out:
>> -     return inode->i_flctx;
>> +     return ctx;
>>  }
>>
>>  void
>> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>>       struct file_lock_context *ctx;
>>       struct inode *inode = file_inode(filp);
>>
>> -     ctx = inode->i_flctx;
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>>               fl->fl_type = F_UNLCK;
>>               return;
>> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
>>       struct file_lock_context *ctx;
>>       struct file_lock *fl;
>>
>> -     ctx = inode->i_flctx;
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (!ctx || list_empty_careful(&ctx->flc_posix))
>>               return 0;
>>
>> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
>>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>>  {
>>       int error = 0;
>> -     struct file_lock_context *ctx = inode->i_flctx;
>> +     struct file_lock_context *ctx;
>>       struct file_lock *new_fl, *fl, *tmp;
>>       unsigned long break_time;
>>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
>> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>>       new_fl->fl_flags = type;
>>
>>       /* typically we will check that ctx is non-NULL before calling */
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (!ctx) {
>>               WARN_ON_ONCE(1);
>>               return error;
>> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
>>  void lease_get_mtime(struct inode *inode, struct timespec *time)
>>  {
>>       bool has_lease = false;
>> -     struct file_lock_context *ctx = inode->i_flctx;
>> +     struct file_lock_context *ctx;
>>       struct file_lock *fl;
>>
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>>               spin_lock(&ctx->flc_lock);
>>               if (!list_empty(&ctx->flc_lease)) {
>> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
>>  {
>>       struct file_lock *fl;
>>       struct inode *inode = file_inode(filp);
>> -     struct file_lock_context *ctx = inode->i_flctx;
>> +     struct file_lock_context *ctx;
>>       int type = F_UNLCK;
>>       LIST_HEAD(dispose);
>>
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>>               spin_lock(&ctx->flc_lock);
>>               time_out_leases(file_inode(filp), &dispose);
>> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
>>       struct file_lock *fl, *victim = NULL;
>>       struct dentry *dentry = filp->f_path.dentry;
>>       struct inode *inode = dentry->d_inode;
>> -     struct file_lock_context *ctx = inode->i_flctx;
>> +     struct file_lock_context *ctx;
>>       LIST_HEAD(dispose);
>>
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (!ctx) {
>>               trace_generic_delete_lease(inode, NULL);
>>               return error;
>> @@ -2359,13 +2367,14 @@ out:
>>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>>  {
>>       struct file_lock lock;
>> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
>> +     struct file_lock_context *ctx;
>>
>>       /*
>>        * If there are no locks held on this file, we don't need to call
>>        * posix_lock_file().  Another process could be setting a lock on this
>>        * file at the same time, but we wouldn't remove that lock anyway.
>>        */
>> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
>>       if (!ctx || list_empty(&ctx->flc_posix))
>>               return;
>>
>> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
>>
>>  /* The i_flctx must be valid when calling into here */
>>  static void
>> -locks_remove_flock(struct file *filp)
>> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>>  {
>>       struct file_lock fl = {
>>               .fl_owner = filp,
>> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
>>               .fl_end = OFFSET_MAX,
>>       };
>>       struct inode *inode = file_inode(filp);
>> -     struct file_lock_context *flctx = inode->i_flctx;
>>
>>       if (list_empty(&flctx->flc_flock))
>>               return;
>> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
>>
>>  /* The i_flctx must be valid when calling into here */
>>  static void
>> -locks_remove_lease(struct file *filp)
>> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
>>  {
>> -     struct inode *inode = file_inode(filp);
>> -     struct file_lock_context *ctx = inode->i_flctx;
>>       struct file_lock *fl, *tmp;
>>       LIST_HEAD(dispose);
>>
>> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
>>   */
>>  void locks_remove_file(struct file *filp)
>>  {
>> -     if (!file_inode(filp)->i_flctx)
>> +     struct file_lock_context *ctx;
>> +
>> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
>> +     if (!ctx)
>>               return;
>>
>>       /* remove any OFD locks */
>>       locks_remove_posix(filp, filp);
>>
>>       /* remove flock locks */
>> -     locks_remove_flock(filp);
>> +     locks_remove_flock(filp, ctx);
>>
>>       /* remove any leases */
>> -     locks_remove_lease(filp);
>> +     locks_remove_lease(filp, ctx);
>>  }
>>
>>  /**
>> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
>>       struct file_lock_context *ctx;
>>       int id = 0;
>>
>> -     ctx = inode->i_flctx;
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (!ctx)
>>               return;
>>
>
>
> --
> Jeff Layton <jlayton@poochiereds.net>



-- 
Dmitry Vyukov, Software Engineer, dvyukov@google.com
Google Germany GmbH, Dienerstraße 12, 80331, München
Geschäftsführer: Graham Law, Christine Elizabeth Flores
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Diese E-Mail ist vertraulich. Wenn Sie nicht der richtige Adressat
sind, leiten Sie diese bitte nicht weiter, informieren Sie den
Absender und löschen Sie die E-Mail und alle Anhänge. Vielen Dank.
This e-mail is confidential. If you are not the right addressee please
do not forward it, please inform the sender, and please erase this
e-mail including any attachments. Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-09-21 10:53   ` Dmitry Vyukov
@ 2015-09-21 10:56     ` Jeff Layton
  2015-09-21 11:00       ` Dmitry Vyukov
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Layton @ 2015-09-21 10:56 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: bfields, Al Viro, linux-fsdevel@vger.kernel.org, LKML,
	Kostya Serebryany, Andrey Konovalov, Alexander Potapenko, ktsan,
	Paul McKenney

On Mon, 21 Sep 2015 12:53:40 +0200
Dmitry Vyukov <dvyukov@google.com> wrote:

> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> > On Mon, 21 Sep 2015 09:43:06 +0200
> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >
> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
> >> cmpxchg() is a release operation which is correct. But it uses
> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
> >> from i_flctx can hoist above the load of i_flctx pointer itself
> >> and observe uninitialized garbage there. This in turn can lead
> >> to corruption of ctx->flc_lock and other members.
> >>
> >
> > I don't get this bit. The i_flctx field is initialized to zero when the
> > inode is allocated, and we only assign it with cmpxchg(). How would you
> > ever see uninitialized garbage in there? It should either be NULL or a
> > valid pointer, no?
> 
> This is not about i_flctx pointer, this is about i_flctx object
> contents (pointee).
> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
> 

Again, I don't get it though. The cmpxchg happens after we've
initialized the structure. Given that there are implicit full memory
barriers before and after the cmpxchg(), how do you end up with another
task seeing the pointer before it was ever initialized?

The store should only happen after the initialization has occurred, and
the loads in the other tasks shouldn't be able to see the results of
that store until then. No?

> > If that'st the case, then most of the places where you're adding
> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
> > something.
> >
> >> Documentation/memory-barriers.txt explicitly requires to use
> >> a barrier in such context:
> >> "A load-load control dependency requires a full read memory barrier".
> >>
> >
> > The same file also says:
> >
> > "Any atomic operation that modifies some state in memory and returns information
> > about the state (old or new) implies an SMP-conditional general memory barrier
> > (smp_mb()) on each side of the actual operation (with the exception of
> > explicit lock operations, described later).  These include:
> >
> > ...
> > /* when succeeds */
> > cmpxchg();
> > "
> >
> > If there is an implied smp_mb() before and after the cmpxchg(), how
> > could the CPU hoist anything before that point?
> >
> > Note that I'm not saying that you're wrong, I just want to make sure I
> > fully understand the problem before we go trying to fix it.
> >
> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
> >> of other functions that can proceed concurrently with
> >> locks_get_lock_context().
> >>
> >> The data race was found with KernelThreadSanitizer (KTSAN).
> >>
> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> >> ---
> >> For the record, here is the KTSAN report:
> >>
> >> ThreadSanitizer: data-race in _raw_spin_lock
> >>
> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
> >>
> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
> >> ---
> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
> >>  1 file changed, 36 insertions(+), 27 deletions(-)
> >>
> >> diff --git a/fs/locks.c b/fs/locks.c
> >> index 2a54c80..316e474 100644
> >> --- a/fs/locks.c
> >> +++ b/fs/locks.c
> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
> >>  static struct file_lock_context *
> >>  locks_get_lock_context(struct inode *inode, int type)
> >>  {
> >> -     struct file_lock_context *new;
> >> +     struct file_lock_context *ctx;
> >>
> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
> >> +     /* paired with cmpxchg() below */
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> +     if (likely(ctx) || type == F_UNLCK)
> >>               goto out;
> >>
> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> -     if (!new)
> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> +     if (!ctx)
> >>               goto out;
> >>
> >> -     spin_lock_init(&new->flc_lock);
> >> -     INIT_LIST_HEAD(&new->flc_flock);
> >> -     INIT_LIST_HEAD(&new->flc_posix);
> >> -     INIT_LIST_HEAD(&new->flc_lease);
> >> +     spin_lock_init(&ctx->flc_lock);
> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
> >>
> >>       /*
> >>        * Assign the pointer if it's not already assigned. If it is, then
> >>        * free the context we just allocated.
> >>        */
> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
> >> -             kmem_cache_free(flctx_cache, new);
> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
> >> +             kmem_cache_free(flctx_cache, ctx);
> >> +             ctx = smp_load_acquire(&inode->i_flctx);
> >> +     }
> >>  out:
> >> -     return inode->i_flctx;
> >> +     return ctx;
> >>  }
> >>
> >>  void
> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
> >>       struct file_lock_context *ctx;
> >>       struct inode *inode = file_inode(filp);
> >>
> >> -     ctx = inode->i_flctx;
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> >>               fl->fl_type = F_UNLCK;
> >>               return;
> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
> >>       struct file_lock_context *ctx;
> >>       struct file_lock *fl;
> >>
> >> -     ctx = inode->i_flctx;
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
> >>               return 0;
> >>
> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >>  {
> >>       int error = 0;
> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> +     struct file_lock_context *ctx;
> >>       struct file_lock *new_fl, *fl, *tmp;
> >>       unsigned long break_time;
> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >>       new_fl->fl_flags = type;
> >>
> >>       /* typically we will check that ctx is non-NULL before calling */
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (!ctx) {
> >>               WARN_ON_ONCE(1);
> >>               return error;
> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
> >>  {
> >>       bool has_lease = false;
> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> +     struct file_lock_context *ctx;
> >>       struct file_lock *fl;
> >>
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >>               spin_lock(&ctx->flc_lock);
> >>               if (!list_empty(&ctx->flc_lease)) {
> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
> >>  {
> >>       struct file_lock *fl;
> >>       struct inode *inode = file_inode(filp);
> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> +     struct file_lock_context *ctx;
> >>       int type = F_UNLCK;
> >>       LIST_HEAD(dispose);
> >>
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >>               spin_lock(&ctx->flc_lock);
> >>               time_out_leases(file_inode(filp), &dispose);
> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
> >>       struct file_lock *fl, *victim = NULL;
> >>       struct dentry *dentry = filp->f_path.dentry;
> >>       struct inode *inode = dentry->d_inode;
> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> +     struct file_lock_context *ctx;
> >>       LIST_HEAD(dispose);
> >>
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (!ctx) {
> >>               trace_generic_delete_lease(inode, NULL);
> >>               return error;
> >> @@ -2359,13 +2367,14 @@ out:
> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
> >>  {
> >>       struct file_lock lock;
> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
> >> +     struct file_lock_context *ctx;
> >>
> >>       /*
> >>        * If there are no locks held on this file, we don't need to call
> >>        * posix_lock_file().  Another process could be setting a lock on this
> >>        * file at the same time, but we wouldn't remove that lock anyway.
> >>        */
> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
> >>       if (!ctx || list_empty(&ctx->flc_posix))
> >>               return;
> >>
> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
> >>
> >>  /* The i_flctx must be valid when calling into here */
> >>  static void
> >> -locks_remove_flock(struct file *filp)
> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
> >>  {
> >>       struct file_lock fl = {
> >>               .fl_owner = filp,
> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
> >>               .fl_end = OFFSET_MAX,
> >>       };
> >>       struct inode *inode = file_inode(filp);
> >> -     struct file_lock_context *flctx = inode->i_flctx;
> >>
> >>       if (list_empty(&flctx->flc_flock))
> >>               return;
> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
> >>
> >>  /* The i_flctx must be valid when calling into here */
> >>  static void
> >> -locks_remove_lease(struct file *filp)
> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
> >>  {
> >> -     struct inode *inode = file_inode(filp);
> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >>       struct file_lock *fl, *tmp;
> >>       LIST_HEAD(dispose);
> >>
> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
> >>   */
> >>  void locks_remove_file(struct file *filp)
> >>  {
> >> -     if (!file_inode(filp)->i_flctx)
> >> +     struct file_lock_context *ctx;
> >> +
> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
> >> +     if (!ctx)
> >>               return;
> >>
> >>       /* remove any OFD locks */
> >>       locks_remove_posix(filp, filp);
> >>
> >>       /* remove flock locks */
> >> -     locks_remove_flock(filp);
> >> +     locks_remove_flock(filp, ctx);
> >>
> >>       /* remove any leases */
> >> -     locks_remove_lease(filp);
> >> +     locks_remove_lease(filp, ctx);
> >>  }
> >>
> >>  /**
> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
> >>       struct file_lock_context *ctx;
> >>       int id = 0;
> >>
> >> -     ctx = inode->i_flctx;
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (!ctx)
> >>               return;
> >>
> >
> >
> > --
> > Jeff Layton <jlayton@poochiereds.net>
> 
> 
> 


-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-09-21 10:56     ` Jeff Layton
@ 2015-09-21 11:00       ` Dmitry Vyukov
  2015-09-21 11:17         ` Jeff Layton
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Vyukov @ 2015-09-21 11:00 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Bruce Fields, Al Viro, linux-fsdevel@vger.kernel.org, LKML,
	Kostya Serebryany, Andrey Konovalov, Alexander Potapenko, ktsan,
	Paul McKenney

On Mon, Sep 21, 2015 at 12:56 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Mon, 21 Sep 2015 12:53:40 +0200
> Dmitry Vyukov <dvyukov@google.com> wrote:
>
>> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> > On Mon, 21 Sep 2015 09:43:06 +0200
>> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >
>> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
>> >> cmpxchg() is a release operation which is correct. But it uses
>> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
>> >> from i_flctx can hoist above the load of i_flctx pointer itself
>> >> and observe uninitialized garbage there. This in turn can lead
>> >> to corruption of ctx->flc_lock and other members.
>> >>
>> >
>> > I don't get this bit. The i_flctx field is initialized to zero when the
>> > inode is allocated, and we only assign it with cmpxchg(). How would you
>> > ever see uninitialized garbage in there? It should either be NULL or a
>> > valid pointer, no?
>>
>> This is not about i_flctx pointer, this is about i_flctx object
>> contents (pointee).
>> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
>>
>
> Again, I don't get it though. The cmpxchg happens after we've
> initialized the structure. Given that there are implicit full memory
> barriers before and after the cmpxchg(), how do you end up with another
> task seeing the pointer before it was ever initialized?
>
> The store should only happen after the initialization has occurred, and
> the loads in the other tasks shouldn't be able to see the results of
> that store until then. No?

If a reader looks at things in the opposite order, then it does not
matter how you store them. Reader still can observe them in the wrong
order.
Memory barriers is always a game of two. Writer needs to do stores in
the correct order and reader must do reads in the correct order. A
single memory barrier cannot possible make any sense.


>> > If that'st the case, then most of the places where you're adding
>> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
>> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
>> > something.
>> >
>> >> Documentation/memory-barriers.txt explicitly requires to use
>> >> a barrier in such context:
>> >> "A load-load control dependency requires a full read memory barrier".
>> >>
>> >
>> > The same file also says:
>> >
>> > "Any atomic operation that modifies some state in memory and returns information
>> > about the state (old or new) implies an SMP-conditional general memory barrier
>> > (smp_mb()) on each side of the actual operation (with the exception of
>> > explicit lock operations, described later).  These include:
>> >
>> > ...
>> > /* when succeeds */
>> > cmpxchg();
>> > "
>> >
>> > If there is an implied smp_mb() before and after the cmpxchg(), how
>> > could the CPU hoist anything before that point?
>> >
>> > Note that I'm not saying that you're wrong, I just want to make sure I
>> > fully understand the problem before we go trying to fix it.
>> >
>> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
>> >> of other functions that can proceed concurrently with
>> >> locks_get_lock_context().
>> >>
>> >> The data race was found with KernelThreadSanitizer (KTSAN).
>> >>
>> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
>> >> ---
>> >> For the record, here is the KTSAN report:
>> >>
>> >> ThreadSanitizer: data-race in _raw_spin_lock
>> >>
>> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
>> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
>> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
>> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
>> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
>> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
>> >>
>> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
>> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
>> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
>> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
>> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
>> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
>> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
>> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
>> >> ---
>> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
>> >>  1 file changed, 36 insertions(+), 27 deletions(-)
>> >>
>> >> diff --git a/fs/locks.c b/fs/locks.c
>> >> index 2a54c80..316e474 100644
>> >> --- a/fs/locks.c
>> >> +++ b/fs/locks.c
>> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
>> >>  static struct file_lock_context *
>> >>  locks_get_lock_context(struct inode *inode, int type)
>> >>  {
>> >> -     struct file_lock_context *new;
>> >> +     struct file_lock_context *ctx;
>> >>
>> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
>> >> +     /* paired with cmpxchg() below */
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> +     if (likely(ctx) || type == F_UNLCK)
>> >>               goto out;
>> >>
>> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> -     if (!new)
>> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> +     if (!ctx)
>> >>               goto out;
>> >>
>> >> -     spin_lock_init(&new->flc_lock);
>> >> -     INIT_LIST_HEAD(&new->flc_flock);
>> >> -     INIT_LIST_HEAD(&new->flc_posix);
>> >> -     INIT_LIST_HEAD(&new->flc_lease);
>> >> +     spin_lock_init(&ctx->flc_lock);
>> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
>> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
>> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
>> >>
>> >>       /*
>> >>        * Assign the pointer if it's not already assigned. If it is, then
>> >>        * free the context we just allocated.
>> >>        */
>> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
>> >> -             kmem_cache_free(flctx_cache, new);
>> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
>> >> +             kmem_cache_free(flctx_cache, ctx);
>> >> +             ctx = smp_load_acquire(&inode->i_flctx);
>> >> +     }
>> >>  out:
>> >> -     return inode->i_flctx;
>> >> +     return ctx;
>> >>  }
>> >>
>> >>  void
>> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>> >>       struct file_lock_context *ctx;
>> >>       struct inode *inode = file_inode(filp);
>> >>
>> >> -     ctx = inode->i_flctx;
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>> >>               fl->fl_type = F_UNLCK;
>> >>               return;
>> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
>> >>       struct file_lock_context *ctx;
>> >>       struct file_lock *fl;
>> >>
>> >> -     ctx = inode->i_flctx;
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
>> >>               return 0;
>> >>
>> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
>> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >>  {
>> >>       int error = 0;
>> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> +     struct file_lock_context *ctx;
>> >>       struct file_lock *new_fl, *fl, *tmp;
>> >>       unsigned long break_time;
>> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
>> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >>       new_fl->fl_flags = type;
>> >>
>> >>       /* typically we will check that ctx is non-NULL before calling */
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (!ctx) {
>> >>               WARN_ON_ONCE(1);
>> >>               return error;
>> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
>> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
>> >>  {
>> >>       bool has_lease = false;
>> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> +     struct file_lock_context *ctx;
>> >>       struct file_lock *fl;
>> >>
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >>               spin_lock(&ctx->flc_lock);
>> >>               if (!list_empty(&ctx->flc_lease)) {
>> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
>> >>  {
>> >>       struct file_lock *fl;
>> >>       struct inode *inode = file_inode(filp);
>> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> +     struct file_lock_context *ctx;
>> >>       int type = F_UNLCK;
>> >>       LIST_HEAD(dispose);
>> >>
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >>               spin_lock(&ctx->flc_lock);
>> >>               time_out_leases(file_inode(filp), &dispose);
>> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
>> >>       struct file_lock *fl, *victim = NULL;
>> >>       struct dentry *dentry = filp->f_path.dentry;
>> >>       struct inode *inode = dentry->d_inode;
>> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> +     struct file_lock_context *ctx;
>> >>       LIST_HEAD(dispose);
>> >>
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (!ctx) {
>> >>               trace_generic_delete_lease(inode, NULL);
>> >>               return error;
>> >> @@ -2359,13 +2367,14 @@ out:
>> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>> >>  {
>> >>       struct file_lock lock;
>> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
>> >> +     struct file_lock_context *ctx;
>> >>
>> >>       /*
>> >>        * If there are no locks held on this file, we don't need to call
>> >>        * posix_lock_file().  Another process could be setting a lock on this
>> >>        * file at the same time, but we wouldn't remove that lock anyway.
>> >>        */
>> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
>> >>       if (!ctx || list_empty(&ctx->flc_posix))
>> >>               return;
>> >>
>> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
>> >>
>> >>  /* The i_flctx must be valid when calling into here */
>> >>  static void
>> >> -locks_remove_flock(struct file *filp)
>> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>> >>  {
>> >>       struct file_lock fl = {
>> >>               .fl_owner = filp,
>> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
>> >>               .fl_end = OFFSET_MAX,
>> >>       };
>> >>       struct inode *inode = file_inode(filp);
>> >> -     struct file_lock_context *flctx = inode->i_flctx;
>> >>
>> >>       if (list_empty(&flctx->flc_flock))
>> >>               return;
>> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
>> >>
>> >>  /* The i_flctx must be valid when calling into here */
>> >>  static void
>> >> -locks_remove_lease(struct file *filp)
>> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
>> >>  {
>> >> -     struct inode *inode = file_inode(filp);
>> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >>       struct file_lock *fl, *tmp;
>> >>       LIST_HEAD(dispose);
>> >>
>> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
>> >>   */
>> >>  void locks_remove_file(struct file *filp)
>> >>  {
>> >> -     if (!file_inode(filp)->i_flctx)
>> >> +     struct file_lock_context *ctx;
>> >> +
>> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
>> >> +     if (!ctx)
>> >>               return;
>> >>
>> >>       /* remove any OFD locks */
>> >>       locks_remove_posix(filp, filp);
>> >>
>> >>       /* remove flock locks */
>> >> -     locks_remove_flock(filp);
>> >> +     locks_remove_flock(filp, ctx);
>> >>
>> >>       /* remove any leases */
>> >> -     locks_remove_lease(filp);
>> >> +     locks_remove_lease(filp, ctx);
>> >>  }
>> >>
>> >>  /**
>> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
>> >>       struct file_lock_context *ctx;
>> >>       int id = 0;
>> >>
>> >> -     ctx = inode->i_flctx;
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (!ctx)
>> >>               return;
>> >>
>> >
>> >
>> > --
>> > Jeff Layton <jlayton@poochiereds.net>
>>
>>
>>
>
>
> --
> Jeff Layton <jlayton@poochiereds.net>



-- 
Dmitry Vyukov, Software Engineer, dvyukov@google.com
Google Germany GmbH, Dienerstraße 12, 80331, München
Geschäftsführer: Graham Law, Christine Elizabeth Flores
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Diese E-Mail ist vertraulich. Wenn Sie nicht der richtige Adressat
sind, leiten Sie diese bitte nicht weiter, informieren Sie den
Absender und löschen Sie die E-Mail und alle Anhänge. Vielen Dank.
This e-mail is confidential. If you are not the right addressee please
do not forward it, please inform the sender, and please erase this
e-mail including any attachments. Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-09-21 11:00       ` Dmitry Vyukov
@ 2015-09-21 11:17         ` Jeff Layton
  2015-09-21 11:38           ` Dmitry Vyukov
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Layton @ 2015-09-21 11:17 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Bruce Fields, Al Viro, linux-fsdevel@vger.kernel.org, LKML,
	Kostya Serebryany, Andrey Konovalov, Alexander Potapenko, ktsan,
	Paul McKenney

On Mon, 21 Sep 2015 13:00:04 +0200
Dmitry Vyukov <dvyukov@google.com> wrote:

> On Mon, Sep 21, 2015 at 12:56 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> > On Mon, 21 Sep 2015 12:53:40 +0200
> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >
> >> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> >> > On Mon, 21 Sep 2015 09:43:06 +0200
> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >> >
> >> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
> >> >> cmpxchg() is a release operation which is correct. But it uses
> >> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
> >> >> from i_flctx can hoist above the load of i_flctx pointer itself
> >> >> and observe uninitialized garbage there. This in turn can lead
> >> >> to corruption of ctx->flc_lock and other members.
> >> >>
> >> >
> >> > I don't get this bit. The i_flctx field is initialized to zero when the
> >> > inode is allocated, and we only assign it with cmpxchg(). How would you
> >> > ever see uninitialized garbage in there? It should either be NULL or a
> >> > valid pointer, no?
> >>
> >> This is not about i_flctx pointer, this is about i_flctx object
> >> contents (pointee).
> >> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
> >>
> >
> > Again, I don't get it though. The cmpxchg happens after we've
> > initialized the structure. Given that there are implicit full memory
> > barriers before and after the cmpxchg(), how do you end up with another
> > task seeing the pointer before it was ever initialized?
> >
> > The store should only happen after the initialization has occurred, and
> > the loads in the other tasks shouldn't be able to see the results of
> > that store until then. No?
> 
> If a reader looks at things in the opposite order, then it does not
> matter how you store them. Reader still can observe them in the wrong
> order.
> Memory barriers is always a game of two. Writer needs to do stores in
> the correct order and reader must do reads in the correct order. A
> single memory barrier cannot possible make any sense.
> 

I get that, but...isn't there a data dependency there? In order for the
reader to see bogus fields inside of i_flctx, it needs to be able to
see a valid pointer in i_flctx...and in order for that to occur the
store has to have occurred.

I guess the concern is that the reader CPU could stumble across some
memory that is eventually going to part of i_flctx prior to loading
that pointer, and assume that its contents are still valid after the
pointer store has occurred? Is that the basic scenario?

> 
> >> > If that'st the case, then most of the places where you're adding
> >> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
> >> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
> >> > something.
> >> >
> >> >> Documentation/memory-barriers.txt explicitly requires to use
> >> >> a barrier in such context:
> >> >> "A load-load control dependency requires a full read memory barrier".
> >> >>
> >> >
> >> > The same file also says:
> >> >
> >> > "Any atomic operation that modifies some state in memory and returns information
> >> > about the state (old or new) implies an SMP-conditional general memory barrier
> >> > (smp_mb()) on each side of the actual operation (with the exception of
> >> > explicit lock operations, described later).  These include:
> >> >
> >> > ...
> >> > /* when succeeds */
> >> > cmpxchg();
> >> > "
> >> >
> >> > If there is an implied smp_mb() before and after the cmpxchg(), how
> >> > could the CPU hoist anything before that point?
> >> >
> >> > Note that I'm not saying that you're wrong, I just want to make sure I
> >> > fully understand the problem before we go trying to fix it.
> >> >
> >> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
> >> >> of other functions that can proceed concurrently with
> >> >> locks_get_lock_context().
> >> >>
> >> >> The data race was found with KernelThreadSanitizer (KTSAN).
> >> >>
> >> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> >> >> ---
> >> >> For the record, here is the KTSAN report:
> >> >>
> >> >> ThreadSanitizer: data-race in _raw_spin_lock
> >> >>
> >> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
> >> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
> >> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
> >> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
> >> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
> >> >>
> >> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
> >> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
> >> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
> >> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
> >> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
> >> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
> >> >> ---
> >> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
> >> >>  1 file changed, 36 insertions(+), 27 deletions(-)
> >> >>
> >> >> diff --git a/fs/locks.c b/fs/locks.c
> >> >> index 2a54c80..316e474 100644
> >> >> --- a/fs/locks.c
> >> >> +++ b/fs/locks.c
> >> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
> >> >>  static struct file_lock_context *
> >> >>  locks_get_lock_context(struct inode *inode, int type)
> >> >>  {
> >> >> -     struct file_lock_context *new;
> >> >> +     struct file_lock_context *ctx;
> >> >>
> >> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
> >> >> +     /* paired with cmpxchg() below */
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> +     if (likely(ctx) || type == F_UNLCK)
> >> >>               goto out;
> >> >>
> >> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> >> -     if (!new)
> >> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> >> +     if (!ctx)
> >> >>               goto out;
> >> >>
> >> >> -     spin_lock_init(&new->flc_lock);
> >> >> -     INIT_LIST_HEAD(&new->flc_flock);
> >> >> -     INIT_LIST_HEAD(&new->flc_posix);
> >> >> -     INIT_LIST_HEAD(&new->flc_lease);
> >> >> +     spin_lock_init(&ctx->flc_lock);
> >> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
> >> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
> >> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
> >> >>
> >> >>       /*
> >> >>        * Assign the pointer if it's not already assigned. If it is, then
> >> >>        * free the context we just allocated.
> >> >>        */
> >> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
> >> >> -             kmem_cache_free(flctx_cache, new);
> >> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
> >> >> +             kmem_cache_free(flctx_cache, ctx);
> >> >> +             ctx = smp_load_acquire(&inode->i_flctx);
> >> >> +     }
> >> >>  out:
> >> >> -     return inode->i_flctx;
> >> >> +     return ctx;
> >> >>  }
> >> >>
> >> >>  void
> >> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
> >> >>       struct file_lock_context *ctx;
> >> >>       struct inode *inode = file_inode(filp);
> >> >>
> >> >> -     ctx = inode->i_flctx;
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> >> >>               fl->fl_type = F_UNLCK;
> >> >>               return;
> >> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
> >> >>       struct file_lock_context *ctx;
> >> >>       struct file_lock *fl;
> >> >>
> >> >> -     ctx = inode->i_flctx;
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
> >> >>               return 0;
> >> >>
> >> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
> >> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >> >>  {
> >> >>       int error = 0;
> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> +     struct file_lock_context *ctx;
> >> >>       struct file_lock *new_fl, *fl, *tmp;
> >> >>       unsigned long break_time;
> >> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
> >> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >> >>       new_fl->fl_flags = type;
> >> >>
> >> >>       /* typically we will check that ctx is non-NULL before calling */
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (!ctx) {
> >> >>               WARN_ON_ONCE(1);
> >> >>               return error;
> >> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
> >> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
> >> >>  {
> >> >>       bool has_lease = false;
> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> +     struct file_lock_context *ctx;
> >> >>       struct file_lock *fl;
> >> >>
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >> >>               spin_lock(&ctx->flc_lock);
> >> >>               if (!list_empty(&ctx->flc_lease)) {
> >> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
> >> >>  {
> >> >>       struct file_lock *fl;
> >> >>       struct inode *inode = file_inode(filp);
> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> +     struct file_lock_context *ctx;
> >> >>       int type = F_UNLCK;
> >> >>       LIST_HEAD(dispose);
> >> >>
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >> >>               spin_lock(&ctx->flc_lock);
> >> >>               time_out_leases(file_inode(filp), &dispose);
> >> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
> >> >>       struct file_lock *fl, *victim = NULL;
> >> >>       struct dentry *dentry = filp->f_path.dentry;
> >> >>       struct inode *inode = dentry->d_inode;
> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> +     struct file_lock_context *ctx;
> >> >>       LIST_HEAD(dispose);
> >> >>
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (!ctx) {
> >> >>               trace_generic_delete_lease(inode, NULL);
> >> >>               return error;
> >> >> @@ -2359,13 +2367,14 @@ out:
> >> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
> >> >>  {
> >> >>       struct file_lock lock;
> >> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
> >> >> +     struct file_lock_context *ctx;
> >> >>
> >> >>       /*
> >> >>        * If there are no locks held on this file, we don't need to call
> >> >>        * posix_lock_file().  Another process could be setting a lock on this
> >> >>        * file at the same time, but we wouldn't remove that lock anyway.
> >> >>        */
> >> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
> >> >>       if (!ctx || list_empty(&ctx->flc_posix))
> >> >>               return;
> >> >>
> >> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
> >> >>
> >> >>  /* The i_flctx must be valid when calling into here */
> >> >>  static void
> >> >> -locks_remove_flock(struct file *filp)
> >> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
> >> >>  {
> >> >>       struct file_lock fl = {
> >> >>               .fl_owner = filp,
> >> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
> >> >>               .fl_end = OFFSET_MAX,
> >> >>       };
> >> >>       struct inode *inode = file_inode(filp);
> >> >> -     struct file_lock_context *flctx = inode->i_flctx;
> >> >>
> >> >>       if (list_empty(&flctx->flc_flock))
> >> >>               return;
> >> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
> >> >>
> >> >>  /* The i_flctx must be valid when calling into here */
> >> >>  static void
> >> >> -locks_remove_lease(struct file *filp)
> >> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
> >> >>  {
> >> >> -     struct inode *inode = file_inode(filp);
> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >>       struct file_lock *fl, *tmp;
> >> >>       LIST_HEAD(dispose);
> >> >>
> >> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
> >> >>   */
> >> >>  void locks_remove_file(struct file *filp)
> >> >>  {
> >> >> -     if (!file_inode(filp)->i_flctx)
> >> >> +     struct file_lock_context *ctx;
> >> >> +
> >> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
> >> >> +     if (!ctx)
> >> >>               return;
> >> >>
> >> >>       /* remove any OFD locks */
> >> >>       locks_remove_posix(filp, filp);
> >> >>
> >> >>       /* remove flock locks */
> >> >> -     locks_remove_flock(filp);
> >> >> +     locks_remove_flock(filp, ctx);
> >> >>
> >> >>       /* remove any leases */
> >> >> -     locks_remove_lease(filp);
> >> >> +     locks_remove_lease(filp, ctx);
> >> >>  }
> >> >>
> >> >>  /**
> >> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
> >> >>       struct file_lock_context *ctx;
> >> >>       int id = 0;
> >> >>
> >> >> -     ctx = inode->i_flctx;
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (!ctx)
> >> >>               return;
> >> >>
> >> >
> >> >
> >> > --
> >> > Jeff Layton <jlayton@poochiereds.net>
> >>
> >>
> >>
> >
> >
> > --
> > Jeff Layton <jlayton@poochiereds.net>
> 
> 
> 


-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-09-21 11:17         ` Jeff Layton
@ 2015-09-21 11:38           ` Dmitry Vyukov
  2015-09-21 11:44             ` Jeff Layton
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Vyukov @ 2015-09-21 11:38 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Bruce Fields, Al Viro, linux-fsdevel@vger.kernel.org, LKML,
	Kostya Serebryany, Andrey Konovalov, Alexander Potapenko, ktsan,
	Paul McKenney

Yes. That processor is Alpha and that's documented in DATA DEPENDENCY
BARRIERS section of Documentation/memory-barriers.txt.



On Mon, Sep 21, 2015 at 1:17 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Mon, 21 Sep 2015 13:00:04 +0200
> Dmitry Vyukov <dvyukov@google.com> wrote:
>
>> On Mon, Sep 21, 2015 at 12:56 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> > On Mon, 21 Sep 2015 12:53:40 +0200
>> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >
>> >> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> >> > On Mon, 21 Sep 2015 09:43:06 +0200
>> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >> >
>> >> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
>> >> >> cmpxchg() is a release operation which is correct. But it uses
>> >> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
>> >> >> from i_flctx can hoist above the load of i_flctx pointer itself
>> >> >> and observe uninitialized garbage there. This in turn can lead
>> >> >> to corruption of ctx->flc_lock and other members.
>> >> >>
>> >> >
>> >> > I don't get this bit. The i_flctx field is initialized to zero when the
>> >> > inode is allocated, and we only assign it with cmpxchg(). How would you
>> >> > ever see uninitialized garbage in there? It should either be NULL or a
>> >> > valid pointer, no?
>> >>
>> >> This is not about i_flctx pointer, this is about i_flctx object
>> >> contents (pointee).
>> >> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
>> >>
>> >
>> > Again, I don't get it though. The cmpxchg happens after we've
>> > initialized the structure. Given that there are implicit full memory
>> > barriers before and after the cmpxchg(), how do you end up with another
>> > task seeing the pointer before it was ever initialized?
>> >
>> > The store should only happen after the initialization has occurred, and
>> > the loads in the other tasks shouldn't be able to see the results of
>> > that store until then. No?
>>
>> If a reader looks at things in the opposite order, then it does not
>> matter how you store them. Reader still can observe them in the wrong
>> order.
>> Memory barriers is always a game of two. Writer needs to do stores in
>> the correct order and reader must do reads in the correct order. A
>> single memory barrier cannot possible make any sense.
>>
>
> I get that, but...isn't there a data dependency there? In order for the
> reader to see bogus fields inside of i_flctx, it needs to be able to
> see a valid pointer in i_flctx...and in order for that to occur the
> store has to have occurred.
>
> I guess the concern is that the reader CPU could stumble across some
> memory that is eventually going to part of i_flctx prior to loading
> that pointer, and assume that its contents are still valid after the
> pointer store has occurred? Is that the basic scenario?
>
>>
>> >> > If that'st the case, then most of the places where you're adding
>> >> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
>> >> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
>> >> > something.
>> >> >
>> >> >> Documentation/memory-barriers.txt explicitly requires to use
>> >> >> a barrier in such context:
>> >> >> "A load-load control dependency requires a full read memory barrier".
>> >> >>
>> >> >
>> >> > The same file also says:
>> >> >
>> >> > "Any atomic operation that modifies some state in memory and returns information
>> >> > about the state (old or new) implies an SMP-conditional general memory barrier
>> >> > (smp_mb()) on each side of the actual operation (with the exception of
>> >> > explicit lock operations, described later).  These include:
>> >> >
>> >> > ...
>> >> > /* when succeeds */
>> >> > cmpxchg();
>> >> > "
>> >> >
>> >> > If there is an implied smp_mb() before and after the cmpxchg(), how
>> >> > could the CPU hoist anything before that point?
>> >> >
>> >> > Note that I'm not saying that you're wrong, I just want to make sure I
>> >> > fully understand the problem before we go trying to fix it.
>> >> >
>> >> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
>> >> >> of other functions that can proceed concurrently with
>> >> >> locks_get_lock_context().
>> >> >>
>> >> >> The data race was found with KernelThreadSanitizer (KTSAN).
>> >> >>
>> >> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
>> >> >> ---
>> >> >> For the record, here is the KTSAN report:
>> >> >>
>> >> >> ThreadSanitizer: data-race in _raw_spin_lock
>> >> >>
>> >> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
>> >> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
>> >> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
>> >> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
>> >> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
>> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
>> >> >>
>> >> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
>> >> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
>> >> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
>> >> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
>> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
>> >> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
>> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
>> >> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
>> >> >> ---
>> >> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
>> >> >>  1 file changed, 36 insertions(+), 27 deletions(-)
>> >> >>
>> >> >> diff --git a/fs/locks.c b/fs/locks.c
>> >> >> index 2a54c80..316e474 100644
>> >> >> --- a/fs/locks.c
>> >> >> +++ b/fs/locks.c
>> >> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
>> >> >>  static struct file_lock_context *
>> >> >>  locks_get_lock_context(struct inode *inode, int type)
>> >> >>  {
>> >> >> -     struct file_lock_context *new;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>
>> >> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
>> >> >> +     /* paired with cmpxchg() below */
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> +     if (likely(ctx) || type == F_UNLCK)
>> >> >>               goto out;
>> >> >>
>> >> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> >> -     if (!new)
>> >> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> >> +     if (!ctx)
>> >> >>               goto out;
>> >> >>
>> >> >> -     spin_lock_init(&new->flc_lock);
>> >> >> -     INIT_LIST_HEAD(&new->flc_flock);
>> >> >> -     INIT_LIST_HEAD(&new->flc_posix);
>> >> >> -     INIT_LIST_HEAD(&new->flc_lease);
>> >> >> +     spin_lock_init(&ctx->flc_lock);
>> >> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
>> >> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
>> >> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
>> >> >>
>> >> >>       /*
>> >> >>        * Assign the pointer if it's not already assigned. If it is, then
>> >> >>        * free the context we just allocated.
>> >> >>        */
>> >> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
>> >> >> -             kmem_cache_free(flctx_cache, new);
>> >> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
>> >> >> +             kmem_cache_free(flctx_cache, ctx);
>> >> >> +             ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> +     }
>> >> >>  out:
>> >> >> -     return inode->i_flctx;
>> >> >> +     return ctx;
>> >> >>  }
>> >> >>
>> >> >>  void
>> >> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>> >> >>       struct file_lock_context *ctx;
>> >> >>       struct inode *inode = file_inode(filp);
>> >> >>
>> >> >> -     ctx = inode->i_flctx;
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>> >> >>               fl->fl_type = F_UNLCK;
>> >> >>               return;
>> >> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
>> >> >>       struct file_lock_context *ctx;
>> >> >>       struct file_lock *fl;
>> >> >>
>> >> >> -     ctx = inode->i_flctx;
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
>> >> >>               return 0;
>> >> >>
>> >> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
>> >> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >> >>  {
>> >> >>       int error = 0;
>> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>       struct file_lock *new_fl, *fl, *tmp;
>> >> >>       unsigned long break_time;
>> >> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
>> >> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >> >>       new_fl->fl_flags = type;
>> >> >>
>> >> >>       /* typically we will check that ctx is non-NULL before calling */
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (!ctx) {
>> >> >>               WARN_ON_ONCE(1);
>> >> >>               return error;
>> >> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
>> >> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
>> >> >>  {
>> >> >>       bool has_lease = false;
>> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>       struct file_lock *fl;
>> >> >>
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >> >>               spin_lock(&ctx->flc_lock);
>> >> >>               if (!list_empty(&ctx->flc_lease)) {
>> >> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
>> >> >>  {
>> >> >>       struct file_lock *fl;
>> >> >>       struct inode *inode = file_inode(filp);
>> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>       int type = F_UNLCK;
>> >> >>       LIST_HEAD(dispose);
>> >> >>
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >> >>               spin_lock(&ctx->flc_lock);
>> >> >>               time_out_leases(file_inode(filp), &dispose);
>> >> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
>> >> >>       struct file_lock *fl, *victim = NULL;
>> >> >>       struct dentry *dentry = filp->f_path.dentry;
>> >> >>       struct inode *inode = dentry->d_inode;
>> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>       LIST_HEAD(dispose);
>> >> >>
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (!ctx) {
>> >> >>               trace_generic_delete_lease(inode, NULL);
>> >> >>               return error;
>> >> >> @@ -2359,13 +2367,14 @@ out:
>> >> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>> >> >>  {
>> >> >>       struct file_lock lock;
>> >> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>
>> >> >>       /*
>> >> >>        * If there are no locks held on this file, we don't need to call
>> >> >>        * posix_lock_file().  Another process could be setting a lock on this
>> >> >>        * file at the same time, but we wouldn't remove that lock anyway.
>> >> >>        */
>> >> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
>> >> >>       if (!ctx || list_empty(&ctx->flc_posix))
>> >> >>               return;
>> >> >>
>> >> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
>> >> >>
>> >> >>  /* The i_flctx must be valid when calling into here */
>> >> >>  static void
>> >> >> -locks_remove_flock(struct file *filp)
>> >> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>> >> >>  {
>> >> >>       struct file_lock fl = {
>> >> >>               .fl_owner = filp,
>> >> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
>> >> >>               .fl_end = OFFSET_MAX,
>> >> >>       };
>> >> >>       struct inode *inode = file_inode(filp);
>> >> >> -     struct file_lock_context *flctx = inode->i_flctx;
>> >> >>
>> >> >>       if (list_empty(&flctx->flc_flock))
>> >> >>               return;
>> >> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
>> >> >>
>> >> >>  /* The i_flctx must be valid when calling into here */
>> >> >>  static void
>> >> >> -locks_remove_lease(struct file *filp)
>> >> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
>> >> >>  {
>> >> >> -     struct inode *inode = file_inode(filp);
>> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >>       struct file_lock *fl, *tmp;
>> >> >>       LIST_HEAD(dispose);
>> >> >>
>> >> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
>> >> >>   */
>> >> >>  void locks_remove_file(struct file *filp)
>> >> >>  {
>> >> >> -     if (!file_inode(filp)->i_flctx)
>> >> >> +     struct file_lock_context *ctx;
>> >> >> +
>> >> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
>> >> >> +     if (!ctx)
>> >> >>               return;
>> >> >>
>> >> >>       /* remove any OFD locks */
>> >> >>       locks_remove_posix(filp, filp);
>> >> >>
>> >> >>       /* remove flock locks */
>> >> >> -     locks_remove_flock(filp);
>> >> >> +     locks_remove_flock(filp, ctx);
>> >> >>
>> >> >>       /* remove any leases */
>> >> >> -     locks_remove_lease(filp);
>> >> >> +     locks_remove_lease(filp, ctx);
>> >> >>  }
>> >> >>
>> >> >>  /**
>> >> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
>> >> >>       struct file_lock_context *ctx;
>> >> >>       int id = 0;
>> >> >>
>> >> >> -     ctx = inode->i_flctx;
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (!ctx)
>> >> >>               return;
>> >> >>
>> >> >
>> >> >
>> >> > --
>> >> > Jeff Layton <jlayton@poochiereds.net>
>> >>
>> >>
>> >>
>> >
>> >
>> > --
>> > Jeff Layton <jlayton@poochiereds.net>
>>
>>
>>
>
>
> --
> Jeff Layton <jlayton@poochiereds.net>



-- 
Dmitry Vyukov, Software Engineer, dvyukov@google.com
Google Germany GmbH, Dienerstraße 12, 80331, München
Geschäftsführer: Graham Law, Christine Elizabeth Flores
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Diese E-Mail ist vertraulich. Wenn Sie nicht der richtige Adressat
sind, leiten Sie diese bitte nicht weiter, informieren Sie den
Absender und löschen Sie die E-Mail und alle Anhänge. Vielen Dank.
This e-mail is confidential. If you are not the right addressee please
do not forward it, please inform the sender, and please erase this
e-mail including any attachments. Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-09-21 11:38           ` Dmitry Vyukov
@ 2015-09-21 11:44             ` Jeff Layton
  2015-09-21 11:53               ` Dmitry Vyukov
  2015-10-19 15:18               ` William Dauchy
  0 siblings, 2 replies; 16+ messages in thread
From: Jeff Layton @ 2015-09-21 11:44 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Bruce Fields, Al Viro, linux-fsdevel@vger.kernel.org, LKML,
	Kostya Serebryany, Andrey Konovalov, Alexander Potapenko, ktsan,
	Paul McKenney

On Mon, 21 Sep 2015 13:38:45 +0200
Dmitry Vyukov <dvyukov@google.com> wrote:

> Yes. That processor is Alpha and that's documented in DATA DEPENDENCY
> BARRIERS section of Documentation/memory-barriers.txt.
> 
> 

Ok, thanks for the explanation. Patch looks fine to me. I'll go ahead
and merge it for v4.4. Let me know though if you think this needs to go
in sooner.

Thanks,
Jeff

> 
> On Mon, Sep 21, 2015 at 1:17 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> > On Mon, 21 Sep 2015 13:00:04 +0200
> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >
> >> On Mon, Sep 21, 2015 at 12:56 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> >> > On Mon, 21 Sep 2015 12:53:40 +0200
> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >> >
> >> >> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> >> >> > On Mon, 21 Sep 2015 09:43:06 +0200
> >> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >> >> >
> >> >> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
> >> >> >> cmpxchg() is a release operation which is correct. But it uses
> >> >> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
> >> >> >> from i_flctx can hoist above the load of i_flctx pointer itself
> >> >> >> and observe uninitialized garbage there. This in turn can lead
> >> >> >> to corruption of ctx->flc_lock and other members.
> >> >> >>
> >> >> >
> >> >> > I don't get this bit. The i_flctx field is initialized to zero when the
> >> >> > inode is allocated, and we only assign it with cmpxchg(). How would you
> >> >> > ever see uninitialized garbage in there? It should either be NULL or a
> >> >> > valid pointer, no?
> >> >>
> >> >> This is not about i_flctx pointer, this is about i_flctx object
> >> >> contents (pointee).
> >> >> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
> >> >>
> >> >
> >> > Again, I don't get it though. The cmpxchg happens after we've
> >> > initialized the structure. Given that there are implicit full memory
> >> > barriers before and after the cmpxchg(), how do you end up with another
> >> > task seeing the pointer before it was ever initialized?
> >> >
> >> > The store should only happen after the initialization has occurred, and
> >> > the loads in the other tasks shouldn't be able to see the results of
> >> > that store until then. No?
> >>
> >> If a reader looks at things in the opposite order, then it does not
> >> matter how you store them. Reader still can observe them in the wrong
> >> order.
> >> Memory barriers is always a game of two. Writer needs to do stores in
> >> the correct order and reader must do reads in the correct order. A
> >> single memory barrier cannot possible make any sense.
> >>
> >
> > I get that, but...isn't there a data dependency there? In order for the
> > reader to see bogus fields inside of i_flctx, it needs to be able to
> > see a valid pointer in i_flctx...and in order for that to occur the
> > store has to have occurred.
> >
> > I guess the concern is that the reader CPU could stumble across some
> > memory that is eventually going to part of i_flctx prior to loading
> > that pointer, and assume that its contents are still valid after the
> > pointer store has occurred? Is that the basic scenario?
> >
> >>
> >> >> > If that'st the case, then most of the places where you're adding
> >> >> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
> >> >> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
> >> >> > something.
> >> >> >
> >> >> >> Documentation/memory-barriers.txt explicitly requires to use
> >> >> >> a barrier in such context:
> >> >> >> "A load-load control dependency requires a full read memory barrier".
> >> >> >>
> >> >> >
> >> >> > The same file also says:
> >> >> >
> >> >> > "Any atomic operation that modifies some state in memory and returns information
> >> >> > about the state (old or new) implies an SMP-conditional general memory barrier
> >> >> > (smp_mb()) on each side of the actual operation (with the exception of
> >> >> > explicit lock operations, described later).  These include:
> >> >> >
> >> >> > ...
> >> >> > /* when succeeds */
> >> >> > cmpxchg();
> >> >> > "
> >> >> >
> >> >> > If there is an implied smp_mb() before and after the cmpxchg(), how
> >> >> > could the CPU hoist anything before that point?
> >> >> >
> >> >> > Note that I'm not saying that you're wrong, I just want to make sure I
> >> >> > fully understand the problem before we go trying to fix it.
> >> >> >
> >> >> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
> >> >> >> of other functions that can proceed concurrently with
> >> >> >> locks_get_lock_context().
> >> >> >>
> >> >> >> The data race was found with KernelThreadSanitizer (KTSAN).
> >> >> >>
> >> >> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> >> >> >> ---
> >> >> >> For the record, here is the KTSAN report:
> >> >> >>
> >> >> >> ThreadSanitizer: data-race in _raw_spin_lock
> >> >> >>
> >> >> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
> >> >> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
> >> >> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
> >> >> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
> >> >> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
> >> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
> >> >> >>
> >> >> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
> >> >> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
> >> >> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
> >> >> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
> >> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >> >> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
> >> >> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
> >> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
> >> >> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
> >> >> >> ---
> >> >> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
> >> >> >>  1 file changed, 36 insertions(+), 27 deletions(-)
> >> >> >>
> >> >> >> diff --git a/fs/locks.c b/fs/locks.c
> >> >> >> index 2a54c80..316e474 100644
> >> >> >> --- a/fs/locks.c
> >> >> >> +++ b/fs/locks.c
> >> >> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
> >> >> >>  static struct file_lock_context *
> >> >> >>  locks_get_lock_context(struct inode *inode, int type)
> >> >> >>  {
> >> >> >> -     struct file_lock_context *new;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>
> >> >> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
> >> >> >> +     /* paired with cmpxchg() below */
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >> +     if (likely(ctx) || type == F_UNLCK)
> >> >> >>               goto out;
> >> >> >>
> >> >> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> >> >> -     if (!new)
> >> >> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> >> >> +     if (!ctx)
> >> >> >>               goto out;
> >> >> >>
> >> >> >> -     spin_lock_init(&new->flc_lock);
> >> >> >> -     INIT_LIST_HEAD(&new->flc_flock);
> >> >> >> -     INIT_LIST_HEAD(&new->flc_posix);
> >> >> >> -     INIT_LIST_HEAD(&new->flc_lease);
> >> >> >> +     spin_lock_init(&ctx->flc_lock);
> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
> >> >> >>
> >> >> >>       /*
> >> >> >>        * Assign the pointer if it's not already assigned. If it is, then
> >> >> >>        * free the context we just allocated.
> >> >> >>        */
> >> >> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
> >> >> >> -             kmem_cache_free(flctx_cache, new);
> >> >> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
> >> >> >> +             kmem_cache_free(flctx_cache, ctx);
> >> >> >> +             ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >> +     }
> >> >> >>  out:
> >> >> >> -     return inode->i_flctx;
> >> >> >> +     return ctx;
> >> >> >>  }
> >> >> >>
> >> >> >>  void
> >> >> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
> >> >> >>       struct file_lock_context *ctx;
> >> >> >>       struct inode *inode = file_inode(filp);
> >> >> >>
> >> >> >> -     ctx = inode->i_flctx;
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> >> >> >>               fl->fl_type = F_UNLCK;
> >> >> >>               return;
> >> >> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
> >> >> >>       struct file_lock_context *ctx;
> >> >> >>       struct file_lock *fl;
> >> >> >>
> >> >> >> -     ctx = inode->i_flctx;
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
> >> >> >>               return 0;
> >> >> >>
> >> >> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
> >> >> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >> >> >>  {
> >> >> >>       int error = 0;
> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>       struct file_lock *new_fl, *fl, *tmp;
> >> >> >>       unsigned long break_time;
> >> >> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
> >> >> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >> >> >>       new_fl->fl_flags = type;
> >> >> >>
> >> >> >>       /* typically we will check that ctx is non-NULL before calling */
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (!ctx) {
> >> >> >>               WARN_ON_ONCE(1);
> >> >> >>               return error;
> >> >> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
> >> >> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
> >> >> >>  {
> >> >> >>       bool has_lease = false;
> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>       struct file_lock *fl;
> >> >> >>
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >> >> >>               spin_lock(&ctx->flc_lock);
> >> >> >>               if (!list_empty(&ctx->flc_lease)) {
> >> >> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
> >> >> >>  {
> >> >> >>       struct file_lock *fl;
> >> >> >>       struct inode *inode = file_inode(filp);
> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>       int type = F_UNLCK;
> >> >> >>       LIST_HEAD(dispose);
> >> >> >>
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >> >> >>               spin_lock(&ctx->flc_lock);
> >> >> >>               time_out_leases(file_inode(filp), &dispose);
> >> >> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
> >> >> >>       struct file_lock *fl, *victim = NULL;
> >> >> >>       struct dentry *dentry = filp->f_path.dentry;
> >> >> >>       struct inode *inode = dentry->d_inode;
> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>       LIST_HEAD(dispose);
> >> >> >>
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (!ctx) {
> >> >> >>               trace_generic_delete_lease(inode, NULL);
> >> >> >>               return error;
> >> >> >> @@ -2359,13 +2367,14 @@ out:
> >> >> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
> >> >> >>  {
> >> >> >>       struct file_lock lock;
> >> >> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>
> >> >> >>       /*
> >> >> >>        * If there are no locks held on this file, we don't need to call
> >> >> >>        * posix_lock_file().  Another process could be setting a lock on this
> >> >> >>        * file at the same time, but we wouldn't remove that lock anyway.
> >> >> >>        */
> >> >> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
> >> >> >>       if (!ctx || list_empty(&ctx->flc_posix))
> >> >> >>               return;
> >> >> >>
> >> >> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
> >> >> >>
> >> >> >>  /* The i_flctx must be valid when calling into here */
> >> >> >>  static void
> >> >> >> -locks_remove_flock(struct file *filp)
> >> >> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
> >> >> >>  {
> >> >> >>       struct file_lock fl = {
> >> >> >>               .fl_owner = filp,
> >> >> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
> >> >> >>               .fl_end = OFFSET_MAX,
> >> >> >>       };
> >> >> >>       struct inode *inode = file_inode(filp);
> >> >> >> -     struct file_lock_context *flctx = inode->i_flctx;
> >> >> >>
> >> >> >>       if (list_empty(&flctx->flc_flock))
> >> >> >>               return;
> >> >> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
> >> >> >>
> >> >> >>  /* The i_flctx must be valid when calling into here */
> >> >> >>  static void
> >> >> >> -locks_remove_lease(struct file *filp)
> >> >> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
> >> >> >>  {
> >> >> >> -     struct inode *inode = file_inode(filp);
> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> >>       struct file_lock *fl, *tmp;
> >> >> >>       LIST_HEAD(dispose);
> >> >> >>
> >> >> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
> >> >> >>   */
> >> >> >>  void locks_remove_file(struct file *filp)
> >> >> >>  {
> >> >> >> -     if (!file_inode(filp)->i_flctx)
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >> +
> >> >> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
> >> >> >> +     if (!ctx)
> >> >> >>               return;
> >> >> >>
> >> >> >>       /* remove any OFD locks */
> >> >> >>       locks_remove_posix(filp, filp);
> >> >> >>
> >> >> >>       /* remove flock locks */
> >> >> >> -     locks_remove_flock(filp);
> >> >> >> +     locks_remove_flock(filp, ctx);
> >> >> >>
> >> >> >>       /* remove any leases */
> >> >> >> -     locks_remove_lease(filp);
> >> >> >> +     locks_remove_lease(filp, ctx);
> >> >> >>  }
> >> >> >>
> >> >> >>  /**
> >> >> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
> >> >> >>       struct file_lock_context *ctx;
> >> >> >>       int id = 0;
> >> >> >>
> >> >> >> -     ctx = inode->i_flctx;
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (!ctx)
> >> >> >>               return;
> >> >> >>
> >> >> >
> >> >> >
> >> >> > --
> >> >> > Jeff Layton <jlayton@poochiereds.net>
> >> >>
> >> >>
> >> >>
> >> >
> >> >
> >> > --
> >> > Jeff Layton <jlayton@poochiereds.net>
> >>
> >>
> >>
> >
> >
> > --
> > Jeff Layton <jlayton@poochiereds.net>
> 
> 
> 


-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-09-21 11:44             ` Jeff Layton
@ 2015-09-21 11:53               ` Dmitry Vyukov
  2015-10-19 15:18               ` William Dauchy
  1 sibling, 0 replies; 16+ messages in thread
From: Dmitry Vyukov @ 2015-09-21 11:53 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Bruce Fields, Al Viro, linux-fsdevel@vger.kernel.org, LKML,
	Kostya Serebryany, Andrey Konovalov, Alexander Potapenko, ktsan,
	Paul McKenney

No hurry on my side.
Thanks

On Mon, Sep 21, 2015 at 1:44 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Mon, 21 Sep 2015 13:38:45 +0200
> Dmitry Vyukov <dvyukov@google.com> wrote:
>
>> Yes. That processor is Alpha and that's documented in DATA DEPENDENCY
>> BARRIERS section of Documentation/memory-barriers.txt.
>>
>>
>
> Ok, thanks for the explanation. Patch looks fine to me. I'll go ahead
> and merge it for v4.4. Let me know though if you think this needs to go
> in sooner.
>
> Thanks,
> Jeff
>
>>
>> On Mon, Sep 21, 2015 at 1:17 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> > On Mon, 21 Sep 2015 13:00:04 +0200
>> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >
>> >> On Mon, Sep 21, 2015 at 12:56 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> >> > On Mon, 21 Sep 2015 12:53:40 +0200
>> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >> >
>> >> >> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> >> >> > On Mon, 21 Sep 2015 09:43:06 +0200
>> >> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >> >> >
>> >> >> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
>> >> >> >> cmpxchg() is a release operation which is correct. But it uses
>> >> >> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
>> >> >> >> from i_flctx can hoist above the load of i_flctx pointer itself
>> >> >> >> and observe uninitialized garbage there. This in turn can lead
>> >> >> >> to corruption of ctx->flc_lock and other members.
>> >> >> >>
>> >> >> >
>> >> >> > I don't get this bit. The i_flctx field is initialized to zero when the
>> >> >> > inode is allocated, and we only assign it with cmpxchg(). How would you
>> >> >> > ever see uninitialized garbage in there? It should either be NULL or a
>> >> >> > valid pointer, no?
>> >> >>
>> >> >> This is not about i_flctx pointer, this is about i_flctx object
>> >> >> contents (pointee).
>> >> >> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
>> >> >>
>> >> >
>> >> > Again, I don't get it though. The cmpxchg happens after we've
>> >> > initialized the structure. Given that there are implicit full memory
>> >> > barriers before and after the cmpxchg(), how do you end up with another
>> >> > task seeing the pointer before it was ever initialized?
>> >> >
>> >> > The store should only happen after the initialization has occurred, and
>> >> > the loads in the other tasks shouldn't be able to see the results of
>> >> > that store until then. No?
>> >>
>> >> If a reader looks at things in the opposite order, then it does not
>> >> matter how you store them. Reader still can observe them in the wrong
>> >> order.
>> >> Memory barriers is always a game of two. Writer needs to do stores in
>> >> the correct order and reader must do reads in the correct order. A
>> >> single memory barrier cannot possible make any sense.
>> >>
>> >
>> > I get that, but...isn't there a data dependency there? In order for the
>> > reader to see bogus fields inside of i_flctx, it needs to be able to
>> > see a valid pointer in i_flctx...and in order for that to occur the
>> > store has to have occurred.
>> >
>> > I guess the concern is that the reader CPU could stumble across some
>> > memory that is eventually going to part of i_flctx prior to loading
>> > that pointer, and assume that its contents are still valid after the
>> > pointer store has occurred? Is that the basic scenario?
>> >
>> >>
>> >> >> > If that'st the case, then most of the places where you're adding
>> >> >> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
>> >> >> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
>> >> >> > something.
>> >> >> >
>> >> >> >> Documentation/memory-barriers.txt explicitly requires to use
>> >> >> >> a barrier in such context:
>> >> >> >> "A load-load control dependency requires a full read memory barrier".
>> >> >> >>
>> >> >> >
>> >> >> > The same file also says:
>> >> >> >
>> >> >> > "Any atomic operation that modifies some state in memory and returns information
>> >> >> > about the state (old or new) implies an SMP-conditional general memory barrier
>> >> >> > (smp_mb()) on each side of the actual operation (with the exception of
>> >> >> > explicit lock operations, described later).  These include:
>> >> >> >
>> >> >> > ...
>> >> >> > /* when succeeds */
>> >> >> > cmpxchg();
>> >> >> > "
>> >> >> >
>> >> >> > If there is an implied smp_mb() before and after the cmpxchg(), how
>> >> >> > could the CPU hoist anything before that point?
>> >> >> >
>> >> >> > Note that I'm not saying that you're wrong, I just want to make sure I
>> >> >> > fully understand the problem before we go trying to fix it.
>> >> >> >
>> >> >> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
>> >> >> >> of other functions that can proceed concurrently with
>> >> >> >> locks_get_lock_context().
>> >> >> >>
>> >> >> >> The data race was found with KernelThreadSanitizer (KTSAN).
>> >> >> >>
>> >> >> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
>> >> >> >> ---
>> >> >> >> For the record, here is the KTSAN report:
>> >> >> >>
>> >> >> >> ThreadSanitizer: data-race in _raw_spin_lock
>> >> >> >>
>> >> >> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
>> >> >> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
>> >> >> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
>> >> >> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
>> >> >> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
>> >> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
>> >> >> >>
>> >> >> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
>> >> >> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
>> >> >> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
>> >> >> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
>> >> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >> >> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
>> >> >> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
>> >> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
>> >> >> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
>> >> >> >> ---
>> >> >> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
>> >> >> >>  1 file changed, 36 insertions(+), 27 deletions(-)
>> >> >> >>
>> >> >> >> diff --git a/fs/locks.c b/fs/locks.c
>> >> >> >> index 2a54c80..316e474 100644
>> >> >> >> --- a/fs/locks.c
>> >> >> >> +++ b/fs/locks.c
>> >> >> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
>> >> >> >>  static struct file_lock_context *
>> >> >> >>  locks_get_lock_context(struct inode *inode, int type)
>> >> >> >>  {
>> >> >> >> -     struct file_lock_context *new;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>
>> >> >> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
>> >> >> >> +     /* paired with cmpxchg() below */
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >> +     if (likely(ctx) || type == F_UNLCK)
>> >> >> >>               goto out;
>> >> >> >>
>> >> >> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> >> >> -     if (!new)
>> >> >> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> >> >> +     if (!ctx)
>> >> >> >>               goto out;
>> >> >> >>
>> >> >> >> -     spin_lock_init(&new->flc_lock);
>> >> >> >> -     INIT_LIST_HEAD(&new->flc_flock);
>> >> >> >> -     INIT_LIST_HEAD(&new->flc_posix);
>> >> >> >> -     INIT_LIST_HEAD(&new->flc_lease);
>> >> >> >> +     spin_lock_init(&ctx->flc_lock);
>> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
>> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
>> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
>> >> >> >>
>> >> >> >>       /*
>> >> >> >>        * Assign the pointer if it's not already assigned. If it is, then
>> >> >> >>        * free the context we just allocated.
>> >> >> >>        */
>> >> >> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
>> >> >> >> -             kmem_cache_free(flctx_cache, new);
>> >> >> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
>> >> >> >> +             kmem_cache_free(flctx_cache, ctx);
>> >> >> >> +             ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >> +     }
>> >> >> >>  out:
>> >> >> >> -     return inode->i_flctx;
>> >> >> >> +     return ctx;
>> >> >> >>  }
>> >> >> >>
>> >> >> >>  void
>> >> >> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>> >> >> >>       struct file_lock_context *ctx;
>> >> >> >>       struct inode *inode = file_inode(filp);
>> >> >> >>
>> >> >> >> -     ctx = inode->i_flctx;
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>> >> >> >>               fl->fl_type = F_UNLCK;
>> >> >> >>               return;
>> >> >> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
>> >> >> >>       struct file_lock_context *ctx;
>> >> >> >>       struct file_lock *fl;
>> >> >> >>
>> >> >> >> -     ctx = inode->i_flctx;
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
>> >> >> >>               return 0;
>> >> >> >>
>> >> >> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
>> >> >> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >> >> >>  {
>> >> >> >>       int error = 0;
>> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>       struct file_lock *new_fl, *fl, *tmp;
>> >> >> >>       unsigned long break_time;
>> >> >> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
>> >> >> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >> >> >>       new_fl->fl_flags = type;
>> >> >> >>
>> >> >> >>       /* typically we will check that ctx is non-NULL before calling */
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (!ctx) {
>> >> >> >>               WARN_ON_ONCE(1);
>> >> >> >>               return error;
>> >> >> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
>> >> >> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
>> >> >> >>  {
>> >> >> >>       bool has_lease = false;
>> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>       struct file_lock *fl;
>> >> >> >>
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >> >> >>               spin_lock(&ctx->flc_lock);
>> >> >> >>               if (!list_empty(&ctx->flc_lease)) {
>> >> >> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
>> >> >> >>  {
>> >> >> >>       struct file_lock *fl;
>> >> >> >>       struct inode *inode = file_inode(filp);
>> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>       int type = F_UNLCK;
>> >> >> >>       LIST_HEAD(dispose);
>> >> >> >>
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >> >> >>               spin_lock(&ctx->flc_lock);
>> >> >> >>               time_out_leases(file_inode(filp), &dispose);
>> >> >> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
>> >> >> >>       struct file_lock *fl, *victim = NULL;
>> >> >> >>       struct dentry *dentry = filp->f_path.dentry;
>> >> >> >>       struct inode *inode = dentry->d_inode;
>> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>       LIST_HEAD(dispose);
>> >> >> >>
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (!ctx) {
>> >> >> >>               trace_generic_delete_lease(inode, NULL);
>> >> >> >>               return error;
>> >> >> >> @@ -2359,13 +2367,14 @@ out:
>> >> >> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>> >> >> >>  {
>> >> >> >>       struct file_lock lock;
>> >> >> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>
>> >> >> >>       /*
>> >> >> >>        * If there are no locks held on this file, we don't need to call
>> >> >> >>        * posix_lock_file().  Another process could be setting a lock on this
>> >> >> >>        * file at the same time, but we wouldn't remove that lock anyway.
>> >> >> >>        */
>> >> >> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
>> >> >> >>       if (!ctx || list_empty(&ctx->flc_posix))
>> >> >> >>               return;
>> >> >> >>
>> >> >> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
>> >> >> >>
>> >> >> >>  /* The i_flctx must be valid when calling into here */
>> >> >> >>  static void
>> >> >> >> -locks_remove_flock(struct file *filp)
>> >> >> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>> >> >> >>  {
>> >> >> >>       struct file_lock fl = {
>> >> >> >>               .fl_owner = filp,
>> >> >> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
>> >> >> >>               .fl_end = OFFSET_MAX,
>> >> >> >>       };
>> >> >> >>       struct inode *inode = file_inode(filp);
>> >> >> >> -     struct file_lock_context *flctx = inode->i_flctx;
>> >> >> >>
>> >> >> >>       if (list_empty(&flctx->flc_flock))
>> >> >> >>               return;
>> >> >> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
>> >> >> >>
>> >> >> >>  /* The i_flctx must be valid when calling into here */
>> >> >> >>  static void
>> >> >> >> -locks_remove_lease(struct file *filp)
>> >> >> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
>> >> >> >>  {
>> >> >> >> -     struct inode *inode = file_inode(filp);
>> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> >>       struct file_lock *fl, *tmp;
>> >> >> >>       LIST_HEAD(dispose);
>> >> >> >>
>> >> >> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
>> >> >> >>   */
>> >> >> >>  void locks_remove_file(struct file *filp)
>> >> >> >>  {
>> >> >> >> -     if (!file_inode(filp)->i_flctx)
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >> +
>> >> >> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
>> >> >> >> +     if (!ctx)
>> >> >> >>               return;
>> >> >> >>
>> >> >> >>       /* remove any OFD locks */
>> >> >> >>       locks_remove_posix(filp, filp);
>> >> >> >>
>> >> >> >>       /* remove flock locks */
>> >> >> >> -     locks_remove_flock(filp);
>> >> >> >> +     locks_remove_flock(filp, ctx);
>> >> >> >>
>> >> >> >>       /* remove any leases */
>> >> >> >> -     locks_remove_lease(filp);
>> >> >> >> +     locks_remove_lease(filp, ctx);
>> >> >> >>  }
>> >> >> >>
>> >> >> >>  /**
>> >> >> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
>> >> >> >>       struct file_lock_context *ctx;
>> >> >> >>       int id = 0;
>> >> >> >>
>> >> >> >> -     ctx = inode->i_flctx;
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (!ctx)
>> >> >> >>               return;
>> >> >> >>
>> >> >> >
>> >> >> >
>> >> >> > --
>> >> >> > Jeff Layton <jlayton@poochiereds.net>
>> >> >>
>> >> >>
>> >> >>
>> >> >
>> >> >
>> >> > --
>> >> > Jeff Layton <jlayton@poochiereds.net>
>> >>
>> >>
>> >>
>> >
>> >
>> > --
>> > Jeff Layton <jlayton@poochiereds.net>
>>
>>
>>
>
>
> --
> Jeff Layton <jlayton@poochiereds.net>



-- 
Dmitry Vyukov, Software Engineer, dvyukov@google.com
Google Germany GmbH, Dienerstraße 12, 80331, München
Geschäftsführer: Graham Law, Christine Elizabeth Flores
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Diese E-Mail ist vertraulich. Wenn Sie nicht der richtige Adressat
sind, leiten Sie diese bitte nicht weiter, informieren Sie den
Absender und löschen Sie die E-Mail und alle Anhänge. Vielen Dank.
This e-mail is confidential. If you are not the right addressee please
do not forward it, please inform the sender, and please erase this
e-mail including any attachments. Thanks.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-09-21 11:44             ` Jeff Layton
  2015-09-21 11:53               ` Dmitry Vyukov
@ 2015-10-19 15:18               ` William Dauchy
  2015-10-19 15:27                 ` Dmitry Vyukov
  2015-10-19 16:44                 ` Jeff Layton
  1 sibling, 2 replies; 16+ messages in thread
From: William Dauchy @ 2015-10-19 15:18 UTC (permalink / raw)
  To: Jeff Layton, Dmitry Vyukov
  Cc: Bruce Fields, Al Viro, linux-fsdevel@vger.kernel.org, LKML,
	Kostya Serebryany, Andrey Konovalov, Alexander Potapenko, ktsan,
	Paul McKenney

Hello Dmitry,

On Mon, Sep 21, 2015 at 1:44 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> Ok, thanks for the explanation. Patch looks fine to me. I'll go ahead
> and merge it for v4.4. Let me know though if you think this needs to go
> in sooner.

I am getting a null deref on a v4.1.x
Do you think your patch could fix the following trace? It looks
similar in my opinion.

BUG: unable to handle kernel NULL pointer dereference at 00000000000001c8
IP: [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
PGD 0
Oops: 0000 [#1] SMP
CPU: 1 PID: 1773 Comm: kworker/1:1H Not tainted 4.1.11-rc1 #1
Workqueue: rpciod ffffffff8164fff0
task: ffff8810374deba0 ti: ffff8810374df150 task.ti: ffff8810374df150
RIP: 0010:[<ffffffff811d0cf3>]  [<ffffffff811d0cf3>]
locks_get_lock_context+0x3/0xc0
RSP: 0000:ffff881036007bb0  EFLAGS: 00010246
RAX: ffff881036007c30 RBX: ffff881001981880 RCX: 0000000000000002
RDX: 00000000000006ed RSI: 0000000000000002 RDI: 0000000000000000
RBP: ffff881036007c08 R08: 0000000000000000 R09: 0000000000000001
R10: 0000000000000000 R11: ffff88101db69948 R12: ffff8810019818d8
R13: ffff881036007bc8 R14: ffff880e225d81c0 R15: ffff881edfd2b400
FS:  0000000000000000(0000) GS:ffff88103fc20000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000001c8 CR3: 000000000169b000 CR4: 00000000000606f0
Stack:
 ffffffff811d2710 ffff881036007bc8 ffffffff819f1af1 ffff881036007bc8
 ffff881036007bc8 ffff881036007c08 ffff881001981880 ffff8810019818d8
 ffff881036007c48 ffff880e225d81c0 ffff881edfd2b400 ffff881036007c88
Call Trace:
 [<ffffffff811d2710>] ? flock_lock_file+0x30/0x270
 [<ffffffff811d3ad1>] flock_lock_file_wait+0x41/0xf0
 [<ffffffff8168be66>] ? _raw_spin_unlock+0x26/0x40
 [<ffffffff81268de9>] do_vfs_lock+0x19/0x40
 [<ffffffff812695cc>] nfs4_locku_done+0x5c/0xf0
 [<ffffffff8164f3f4>] rpc_exit_task+0x34/0xb0
 [<ffffffff8164fcd9>] __rpc_execute+0x79/0x390
 [<ffffffff81650000>] rpc_async_schedule+0x10/0x20
 [<ffffffff81086095>] process_one_work+0x1a5/0x450
 [<ffffffff81086024>] ? process_one_work+0x134/0x450
 [<ffffffff8108638b>] worker_thread+0x4b/0x4a0
 [<ffffffff81086340>] ? process_one_work+0x450/0x450
 [<ffffffff81086340>] ? process_one_work+0x450/0x450
 [<ffffffff8108d777>] kthread+0xf7/0x110
 [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
 [<ffffffff8168ce3e>] ret_from_fork+0x3e/0x70
 [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
Code: 48 b8 00 00 00 00 00 00 00 80 55 48 89 e5 48 09 c1 ff d1 5d 85
c0 0f 95 c0 0f b6 c0 eb b9 66 2e 0f 1f 84 00 00 00 00 00 83 fe 02 <48>
8b 87 c8 01 00 00 0f 84 a0 00 00 00 48 85 c0 0f 85 97 00 00
RIP  [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
 RSP <ffff881036007bb0>
CR2: 00000000000001c8
---[ end trace 2da9686dda1b5574 ]---


Thanks,
-- 
William

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-10-19 15:18               ` William Dauchy
@ 2015-10-19 15:27                 ` Dmitry Vyukov
  2015-10-19 15:36                   ` William Dauchy
  2015-10-19 16:44                 ` Jeff Layton
  1 sibling, 1 reply; 16+ messages in thread
From: Dmitry Vyukov @ 2015-10-19 15:27 UTC (permalink / raw)
  To: William Dauchy
  Cc: Jeff Layton, Bruce Fields, Al Viro, linux-fsdevel@vger.kernel.org,
	LKML, Kostya Serebryany, Andrey Konovalov, Alexander Potapenko,
	ktsan, Paul McKenney

I would expect that you see something else.
The issue that I fixed would fire very infrequently and unreproducibly.


On Mon, Oct 19, 2015 at 5:18 PM, William Dauchy <wdauchy@gmail.com> wrote:
> Hello Dmitry,
>
> On Mon, Sep 21, 2015 at 1:44 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> Ok, thanks for the explanation. Patch looks fine to me. I'll go ahead
>> and merge it for v4.4. Let me know though if you think this needs to go
>> in sooner.
>
> I am getting a null deref on a v4.1.x
> Do you think your patch could fix the following trace? It looks
> similar in my opinion.
>
> BUG: unable to handle kernel NULL pointer dereference at 00000000000001c8
> IP: [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
> PGD 0
> Oops: 0000 [#1] SMP
> CPU: 1 PID: 1773 Comm: kworker/1:1H Not tainted 4.1.11-rc1 #1
> Workqueue: rpciod ffffffff8164fff0
> task: ffff8810374deba0 ti: ffff8810374df150 task.ti: ffff8810374df150
> RIP: 0010:[<ffffffff811d0cf3>]  [<ffffffff811d0cf3>]
> locks_get_lock_context+0x3/0xc0
> RSP: 0000:ffff881036007bb0  EFLAGS: 00010246
> RAX: ffff881036007c30 RBX: ffff881001981880 RCX: 0000000000000002
> RDX: 00000000000006ed RSI: 0000000000000002 RDI: 0000000000000000
> RBP: ffff881036007c08 R08: 0000000000000000 R09: 0000000000000001
> R10: 0000000000000000 R11: ffff88101db69948 R12: ffff8810019818d8
> R13: ffff881036007bc8 R14: ffff880e225d81c0 R15: ffff881edfd2b400
> FS:  0000000000000000(0000) GS:ffff88103fc20000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000000001c8 CR3: 000000000169b000 CR4: 00000000000606f0
> Stack:
>  ffffffff811d2710 ffff881036007bc8 ffffffff819f1af1 ffff881036007bc8
>  ffff881036007bc8 ffff881036007c08 ffff881001981880 ffff8810019818d8
>  ffff881036007c48 ffff880e225d81c0 ffff881edfd2b400 ffff881036007c88
> Call Trace:
>  [<ffffffff811d2710>] ? flock_lock_file+0x30/0x270
>  [<ffffffff811d3ad1>] flock_lock_file_wait+0x41/0xf0
>  [<ffffffff8168be66>] ? _raw_spin_unlock+0x26/0x40
>  [<ffffffff81268de9>] do_vfs_lock+0x19/0x40
>  [<ffffffff812695cc>] nfs4_locku_done+0x5c/0xf0
>  [<ffffffff8164f3f4>] rpc_exit_task+0x34/0xb0
>  [<ffffffff8164fcd9>] __rpc_execute+0x79/0x390
>  [<ffffffff81650000>] rpc_async_schedule+0x10/0x20
>  [<ffffffff81086095>] process_one_work+0x1a5/0x450
>  [<ffffffff81086024>] ? process_one_work+0x134/0x450
>  [<ffffffff8108638b>] worker_thread+0x4b/0x4a0
>  [<ffffffff81086340>] ? process_one_work+0x450/0x450
>  [<ffffffff81086340>] ? process_one_work+0x450/0x450
>  [<ffffffff8108d777>] kthread+0xf7/0x110
>  [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
>  [<ffffffff8168ce3e>] ret_from_fork+0x3e/0x70
>  [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
> Code: 48 b8 00 00 00 00 00 00 00 80 55 48 89 e5 48 09 c1 ff d1 5d 85
> c0 0f 95 c0 0f b6 c0 eb b9 66 2e 0f 1f 84 00 00 00 00 00 83 fe 02 <48>
> 8b 87 c8 01 00 00 0f 84 a0 00 00 00 48 85 c0 0f 85 97 00 00
> RIP  [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
>  RSP <ffff881036007bb0>
> CR2: 00000000000001c8
> ---[ end trace 2da9686dda1b5574 ]---
>
>
> Thanks,
> --
> William
>
> --
> You received this message because you are subscribed to the Google Groups "ktsan" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to ktsan+unsubscribe@googlegroups.com.
> To post to this group, send email to ktsan@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/ktsan/CAJ75kXYAFAWUh6RScKqsFbt4D462Rbc3%2BW3dd6x4-bh7EOKXcQ%40mail.gmail.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-10-19 15:27                 ` Dmitry Vyukov
@ 2015-10-19 15:36                   ` William Dauchy
  0 siblings, 0 replies; 16+ messages in thread
From: William Dauchy @ 2015-10-19 15:36 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Jeff Layton, Bruce Fields, Al Viro, linux-fsdevel@vger.kernel.org,
	LKML, Kostya Serebryany, Andrey Konovalov, Alexander Potapenko,
	ktsan, Paul McKenney

Hello Dmitry,

On Mon, Oct 19, 2015 at 5:27 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
> I would expect that you see something else.
> The issue that I fixed would fire very infrequently and unreproducibly.

Thanks for the quick reply. I will open another thread for this issue.
-- 
William

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-10-19 15:18               ` William Dauchy
  2015-10-19 15:27                 ` Dmitry Vyukov
@ 2015-10-19 16:44                 ` Jeff Layton
  2015-10-19 16:53                   ` William Dauchy
  1 sibling, 1 reply; 16+ messages in thread
From: Jeff Layton @ 2015-10-19 16:44 UTC (permalink / raw)
  To: William Dauchy, Dmitry Vyukov
  Cc: Bruce Fields, Al Viro, linux-fsdevel@vger.kernel.org, LKML,
	Kostya Serebryany, Andrey Konovalov, Alexander Potapenko, ktsan,
	Paul McKenney

On Mon, 2015-10-19 at 17:18 +0200, William Dauchy wrote:
> Hello Dmitry,
> 
> On Mon, Sep 21, 2015 at 1:44 PM, Jeff Layton <jlayton@poochiereds.net
> > wrote:
> > Ok, thanks for the explanation. Patch looks fine to me. I'll go
> > ahead
> > and merge it for v4.4. Let me know though if you think this needs
> > to go
> > in sooner.
> 
> I am getting a null deref on a v4.1.x
> Do you think your patch could fix the following trace? It looks
> similar in my opinion.
> 
> BUG: unable to handle kernel NULL pointer dereference at
> 00000000000001c8
> IP: [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
> PGD 0
> Oops: 0000 [#1] SMP
> CPU: 1 PID: 1773 Comm: kworker/1:1H Not tainted 4.1.11-rc1 #1
> Workqueue: rpciod ffffffff8164fff0
> task: ffff8810374deba0 ti: ffff8810374df150 task.ti: ffff8810374df150
> RIP: 0010:[<ffffffff811d0cf3>]  [<ffffffff811d0cf3>]
> locks_get_lock_context+0x3/0xc0
> RSP: 0000:ffff881036007bb0  EFLAGS: 00010246
> RAX: ffff881036007c30 RBX: ffff881001981880 RCX: 0000000000000002
> RDX: 00000000000006ed RSI: 0000000000000002 RDI: 0000000000000000
> RBP: ffff881036007c08 R08: 0000000000000000 R09: 0000000000000001
> R10: 0000000000000000 R11: ffff88101db69948 R12: ffff8810019818d8
> R13: ffff881036007bc8 R14: ffff880e225d81c0 R15: ffff881edfd2b400
> FS:  0000000000000000(0000) GS:ffff88103fc20000(0000)
> knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000000001c8 CR3: 000000000169b000 CR4: 00000000000606f0
> Stack:
>  ffffffff811d2710 ffff881036007bc8 ffffffff819f1af1 ffff881036007bc8
>  ffff881036007bc8 ffff881036007c08 ffff881001981880 ffff8810019818d8
>  ffff881036007c48 ffff880e225d81c0 ffff881edfd2b400 ffff881036007c88
> Call Trace:
>  [<ffffffff811d2710>] ? flock_lock_file+0x30/0x270
>  [<ffffffff811d3ad1>] flock_lock_file_wait+0x41/0xf0
>  [<ffffffff8168be66>] ? _raw_spin_unlock+0x26/0x40
>  [<ffffffff81268de9>] do_vfs_lock+0x19/0x40
>  [<ffffffff812695cc>] nfs4_locku_done+0x5c/0xf0
>  [<ffffffff8164f3f4>] rpc_exit_task+0x34/0xb0
>  [<ffffffff8164fcd9>] __rpc_execute+0x79/0x390
>  [<ffffffff81650000>] rpc_async_schedule+0x10/0x20
>  [<ffffffff81086095>] process_one_work+0x1a5/0x450
>  [<ffffffff81086024>] ? process_one_work+0x134/0x450
>  [<ffffffff8108638b>] worker_thread+0x4b/0x4a0
>  [<ffffffff81086340>] ? process_one_work+0x450/0x450
>  [<ffffffff81086340>] ? process_one_work+0x450/0x450
>  [<ffffffff8108d777>] kthread+0xf7/0x110
>  [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
>  [<ffffffff8168ce3e>] ret_from_fork+0x3e/0x70
>  [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
> Code: 48 b8 00 00 00 00 00 00 00 80 55 48 89 e5 48 09 c1 ff d1 5d 85
> c0 0f 95 c0 0f b6 c0 eb b9 66 2e 0f 1f 84 00 00 00 00 00 83 fe 02
> <48>
> 8b 87 c8 01 00 00 0f 84 a0 00 00 00 48 85 c0 0f 85 97 00 00
> RIP  [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
>  RSP <ffff881036007bb0>
> CR2: 00000000000001c8
> ---[ end trace 2da9686dda1b5574 ]---
> 
> 
> Thanks,


This should be fixed by this series of four commits that are already in
mainline:

   
bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b65c6
b3e388538d9

The basic problem is that when the struct file is being torn down,
file_inode(file) may already return NULL. So we need to be able to pass
in the inode directly from the reference held by the nfs_open_context.

Are those patches reasonable to pull in?

-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-10-19 16:44                 ` Jeff Layton
@ 2015-10-19 16:53                   ` William Dauchy
  2015-10-19 17:24                     ` William Dauchy
  0 siblings, 1 reply; 16+ messages in thread
From: William Dauchy @ 2015-10-19 16:53 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Dmitry Vyukov, Bruce Fields, Al Viro,
	linux-fsdevel@vger.kernel.org, LKML, Kostya Serebryany,
	Andrey Konovalov, Alexander Potapenko, ktsan, Paul McKenney

Hi Jeff,

Thank you for you reply.

On Mon, Oct 19, 2015 at 6:44 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> This should be fixed by this series of four commits that are already in
> mainline:
> bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b65c6
> b3e388538d9

Am I missing something, I see three of them between
bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b65c6b3e388538d9
(and not four)

ee296d7c5709440f8abd36b5b65c6b3e388538d9 locks: inline
posix_lock_file_wait and flock_lock_file_wait
83bfff23e9ed19f37c4ef0bba84e75bd88e5cf21 nfs4: have do_vfs_lock take
an inode pointer
29d01b22eaa18d8b46091d3c98c6001c49f78e4a locks: new helpers -
flock_lock_inode_wait and posix_lock_inode_wait

Thanks,
-- 
William

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-10-19 16:53                   ` William Dauchy
@ 2015-10-19 17:24                     ` William Dauchy
  2015-10-19 17:46                       ` Jeff Layton
  0 siblings, 1 reply; 16+ messages in thread
From: William Dauchy @ 2015-10-19 17:24 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Dmitry Vyukov, Bruce Fields, Al Viro,
	linux-fsdevel@vger.kernel.org, LKML, Kostya Serebryany,
	Andrey Konovalov, Alexander Potapenko, ktsan, Paul McKenney

On Mon, Oct 19, 2015 at 6:53 PM, William Dauchy <wdauchy@gmail.com> wrote:
> On Mon, Oct 19, 2015 at 6:44 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> This should be fixed by this series of four commits that are already in
>> mainline:
>> bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b65c6
>> b3e388538d9
>
> Am I missing something, I see three of them between
> bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b65c6b3e388538d9
> (and not four)
>
> ee296d7c5709440f8abd36b5b65c6b3e388538d9 locks: inline
> posix_lock_file_wait and flock_lock_file_wait
> 83bfff23e9ed19f37c4ef0bba84e75bd88e5cf21 nfs4: have do_vfs_lock take
> an inode pointer
> 29d01b22eaa18d8b46091d3c98c6001c49f78e4a locks: new helpers -
> flock_lock_inode_wait and posix_lock_inode_wait

I suppose I found the missing one
bcd7f78 locks: have flock_lock_file take an inode pointer instead of a filp

-- 
William

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] fs: fix data races on inode->i_flctx
  2015-10-19 17:24                     ` William Dauchy
@ 2015-10-19 17:46                       ` Jeff Layton
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Layton @ 2015-10-19 17:46 UTC (permalink / raw)
  To: William Dauchy
  Cc: Dmitry Vyukov, Bruce Fields, Al Viro,
	linux-fsdevel@vger.kernel.org, LKML, Kostya Serebryany,
	Andrey Konovalov, Alexander Potapenko, ktsan, Paul McKenney

On Mon, 2015-10-19 at 19:24 +0200, William Dauchy wrote:
> On Mon, Oct 19, 2015 at 6:53 PM, William Dauchy <wdauchy@gmail.com>
> wrote:
> > On Mon, Oct 19, 2015 at 6:44 PM, Jeff Layton <
> > jlayton@poochiereds.net> wrote:
> > > This should be fixed by this series of four commits that are
> > > already in
> > > mainline:
> > > bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b
> > > 5b65c6
> > > b3e388538d9
> > 
> > Am I missing something, I see three of them between
> > bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b
> > 65c6b3e388538d9
> > (and not four)
> > 
> > ee296d7c5709440f8abd36b5b65c6b3e388538d9 locks: inline
> > posix_lock_file_wait and flock_lock_file_wait
> > 83bfff23e9ed19f37c4ef0bba84e75bd88e5cf21 nfs4: have do_vfs_lock
> > take
> > an inode pointer
> > 29d01b22eaa18d8b46091d3c98c6001c49f78e4a locks: new helpers -
> > flock_lock_inode_wait and posix_lock_inode_wait
> 
> I suppose I found the missing one
> bcd7f78 locks: have flock_lock_file take an inode pointer instead of
> a filp
> 

Ahh yes, sorry -- that one is also required.

Thanks!
-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2015-10-19 17:46 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-09-21  7:43 [PATCH] fs: fix data races on inode->i_flctx Dmitry Vyukov
2015-09-21 10:50 ` Jeff Layton
2015-09-21 10:53   ` Dmitry Vyukov
2015-09-21 10:56     ` Jeff Layton
2015-09-21 11:00       ` Dmitry Vyukov
2015-09-21 11:17         ` Jeff Layton
2015-09-21 11:38           ` Dmitry Vyukov
2015-09-21 11:44             ` Jeff Layton
2015-09-21 11:53               ` Dmitry Vyukov
2015-10-19 15:18               ` William Dauchy
2015-10-19 15:27                 ` Dmitry Vyukov
2015-10-19 15:36                   ` William Dauchy
2015-10-19 16:44                 ` Jeff Layton
2015-10-19 16:53                   ` William Dauchy
2015-10-19 17:24                     ` William Dauchy
2015-10-19 17:46                       ` Jeff Layton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).