[PATCH] drm: Fix lock order reversal between mmap_sem and struct

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
@ 2009-02-18  0:59 Eric Anholt
  2009-02-18  8:02 ` Wang Chen
  2009-02-18 15:08 ` [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex Peter Zijlstra
  0 siblings, 2 replies; 20+ messages in thread
From: Eric Anholt @ 2009-02-18  0:59 UTC (permalink / raw)
  To: linux-kernel; +Cc: dri-devel, Eric Anholt

The basic problem was
mmap_sem (do_mmap()) -> struct_mutex (drm_gem_mmap(), i915_gem_fault())
struct_mutex (i915_gem_execbuffer()) -> mmap_sem (copy_from/to_user())

We have plenty of places where we want to hold device state the same
(struct_mutex) while we move a non-trivial amount of data
(copy_from/to_user()), such as i915_gem_pwrite().  Solve this by moving the
easy things that needed struct_mutex with mmap_sem held to using a lock to
cover just those data structures (offset hash and offset manager), and do
trylock and reschedule in fault.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/drm_gem.c       |    8 ++++----
 drivers/gpu/drm/i915/i915_gem.c |   15 ++++++++++++++-
 include/drm/drmP.h              |    1 +
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 88d3368..13a0184 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -97,6 +97,7 @@ drm_gem_init(struct drm_device *dev)
 
 	dev->mm_private = mm;
 
+	mutex_init(&mm->offset_mutex);
 	if (drm_ht_create(&mm->offset_hash, 19)) {
 		drm_free(mm, sizeof(struct drm_gem_mm), DRM_MEM_MM);
 		return -ENOMEM;
@@ -508,10 +509,9 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	unsigned long prot;
 	int ret = 0;
 
-	mutex_lock(&dev->struct_mutex);
-
+	mutex_lock(&mm->offset_mutex);
 	if (drm_ht_find_item(&mm->offset_hash, vma->vm_pgoff, &hash)) {
-		mutex_unlock(&dev->struct_mutex);
+		mutex_unlock(&mm->offset_mutex);
 		return drm_mmap(filp, vma);
 	}
 
@@ -556,7 +556,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	drm_vm_open_locked(vma);
 
 out_unlock:
-	mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&mm->offset_mutex);
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ac534c9..da9a2cb 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -573,8 +573,16 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
 		PAGE_SHIFT;
 
+	/* Get the struct mutex before accessing GEM data structures, but
+	 * keep the struct_mutex -> mmap_sem lock ordering so that we don't
+	 * need to mangle pwrite/pread to allow mmap_sem -> struct_mutex.
+	 */
+	if (!mutex_trylock(&dev->struct_mutex)) {
+		need_resched();
+		return VM_FAULT_NOPAGE;
+	}
+
 	/* Now bind it into the GTT if needed */
-	mutex_lock(&dev->struct_mutex);
 	if (!obj_priv->gtt_space) {
 		ret = i915_gem_object_bind_to_gtt(obj, obj_priv->gtt_alignment);
 		if (ret) {
@@ -646,6 +654,7 @@ i915_gem_create_mmap_offset(struct drm_gem_object *obj)
 	map->size = obj->size;
 	map->handle = obj;
 
+	mutex_lock(&mm->offset_mutex);
 	/* Get a DRM GEM mmap offset allocated... */
 	list->file_offset_node = drm_mm_search_free(&mm->offset_manager,
 						    obj->size / PAGE_SIZE, 0, 0);
@@ -671,12 +680,14 @@ i915_gem_create_mmap_offset(struct drm_gem_object *obj)
 	/* By now we should be all set, any drm_mmap request on the offset
 	 * below will get to our mmap & fault handler */
 	obj_priv->mmap_offset = ((uint64_t) list->hash.key) << PAGE_SHIFT;
+	mutex_unlock(&mm->offset_mutex);
 
 	return 0;
 
 out_free_mm:
 	drm_mm_put_block(list->file_offset_node);
 out_free_list:
+	mutex_unlock(&mm->offset_mutex);
 	drm_free(list->map, sizeof(struct drm_map_list), DRM_MEM_DRIVER);
 
 	return ret;
@@ -690,6 +701,7 @@ i915_gem_free_mmap_offset(struct drm_gem_object *obj)
 	struct drm_gem_mm *mm = dev->mm_private;
 	struct drm_map_list *list;
 
+	mutex_lock(&mm->offset_mutex);
 	list = &obj->map_list;
 	drm_ht_remove_item(&mm->offset_hash, &list->hash);
 
@@ -704,6 +716,7 @@ i915_gem_free_mmap_offset(struct drm_gem_object *obj)
 	}
 
 	obj_priv->mmap_offset = 0;
+	mutex_unlock(&mm->offset_mutex);
 }
 
 /**
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index e5f4ae9..04f765b 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -570,6 +570,7 @@ struct drm_ati_pcigart_info {
 struct drm_gem_mm {
 	struct drm_mm offset_manager;	/**< Offset mgmt for buffer objects */
 	struct drm_open_hash offset_hash; /**< User token hash table for maps */
+	struct mutex offset_mutex; /**< covers offset_manager and offset_hash */
 };
 
 /**
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-18  0:59 [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex Eric Anholt
@ 2009-02-18  8:02 ` Wang Chen
  2009-02-18 16:38   ` [PATCH] drm: Take mmap_sem up front to avoid lock order violations krh
  2009-02-18 15:08 ` [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex Peter Zijlstra
  1 sibling, 1 reply; 20+ messages in thread
From: Wang Chen @ 2009-02-18  8:02 UTC (permalink / raw)
  To: Eric Anholt; +Cc: linux-kernel, dri-devel

Eric Anholt said the following on 2009-2-18 8:59:
> The basic problem was
> mmap_sem (do_mmap()) -> struct_mutex (drm_gem_mmap(), i915_gem_fault())
> struct_mutex (i915_gem_execbuffer()) -> mmap_sem (copy_from/to_user())
> 
> We have plenty of places where we want to hold device state the same
> (struct_mutex) while we move a non-trivial amount of data
> (copy_from/to_user()), such as i915_gem_pwrite().  Solve this by moving the
> easy things that needed struct_mutex with mmap_sem held to using a lock to
> cover just those data structures (offset hash and offset manager), and do
> trylock and reschedule in fault.
> 

Eric, I tested the patch.
But following bug still doesn't disappear.
http://bugzilla.kernel.org/show_bug.cgi?id=12419

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-18  0:59 [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex Eric Anholt
  2009-02-18  8:02 ` Wang Chen
@ 2009-02-18 15:08 ` Peter Zijlstra
  2009-02-19 21:02   ` Thomas Hellstrom
  1 sibling, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2009-02-18 15:08 UTC (permalink / raw)
  To: Eric Anholt; +Cc: linux-kernel, dri-devel, Ingo Molnar, Nick Piggin, Wang Chen

On Tue, 2009-02-17 at 16:59 -0800, Eric Anholt wrote:
> The basic problem was
> mmap_sem (do_mmap()) -> struct_mutex (drm_gem_mmap(), i915_gem_fault())
> struct_mutex (i915_gem_execbuffer()) -> mmap_sem (copy_from/to_user())

That's not the only problem, there's also:

dup_mmap()
  down_write(mmap_sem)
  vm_ops->open() -> drm_vm_open()
    mutex_lock(struct_mutex);

> We have plenty of places where we want to hold device state the same
> (struct_mutex) while we move a non-trivial amount of data
> (copy_from/to_user()), such as i915_gem_pwrite().  Solve this by moving the
> easy things that needed struct_mutex with mmap_sem held to using a lock to
> cover just those data structures (offset hash and offset manager), and do
> trylock and reschedule in fault.

So we establish,

  mmap_sem
    offset_mutex

  i915_gem_mmap_gtt_ioctl()
    mutex_lock(struct_mutex)
    i915_gem_create_mmap_offset()
      mutex_lock(offset_mutex)

However we still have

  struct_mutex
    mmap_sem

in basically every copy_*_user() case

But you cannot seem to switch ->fault() to use offset_mutex, which would
work out the inversion because you then have:

  struct_mutex
    mmap_sem
      offset_mutex

So why bother with the offset_mutex? Instead you make your ->fault()
fail randomly.

I'm not sure what Wang Chen sees after this patch, but I should not be
the exact same splat, still it would not at all surprise me if there's
plenty left.

The locking looks very fragile and I don't think this patch is helping
anything, sorry.

> Signed-off-by: Eric Anholt <eric@anholt.net>
> ---
>  drivers/gpu/drm/drm_gem.c       |    8 ++++----
>  drivers/gpu/drm/i915/i915_gem.c |   15 ++++++++++++++-
>  include/drm/drmP.h              |    1 +
>  3 files changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
> index 88d3368..13a0184 100644
> --- a/drivers/gpu/drm/drm_gem.c
> +++ b/drivers/gpu/drm/drm_gem.c
> @@ -97,6 +97,7 @@ drm_gem_init(struct drm_device *dev)
>  
>  	dev->mm_private = mm;
>  
> +	mutex_init(&mm->offset_mutex);
>  	if (drm_ht_create(&mm->offset_hash, 19)) {
>  		drm_free(mm, sizeof(struct drm_gem_mm), DRM_MEM_MM);
>  		return -ENOMEM;
> @@ -508,10 +509,9 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
>  	unsigned long prot;
>  	int ret = 0;
>  
> -	mutex_lock(&dev->struct_mutex);
> -
> +	mutex_lock(&mm->offset_mutex);
>  	if (drm_ht_find_item(&mm->offset_hash, vma->vm_pgoff, &hash)) {
> -		mutex_unlock(&dev->struct_mutex);
> +		mutex_unlock(&mm->offset_mutex);
>  		return drm_mmap(filp, vma);
>  	}
>  
> @@ -556,7 +556,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
>  	drm_vm_open_locked(vma);
>  
>  out_unlock:
> -	mutex_unlock(&dev->struct_mutex);
> +	mutex_unlock(&mm->offset_mutex);
>  
>  	return ret;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ac534c9..da9a2cb 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -573,8 +573,16 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
>  		PAGE_SHIFT;
>  
> +	/* Get the struct mutex before accessing GEM data structures, but
> +	 * keep the struct_mutex -> mmap_sem lock ordering so that we don't
> +	 * need to mangle pwrite/pread to allow mmap_sem -> struct_mutex.
> +	 */
> +	if (!mutex_trylock(&dev->struct_mutex)) {
> +		need_resched();
> +		return VM_FAULT_NOPAGE;
> +	}

So we just fail the fault if someone happens to hold the struct_mutex?
Seems rather fragile, could be another thread doing an ioctl.

>  	/* Now bind it into the GTT if needed */
> -	mutex_lock(&dev->struct_mutex);
>  	if (!obj_priv->gtt_space) {
>  		ret = i915_gem_object_bind_to_gtt(obj, obj_priv->gtt_alignment);
>  		if (ret) {


> @@ -646,6 +654,7 @@ i915_gem_create_mmap_offset(struct drm_gem_object *obj)
>  	map->size = obj->size;
>  	map->handle = obj;
>  
> +	mutex_lock(&mm->offset_mutex);
>  	/* Get a DRM GEM mmap offset allocated... */
>  	list->file_offset_node = drm_mm_search_free(&mm->offset_manager,
>  						    obj->size / PAGE_SIZE, 0, 0);
> @@ -671,12 +680,14 @@ i915_gem_create_mmap_offset(struct drm_gem_object *obj)
>  	/* By now we should be all set, any drm_mmap request on the offset
>  	 * below will get to our mmap & fault handler */
>  	obj_priv->mmap_offset = ((uint64_t) list->hash.key) << PAGE_SHIFT;
> +	mutex_unlock(&mm->offset_mutex);
>  
>  	return 0;
>  
>  out_free_mm:
>  	drm_mm_put_block(list->file_offset_node);
>  out_free_list:
> +	mutex_unlock(&mm->offset_mutex);
>  	drm_free(list->map, sizeof(struct drm_map_list), DRM_MEM_DRIVER);
>  
>  	return ret;
> @@ -690,6 +701,7 @@ i915_gem_free_mmap_offset(struct drm_gem_object *obj)
>  	struct drm_gem_mm *mm = dev->mm_private;
>  	struct drm_map_list *list;
>  
> +	mutex_lock(&mm->offset_mutex);
>  	list = &obj->map_list;
>  	drm_ht_remove_item(&mm->offset_hash, &list->hash);
>  
> @@ -704,6 +716,7 @@ i915_gem_free_mmap_offset(struct drm_gem_object *obj)
>  	}
>  
>  	obj_priv->mmap_offset = 0;
> +	mutex_unlock(&mm->offset_mutex);
>  }
>  
>  /**
> diff --git a/include/drm/drmP.h b/include/drm/drmP.h
> index e5f4ae9..04f765b 100644
> --- a/include/drm/drmP.h
> +++ b/include/drm/drmP.h
> @@ -570,6 +570,7 @@ struct drm_ati_pcigart_info {
>  struct drm_gem_mm {
>  	struct drm_mm offset_manager;	/**< Offset mgmt for buffer objects */
>  	struct drm_open_hash offset_hash; /**< User token hash table for maps */
> +	struct mutex offset_mutex; /**< covers offset_manager and offset_hash */
>  };
>  
>  /**


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] drm: Take mmap_sem up front to avoid lock order violations.
  2009-02-18  8:02 ` Wang Chen
@ 2009-02-18 16:38   ` krh
  2009-02-19  9:19     ` Peter Zijlstra
  0 siblings, 1 reply; 20+ messages in thread
From: krh @ 2009-02-18 16:38 UTC (permalink / raw)
  To: eric; +Cc: Wang Chen, dri-devel, linux-kernel, Kristian Høgsberg

From: Kristian Høgsberg <krh@redhat.com>

A number of GEM operations (and legacy drm ones) want to copy data to
or from userspace while holding the struct_mutex lock.  However, the
fault handler calls us with the mmap_sem held and thus enforces the
opposite locking order.  This patch downs the mmap_sem up front for
those operations that access userspace data under the struct_mutex
lock to ensure the locking order is consistent.

Signed-off-by: Kristian Høgsberg <krh@redhat.com>
---

Here's a different and simpler attempt to fix the locking order
problem.  We can just down_read() the mmap_sem pre-emptively up-front,
and the locking order is respected.  It's simpler than the
mutex_trylock() game, avoids introducing a new mutex.

(forgot to add lkml, resending)

cheers,
Kristian

 drivers/gpu/drm/i915/i915_dma.c |    6 +++++-
 drivers/gpu/drm/i915/i915_gem.c |   20 +++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 81f1cff..d8b58d9 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -642,9 +642,11 @@ static int i915_batchbuffer(struct drm_device *dev, void *data,
 						       sizeof(struct drm_clip_rect)))
 		return -EFAULT;
 
+	down_read(&current->mm->mmap_sem);
 	mutex_lock(&dev->struct_mutex);
 	ret = i915_dispatch_batchbuffer(dev, batch);
 	mutex_unlock(&dev->struct_mutex);
+	up_read(&current->mm->mmap_sem);
 
 	if (sarea_priv)
 		sarea_priv->last_dispatch = READ_BREADCRUMB(dev_priv);
@@ -674,14 +676,16 @@ static int i915_cmdbuffer(struct drm_device *dev, void *data,
 		return -EFAULT;
 	}
 
+	down_read(&current->mm->mmap_sem);
 	mutex_lock(&dev->struct_mutex);
 	ret = i915_dispatch_cmdbuffer(dev, cmdbuf);
 	mutex_unlock(&dev->struct_mutex);
+	up_read(&current->mm->mmap_sem);
+
 	if (ret) {
 		DRM_ERROR("i915_dispatch_cmdbuffer failed\n");
 		return ret;
 	}
-
 	if (sarea_priv)
 		sarea_priv->last_dispatch = READ_BREADCRUMB(dev_priv);
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d9cd42f..3dd8b6e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -171,6 +171,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
+	down_read(&current->mm->mmap_sem);
 	mutex_lock(&dev->struct_mutex);
 
 	ret = i915_gem_object_set_cpu_read_domain_range(obj, args->offset,
@@ -196,6 +197,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 
 	drm_gem_object_unreference(obj);
 	mutex_unlock(&dev->struct_mutex);
+	up_read(&current->mm->mmap_sem);
 
 	return 0;
 }
@@ -264,7 +266,7 @@ i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
 	if (!access_ok(VERIFY_READ, user_data, remain))
 		return -EFAULT;
 
-
+	down_read(&current->mm->mmap_sem);
 	mutex_lock(&dev->struct_mutex);
 	ret = i915_gem_object_pin(obj, 0);
 	if (ret) {
@@ -315,6 +317,7 @@ i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
 fail:
 	i915_gem_object_unpin(obj);
 	mutex_unlock(&dev->struct_mutex);
+	up_read(&current->mm->mmap_sem);
 
 	return ret;
 }
@@ -328,6 +331,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
 	loff_t offset;
 	ssize_t written;
 
+	down_read(&current->mm->mmap_sem);
 	mutex_lock(&dev->struct_mutex);
 
 	ret = i915_gem_object_set_to_cpu_domain(obj, 1);
@@ -350,6 +354,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
 	}
 
 	mutex_unlock(&dev->struct_mutex);
+	up_read(&current->mm->mmap_sem);
 
 	return 0;
 }
@@ -2473,22 +2478,21 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 		goto pre_mutex_err;
 	}
 
+	down_read(&current->mm->mmap_sem);
 	mutex_lock(&dev->struct_mutex);
 
 	i915_verify_inactive(dev, __FILE__, __LINE__);
 
 	if (dev_priv->mm.wedged) {
 		DRM_ERROR("Execbuf while wedged\n");
-		mutex_unlock(&dev->struct_mutex);
 		ret = -EIO;
-		goto pre_mutex_err;
+		goto mutex_err;
 	}
 
 	if (dev_priv->mm.suspended) {
 		DRM_ERROR("Execbuf while VT-switched.\n");
-		mutex_unlock(&dev->struct_mutex);
 		ret = -EBUSY;
-		goto pre_mutex_err;
+		goto mutex_err;
 	}
 
 	/* Look up object handles */
@@ -2641,8 +2645,6 @@ err:
 	for (i = 0; i < args->buffer_count; i++)
 		drm_gem_object_unreference(object_list[i]);
 
-	mutex_unlock(&dev->struct_mutex);
-
 	if (!ret) {
 		/* Copy the new buffer offsets back to the user's exec list. */
 		ret = copy_to_user((struct drm_i915_relocation_entry __user *)
@@ -2655,6 +2657,10 @@ err:
 				  args->buffer_count, ret);
 	}
 
+mutex_err:
+	mutex_unlock(&dev->struct_mutex);
+	up_read(&current->mm->mmap_sem);
+
 pre_mutex_err:
 	drm_free(object_list, sizeof(*object_list) * args->buffer_count,
 		 DRM_MEM_DRIVER);
-- 
1.6.1.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Take mmap_sem up front to avoid lock order violations.
  2009-02-18 16:38   ` [PATCH] drm: Take mmap_sem up front to avoid lock order violations krh
@ 2009-02-19  9:19     ` Peter Zijlstra
  2009-02-19 10:33       ` Peter Zijlstra
  2009-02-19 12:57       ` Nick Piggin
  0 siblings, 2 replies; 20+ messages in thread
From: Peter Zijlstra @ 2009-02-19  9:19 UTC (permalink / raw)
  To: krh
  Cc: eric, Wang Chen, dri-devel, linux-kernel, Kristian Høgsberg,
	Nick Piggin, Andrew Morton, Hugh Dickins

On Wed, 2009-02-18 at 11:38 -0500, krh@bitplanet.net wrote:
> From: Kristian Høgsberg <krh@redhat.com>
> 
> A number of GEM operations (and legacy drm ones) want to copy data to
> or from userspace while holding the struct_mutex lock.  However, the
> fault handler calls us with the mmap_sem held and thus enforces the
> opposite locking order.  This patch downs the mmap_sem up front for
> those operations that access userspace data under the struct_mutex
> lock to ensure the locking order is consistent.
> 
> Signed-off-by: Kristian Høgsberg <krh@redhat.com>
> ---
> 
> Here's a different and simpler attempt to fix the locking order
> problem.  We can just down_read() the mmap_sem pre-emptively up-front,
> and the locking order is respected.  It's simpler than the
> mutex_trylock() game, avoids introducing a new mutex.
> 

Hell no!

for one, mmap_sem is not a recursive lock, so a pagefault will utterly
fail with this in place.

Secondly, holding mmap_sem for no good reason just sucks.

>  drivers/gpu/drm/i915/i915_dma.c |    6 +++++-
>  drivers/gpu/drm/i915/i915_gem.c |   20 +++++++++++++-------
>  2 files changed, 18 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 81f1cff..d8b58d9 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -642,9 +642,11 @@ static int i915_batchbuffer(struct drm_device *dev, void *data,
>  						       sizeof(struct drm_clip_rect)))
>  		return -EFAULT;
>  
> +	down_read(&current->mm->mmap_sem);
>  	mutex_lock(&dev->struct_mutex);
>  	ret = i915_dispatch_batchbuffer(dev, batch);
>  	mutex_unlock(&dev->struct_mutex);
> +	up_read(&current->mm->mmap_sem);
>  
>  	if (sarea_priv)
>  		sarea_priv->last_dispatch = READ_BREADCRUMB(dev_priv);
> @@ -674,14 +676,16 @@ static int i915_cmdbuffer(struct drm_device *dev, void *data,
>  		return -EFAULT;
>  	}
>  
> +	down_read(&current->mm->mmap_sem);
>  	mutex_lock(&dev->struct_mutex);
>  	ret = i915_dispatch_cmdbuffer(dev, cmdbuf);
>  	mutex_unlock(&dev->struct_mutex);
> +	up_read(&current->mm->mmap_sem);
> +
>  	if (ret) {
>  		DRM_ERROR("i915_dispatch_cmdbuffer failed\n");
>  		return ret;
>  	}
> -
>  	if (sarea_priv)
>  		sarea_priv->last_dispatch = READ_BREADCRUMB(dev_priv);
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d9cd42f..3dd8b6e 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -171,6 +171,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
>  		return -EINVAL;
>  	}
>  
> +	down_read(&current->mm->mmap_sem);
>  	mutex_lock(&dev->struct_mutex);
>  
>  	ret = i915_gem_object_set_cpu_read_domain_range(obj, args->offset,
> @@ -196,6 +197,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
>  
>  	drm_gem_object_unreference(obj);
>  	mutex_unlock(&dev->struct_mutex);
> +	up_read(&current->mm->mmap_sem);
>  
>  	return 0;
>  }
> @@ -264,7 +266,7 @@ i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
>  	if (!access_ok(VERIFY_READ, user_data, remain))
>  		return -EFAULT;
>  
> -
> +	down_read(&current->mm->mmap_sem);
>  	mutex_lock(&dev->struct_mutex);
>  	ret = i915_gem_object_pin(obj, 0);
>  	if (ret) {
> @@ -315,6 +317,7 @@ i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
>  fail:
>  	i915_gem_object_unpin(obj);
>  	mutex_unlock(&dev->struct_mutex);
> +	up_read(&current->mm->mmap_sem);
>  
>  	return ret;
>  }
> @@ -328,6 +331,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
>  	loff_t offset;
>  	ssize_t written;
>  
> +	down_read(&current->mm->mmap_sem);
>  	mutex_lock(&dev->struct_mutex);
>  
>  	ret = i915_gem_object_set_to_cpu_domain(obj, 1);
> @@ -350,6 +354,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
>  	}
>  
>  	mutex_unlock(&dev->struct_mutex);
> +	up_read(&current->mm->mmap_sem);
>  
>  	return 0;
>  }
> @@ -2473,22 +2478,21 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
>  		goto pre_mutex_err;
>  	}
>  
> +	down_read(&current->mm->mmap_sem);
>  	mutex_lock(&dev->struct_mutex);
>  
>  	i915_verify_inactive(dev, __FILE__, __LINE__);
>  
>  	if (dev_priv->mm.wedged) {
>  		DRM_ERROR("Execbuf while wedged\n");
> -		mutex_unlock(&dev->struct_mutex);
>  		ret = -EIO;
> -		goto pre_mutex_err;
> +		goto mutex_err;
>  	}
>  
>  	if (dev_priv->mm.suspended) {
>  		DRM_ERROR("Execbuf while VT-switched.\n");
> -		mutex_unlock(&dev->struct_mutex);
>  		ret = -EBUSY;
> -		goto pre_mutex_err;
> +		goto mutex_err;
>  	}
>  
>  	/* Look up object handles */
> @@ -2641,8 +2645,6 @@ err:
>  	for (i = 0; i < args->buffer_count; i++)
>  		drm_gem_object_unreference(object_list[i]);
>  
> -	mutex_unlock(&dev->struct_mutex);
> -
>  	if (!ret) {
>  		/* Copy the new buffer offsets back to the user's exec list. */
>  		ret = copy_to_user((struct drm_i915_relocation_entry __user *)
> @@ -2655,6 +2657,10 @@ err:
>  				  args->buffer_count, ret);
>  	}
>  
> +mutex_err:
> +	mutex_unlock(&dev->struct_mutex);
> +	up_read(&current->mm->mmap_sem);
> +
>  pre_mutex_err:
>  	drm_free(object_list, sizeof(*object_list) * args->buffer_count,
>  		 DRM_MEM_DRIVER);


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Take mmap_sem up front to avoid lock order violations.
  2009-02-19  9:19     ` Peter Zijlstra
@ 2009-02-19 10:33       ` Peter Zijlstra
  2009-02-19 14:49         ` Kristian Høgsberg
  2009-02-19 12:57       ` Nick Piggin
  1 sibling, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2009-02-19 10:33 UTC (permalink / raw)
  To: krh
  Cc: eric, Wang Chen, dri-devel, linux-kernel, Kristian Høgsberg,
	Nick Piggin, Andrew Morton, Hugh Dickins

On Thu, 2009-02-19 at 10:19 +0100, Peter Zijlstra wrote:
> On Wed, 2009-02-18 at 11:38 -0500, krh@bitplanet.net wrote:
> > From: Kristian Høgsberg <krh@redhat.com>
> > 
> > A number of GEM operations (and legacy drm ones) want to copy data to
> > or from userspace while holding the struct_mutex lock.  However, the
> > fault handler calls us with the mmap_sem held and thus enforces the
> > opposite locking order.  This patch downs the mmap_sem up front for
> > those operations that access userspace data under the struct_mutex
> > lock to ensure the locking order is consistent.
> > 
> > Signed-off-by: Kristian Høgsberg <krh@redhat.com>
> > ---
> > 
> > Here's a different and simpler attempt to fix the locking order
> > problem.  We can just down_read() the mmap_sem pre-emptively up-front,
> > and the locking order is respected.  It's simpler than the
> > mutex_trylock() game, avoids introducing a new mutex.
> > 

OK let me try that again -- my initial response was a tad curt :/

While I appreciate your efforts in fixing GEM (I too have an interest in
seeing it done), I cannot support your patch.

Firstly, you're using mmap_sem well outside its problem domain, this is
bad form. Furthermore, holding it for extended durations for no good
reason affects all other users.

Secondly, mmap_sem is not a recursive lock (very few kernel locks are,
and we generally frown upon recursive locking schemes), this means that
the fault handler still cannot function properly.





^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Take mmap_sem up front to avoid lock order violations.
  2009-02-19  9:19     ` Peter Zijlstra
  2009-02-19 10:33       ` Peter Zijlstra
@ 2009-02-19 12:57       ` Nick Piggin
  2009-02-21  2:33         ` Eric Anholt
  1 sibling, 1 reply; 20+ messages in thread
From: Nick Piggin @ 2009-02-19 12:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: krh, eric, Wang Chen, dri-devel, linux-kernel,
	Kristian Høgsberg, Andrew Morton, Hugh Dickins

On Thu, Feb 19, 2009 at 10:19:05AM +0100, Peter Zijlstra wrote:
> On Wed, 2009-02-18 at 11:38 -0500, krh@bitplanet.net wrote:
> > From: Kristian Høgsberg <krh@redhat.com>
> > 
> > A number of GEM operations (and legacy drm ones) want to copy data to
> > or from userspace while holding the struct_mutex lock.  However, the
> > fault handler calls us with the mmap_sem held and thus enforces the
> > opposite locking order.  This patch downs the mmap_sem up front for
> > those operations that access userspace data under the struct_mutex
> > lock to ensure the locking order is consistent.
> > 
> > Signed-off-by: Kristian Høgsberg <krh@redhat.com>
> > ---
> > 
> > Here's a different and simpler attempt to fix the locking order
> > problem.  We can just down_read() the mmap_sem pre-emptively up-front,
> > and the locking order is respected.  It's simpler than the
> > mutex_trylock() game, avoids introducing a new mutex.

The "simple" way to fix this is to just allocate a temporary buffer
to copy a snapshot of the data going to/from userspace. Then do the
real usercopy to/from that buffer outside the locks.

You don't have any performance critical bulk copies (ie. that will
blow the L1 cache), do you? 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Take mmap_sem up front to avoid lock order violations.
  2009-02-19 10:33       ` Peter Zijlstra
@ 2009-02-19 14:49         ` Kristian Høgsberg
  2009-02-19 15:17           ` Nick Piggin
  0 siblings, 1 reply; 20+ messages in thread
From: Kristian Høgsberg @ 2009-02-19 14:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: krh, eric, Wang Chen, dri-devel, linux-kernel, Nick Piggin,
	Andrew Morton, Hugh Dickins

On Thu, 2009-02-19 at 11:33 +0100, Peter Zijlstra wrote:
> On Thu, 2009-02-19 at 10:19 +0100, Peter Zijlstra wrote:
> > On Wed, 2009-02-18 at 11:38 -0500, krh@bitplanet.net wrote:
> > > From: Kristian Høgsberg <krh@redhat.com>
> > > 
> > > A number of GEM operations (and legacy drm ones) want to copy data to
> > > or from userspace while holding the struct_mutex lock.  However, the
> > > fault handler calls us with the mmap_sem held and thus enforces the
> > > opposite locking order.  This patch downs the mmap_sem up front for
> > > those operations that access userspace data under the struct_mutex
> > > lock to ensure the locking order is consistent.
> > > 
> > > Signed-off-by: Kristian Høgsberg <krh@redhat.com>
> > > ---
> > > 
> > > Here's a different and simpler attempt to fix the locking order
> > > problem.  We can just down_read() the mmap_sem pre-emptively up-front,
> > > and the locking order is respected.  It's simpler than the
> > > mutex_trylock() game, avoids introducing a new mutex.
> > > 
> 
> OK let me try that again -- my initial response was a tad curt :/

No that's fair, I was aware that the patch was probably borderline and
got the feedback I was looking for ;)

> While I appreciate your efforts in fixing GEM (I too have an interest in
> seeing it done), I cannot support your patch.
> 
> Firstly, you're using mmap_sem well outside its problem domain, this is
> bad form. Furthermore, holding it for extended durations for no good
> reason affects all other users.

Yup, agree.

> Secondly, mmap_sem is not a recursive lock (very few kernel locks are,
> and we generally frown upon recursive locking schemes), this means that
> the fault handler still cannot function properly.

I understand, but we take it twice only as a read lock, so that should
work, right?  We prevent the deadlock the lockdep validator warned about
and as far as I can see, the patch doesn't introduce a new one.  But
other than that I agree with the frowning on recursive locking, it's too
often used to paper over badly thought out locking.

cheers,
Kristian



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Take mmap_sem up front to avoid lock order violations.
  2009-02-19 14:49         ` Kristian Høgsberg
@ 2009-02-19 15:17           ` Nick Piggin
  2009-02-19 15:21             ` Kristian Høgsberg
  0 siblings, 1 reply; 20+ messages in thread
From: Nick Piggin @ 2009-02-19 15:17 UTC (permalink / raw)
  To: Kristian Høgsberg
  Cc: Peter Zijlstra, krh, eric, Wang Chen, dri-devel, linux-kernel,
	Andrew Morton, Hugh Dickins

On Thu, Feb 19, 2009 at 09:49:40AM -0500, Kristian Høgsberg wrote:
> 
> > Secondly, mmap_sem is not a recursive lock (very few kernel locks are,
> > and we generally frown upon recursive locking schemes), this means that
> > the fault handler still cannot function properly.
> 
> I understand, but we take it twice only as a read lock, so that should
> work, right?  We prevent the deadlock the lockdep validator warned about
> and as far as I can see, the patch doesn't introduce a new one.  But
> other than that I agree with the frowning on recursive locking, it's too
> often used to paper over badly thought out locking.

It doesn't work. rwsems are fair (otherwise there is terrible starvation
properties), so if another process does an interleaved down_write, then
the 2nd down_read will block until the down_write is serviced.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Take mmap_sem up front to avoid lock order violations.
  2009-02-19 15:17           ` Nick Piggin
@ 2009-02-19 15:21             ` Kristian Høgsberg
  0 siblings, 0 replies; 20+ messages in thread
From: Kristian Høgsberg @ 2009-02-19 15:21 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Peter Zijlstra, krh, eric, Wang Chen, dri-devel, linux-kernel,
	Andrew Morton, Hugh Dickins

On Thu, 2009-02-19 at 16:17 +0100, Nick Piggin wrote:
> On Thu, Feb 19, 2009 at 09:49:40AM -0500, Kristian Høgsberg wrote:
> > 
> > > Secondly, mmap_sem is not a recursive lock (very few kernel locks are,
> > > and we generally frown upon recursive locking schemes), this means that
> > > the fault handler still cannot function properly.
> > 
> > I understand, but we take it twice only as a read lock, so that should
> > work, right?  We prevent the deadlock the lockdep validator warned about
> > and as far as I can see, the patch doesn't introduce a new one.  But
> > other than that I agree with the frowning on recursive locking, it's too
> > often used to paper over badly thought out locking.
> 
> It doesn't work. rwsems are fair (otherwise there is terrible starvation
> properties), so if another process does an interleaved down_write, then
> the 2nd down_read will block until the down_write is serviced.

Ooh, right, yes of course, ouch.

thanks,
Kristian



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-18 15:08 ` [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex Peter Zijlstra
@ 2009-02-19 21:02   ` Thomas Hellstrom
  2009-02-19 22:26     ` Peter Zijlstra
  0 siblings, 1 reply; 20+ messages in thread
From: Thomas Hellstrom @ 2009-02-19 21:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Anholt, Wang Chen, Nick Piggin, Ingo Molnar, dri-devel,
	linux-kernel

Peter Zijlstra wrote:
> On Tue, 2009-02-17 at 16:59 -0800, Eric Anholt wrote:
>   
>> The basic problem was
>> mmap_sem (do_mmap()) -> struct_mutex (drm_gem_mmap(), i915_gem_fault())
>> struct_mutex (i915_gem_execbuffer()) -> mmap_sem (copy_from/to_user())
>>     
>
> That's not the only problem, there's also:
>
> dup_mmap()
>   down_write(mmap_sem)
>   vm_ops->open() -> drm_vm_open()
>     mutex_lock(struct_mutex);
>
>   
>> We have plenty of places where we want to hold device state the same
>> (struct_mutex) while we move a non-trivial amount of data
>> (copy_from/to_user()), such as i915_gem_pwrite().  Solve this by moving the
>> easy things that needed struct_mutex with mmap_sem held to using a lock to
>> cover just those data structures (offset hash and offset manager), and do
>> trylock and reschedule in fault.
>>     
>
> So we establish,
>
>   mmap_sem
>     offset_mutex
>
>   i915_gem_mmap_gtt_ioctl()
>     mutex_lock(struct_mutex)
>     i915_gem_create_mmap_offset()
>       mutex_lock(offset_mutex)
>
> However we still have
>
>   struct_mutex
>     mmap_sem
>
> in basically every copy_*_user() case
>
> But you cannot seem to switch ->fault() to use offset_mutex, which would
> work out the inversion because you then have:
>
>   struct_mutex
>     mmap_sem
>       offset_mutex
>
> So why bother with the offset_mutex? Instead you make your ->fault()
> fail randomly.
>
> I'm not sure what Wang Chen sees after this patch, but I should not be
> the exact same splat, still it would not at all surprise me if there's
> plenty left.
>
> The locking looks very fragile and I don't think this patch is helping
> anything, sorry.
>
>   
It looks to me like the driver preferred locking order is

object_mutex (which happens to be the device global struct_mutex)
  mmap_sem
     offset_mutex.

So if one could avoid using the struct_mutex for object bookkeeping (A 
separate lock) then
vm_open() and vm_close() would adhere to that locking order as well, 
simply by not taking the struct_mutex at all.

So only fault() remains, in which that locking order is reversed. 
Personally I think the trylock ->reschedule->retry method with proper 
commenting is a good solution. It will be the _only_ place where locking 
order is reversed and it is done in a deadlock-safe manner. Note that 
fault() doesn't really fail, but requests a retry from user-space with 
rescheduling to give the process holding the struct_mutex time to 
release it.

/Thomas






^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-19 21:02   ` Thomas Hellstrom
@ 2009-02-19 22:26     ` Peter Zijlstra
  2009-02-20  2:04       ` Eric Anholt
  2009-02-20  8:31       ` Thomas Hellstrom
  0 siblings, 2 replies; 20+ messages in thread
From: Peter Zijlstra @ 2009-02-19 22:26 UTC (permalink / raw)
  To: Thomas Hellstrom
  Cc: Eric Anholt, Wang Chen, Nick Piggin, Ingo Molnar, dri-devel,
	linux-kernel

On Thu, 2009-02-19 at 22:02 +0100, Thomas Hellstrom wrote:
>  
> It looks to me like the driver preferred locking order is
> 
> object_mutex (which happens to be the device global struct_mutex)
>   mmap_sem
>      offset_mutex.
> 
> So if one could avoid using the struct_mutex for object bookkeeping (A 
> separate lock) then
> vm_open() and vm_close() would adhere to that locking order as well, 
> simply by not taking the struct_mutex at all.
> 
> So only fault() remains, in which that locking order is reversed. 
> Personally I think the trylock ->reschedule->retry method with proper 
> commenting is a good solution. It will be the _only_ place where locking 
> order is reversed and it is done in a deadlock-safe manner. Note that 
> fault() doesn't really fail, but requests a retry from user-space with 
> rescheduling to give the process holding the struct_mutex time to 
> release it.

It doesn't do the reschedule -- need_resched() will check if the current
task was marked to be scheduled away, furthermore yield based locking
sucks chunks.

What's so very difficult about pulling the copy_*_user() out from under
the locks?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-19 22:26     ` Peter Zijlstra
@ 2009-02-20  2:04       ` Eric Anholt
  2009-02-20  7:36         ` Peter Zijlstra
  2009-02-20  8:31       ` Thomas Hellstrom
  1 sibling, 1 reply; 20+ messages in thread
From: Eric Anholt @ 2009-02-20  2:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Hellstrom, Wang Chen, Nick Piggin, Ingo Molnar, dri-devel,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2330 bytes --]

On Thu, 2009-02-19 at 23:26 +0100, Peter Zijlstra wrote:
> On Thu, 2009-02-19 at 22:02 +0100, Thomas Hellstrom wrote:
> >  
> > It looks to me like the driver preferred locking order is
> > 
> > object_mutex (which happens to be the device global struct_mutex)
> >   mmap_sem
> >      offset_mutex.
> > 
> > So if one could avoid using the struct_mutex for object bookkeeping (A 
> > separate lock) then
> > vm_open() and vm_close() would adhere to that locking order as well, 
> > simply by not taking the struct_mutex at all.
> > 
> > So only fault() remains, in which that locking order is reversed. 
> > Personally I think the trylock ->reschedule->retry method with proper 
> > commenting is a good solution. It will be the _only_ place where locking 
> > order is reversed and it is done in a deadlock-safe manner. Note that 
> > fault() doesn't really fail, but requests a retry from user-space with 
> > rescheduling to give the process holding the struct_mutex time to 
> > release it.
> 
> It doesn't do the reschedule -- need_resched() will check if the current
> task was marked to be scheduled away, furthermore yield based locking
> sucks chunks.
> 
> What's so very difficult about pulling the copy_*_user() out from under
> the locks?

That we're expecting the data movement to occur while holding device
state in place.  For example, we write data through the GTT most of the
time so we:

lock struct_mutex
pin the object to the GTT
flushing caches as needed
copy_from_user
unpin object
unlock struct_mutex

If I'm to pull the copy_from_user out, that means I have to:

alloc temporary storage
for each block of temp storage size:
	copy_from_user
	lock struct_mutex
	pin the object to the GTT
	flush caches as needed
	memcpy
	unpin object
	unlock struct_mutex

At this point of introducing our third copy of the user's data in our
hottest path, we should probably ditch the pwrite path entirely and go
to user mapping of the objects for performance.  Requiring user mapping
(which has significant overhead) cuts the likelihood of moving from
user-space object caching to kernel object caching in the future, which
has the potential of saving steaming piles of memory.

-- 
Eric Anholt
eric@anholt.net                         eric.anholt@intel.com



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-20  2:04       ` Eric Anholt
@ 2009-02-20  7:36         ` Peter Zijlstra
  2009-02-25  8:15           ` Eric Anholt
  0 siblings, 1 reply; 20+ messages in thread
From: Peter Zijlstra @ 2009-02-20  7:36 UTC (permalink / raw)
  To: Eric Anholt
  Cc: Thomas Hellstrom, Wang Chen, Nick Piggin, Ingo Molnar, dri-devel,
	linux-kernel

On Thu, 2009-02-19 at 18:04 -0800, Eric Anholt wrote:
> On Thu, 2009-02-19 at 23:26 +0100, Peter Zijlstra wrote:
> > On Thu, 2009-02-19 at 22:02 +0100, Thomas Hellstrom wrote:
> > >  
> > > It looks to me like the driver preferred locking order is
> > > 
> > > object_mutex (which happens to be the device global struct_mutex)
> > >   mmap_sem
> > >      offset_mutex.
> > > 
> > > So if one could avoid using the struct_mutex for object bookkeeping (A 
> > > separate lock) then
> > > vm_open() and vm_close() would adhere to that locking order as well, 
> > > simply by not taking the struct_mutex at all.
> > > 
> > > So only fault() remains, in which that locking order is reversed. 
> > > Personally I think the trylock ->reschedule->retry method with proper 
> > > commenting is a good solution. It will be the _only_ place where locking 
> > > order is reversed and it is done in a deadlock-safe manner. Note that 
> > > fault() doesn't really fail, but requests a retry from user-space with 
> > > rescheduling to give the process holding the struct_mutex time to 
> > > release it.
> > 
> > It doesn't do the reschedule -- need_resched() will check if the current
> > task was marked to be scheduled away, furthermore yield based locking
> > sucks chunks.

Imagine what would happen if your faulting task was the highest RT prio
task in the system, you'd end up with a life-lock.

> > What's so very difficult about pulling the copy_*_user() out from under
> > the locks?
> 
> That we're expecting the data movement to occur while holding device
> state in place.  For example, we write data through the GTT most of the
> time so we:
> 
> lock struct_mutex
> pin the object to the GTT
> flushing caches as needed
> copy_from_user
> unpin object
> unlock struct_mutex

So you cannot drop the lock once you've pinned the dst object?

> If I'm to pull the copy_from_user out, that means I have to:
> 
> alloc temporary storage
> for each block of temp storage size:
> 	copy_from_user
> 	lock struct_mutex
> 	pin the object to the GTT
> 	flush caches as needed
> 	memcpy
> 	unpin object
> 	unlock struct_mutex
> 
> At this point of introducing our third copy of the user's data in our
> hottest path, we should probably ditch the pwrite path entirely and go
> to user mapping of the objects for performance.  Requiring user mapping
> (which has significant overhead) cuts the likelihood of moving from
> user-space object caching to kernel object caching in the future, which
> has the potential of saving steaming piles of memory.

Or you could get_user_pages() to fault the user pages and pin them, and
then do pagefault_disable() and use copy_from_user_inatomic or such, and
release the pages again.




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-19 22:26     ` Peter Zijlstra
  2009-02-20  2:04       ` Eric Anholt
@ 2009-02-20  8:31       ` Thomas Hellstrom
  2009-02-20  8:47         ` Peter Zijlstra
  1 sibling, 1 reply; 20+ messages in thread
From: Thomas Hellstrom @ 2009-02-20  8:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Anholt, Wang Chen, Nick Piggin, Ingo Molnar, dri-devel,
	linux-kernel

Peter Zijlstra wrote:
> On Thu, 2009-02-19 at 22:02 +0100, Thomas Hellstrom wrote:
>   
>>  
>> It looks to me like the driver preferred locking order is
>>
>> object_mutex (which happens to be the device global struct_mutex)
>>   mmap_sem
>>      offset_mutex.
>>
>> So if one could avoid using the struct_mutex for object bookkeeping (A 
>> separate lock) then
>> vm_open() and vm_close() would adhere to that locking order as well, 
>> simply by not taking the struct_mutex at all.
>>
>> So only fault() remains, in which that locking order is reversed. 
>> Personally I think the trylock ->reschedule->retry method with proper 
>> commenting is a good solution. It will be the _only_ place where locking 
>> order is reversed and it is done in a deadlock-safe manner. Note that 
>> fault() doesn't really fail, but requests a retry from user-space with 
>> rescheduling to give the process holding the struct_mutex time to 
>> release it.
>>     
>
> It doesn't do the reschedule -- need_resched() will check if the current
> task was marked to be scheduled away, 
Yes. my mistake. set_tsk_need_resched() would be the proper call. If I'm 
correctly informed, that would kick in the scheduler _after_ the 
mmap_sem() is released, just before returning to user-space.
> furthermore yield based locking
> sucks chunks.
>   
Yes, but AFAICT in this situation it is the only way to reverse locking 
order in a deadlock safe manner. If there is a lot of contention it will 
eat cpu. Unfortunately since the struct_mutex is such a wide lock there 
will probably be contention in some situations.

BTW isn't this quite common in distributed resource management, when you 
can't ensure that all requestors will request resources in the same order?
Try to grab all resources you need for an operation. If you fail to get 
one, release the resources you already have, sleep waiting for the 
failing one to be available and then retry.

In this case we fail be cause we _may_ have a deadlock. Since we cannot 
release the mmap_sem and wait, we do the second best thing and tell the 
kernel to reschedule when the mmap_sem is released.

> What's so very difficult about pulling the copy_*_user() out from under
> the locks?
>
>   
Given Eric's comment this is a GEM performance-critical path, so even 
from a CPU-usage perspecive, the trylock solution may be preferred.

 From a make-the-code-easy-to-understand perspective, I agree pulling 
out copy_*_user() is a better solution.  It might even be that the 
trylock solution doesn't kill the warnings from the lock dependency tracker.

/Thomas



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-20  8:31       ` Thomas Hellstrom
@ 2009-02-20  8:47         ` Peter Zijlstra
  0 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2009-02-20  8:47 UTC (permalink / raw)
  To: Thomas Hellstrom
  Cc: Eric Anholt, Wang Chen, Nick Piggin, Ingo Molnar, dri-devel,
	linux-kernel

On Fri, 2009-02-20 at 09:31 +0100, Thomas Hellstrom wrote:
> Peter Zijlstra wrote:
> > On Thu, 2009-02-19 at 22:02 +0100, Thomas Hellstrom wrote:
> >   
> >>  
> >> It looks to me like the driver preferred locking order is
> >>
> >> object_mutex (which happens to be the device global struct_mutex)
> >>   mmap_sem
> >>      offset_mutex.
> >>
> >> So if one could avoid using the struct_mutex for object bookkeeping (A 
> >> separate lock) then
> >> vm_open() and vm_close() would adhere to that locking order as well, 
> >> simply by not taking the struct_mutex at all.
> >>
> >> So only fault() remains, in which that locking order is reversed. 
> >> Personally I think the trylock ->reschedule->retry method with proper 
> >> commenting is a good solution. It will be the _only_ place where locking 
> >> order is reversed and it is done in a deadlock-safe manner. Note that 
> >> fault() doesn't really fail, but requests a retry from user-space with 
> >> rescheduling to give the process holding the struct_mutex time to 
> >> release it.
> >>     
> >
> > It doesn't do the reschedule -- need_resched() will check if the current
> > task was marked to be scheduled away, 

> Yes. my mistake. set_tsk_need_resched() would be the proper call. If I'm 
> correctly informed, that would kick in the scheduler _after_ the 
> mmap_sem() is released, just before returning to user-space.

Yes, but it would still life-lock in the RT example given in the other
email.

> > furthermore yield based locking
> > sucks chunks.
> >   
> Yes, but AFAICT in this situation it is the only way to reverse locking 
> order in a deadlock safe manner. If there is a lot of contention it will 
> eat cpu. Unfortunately since the struct_mutex is such a wide lock there 
> will probably be contention in some situations.

I'd be surprised if this were the only solution. Maybe its the easiest,
but not one I'll support.

> BTW isn't this quite common in distributed resource management, when you 
> can't ensure that all requestors will request resources in the same order?
> Try to grab all resources you need for an operation. If you fail to get 
> one, release the resources you already have, sleep waiting for the 
> failing one to be available and then retry.

Not if you're building deterministic systems. Such constructs are highly
non-deterministic.

Furthermore, this isn't really a distributed system is it?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Take mmap_sem up front to avoid lock order violations.
  2009-02-19 12:57       ` Nick Piggin
@ 2009-02-21  2:33         ` Eric Anholt
  0 siblings, 0 replies; 20+ messages in thread
From: Eric Anholt @ 2009-02-21  2:33 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Peter Zijlstra, krh, Wang Chen, dri-devel, linux-kernel,
	Kristian Høgsberg, Andrew Morton, Hugh Dickins

[-- Attachment #1: Type: text/plain, Size: 1642 bytes --]

On Thu, 2009-02-19 at 13:57 +0100, Nick Piggin wrote:
> On Thu, Feb 19, 2009 at 10:19:05AM +0100, Peter Zijlstra wrote:
> > On Wed, 2009-02-18 at 11:38 -0500, krh@bitplanet.net wrote:
> > > From: Kristian Høgsberg <krh@redhat.com>
> > > 
> > > A number of GEM operations (and legacy drm ones) want to copy data to
> > > or from userspace while holding the struct_mutex lock.  However, the
> > > fault handler calls us with the mmap_sem held and thus enforces the
> > > opposite locking order.  This patch downs the mmap_sem up front for
> > > those operations that access userspace data under the struct_mutex
> > > lock to ensure the locking order is consistent.
> > > 
> > > Signed-off-by: Kristian Høgsberg <krh@redhat.com>
> > > ---
> > > 
> > > Here's a different and simpler attempt to fix the locking order
> > > problem.  We can just down_read() the mmap_sem pre-emptively up-front,
> > > and the locking order is respected.  It's simpler than the
> > > mutex_trylock() game, avoids introducing a new mutex.
> 
> The "simple" way to fix this is to just allocate a temporary buffer
> to copy a snapshot of the data going to/from userspace. Then do the
> real usercopy to/from that buffer outside the locks.
> 
> You don't have any performance critical bulk copies (ie. that will
> blow the L1 cache), do you? 

16kb is the most common size (batchbuffers).  32k is popular on 915
(vertex), and varying between 0-128k on 965 (vertex).  The pwrite path
generally represents 10-30% of CPU consumption in CPU-bound apps.

-- 
Eric Anholt
eric@anholt.net                         eric.anholt@intel.com



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-20  7:36         ` Peter Zijlstra
@ 2009-02-25  8:15           ` Eric Anholt
  2009-02-25  8:54             ` Thomas Hellström
  2009-02-25  9:07             ` Peter Zijlstra
  0 siblings, 2 replies; 20+ messages in thread
From: Eric Anholt @ 2009-02-25  8:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Hellstrom, Wang Chen, Nick Piggin, Ingo Molnar, dri-devel,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 3572 bytes --]

On Fri, 2009-02-20 at 08:36 +0100, Peter Zijlstra wrote:
> On Thu, 2009-02-19 at 18:04 -0800, Eric Anholt wrote:
> > On Thu, 2009-02-19 at 23:26 +0100, Peter Zijlstra wrote:
> > > On Thu, 2009-02-19 at 22:02 +0100, Thomas Hellstrom wrote:
> > > >  
> > > > It looks to me like the driver preferred locking order is
> > > > 
> > > > object_mutex (which happens to be the device global struct_mutex)
> > > >   mmap_sem
> > > >      offset_mutex.
> > > > 
> > > > So if one could avoid using the struct_mutex for object bookkeeping (A 
> > > > separate lock) then
> > > > vm_open() and vm_close() would adhere to that locking order as well, 
> > > > simply by not taking the struct_mutex at all.
> > > > 
> > > > So only fault() remains, in which that locking order is reversed. 
> > > > Personally I think the trylock ->reschedule->retry method with proper 
> > > > commenting is a good solution. It will be the _only_ place where locking 
> > > > order is reversed and it is done in a deadlock-safe manner. Note that 
> > > > fault() doesn't really fail, but requests a retry from user-space with 
> > > > rescheduling to give the process holding the struct_mutex time to 
> > > > release it.
> > > 
> > > It doesn't do the reschedule -- need_resched() will check if the current
> > > task was marked to be scheduled away, furthermore yield based locking
> > > sucks chunks.
> 
> Imagine what would happen if your faulting task was the highest RT prio
> task in the system, you'd end up with a life-lock.
> 
> > > What's so very difficult about pulling the copy_*_user() out from under
> > > the locks?
> > 
> > That we're expecting the data movement to occur while holding device
> > state in place.  For example, we write data through the GTT most of the
> > time so we:
> > 
> > lock struct_mutex
> > pin the object to the GTT
> > flushing caches as needed
> > copy_from_user
> > unpin object
> > unlock struct_mutex
> 
> So you cannot drop the lock once you've pinned the dst object?
> 
> > If I'm to pull the copy_from_user out, that means I have to:
> > 
> > alloc temporary storage
> > for each block of temp storage size:
> > 	copy_from_user
> > 	lock struct_mutex
> > 	pin the object to the GTT
> > 	flush caches as needed
> > 	memcpy
> > 	unpin object
> > 	unlock struct_mutex
> > 
> > At this point of introducing our third copy of the user's data in our
> > hottest path, we should probably ditch the pwrite path entirely and go
> > to user mapping of the objects for performance.  Requiring user mapping
> > (which has significant overhead) cuts the likelihood of moving from
> > user-space object caching to kernel object caching in the future, which
> > has the potential of saving steaming piles of memory.
> 
> Or you could get_user_pages() to fault the user pages and pin them, and
> then do pagefault_disable() and use copy_from_user_inatomic or such, and
> release the pages again.

I started poking at this today, since the get_user_pages sounded like
the solution.  Only then I noticed: when we unbind an existing object,
we have to unmap_mapping_range to clear the clients' mappings to it in
the GTT, which needs to happen while the struct lock (protecting the gtt
structure and the gtt to object mappings) is held.  So for fault we have
mmap_sem held to struct mutex taken for poking at the gtt structure, and
for unbind we have struct mutex held to mmap_sem taken to clear
mappings.

-- 
Eric Anholt
eric@anholt.net                         eric.anholt@intel.com



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-25  8:15           ` Eric Anholt
@ 2009-02-25  8:54             ` Thomas Hellström
  2009-02-25  9:07             ` Peter Zijlstra
  1 sibling, 0 replies; 20+ messages in thread
From: Thomas Hellström @ 2009-02-25  8:54 UTC (permalink / raw)
  To: Eric Anholt
  Cc: Peter Zijlstra, Wang Chen, Nick Piggin, Ingo Molnar, dri-devel,
	linux-kernel

Eric Anholt wrote:
> On Fri, 2009-02-20 at 08:36 +0100, Peter Zijlstra wrote:
>   
>> On Thu, 2009-02-19 at 18:04 -0800, Eric Anholt wrote:
>>     
>>> On Thu, 2009-02-19 at 23:26 +0100, Peter Zijlstra wrote:
>>>       
>>>> On Thu, 2009-02-19 at 22:02 +0100, Thomas Hellstrom wrote:
>>>>         
>>>>>  
>>>>> It looks to me like the driver preferred locking order is
>>>>>
>>>>> object_mutex (which happens to be the device global struct_mutex)
>>>>>   mmap_sem
>>>>>      offset_mutex.
>>>>>
>>>>> So if one could avoid using the struct_mutex for object bookkeeping (A 
>>>>> separate lock) then
>>>>> vm_open() and vm_close() would adhere to that locking order as well, 
>>>>> simply by not taking the struct_mutex at all.
>>>>>
>>>>> So only fault() remains, in which that locking order is reversed. 
>>>>> Personally I think the trylock ->reschedule->retry method with proper 
>>>>> commenting is a good solution. It will be the _only_ place where locking 
>>>>> order is reversed and it is done in a deadlock-safe manner. Note that 
>>>>> fault() doesn't really fail, but requests a retry from user-space with 
>>>>> rescheduling to give the process holding the struct_mutex time to 
>>>>> release it.
>>>>>           
>>>> It doesn't do the reschedule -- need_resched() will check if the current
>>>> task was marked to be scheduled away, furthermore yield based locking
>>>> sucks chunks.
>>>>         
>> Imagine what would happen if your faulting task was the highest RT prio
>> task in the system, you'd end up with a life-lock.
>>
>>     
>>>> What's so very difficult about pulling the copy_*_user() out from under
>>>> the locks?
>>>>         
>>> That we're expecting the data movement to occur while holding device
>>> state in place.  For example, we write data through the GTT most of the
>>> time so we:
>>>
>>> lock struct_mutex
>>> pin the object to the GTT
>>> flushing caches as needed
>>> copy_from_user
>>> unpin object
>>> unlock struct_mutex
>>>       
>> So you cannot drop the lock once you've pinned the dst object?
>>
>>     
>>> If I'm to pull the copy_from_user out, that means I have to:
>>>
>>> alloc temporary storage
>>> for each block of temp storage size:
>>> 	copy_from_user
>>> 	lock struct_mutex
>>> 	pin the object to the GTT
>>> 	flush caches as needed
>>> 	memcpy
>>> 	unpin object
>>> 	unlock struct_mutex
>>>
>>> At this point of introducing our third copy of the user's data in our
>>> hottest path, we should probably ditch the pwrite path entirely and go
>>> to user mapping of the objects for performance.  Requiring user mapping
>>> (which has significant overhead) cuts the likelihood of moving from
>>> user-space object caching to kernel object caching in the future, which
>>> has the potential of saving steaming piles of memory.
>>>       
>> Or you could get_user_pages() to fault the user pages and pin them, and
>> then do pagefault_disable() and use copy_from_user_inatomic or such, and
>> release the pages again.
>>     
>
> I started poking at this today, since the get_user_pages sounded like
> the solution.  Only then I noticed: when we unbind an existing object,
> we have to unmap_mapping_range to clear the clients' mappings to it in
> the GTT, which needs to happen while the struct lock (protecting the gtt
> structure and the gtt to object mappings) is held.  So for fault we have
> mmap_sem held to struct mutex taken for poking at the gtt structure, and
> for unbind we have struct mutex held to mmap_sem taken to clear
> mappings.
>
>   
I don't think the mmap_sem is taken during unmap_mapping_rage() ?

/Thomas






^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex.
  2009-02-25  8:15           ` Eric Anholt
  2009-02-25  8:54             ` Thomas Hellström
@ 2009-02-25  9:07             ` Peter Zijlstra
  1 sibling, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2009-02-25  9:07 UTC (permalink / raw)
  To: Eric Anholt
  Cc: Thomas Hellstrom, Wang Chen, Nick Piggin, Ingo Molnar, dri-devel,
	linux-kernel

On Wed, 2009-02-25 at 00:15 -0800, Eric Anholt wrote:
> > Or you could get_user_pages() to fault the user pages and pin them, and
> > then do pagefault_disable() and use copy_from_user_inatomic or such, and
> > release the pages again.
> 
> I started poking at this today, since the get_user_pages sounded like
> the solution.  Only then I noticed: when we unbind an existing object,
> we have to unmap_mapping_range to clear the clients' mappings to it in
> the GTT, which needs to happen while the struct lock (protecting the gtt
> structure and the gtt to object mappings) is held.  So for fault we have
> mmap_sem held to struct mutex taken for poking at the gtt structure, and
> for unbind we have struct mutex held to mmap_sem taken to clear
> mappings.

So it again comes down to the fact that you cannot pin a gtt object
without also holding this struct_mutex?

Normally such things are done by elevating a refcount so that both
regular frees and reclaim gets delayed until you're done with the
object.




^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2009-02-25  9:07 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-02-18  0:59 [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex Eric Anholt
2009-02-18  8:02 ` Wang Chen
2009-02-18 16:38   ` [PATCH] drm: Take mmap_sem up front to avoid lock order violations krh
2009-02-19  9:19     ` Peter Zijlstra
2009-02-19 10:33       ` Peter Zijlstra
2009-02-19 14:49         ` Kristian Høgsberg
2009-02-19 15:17           ` Nick Piggin
2009-02-19 15:21             ` Kristian Høgsberg
2009-02-19 12:57       ` Nick Piggin
2009-02-21  2:33         ` Eric Anholt
2009-02-18 15:08 ` [PATCH] drm: Fix lock order reversal between mmap_sem and struct_mutex Peter Zijlstra
2009-02-19 21:02   ` Thomas Hellstrom
2009-02-19 22:26     ` Peter Zijlstra
2009-02-20  2:04       ` Eric Anholt
2009-02-20  7:36         ` Peter Zijlstra
2009-02-25  8:15           ` Eric Anholt
2009-02-25  8:54             ` Thomas Hellström
2009-02-25  9:07             ` Peter Zijlstra
2009-02-20  8:31       ` Thomas Hellstrom
2009-02-20  8:47         ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox