linux-fbdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] OMAPDSS: DISPC: Fix IRQ unregister race
@ 2012-09-02 19:12 Dimitar Dimitrov
  2012-09-06 13:36 ` Tomi Valkeinen
  0 siblings, 1 reply; 7+ messages in thread
From: Dimitar Dimitrov @ 2012-09-02 19:12 UTC (permalink / raw)
  To: tomi.valkeinen; +Cc: linux-omap, linux-fbdev, Dimitar Dimitrov

Very rare kernel crashes are reported on a custom OMAP4 board. Kernel
panics due to corrupted completion structure while executing
dispc_irq_wait_handler(). Excerpt from kernel log:

  Internal error: Oops - undefined instruction: 0 [#1] PREEMPT SMP
  Unable to handle kernel paging request at virtual address 00400130
  ...
  PC is at 0xebf205bc
  LR is at __wake_up_common+0x54/0x94
  ...
  (__wake_up_common+0x0/0x94)
  (complete+0x0/0x60)
  (dispc_irq_wait_handler.36902+0x0/0x14)
  (omap_dispc_irq_handler+0x0/0x354)
  (handle_irq_event_percpu+0x0/0x188)
  (handle_irq_event+0x0/0x64)
  (handle_fasteoi_irq+0x0/0x10c)
  (generic_handle_irq+0x0/0x48)
  (asm_do_IRQ+0x0/0xc0)

DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
unregister_isr() and DISPC IRQ might be running in parallel on different
CPUs. So there is a chance that a callback is executed even though it
has been unregistered. As omap_dispc_wait_for_irq_timeout() declares a
completion on stack, the dispc_irq_wait_handler() callback might try to
access a completion structure that is invalid. This leads to crashes and
hangs.

Solution is to divide unregister calls into two sets:
  1. Non-strict unregistering of callbacks. Callbacks could safely be
     executed after unregistering them. This is the case with unregister
     calls from the IRQ handler itself.
  2. Strict (synchronized) unregistering. Callbacks are not allowed
     after unregistering. This is the case with completion waiting.

The above solution should satisfy one of the original intentions of the
driver: callbacks should be able to unregister themselves.

Signed-off-by: Dimitar Dimitrov <dinuxbg@gmail.com>
---

WARNING: This bug is quite old. The patch has been tested on v3.0. No testing
has been done after rebasing to v3.6. Hence the RFC tag. Hopefully someone
will beat me in testing with latest linux-omap/master.

 drivers/media/video/omap/omap_vout.c |    4 +--
 drivers/staging/omapdrm/omap_plane.c |    2 +-
 drivers/video/omap2/dss/apply.c      |    2 +-
 drivers/video/omap2/dss/dispc.c      |   47 +++++++++++++++++++++++++++++-----
 drivers/video/omap2/dss/dsi.c        |    4 +--
 drivers/video/omap2/dss/rfbi.c       |    2 +-
 include/video/omapdss.h              |    3 ++-
 7 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/drivers/media/video/omap/omap_vout.c b/drivers/media/video/omap/omap_vout.c
index 88cf9d9..d06896e 100644
--- a/drivers/media/video/omap/omap_vout.c
+++ b/drivers/media/video/omap/omap_vout.c
@@ -976,7 +976,7 @@ static int omap_vout_release(struct file *file)
 
 		mask = DISPC_IRQ_VSYNC | DISPC_IRQ_EVSYNC_EVEN |
 			DISPC_IRQ_EVSYNC_ODD | DISPC_IRQ_VSYNC2;
-		omap_dispc_unregister_isr(omap_vout_isr, vout, mask);
+		omap_dispc_unregister_isr_sync(omap_vout_isr, vout, mask);
 		vout->streaming = 0;
 
 		videobuf_streamoff(q);
@@ -1722,7 +1722,7 @@ static int vidioc_streamoff(struct file *file, void *fh, enum v4l2_buf_type i)
 	mask = DISPC_IRQ_VSYNC | DISPC_IRQ_EVSYNC_EVEN | DISPC_IRQ_EVSYNC_ODD
 		| DISPC_IRQ_VSYNC2;
 
-	omap_dispc_unregister_isr(omap_vout_isr, vout, mask);
+	omap_dispc_unregister_isr_sync(omap_vout_isr, vout, mask);
 
 	for (j = 0; j < ovid->num_overlays; j++) {
 		struct omap_overlay *ovl = ovid->overlays[j];
diff --git a/drivers/staging/omapdrm/omap_plane.c b/drivers/staging/omapdrm/omap_plane.c
index 7997be7..8d8aa5b 100644
--- a/drivers/staging/omapdrm/omap_plane.c
+++ b/drivers/staging/omapdrm/omap_plane.c
@@ -82,7 +82,7 @@ static void dispc_isr(void *arg, uint32_t mask)
 	struct omap_plane *omap_plane = to_omap_plane(plane);
 	struct omap_drm_private *priv = plane->dev->dev_private;
 
-	omap_dispc_unregister_isr(dispc_isr, plane,
+	omap_dispc_unregister_isr_nosync(dispc_isr, plane,
 			id2irq[omap_plane->ovl->id]);
 
 	queue_work(priv->wq, &omap_plane->work);
diff --git a/drivers/video/omap2/dss/apply.c b/drivers/video/omap2/dss/apply.c
index 0fefc68..9386834 100644
--- a/drivers/video/omap2/dss/apply.c
+++ b/drivers/video/omap2/dss/apply.c
@@ -847,7 +847,7 @@ static void dss_unregister_vsync_isr(void)
 	for (i = 0; i < num_mgrs; ++i)
 		mask |= dispc_mgr_get_framedone_irq(i);
 
-	r = omap_dispc_unregister_isr(dss_apply_irq_handler, NULL, mask);
+	r = omap_dispc_unregister_isr_nosync(dss_apply_irq_handler, NULL, mask);
 	WARN_ON(r);
 
 	dss_data.irq_enabled = false;
diff --git a/drivers/video/omap2/dss/dispc.c b/drivers/video/omap2/dss/dispc.c
index 5b289c5..23d5881 100644
--- a/drivers/video/omap2/dss/dispc.c
+++ b/drivers/video/omap2/dss/dispc.c
@@ -2351,7 +2351,7 @@ static void dispc_mgr_enable_lcd_out(enum omap_channel channel, bool enable)
 					msecs_to_jiffies(100)))
 			DSSERR("timeout waiting for FRAME DONE\n");
 
-		r = omap_dispc_unregister_isr(dispc_disable_isr,
+		r = omap_dispc_unregister_isr_sync(dispc_disable_isr,
 				&frame_done_completion, irq);
 
 		if (r)
@@ -2420,8 +2420,8 @@ static void dispc_mgr_enable_digit_out(bool enable)
 					enable ? "start" : "stop");
 	}
 
-	r = omap_dispc_unregister_isr(dispc_disable_isr, &frame_done_completion,
-			irq_mask);
+	r = omap_dispc_unregister_isr_sync(dispc_disable_isr,
+			&frame_done_completion, irq_mask);
 	if (r)
 		DSSERR("failed to unregister %x isr\n", irq_mask);
 
@@ -3319,7 +3319,8 @@ err:
 }
 EXPORT_SYMBOL(omap_dispc_register_isr);
 
-int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
+/* WARNING: callback might be executed even after this function returns! */
+int omap_dispc_unregister_isr_nosync(omap_dispc_isr_t isr, void *arg, u32 mask)
 {
 	int i;
 	unsigned long flags;
@@ -3351,7 +3352,37 @@ int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
 
 	return ret;
 }
-EXPORT_SYMBOL(omap_dispc_unregister_isr);
+EXPORT_SYMBOL(omap_dispc_unregister_isr_nosync);
+
+/*
+ * Ensure that callback <isr> will NOT be executed after this function
+ * returns. Must be called from sleepable context, though!
+ */
+int omap_dispc_unregister_isr_sync(omap_dispc_isr_t isr, void *arg, u32 mask)
+{
+	int ret;
+
+	ret = omap_dispc_unregister_isr_nosync(isr, arg, mask);
+
+	/* Task context is not really needed. But if we're called from atomic
+	 * context, it is probably from DISPC IRQ, where we will deadlock.
+	 * So use might_sleep() to catch potential deadlocks.
+	 */
+	might_sleep();
+
+#if defined(CONFIG_SMP)
+	/* DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
+	 * unregister_isr() and DISPC IRQ might be running in parallel on
+	 * different CPUs. So there is a chance that a callback is executed
+	 * even though it has been unregistered. Add a barrier, in order to
+	 * ensure that after returning from this function, the new DISPC IRQ
+	 * will use an updated callback array, and NOT its cached copy.
+	 */
+	synchronize_irq(dispc.irq);
+#endif
+
+	return ret;
+}
 
 #ifdef DEBUG
 static void print_irq_status(u32 status)
@@ -3566,7 +3597,8 @@ int omap_dispc_wait_for_irq_timeout(u32 irqmask, unsigned long timeout)
 
 	timeout = wait_for_completion_timeout(&completion, timeout);
 
-	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion, irqmask);
+	omap_dispc_unregister_isr_sync(dispc_irq_wait_handler, &completion,
+			irqmask);
 
 	if (timeout = 0)
 		return -ETIMEDOUT;
@@ -3597,7 +3629,8 @@ int omap_dispc_wait_for_irq_interruptible_timeout(u32 irqmask,
 	timeout = wait_for_completion_interruptible_timeout(&completion,
 			timeout);
 
-	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion, irqmask);
+	omap_dispc_unregister_isr_sync(dispc_irq_wait_handler, &completion,
+			irqmask);
 
 	if (timeout = 0)
 		return -ETIMEDOUT;
diff --git a/drivers/video/omap2/dss/dsi.c b/drivers/video/omap2/dss/dsi.c
index b07e886..46f6c6c 100644
--- a/drivers/video/omap2/dss/dsi.c
+++ b/drivers/video/omap2/dss/dsi.c
@@ -4430,7 +4430,7 @@ static int dsi_display_init_dispc(struct omap_dss_device *dssdev)
 	return 0;
 err1:
 	if (dssdev->panel.dsi_mode = OMAP_DSS_DSI_CMD_MODE)
-		omap_dispc_unregister_isr(dsi_framedone_irq_callback,
+		omap_dispc_unregister_isr_sync(dsi_framedone_irq_callback,
 			(void *) dssdev, irq);
 err:
 	return r;
@@ -4443,7 +4443,7 @@ static void dsi_display_uninit_dispc(struct omap_dss_device *dssdev)
 
 		irq = dispc_mgr_get_framedone_irq(dssdev->manager->id);
 
-		omap_dispc_unregister_isr(dsi_framedone_irq_callback,
+		omap_dispc_unregister_isr_sync(dsi_framedone_irq_callback,
 			(void *) dssdev, irq);
 	}
 }
diff --git a/drivers/video/omap2/dss/rfbi.c b/drivers/video/omap2/dss/rfbi.c
index 7c08742..31d167c 100644
--- a/drivers/video/omap2/dss/rfbi.c
+++ b/drivers/video/omap2/dss/rfbi.c
@@ -930,7 +930,7 @@ EXPORT_SYMBOL(omapdss_rfbi_display_enable);
 
 void omapdss_rfbi_display_disable(struct omap_dss_device *dssdev)
 {
-	omap_dispc_unregister_isr(framedone_callback, NULL,
+	omap_dispc_unregister_isr_sync(framedone_callback, NULL,
 			DISPC_IRQ_FRAMEDONE);
 	omap_dss_stop_device(dssdev);
 
diff --git a/include/video/omapdss.h b/include/video/omapdss.h
index a6267a2..2287368 100644
--- a/include/video/omapdss.h
+++ b/include/video/omapdss.h
@@ -707,7 +707,8 @@ void omapdss_default_get_timings(struct omap_dss_device *dssdev,
 
 typedef void (*omap_dispc_isr_t) (void *arg, u32 mask);
 int omap_dispc_register_isr(omap_dispc_isr_t isr, void *arg, u32 mask);
-int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask);
+int omap_dispc_unregister_isr_sync(omap_dispc_isr_t isr, void *arg, u32 mask);
+int omap_dispc_unregister_isr_nosync(omap_dispc_isr_t isr, void *arg, u32 mask);
 
 int omap_dispc_wait_for_irq_timeout(u32 irqmask, unsigned long timeout);
 int omap_dispc_wait_for_irq_interruptible_timeout(u32 irqmask,
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] OMAPDSS: DISPC: Fix IRQ unregister race
  2012-09-02 19:12 [RFC PATCH] OMAPDSS: DISPC: Fix IRQ unregister race Dimitar Dimitrov
@ 2012-09-06 13:36 ` Tomi Valkeinen
  2012-09-07 14:42   ` Dimitar Dimitrov
  0 siblings, 1 reply; 7+ messages in thread
From: Tomi Valkeinen @ 2012-09-06 13:36 UTC (permalink / raw)
  To: Dimitar Dimitrov; +Cc: linux-omap, linux-fbdev

[-- Attachment #1: Type: text/plain, Size: 2447 bytes --]

Hi,

On Sun, 2012-09-02 at 22:12 +0300, Dimitar Dimitrov wrote:
> Very rare kernel crashes are reported on a custom OMAP4 board. Kernel
> panics due to corrupted completion structure while executing
> dispc_irq_wait_handler(). Excerpt from kernel log:
> 
>   Internal error: Oops - undefined instruction: 0 [#1] PREEMPT SMP
>   Unable to handle kernel paging request at virtual address 00400130
>   ...
>   PC is at 0xebf205bc
>   LR is at __wake_up_common+0x54/0x94
>   ...
>   (__wake_up_common+0x0/0x94)
>   (complete+0x0/0x60)
>   (dispc_irq_wait_handler.36902+0x0/0x14)
>   (omap_dispc_irq_handler+0x0/0x354)
>   (handle_irq_event_percpu+0x0/0x188)
>   (handle_irq_event+0x0/0x64)
>   (handle_fasteoi_irq+0x0/0x10c)
>   (generic_handle_irq+0x0/0x48)
>   (asm_do_IRQ+0x0/0xc0)
> 
> DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
> unregister_isr() and DISPC IRQ might be running in parallel on different
> CPUs. So there is a chance that a callback is executed even though it
> has been unregistered. As omap_dispc_wait_for_irq_timeout() declares a
> completion on stack, the dispc_irq_wait_handler() callback might try to
> access a completion structure that is invalid. This leads to crashes and
> hangs.
> 
> Solution is to divide unregister calls into two sets:
>   1. Non-strict unregistering of callbacks. Callbacks could safely be
>      executed after unregistering them. This is the case with unregister
>      calls from the IRQ handler itself.
>   2. Strict (synchronized) unregistering. Callbacks are not allowed
>      after unregistering. This is the case with completion waiting.
> 
> The above solution should satisfy one of the original intentions of the
> driver: callbacks should be able to unregister themselves.

I think it'd be better to create a new function for the nosync version,
and keep the old name for the sync version. The reason for this is to
minimize the amount of changes, as I think this one needs to be applied
to stable kernel trees also.

Also, I think we need similar one for dsi.c, as it has the same kind of
irq handling. But with a quick glance only sync version is needed there.

However, I'm not quite sure about this approach. The fix makes sense,
but it makes me think if the irq handling is designed the wrong way.

While debugging and fixing this, did you think some other irq handling
approach would be saner?

 Tomi


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] OMAPDSS: DISPC: Fix IRQ unregister race
  2012-09-06 13:36 ` Tomi Valkeinen
@ 2012-09-07 14:42   ` Dimitar Dimitrov
  2012-09-08 15:05     ` [RFC PATCH v2] OMAPDSS: " Dimitar Dimitrov
  0 siblings, 1 reply; 7+ messages in thread
From: Dimitar Dimitrov @ 2012-09-07 14:42 UTC (permalink / raw)
  To: Tomi Valkeinen; +Cc: linux-omap, linux-fbdev

Hi,

On Thursday 06 September 2012 16:36:54 Tomi Valkeinen wrote:
> Hi,
> 
> On Sun, 2012-09-02 at 22:12 +0300, Dimitar Dimitrov wrote:
> > Very rare kernel crashes are reported on a custom OMAP4 board. Kernel
> > panics due to corrupted completion structure while executing
> > 
> > dispc_irq_wait_handler(). Excerpt from kernel log:
> >   Internal error: Oops - undefined instruction: 0 [#1] PREEMPT SMP
> >   Unable to handle kernel paging request at virtual address 00400130
> >   ...
> >   PC is at 0xebf205bc
> >   LR is at __wake_up_common+0x54/0x94
> >   ...
> >   (__wake_up_common+0x0/0x94)
> >   (complete+0x0/0x60)
> >   (dispc_irq_wait_handler.36902+0x0/0x14)
> >   (omap_dispc_irq_handler+0x0/0x354)
> >   (handle_irq_event_percpu+0x0/0x188)
> >   (handle_irq_event+0x0/0x64)
> >   (handle_fasteoi_irq+0x0/0x10c)
> >   (generic_handle_irq+0x0/0x48)
> >   (asm_do_IRQ+0x0/0xc0)
> > 
> > DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
> > unregister_isr() and DISPC IRQ might be running in parallel on different
> > CPUs. So there is a chance that a callback is executed even though it
> > has been unregistered. As omap_dispc_wait_for_irq_timeout() declares a
> > completion on stack, the dispc_irq_wait_handler() callback might try to
> > access a completion structure that is invalid. This leads to crashes and
> > hangs.
> > 
> > Solution is to divide unregister calls into two sets:
> >   1. Non-strict unregistering of callbacks. Callbacks could safely be
> >   
> >      executed after unregistering them. This is the case with unregister
> >      calls from the IRQ handler itself.
> >   
> >   2. Strict (synchronized) unregistering. Callbacks are not allowed
> >   
> >      after unregistering. This is the case with completion waiting.
> > 
> > The above solution should satisfy one of the original intentions of the
> > driver: callbacks should be able to unregister themselves.
> 
> I think it'd be better to create a new function for the nosync version,
> and keep the old name for the sync version. The reason for this is to
> minimize the amount of changes, as I think this one needs to be applied
> to stable kernel trees also.
My intention was to force all callers to pick sides. In case of rebase issues 
we get link errors instead of rare and subtle run-time races. Still, if you 
think we should leave the old name untouched then I'll change my 
patch.

> 
> Also, I think we need similar one for dsi.c, as it has the same kind of
> irq handling. But with a quick glance only sync version is needed there.
Thanks, I missed that. I'll try to fix and send again.

> 
> However, I'm not quite sure about this approach. The fix makes sense,
> but it makes me think if the irq handling is designed the wrong way.
> 
> While debugging and fixing this, did you think some other irq handling
> approach would be saner?
I tried but could not come up with better approach. The main difficulty is 
that there are two contradicting requirements:
 1. Some callbacks unregister other callbacks, including themselves, from 
DISPC IRQ context.
 2. Some functions expect that once a callback is unregistered it is never 
executed.

Hence it is natural to split callers into two sets. I'm open for suggestions.
> 
>  Tomi

Thanks,
Dimitar

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [RFC PATCH v2] OMAPDSS: Fix IRQ unregister race
  2012-09-07 14:42   ` Dimitar Dimitrov
@ 2012-09-08 15:05     ` Dimitar Dimitrov
  2012-09-10  7:49       ` Tomi Valkeinen
  0 siblings, 1 reply; 7+ messages in thread
From: Dimitar Dimitrov @ 2012-09-08 15:05 UTC (permalink / raw)
  To: tomi.valkeinen; +Cc: linux-omap, linux-fbdev, Dimitar Dimitrov

Very rare kernel crashes are reported on a custom OMAP4 board. Kernel
panics due to corrupted completion structure while executing
dispc_irq_wait_handler(). Excerpt from kernel log:

  Internal error: Oops - undefined instruction: 0 [#1] PREEMPT SMP
  Unable to handle kernel paging request at virtual address 00400130
  ...
  PC is at 0xebf205bc
  LR is at __wake_up_common+0x54/0x94
  ...
  (__wake_up_common+0x0/0x94)
  (complete+0x0/0x60)
  (dispc_irq_wait_handler.36902+0x0/0x14)
  (omap_dispc_irq_handler+0x0/0x354)
  (handle_irq_event_percpu+0x0/0x188)
  (handle_irq_event+0x0/0x64)
  (handle_fasteoi_irq+0x0/0x10c)
  (generic_handle_irq+0x0/0x48)
  (asm_do_IRQ+0x0/0xc0)

DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
unregister_isr() and DISPC IRQ might be running in parallel on different
CPUs. So there is a chance that a callback is executed even though it
has been unregistered. As omap_dispc_wait_for_irq_timeout() declares a
completion on stack, the dispc_irq_wait_handler() callback might try to
access a completion structure that is invalid. This leads to crashes and
hangs.

Solution is to divide unregister calls into two sets:
  1. Non-strict unregistering of callbacks. Callbacks could safely be
     executed after unregistering them. This is the case with unregister
     calls from the IRQ handler itself.
  2. Strict (synchronized) unregistering. Callbacks are not allowed
     after unregistering. This is the case with completion waiting.

The above solution should satisfy one of the original intentions of the
driver: callbacks should be able to unregister themselves.

Also fix DSI IRQ unregister which has similar logic to DISPC IRQ handling.

Signed-off-by: Dimitar Dimitrov <dinuxbg@gmail.com>
---

WARNING: This bug is quite old. The patch has been tested on v3.0. No testing
has been done after rebasing to v3.6. Hence the RFC tag. Hopefully someone
will beat me in testing with latest linux-omap/master.

Changes since v1 per Tomi Valkeinen's comments:
  - Don't rename omap_dispc_unregister_isr, just introduce nosync variant.
  - Apply the same fix for DSI IRQ which suffers from the same race condition.

 drivers/staging/omapdrm/omap_plane.c |    2 +-
 drivers/video/omap2/dss/apply.c      |    2 +-
 drivers/video/omap2/dss/dispc.c      |   45 +++++++++++++++++++++++++++++-----
 drivers/video/omap2/dss/dsi.c        |   19 ++++++++++++++
 include/video/omapdss.h              |    1 +
 5 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/drivers/staging/omapdrm/omap_plane.c b/drivers/staging/omapdrm/omap_plane.c
index 7997be7..8d8aa5b 100644
--- a/drivers/staging/omapdrm/omap_plane.c
+++ b/drivers/staging/omapdrm/omap_plane.c
@@ -82,7 +82,7 @@ static void dispc_isr(void *arg, uint32_t mask)
 	struct omap_plane *omap_plane = to_omap_plane(plane);
 	struct omap_drm_private *priv = plane->dev->dev_private;
 
-	omap_dispc_unregister_isr(dispc_isr, plane,
+	omap_dispc_unregister_isr_nosync(dispc_isr, plane,
 			id2irq[omap_plane->ovl->id]);
 
 	queue_work(priv->wq, &omap_plane->work);
diff --git a/drivers/video/omap2/dss/apply.c b/drivers/video/omap2/dss/apply.c
index 0fefc68..9386834 100644
--- a/drivers/video/omap2/dss/apply.c
+++ b/drivers/video/omap2/dss/apply.c
@@ -847,7 +847,7 @@ static void dss_unregister_vsync_isr(void)
 	for (i = 0; i < num_mgrs; ++i)
 		mask |= dispc_mgr_get_framedone_irq(i);
 
-	r = omap_dispc_unregister_isr(dss_apply_irq_handler, NULL, mask);
+	r = omap_dispc_unregister_isr_nosync(dss_apply_irq_handler, NULL, mask);
 	WARN_ON(r);
 
 	dss_data.irq_enabled = false;
diff --git a/drivers/video/omap2/dss/dispc.c b/drivers/video/omap2/dss/dispc.c
index ee9e296..a67d92c 100644
--- a/drivers/video/omap2/dss/dispc.c
+++ b/drivers/video/omap2/dss/dispc.c
@@ -2421,8 +2421,8 @@ static void dispc_mgr_enable_digit_out(bool enable)
 					enable ? "start" : "stop");
 	}
 
-	r = omap_dispc_unregister_isr(dispc_disable_isr, &frame_done_completion,
-			irq_mask);
+	r = omap_dispc_unregister_isr(dispc_disable_isr,
+			&frame_done_completion, irq_mask);
 	if (r)
 		DSSERR("failed to unregister %x isr\n", irq_mask);
 
@@ -3320,7 +3320,8 @@ err:
 }
 EXPORT_SYMBOL(omap_dispc_register_isr);
 
-int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
+/* WARNING: callback might be executed even after this function returns! */
+int omap_dispc_unregister_isr_nosync(omap_dispc_isr_t isr, void *arg, u32 mask)
 {
 	int i;
 	unsigned long flags;
@@ -3352,7 +3353,37 @@ int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
 
 	return ret;
 }
-EXPORT_SYMBOL(omap_dispc_unregister_isr);
+EXPORT_SYMBOL(omap_dispc_unregister_isr_nosync);
+
+/*
+ * Ensure that callback <isr> will NOT be executed after this function
+ * returns. Must be called from sleepable context, though!
+ */
+int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
+{
+	int ret;
+
+	ret = omap_dispc_unregister_isr_nosync(isr, arg, mask);
+
+	/* Task context is not really needed. But if we're called from atomic
+	 * context, it is probably from DISPC IRQ, where we will deadlock.
+	 * So use might_sleep() to catch potential deadlocks.
+	 */
+	might_sleep();
+
+#if defined(CONFIG_SMP)
+	/* DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
+	 * unregister_isr() and DISPC IRQ might be running in parallel on
+	 * different CPUs. So there is a chance that a callback is executed
+	 * even though it has been unregistered. Add a barrier, in order to
+	 * ensure that after returning from this function, the new DISPC IRQ
+	 * will use an updated callback array, and NOT its cached copy.
+	 */
+	synchronize_irq(dispc.irq);
+#endif
+
+	return ret;
+}
 
 #ifdef DEBUG
 static void print_irq_status(u32 status)
@@ -3567,7 +3598,8 @@ int omap_dispc_wait_for_irq_timeout(u32 irqmask, unsigned long timeout)
 
 	timeout = wait_for_completion_timeout(&completion, timeout);
 
-	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion, irqmask);
+	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion,
+			irqmask);
 
 	if (timeout = 0)
 		return -ETIMEDOUT;
@@ -3598,7 +3630,8 @@ int omap_dispc_wait_for_irq_interruptible_timeout(u32 irqmask,
 	timeout = wait_for_completion_interruptible_timeout(&completion,
 			timeout);
 
-	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion, irqmask);
+	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion,
+			irqmask);
 
 	if (timeout = 0)
 		return -ETIMEDOUT;
diff --git a/drivers/video/omap2/dss/dsi.c b/drivers/video/omap2/dss/dsi.c
index b07e886..24b4a3e 100644
--- a/drivers/video/omap2/dss/dsi.c
+++ b/drivers/video/omap2/dss/dsi.c
@@ -960,6 +960,13 @@ static int dsi_unregister_isr(struct platform_device *dsidev,
 
 	spin_unlock_irqrestore(&dsi->irq_lock, flags);
 
+	might_sleep();
+
+#if defined(CONFIG_SMP)
+	/* See notes for dispc.c:omap_dispc_unregister_isr() . */
+	synchronize_irq(dsi->irq);
+#endif
+
 	return r;
 }
 
@@ -1002,6 +1009,12 @@ static int dsi_unregister_isr_vc(struct platform_device *dsidev, int channel,
 
 	spin_unlock_irqrestore(&dsi->irq_lock, flags);
 
+	might_sleep();
+
+#if defined(CONFIG_SMP)
+	/* See notes for dispc.c:omap_dispc_unregister_isr() . */
+	synchronize_irq(dsi->irq);
+#endif
 	return r;
 }
 
@@ -1042,6 +1055,12 @@ static int dsi_unregister_isr_cio(struct platform_device *dsidev,
 
 	spin_unlock_irqrestore(&dsi->irq_lock, flags);
 
+	might_sleep();
+
+#if defined(CONFIG_SMP)
+	/* See notes for dispc.c:omap_dispc_unregister_isr() . */
+	synchronize_irq(dsi->irq);
+#endif
 	return r;
 }
 
diff --git a/include/video/omapdss.h b/include/video/omapdss.h
index a6267a2..769f981 100644
--- a/include/video/omapdss.h
+++ b/include/video/omapdss.h
@@ -708,6 +708,7 @@ void omapdss_default_get_timings(struct omap_dss_device *dssdev,
 typedef void (*omap_dispc_isr_t) (void *arg, u32 mask);
 int omap_dispc_register_isr(omap_dispc_isr_t isr, void *arg, u32 mask);
 int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask);
+int omap_dispc_unregister_isr_nosync(omap_dispc_isr_t isr, void *arg, u32 mask);
 
 int omap_dispc_wait_for_irq_timeout(u32 irqmask, unsigned long timeout);
 int omap_dispc_wait_for_irq_interruptible_timeout(u32 irqmask,
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH v2] OMAPDSS: Fix IRQ unregister race
  2012-09-08 15:05     ` [RFC PATCH v2] OMAPDSS: " Dimitar Dimitrov
@ 2012-09-10  7:49       ` Tomi Valkeinen
  2012-09-10 19:19         ` Dimitar Dimitrov
  0 siblings, 1 reply; 7+ messages in thread
From: Tomi Valkeinen @ 2012-09-10  7:49 UTC (permalink / raw)
  To: Dimitar Dimitrov; +Cc: linux-omap, linux-fbdev

[-- Attachment #1: Type: text/plain, Size: 8260 bytes --]

On Sat, 2012-09-08 at 18:05 +0300, Dimitar Dimitrov wrote:
> Very rare kernel crashes are reported on a custom OMAP4 board. Kernel
> panics due to corrupted completion structure while executing
> dispc_irq_wait_handler(). Excerpt from kernel log:
> 
>   Internal error: Oops - undefined instruction: 0 [#1] PREEMPT SMP
>   Unable to handle kernel paging request at virtual address 00400130
>   ...
>   PC is at 0xebf205bc
>   LR is at __wake_up_common+0x54/0x94
>   ...
>   (__wake_up_common+0x0/0x94)
>   (complete+0x0/0x60)
>   (dispc_irq_wait_handler.36902+0x0/0x14)
>   (omap_dispc_irq_handler+0x0/0x354)
>   (handle_irq_event_percpu+0x0/0x188)
>   (handle_irq_event+0x0/0x64)
>   (handle_fasteoi_irq+0x0/0x10c)
>   (generic_handle_irq+0x0/0x48)
>   (asm_do_IRQ+0x0/0xc0)
> 
> DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
> unregister_isr() and DISPC IRQ might be running in parallel on different
> CPUs. So there is a chance that a callback is executed even though it
> has been unregistered. As omap_dispc_wait_for_irq_timeout() declares a
> completion on stack, the dispc_irq_wait_handler() callback might try to
> access a completion structure that is invalid. This leads to crashes and
> hangs.
> 
> Solution is to divide unregister calls into two sets:
>   1. Non-strict unregistering of callbacks. Callbacks could safely be
>      executed after unregistering them. This is the case with unregister
>      calls from the IRQ handler itself.
>   2. Strict (synchronized) unregistering. Callbacks are not allowed
>      after unregistering. This is the case with completion waiting.
> 
> The above solution should satisfy one of the original intentions of the
> driver: callbacks should be able to unregister themselves.
> 
> Also fix DSI IRQ unregister which has similar logic to DISPC IRQ handling.
> 
> Signed-off-by: Dimitar Dimitrov <dinuxbg@gmail.com>
> ---
> 
> WARNING: This bug is quite old. The patch has been tested on v3.0. No testing
> has been done after rebasing to v3.6. Hence the RFC tag. Hopefully someone
> will beat me in testing with latest linux-omap/master.
> 
> Changes since v1 per Tomi Valkeinen's comments:
>   - Don't rename omap_dispc_unregister_isr, just introduce nosync variant.
>   - Apply the same fix for DSI IRQ which suffers from the same race condition.

I made a quick test and works for me, although I haven't encountered the
problem itself.

Some mostly cosmetic comments below.

This seems to apply cleanly to v3.4+ kernels, but not earlier ones. Do
you want to make the needed modifications and mail this and the modified
patches for stable kernels also? I can do that also if you don't want
to.

>  drivers/staging/omapdrm/omap_plane.c |    2 +-
>  drivers/video/omap2/dss/apply.c      |    2 +-
>  drivers/video/omap2/dss/dispc.c      |   45 +++++++++++++++++++++++++++++-----
>  drivers/video/omap2/dss/dsi.c        |   19 ++++++++++++++
>  include/video/omapdss.h              |    1 +
>  5 files changed, 61 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/staging/omapdrm/omap_plane.c b/drivers/staging/omapdrm/omap_plane.c
> index 7997be7..8d8aa5b 100644
> --- a/drivers/staging/omapdrm/omap_plane.c
> +++ b/drivers/staging/omapdrm/omap_plane.c
> @@ -82,7 +82,7 @@ static void dispc_isr(void *arg, uint32_t mask)
>  	struct omap_plane *omap_plane = to_omap_plane(plane);
>  	struct omap_drm_private *priv = plane->dev->dev_private;
>  
> -	omap_dispc_unregister_isr(dispc_isr, plane,
> +	omap_dispc_unregister_isr_nosync(dispc_isr, plane,
>  			id2irq[omap_plane->ovl->id]);
>  
>  	queue_work(priv->wq, &omap_plane->work);
> diff --git a/drivers/video/omap2/dss/apply.c b/drivers/video/omap2/dss/apply.c
> index 0fefc68..9386834 100644
> --- a/drivers/video/omap2/dss/apply.c
> +++ b/drivers/video/omap2/dss/apply.c
> @@ -847,7 +847,7 @@ static void dss_unregister_vsync_isr(void)
>  	for (i = 0; i < num_mgrs; ++i)
>  		mask |= dispc_mgr_get_framedone_irq(i);
>  
> -	r = omap_dispc_unregister_isr(dss_apply_irq_handler, NULL, mask);
> +	r = omap_dispc_unregister_isr_nosync(dss_apply_irq_handler, NULL, mask);
>  	WARN_ON(r);
>  
>  	dss_data.irq_enabled = false;
> diff --git a/drivers/video/omap2/dss/dispc.c b/drivers/video/omap2/dss/dispc.c
> index ee9e296..a67d92c 100644
> --- a/drivers/video/omap2/dss/dispc.c
> +++ b/drivers/video/omap2/dss/dispc.c
> @@ -2421,8 +2421,8 @@ static void dispc_mgr_enable_digit_out(bool enable)
>  					enable ? "start" : "stop");
>  	}
>  
> -	r = omap_dispc_unregister_isr(dispc_disable_isr, &frame_done_completion,
> -			irq_mask);
> +	r = omap_dispc_unregister_isr(dispc_disable_isr,
> +			&frame_done_completion, irq_mask);

This change is not needed.

>  	if (r)
>  		DSSERR("failed to unregister %x isr\n", irq_mask);
>  
> @@ -3320,7 +3320,8 @@ err:
>  }
>  EXPORT_SYMBOL(omap_dispc_register_isr);
>  
> -int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
> +/* WARNING: callback might be executed even after this function returns! */
> +int omap_dispc_unregister_isr_nosync(omap_dispc_isr_t isr, void *arg, u32 mask)
>  {
>  	int i;
>  	unsigned long flags;
> @@ -3352,7 +3353,37 @@ int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
>  
>  	return ret;
>  }
> -EXPORT_SYMBOL(omap_dispc_unregister_isr);
> +EXPORT_SYMBOL(omap_dispc_unregister_isr_nosync);
> +
> +/*
> + * Ensure that callback <isr> will NOT be executed after this function
> + * returns. Must be called from sleepable context, though!
> + */
> +int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
> +{
> +	int ret;
> +
> +	ret = omap_dispc_unregister_isr_nosync(isr, arg, mask);
> +
> +	/* Task context is not really needed. But if we're called from atomic
> +	 * context, it is probably from DISPC IRQ, where we will deadlock.
> +	 * So use might_sleep() to catch potential deadlocks.
> +	 */

Use the kernel multi-line comment style, i.e.:

/*
 * foobar
 */

> +	might_sleep();
> +
> +#if defined(CONFIG_SMP)
> +	/* DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
> +	 * unregister_isr() and DISPC IRQ might be running in parallel on
> +	 * different CPUs. So there is a chance that a callback is executed
> +	 * even though it has been unregistered. Add a barrier, in order to
> +	 * ensure that after returning from this function, the new DISPC IRQ
> +	 * will use an updated callback array, and NOT its cached copy.
> +	 */

Comment style here also.

> +	synchronize_irq(dispc.irq);
> +#endif

Why do you use #if defined(CONFIG_SMP)? synchronize_irq is defined
with !CONFIG_SMP also. In that case it becomes just barrier().

> +
> +	return ret;
> +}
>  
>  #ifdef DEBUG
>  static void print_irq_status(u32 status)
> @@ -3567,7 +3598,8 @@ int omap_dispc_wait_for_irq_timeout(u32 irqmask, unsigned long timeout)
>  
>  	timeout = wait_for_completion_timeout(&completion, timeout);
>  
> -	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion, irqmask);
> +	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion,
> +			irqmask);

Change not needed.

>  
>  	if (timeout == 0)
>  		return -ETIMEDOUT;
> @@ -3598,7 +3630,8 @@ int omap_dispc_wait_for_irq_interruptible_timeout(u32 irqmask,
>  	timeout = wait_for_completion_interruptible_timeout(&completion,
>  			timeout);
>  
> -	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion, irqmask);
> +	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion,
> +			irqmask);

Change not needed.

>  
>  	if (timeout == 0)
>  		return -ETIMEDOUT;
> diff --git a/drivers/video/omap2/dss/dsi.c b/drivers/video/omap2/dss/dsi.c
> index b07e886..24b4a3e 100644
> --- a/drivers/video/omap2/dss/dsi.c
> +++ b/drivers/video/omap2/dss/dsi.c
> @@ -960,6 +960,13 @@ static int dsi_unregister_isr(struct platform_device *dsidev,
>  
>  	spin_unlock_irqrestore(&dsi->irq_lock, flags);
>  
> +	might_sleep();
> +
> +#if defined(CONFIG_SMP)
> +	/* See notes for dispc.c:omap_dispc_unregister_isr() . */
> +	synchronize_irq(dsi->irq);
> +#endif
> +

Same SMP comment as with dispc.

 Tomi


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH v2] OMAPDSS: Fix IRQ unregister race
  2012-09-10  7:49       ` Tomi Valkeinen
@ 2012-09-10 19:19         ` Dimitar Dimitrov
  2012-09-10 19:34           ` [PATCH v3] " Dimitar Dimitrov
  0 siblings, 1 reply; 7+ messages in thread
From: Dimitar Dimitrov @ 2012-09-10 19:19 UTC (permalink / raw)
  To: Tomi Valkeinen; +Cc: linux-omap, linux-fbdev

On Monday 10 September 2012 10:49:05 Tomi Valkeinen wrote:
> On Sat, 2012-09-08 at 18:05 +0300, Dimitar Dimitrov wrote:
> > Very rare kernel crashes are reported on a custom OMAP4 board. Kernel
> > panics due to corrupted completion structure while executing
> > 
> > dispc_irq_wait_handler(). Excerpt from kernel log:
> >   Internal error: Oops - undefined instruction: 0 [#1] PREEMPT SMP
> >   Unable to handle kernel paging request at virtual address 00400130
> >   ...
> >   PC is at 0xebf205bc
> >   LR is at __wake_up_common+0x54/0x94
> >   ...
> >   (__wake_up_common+0x0/0x94)
> >   (complete+0x0/0x60)
> >   (dispc_irq_wait_handler.36902+0x0/0x14)
> >   (omap_dispc_irq_handler+0x0/0x354)
> >   (handle_irq_event_percpu+0x0/0x188)
> >   (handle_irq_event+0x0/0x64)
> >   (handle_fasteoi_irq+0x0/0x10c)
> >   (generic_handle_irq+0x0/0x48)
> >   (asm_do_IRQ+0x0/0xc0)
> > 
> > DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
> > unregister_isr() and DISPC IRQ might be running in parallel on different
> > CPUs. So there is a chance that a callback is executed even though it
> > has been unregistered. As omap_dispc_wait_for_irq_timeout() declares a
> > completion on stack, the dispc_irq_wait_handler() callback might try to
> > access a completion structure that is invalid. This leads to crashes and
> > hangs.
> > 
> > Solution is to divide unregister calls into two sets:
> >   1. Non-strict unregistering of callbacks. Callbacks could safely be
> >   
> >      executed after unregistering them. This is the case with unregister
> >      calls from the IRQ handler itself.
> >   
> >   2. Strict (synchronized) unregistering. Callbacks are not allowed
> >   
> >      after unregistering. This is the case with completion waiting.
> > 
> > The above solution should satisfy one of the original intentions of the
> > driver: callbacks should be able to unregister themselves.
> > 
> > Also fix DSI IRQ unregister which has similar logic to DISPC IRQ
> > handling.
> > 
> > Signed-off-by: Dimitar Dimitrov <dinuxbg@gmail.com>
> > ---
> > 
> > WARNING: This bug is quite old. The patch has been tested on v3.0. No
> > testing has been done after rebasing to v3.6. Hence the RFC tag.
> > Hopefully someone will beat me in testing with latest linux-omap/master.
> > 
> > Changes since v1 per Tomi Valkeinen's comments:
> >   - Don't rename omap_dispc_unregister_isr, just introduce nosync
> >   variant. - Apply the same fix for DSI IRQ which suffers from the same
> >   race condition.
> 
> I made a quick test and works for me, although I haven't encountered the
> problem itself.
This issue is very rarely seen during normal operation. You may not hit it 
even after days of stress testing. To speed up reproducing an artificial delay 
in the IRQ routine could be used while stress-testing DSS, e.g.:

@@ -3462,6 +3462,8 @@ static irqreturn_t omap_dispc_irq_handler(int irq, void 
*a
 
        spin_unlock(&dispc.irq_lock);
 
+       { static int iii; if ((iii++ % 10) = 0) mdelay(100); }
+
        for (i = 0; i < DISPC_MAX_NR_ISRS; i++) {
                isr_data = &registered_isr[i];

> 
> Some mostly cosmetic comments below.
> 
> This seems to apply cleanly to v3.4+ kernels, but not earlier ones. Do
> you want to make the needed modifications and mail this and the modified
> patches for stable kernels also? I can do that also if you don't want
> to.
I can do the rebase and cleanup. I'll send the modified patch per your 
comments in the next hour. 

Currently I cannot test, though. I hope to have a working setup by end of week 
in order to send sanity-tested patches for stable kernels.

> 
> >  drivers/staging/omapdrm/omap_plane.c |    2 +-
> >  drivers/video/omap2/dss/apply.c      |    2 +-
> >  drivers/video/omap2/dss/dispc.c      |   45
> >  +++++++++++++++++++++++++++++----- drivers/video/omap2/dss/dsi.c       
> >  |   19 ++++++++++++++
> >  include/video/omapdss.h              |    1 +
> >  5 files changed, 61 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/staging/omapdrm/omap_plane.c
> > b/drivers/staging/omapdrm/omap_plane.c index 7997be7..8d8aa5b 100644
> > --- a/drivers/staging/omapdrm/omap_plane.c
> > +++ b/drivers/staging/omapdrm/omap_plane.c
> > @@ -82,7 +82,7 @@ static void dispc_isr(void *arg, uint32_t mask)
> > 
> >  	struct omap_plane *omap_plane = to_omap_plane(plane);
> >  	struct omap_drm_private *priv = plane->dev->dev_private;
> > 
> > -	omap_dispc_unregister_isr(dispc_isr, plane,
> > +	omap_dispc_unregister_isr_nosync(dispc_isr, plane,
> > 
> >  			id2irq[omap_plane->ovl->id]);
> >  	
> >  	queue_work(priv->wq, &omap_plane->work);
> > 
> > diff --git a/drivers/video/omap2/dss/apply.c
> > b/drivers/video/omap2/dss/apply.c index 0fefc68..9386834 100644
> > --- a/drivers/video/omap2/dss/apply.c
> > +++ b/drivers/video/omap2/dss/apply.c
> > @@ -847,7 +847,7 @@ static void dss_unregister_vsync_isr(void)
> > 
> >  	for (i = 0; i < num_mgrs; ++i)
> >  	
> >  		mask |= dispc_mgr_get_framedone_irq(i);
> > 
> > -	r = omap_dispc_unregister_isr(dss_apply_irq_handler, NULL, mask);
> > +	r = omap_dispc_unregister_isr_nosync(dss_apply_irq_handler, NULL,
> > mask);
> > 
> >  	WARN_ON(r);
> >  	
> >  	dss_data.irq_enabled = false;
> > 
> > diff --git a/drivers/video/omap2/dss/dispc.c
> > b/drivers/video/omap2/dss/dispc.c index ee9e296..a67d92c 100644
> > --- a/drivers/video/omap2/dss/dispc.c
> > +++ b/drivers/video/omap2/dss/dispc.c
> > @@ -2421,8 +2421,8 @@ static void dispc_mgr_enable_digit_out(bool enable)
> > 
> >  					enable ? "start" : "stop");
> >  	
> >  	}
> > 
> > -	r = omap_dispc_unregister_isr(dispc_disable_isr,
> > &frame_done_completion, -			irq_mask);
> > +	r = omap_dispc_unregister_isr(dispc_disable_isr,
> > +			&frame_done_completion, irq_mask);
> 
> This change is not needed.
Sorry, I missed those. Will remove and resend.
> 
> >  	if (r)
> >  	
> >  		DSSERR("failed to unregister %x isr\n", irq_mask);
> > 
> > @@ -3320,7 +3320,8 @@ err:
> >  }
> >  EXPORT_SYMBOL(omap_dispc_register_isr);
> > 
> > -int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
> > +/* WARNING: callback might be executed even after this function returns!
> > */ +int omap_dispc_unregister_isr_nosync(omap_dispc_isr_t isr, void
> > *arg, u32 mask)
> > 
> >  {
> >  
> >  	int i;
> >  	unsigned long flags;
> > 
> > @@ -3352,7 +3353,37 @@ int omap_dispc_unregister_isr(omap_dispc_isr_t
> > isr, void *arg, u32 mask)
> > 
> >  	return ret;
> >  
> >  }
> > 
> > -EXPORT_SYMBOL(omap_dispc_unregister_isr);
> > +EXPORT_SYMBOL(omap_dispc_unregister_isr_nosync);
> > +
> > +/*
> > + * Ensure that callback <isr> will NOT be executed after this function
> > + * returns. Must be called from sleepable context, though!
> > + */
> > +int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
> > +{
> > +	int ret;
> > +
> > +	ret = omap_dispc_unregister_isr_nosync(isr, arg, mask);
> > +
> > +	/* Task context is not really needed. But if we're called from atomic
> > +	 * context, it is probably from DISPC IRQ, where we will deadlock.
> > +	 * So use might_sleep() to catch potential deadlocks.
> > +	 */
> 
> Use the kernel multi-line comment style, i.e.:
> 
> /*
>  * foobar
>  */
Will fix.
> 
> > +	might_sleep();
> > +
> > +#if defined(CONFIG_SMP)
> > +	/* DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
> > +	 * unregister_isr() and DISPC IRQ might be running in parallel on
> > +	 * different CPUs. So there is a chance that a callback is executed
> > +	 * even though it has been unregistered. Add a barrier, in order to
> > +	 * ensure that after returning from this function, the new DISPC IRQ
> > +	 * will use an updated callback array, and NOT its cached copy.
> > +	 */
> 
> Comment style here also.
Will fix.

> 
> > +	synchronize_irq(dispc.irq);
> > +#endif
> 
> Why do you use #if defined(CONFIG_SMP)? synchronize_irq is defined
> with !CONFIG_SMP also. In that case it becomes just barrier().
> 
You're correct. I'll remove "defined(CONFIG_SMP)".

> > +
> > +	return ret;
> > +}
I forgot to keep EXPORT_SYMBOL for omap_dispc_unregister_isr. Will add it.

> > 
> >  #ifdef DEBUG
> >  static void print_irq_status(u32 status)
> > 
> > @@ -3567,7 +3598,8 @@ int omap_dispc_wait_for_irq_timeout(u32 irqmask,
> > unsigned long timeout)
> > 
> >  	timeout = wait_for_completion_timeout(&completion, timeout);
> > 
> > -	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion,
> > irqmask); +	omap_dispc_unregister_isr(dispc_irq_wait_handler,
> > &completion, +			irqmask);
> 
> Change not needed.
Will remove.
> 
> >  	if (timeout = 0)
> >  	
> >  		return -ETIMEDOUT;
> > 
> > @@ -3598,7 +3630,8 @@ int
> > omap_dispc_wait_for_irq_interruptible_timeout(u32 irqmask,
> > 
> >  	timeout = wait_for_completion_interruptible_timeout(&completion,
> >  	
> >  			timeout);
> > 
> > -	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion,
> > irqmask); +	omap_dispc_unregister_isr(dispc_irq_wait_handler,
> > &completion, +			irqmask);
> 
> Change not needed.
Will remove.
> 
> >  	if (timeout = 0)
> >  	
> >  		return -ETIMEDOUT;
> > 
> > diff --git a/drivers/video/omap2/dss/dsi.c
> > b/drivers/video/omap2/dss/dsi.c index b07e886..24b4a3e 100644
> > --- a/drivers/video/omap2/dss/dsi.c
> > +++ b/drivers/video/omap2/dss/dsi.c
> > @@ -960,6 +960,13 @@ static int dsi_unregister_isr(struct platform_device
> > *dsidev,
> > 
> >  	spin_unlock_irqrestore(&dsi->irq_lock, flags);
> > 
> > +	might_sleep();
> > +
> > +#if defined(CONFIG_SMP)
> > +	/* See notes for dispc.c:omap_dispc_unregister_isr() . */
> > +	synchronize_irq(dsi->irq);
> > +#endif
> > +
> 
> Same SMP comment as with dispc.
> 
>  Tomi

Thanks,
Dimitar

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3] OMAPDSS: Fix IRQ unregister race
  2012-09-10 19:19         ` Dimitar Dimitrov
@ 2012-09-10 19:34           ` Dimitar Dimitrov
  0 siblings, 0 replies; 7+ messages in thread
From: Dimitar Dimitrov @ 2012-09-10 19:34 UTC (permalink / raw)
  To: tomi.valkeinen; +Cc: linux-omap, linux-fbdev, Dimitar Dimitrov, stable

Very rare kernel crashes are reported on a custom OMAP4 board. Kernel
panics due to corrupted completion structure while executing
dispc_irq_wait_handler(). Excerpt from kernel log:

  Internal error: Oops - undefined instruction: 0 [#1] PREEMPT SMP
  Unable to handle kernel paging request at virtual address 00400130
  ...
  PC is at 0xebf205bc
  LR is at __wake_up_common+0x54/0x94
  ...
  (__wake_up_common+0x0/0x94)
  (complete+0x0/0x60)
  (dispc_irq_wait_handler.36902+0x0/0x14)
  (omap_dispc_irq_handler+0x0/0x354)
  (handle_irq_event_percpu+0x0/0x188)
  (handle_irq_event+0x0/0x64)
  (handle_fasteoi_irq+0x0/0x10c)
  (generic_handle_irq+0x0/0x48)
  (asm_do_IRQ+0x0/0xc0)

DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
unregister_isr() and DISPC IRQ might be running in parallel on different
CPUs. So there is a chance that a callback is executed even though it
has been unregistered. As omap_dispc_wait_for_irq_timeout() declares a
completion on stack, the dispc_irq_wait_handler() callback might try to
access a completion structure that is invalid. This leads to crashes and
hangs.

Solution is to divide unregister calls into two sets:
  1. Non-strict unregistering of callbacks. Callbacks could safely be
     executed after unregistering them. This is the case with unregister
     calls from the IRQ handler itself.
  2. Strict (synchronized) unregistering. Callbacks are not allowed
     after unregistering. This is the case with completion waiting.

The above solution should satisfy one of the original intentions of the
driver: callbacks should be able to unregister themselves.

Also fix DSI IRQ unregister which has similar logic to DISPC IRQ handling.

Cc: stable <stable@vger.kernel.org>
Signed-off-by: Dimitar Dimitrov <dinuxbg@gmail.com>
---
Changes since v2:
  - Fix formatting per Tomi Valkeinen's comments.
  - Restored accidental removal of EXPORT_SYMBOL for omap_dispc_register_isr.

 drivers/staging/omapdrm/omap_plane.c |    2 +-
 drivers/video/omap2/dss/apply.c      |    2 +-
 drivers/video/omap2/dss/dispc.c      |   34 +++++++++++++++++++++++++++++++++-
 drivers/video/omap2/dss/dsi.c        |   15 +++++++++++++++
 include/video/omapdss.h              |    1 +
 5 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/omapdrm/omap_plane.c b/drivers/staging/omapdrm/omap_plane.c
index 7997be7..8d8aa5b 100644
--- a/drivers/staging/omapdrm/omap_plane.c
+++ b/drivers/staging/omapdrm/omap_plane.c
@@ -82,7 +82,7 @@ static void dispc_isr(void *arg, uint32_t mask)
 	struct omap_plane *omap_plane = to_omap_plane(plane);
 	struct omap_drm_private *priv = plane->dev->dev_private;
 
-	omap_dispc_unregister_isr(dispc_isr, plane,
+	omap_dispc_unregister_isr_nosync(dispc_isr, plane,
 			id2irq[omap_plane->ovl->id]);
 
 	queue_work(priv->wq, &omap_plane->work);
diff --git a/drivers/video/omap2/dss/apply.c b/drivers/video/omap2/dss/apply.c
index 0fefc68..9386834 100644
--- a/drivers/video/omap2/dss/apply.c
+++ b/drivers/video/omap2/dss/apply.c
@@ -847,7 +847,7 @@ static void dss_unregister_vsync_isr(void)
 	for (i = 0; i < num_mgrs; ++i)
 		mask |= dispc_mgr_get_framedone_irq(i);
 
-	r = omap_dispc_unregister_isr(dss_apply_irq_handler, NULL, mask);
+	r = omap_dispc_unregister_isr_nosync(dss_apply_irq_handler, NULL, mask);
 	WARN_ON(r);
 
 	dss_data.irq_enabled = false;
diff --git a/drivers/video/omap2/dss/dispc.c b/drivers/video/omap2/dss/dispc.c
index ee9e296..6032252 100644
--- a/drivers/video/omap2/dss/dispc.c
+++ b/drivers/video/omap2/dss/dispc.c
@@ -3320,7 +3320,8 @@ err:
 }
 EXPORT_SYMBOL(omap_dispc_register_isr);
 
-int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
+/* WARNING: callback might be executed even after this function returns! */
+int omap_dispc_unregister_isr_nosync(omap_dispc_isr_t isr, void *arg, u32 mask)
 {
 	int i;
 	unsigned long flags;
@@ -3352,6 +3353,37 @@ int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
 
 	return ret;
 }
+EXPORT_SYMBOL(omap_dispc_unregister_isr_nosync);
+
+/*
+ * Ensure that callback <isr> will NOT be executed after this function
+ * returns. Must be called from sleepable context, though!
+ */
+int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
+{
+	int ret;
+
+	ret = omap_dispc_unregister_isr_nosync(isr, arg, mask);
+
+	/*
+	 * Task context is not really needed. But if we're called from atomic
+	 * context, it is probably from DISPC IRQ, where we will deadlock.
+	 * So use might_sleep() to catch potential deadlocks.
+	 */
+	might_sleep();
+
+	/*
+	 * DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
+	 * unregister_isr() and DISPC IRQ might be running in parallel on
+	 * different CPUs. So there is a chance that a callback is executed
+	 * even though it has been unregistered. Add a barrier, in order to
+	 * ensure that after returning from this function, the new DISPC IRQ
+	 * will use an updated callback array, and NOT its cached copy.
+	 */
+	synchronize_irq(dispc.irq);
+
+	return ret;
+}
 EXPORT_SYMBOL(omap_dispc_unregister_isr);
 
 #ifdef DEBUG
diff --git a/drivers/video/omap2/dss/dsi.c b/drivers/video/omap2/dss/dsi.c
index b07e886..c0fcb68 100644
--- a/drivers/video/omap2/dss/dsi.c
+++ b/drivers/video/omap2/dss/dsi.c
@@ -960,6 +960,11 @@ static int dsi_unregister_isr(struct platform_device *dsidev,
 
 	spin_unlock_irqrestore(&dsi->irq_lock, flags);
 
+	might_sleep();
+
+	/* See notes for dispc.c:omap_dispc_unregister_isr() . */
+	synchronize_irq(dsi->irq);
+
 	return r;
 }
 
@@ -1002,6 +1007,11 @@ static int dsi_unregister_isr_vc(struct platform_device *dsidev, int channel,
 
 	spin_unlock_irqrestore(&dsi->irq_lock, flags);
 
+	might_sleep();
+
+	/* See notes for dispc.c:omap_dispc_unregister_isr() . */
+	synchronize_irq(dsi->irq);
+
 	return r;
 }
 
@@ -1042,6 +1052,11 @@ static int dsi_unregister_isr_cio(struct platform_device *dsidev,
 
 	spin_unlock_irqrestore(&dsi->irq_lock, flags);
 
+	might_sleep();
+
+	/* See notes for dispc.c:omap_dispc_unregister_isr() . */
+	synchronize_irq(dsi->irq);
+
 	return r;
 }
 
diff --git a/include/video/omapdss.h b/include/video/omapdss.h
index a6267a2..769f981 100644
--- a/include/video/omapdss.h
+++ b/include/video/omapdss.h
@@ -708,6 +708,7 @@ void omapdss_default_get_timings(struct omap_dss_device *dssdev,
 typedef void (*omap_dispc_isr_t) (void *arg, u32 mask);
 int omap_dispc_register_isr(omap_dispc_isr_t isr, void *arg, u32 mask);
 int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask);
+int omap_dispc_unregister_isr_nosync(omap_dispc_isr_t isr, void *arg, u32 mask);
 
 int omap_dispc_wait_for_irq_timeout(u32 irqmask, unsigned long timeout);
 int omap_dispc_wait_for_irq_interruptible_timeout(u32 irqmask,
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2012-09-10 19:34 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-09-02 19:12 [RFC PATCH] OMAPDSS: DISPC: Fix IRQ unregister race Dimitar Dimitrov
2012-09-06 13:36 ` Tomi Valkeinen
2012-09-07 14:42   ` Dimitar Dimitrov
2012-09-08 15:05     ` [RFC PATCH v2] OMAPDSS: " Dimitar Dimitrov
2012-09-10  7:49       ` Tomi Valkeinen
2012-09-10 19:19         ` Dimitar Dimitrov
2012-09-10 19:34           ` [PATCH v3] " Dimitar Dimitrov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).