[PATCH v5 0/1] audio/jack: fix use after free segfault

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v5 0/1] audio/jack: fix use after free segfault
@ 2020-08-19  6:29 Geoffrey McRae
  2020-08-19  6:29 ` [PATCH v5 1/1] " Geoffrey McRae
  0 siblings, 1 reply; 15+ messages in thread
From: Geoffrey McRae @ 2020-08-19  6:29 UTC (permalink / raw)
  To: qemu-devel; +Cc: Geoffrey McRae, kraxel

v5:
  * removed hanging dlfcn include from v3

Geoffrey McRae (1):
  audio/jack: fix use after free segfault

 audio/jackaudio.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

-- 
2.20.1



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v5 1/1] audio/jack: fix use after free segfault
  2020-08-19  6:29 [PATCH v5 0/1] audio/jack: fix use after free segfault Geoffrey McRae
@ 2020-08-19  6:29 ` Geoffrey McRae
  2020-08-19 15:21   ` Christian Schoenebeck
  0 siblings, 1 reply; 15+ messages in thread
From: Geoffrey McRae @ 2020-08-19  6:29 UTC (permalink / raw)
  To: qemu-devel; +Cc: Geoffrey McRae, kraxel

This change registers a bottom handler to close the JACK client
connection when a server shutdown signal is recieved. Without this
libjack2 attempts to "clean up" old clients and causes a use after free
segfault.

Signed-off-by: Geoffrey McRae <geoff@hostfission.com>
---
 audio/jackaudio.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/audio/jackaudio.c b/audio/jackaudio.c
index 72ed7c4929..b0da5cd00b 100644
--- a/audio/jackaudio.c
+++ b/audio/jackaudio.c
@@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include "qemu/module.h"
 #include "qemu/atomic.h"
+#include "qemu/main-loop.h"
 #include "qemu-common.h"
 #include "audio.h"
 
@@ -63,6 +64,7 @@ typedef struct QJackClient {
     QJackState      state;
     jack_client_t  *client;
     jack_nframes_t  freq;
+    QEMUBH         *shutdown_bh;
 
     struct QJack   *j;
     int             nchannels;
@@ -306,21 +308,27 @@ static int qjack_xrun(void *arg)
     return 0;
 }
 
+static void qjack_shutdown_bh(void *opaque)
+{
+    QJackClient *c = (QJackClient *)opaque;
+    qjack_client_fini(c);
+}
+
 static void qjack_shutdown(void *arg)
 {
     QJackClient *c = (QJackClient *)arg;
-    c->state = QJACK_STATE_SHUTDOWN;
+    c->state       = QJACK_STATE_SHUTDOWN;
+    qemu_bh_schedule(c->shutdown_bh);
 }
 
 static void qjack_client_recover(QJackClient *c)
 {
-    if (c->state == QJACK_STATE_SHUTDOWN) {
-        qjack_client_fini(c);
+    if (c->state != QJACK_STATE_DISCONNECTED) {
+        return;
     }
 
     /* packets is used simply to throttle this */
-    if (c->state == QJACK_STATE_DISCONNECTED &&
-        c->packets % 100 == 0) {
+    if (c->packets % 100 == 0) {
 
         /* if enabled then attempt to recover */
         if (c->enabled) {
@@ -417,6 +425,10 @@ static int qjack_client_init(QJackClient *c)
         options |= JackServerName;
     }
 
+    if (!c->shutdown_bh) {
+        c->shutdown_bh = qemu_bh_new(qjack_shutdown_bh, c);
+    }
+
     c->client = jack_client_open(client_name, options, &status,
       c->opt->server_name);
 
@@ -489,8 +501,6 @@ static int qjack_init_out(HWVoiceOut *hw, struct audsettings *as,
     QJackOut *jo  = (QJackOut *)hw;
     Audiodev *dev = (Audiodev *)drv_opaque;
 
-    qjack_client_fini(&jo->c);
-
     jo->c.out       = true;
     jo->c.enabled   = false;
     jo->c.nchannels = as->nchannels;
@@ -525,8 +535,6 @@ static int qjack_init_in(HWVoiceIn *hw, struct audsettings *as,
     QJackIn  *ji  = (QJackIn *)hw;
     Audiodev *dev = (Audiodev *)drv_opaque;
 
-    qjack_client_fini(&ji->c);
-
     ji->c.out       = false;
     ji->c.enabled   = false;
     ji->c.nchannels = as->nchannels;
@@ -557,6 +565,8 @@ static int qjack_init_in(HWVoiceIn *hw, struct audsettings *as,
 
 static void qjack_client_fini(QJackClient *c)
 {
+    qemu_bh_cancel(c->shutdown_bh);
+
     switch (c->state) {
     case QJACK_STATE_RUNNING:
         jack_deactivate(c->client);
@@ -564,6 +574,7 @@ static void qjack_client_fini(QJackClient *c)
 
     case QJACK_STATE_SHUTDOWN:
         jack_client_close(c->client);
+        c->client = NULL;
         /* fallthrough */
 
     case QJACK_STATE_DISCONNECTED:
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v5 1/1] audio/jack: fix use after free segfault
  2020-08-19  6:29 ` [PATCH v5 1/1] " Geoffrey McRae
@ 2020-08-19 15:21   ` Christian Schoenebeck
  2020-08-19 15:27     ` Geoffrey McRae
  2020-08-20  5:37     ` Gerd Hoffmann
  0 siblings, 2 replies; 15+ messages in thread
From: Christian Schoenebeck @ 2020-08-19 15:21 UTC (permalink / raw)
  To: qemu-devel; +Cc: Geoffrey McRae, kraxel

On Mittwoch, 19. August 2020 08:29:39 CEST Geoffrey McRae wrote:
> This change registers a bottom handler to close the JACK client
> connection when a server shutdown signal is recieved. Without this
> libjack2 attempts to "clean up" old clients and causes a use after free
> segfault.
> 
> Signed-off-by: Geoffrey McRae <geoff@hostfission.com>
> ---

Looks much better now, but ...

>  audio/jackaudio.c | 29 ++++++++++++++++++++---------
>  1 file changed, 20 insertions(+), 9 deletions(-)
> 
> diff --git a/audio/jackaudio.c b/audio/jackaudio.c
> index 72ed7c4929..b0da5cd00b 100644
> --- a/audio/jackaudio.c
> +++ b/audio/jackaudio.c
> @@ -25,6 +25,7 @@
>  #include "qemu/osdep.h"
>  #include "qemu/module.h"
>  #include "qemu/atomic.h"
> +#include "qemu/main-loop.h"
>  #include "qemu-common.h"
>  #include "audio.h"
> 
> @@ -63,6 +64,7 @@ typedef struct QJackClient {
>      QJackState      state;
>      jack_client_t  *client;
>      jack_nframes_t  freq;
> +    QEMUBH         *shutdown_bh;
> 
>      struct QJack   *j;
>      int             nchannels;
> @@ -306,21 +308,27 @@ static int qjack_xrun(void *arg)
>      return 0;
>  }
> 
> +static void qjack_shutdown_bh(void *opaque)
> +{
> +    QJackClient *c = (QJackClient *)opaque;
> +    qjack_client_fini(c);
> +}
> +
>  static void qjack_shutdown(void *arg)
>  {
>      QJackClient *c = (QJackClient *)arg;
> -    c->state = QJACK_STATE_SHUTDOWN;
> +    c->state       = QJACK_STATE_SHUTDOWN;

White space changes are not much embraced on high traffic projects BTW.

> +    qemu_bh_schedule(c->shutdown_bh);
>  }
> 
>  static void qjack_client_recover(QJackClient *c)
>  {
> -    if (c->state == QJACK_STATE_SHUTDOWN) {
> -        qjack_client_fini(c);
> +    if (c->state != QJACK_STATE_DISCONNECTED) {
> +        return;
>      }
> 
>      /* packets is used simply to throttle this */
> -    if (c->state == QJACK_STATE_DISCONNECTED &&
> -        c->packets % 100 == 0) {
> +    if (c->packets % 100 == 0) {
> 
>          /* if enabled then attempt to recover */
>          if (c->enabled) {
> @@ -417,6 +425,10 @@ static int qjack_client_init(QJackClient *c)
>          options |= JackServerName;
>      }
> 
> +    if (!c->shutdown_bh) {
> +        c->shutdown_bh = qemu_bh_new(qjack_shutdown_bh, c);
> +    }
> +

Where is qemu_bh_delete() ?

>      c->client = jack_client_open(client_name, options, &status,
>        c->opt->server_name);
> 
> @@ -489,8 +501,6 @@ static int qjack_init_out(HWVoiceOut *hw, struct
> audsettings *as, QJackOut *jo  = (QJackOut *)hw;
>      Audiodev *dev = (Audiodev *)drv_opaque;
> 
> -    qjack_client_fini(&jo->c);
> -
>      jo->c.out       = true;
>      jo->c.enabled   = false;
>      jo->c.nchannels = as->nchannels;
> @@ -525,8 +535,6 @@ static int qjack_init_in(HWVoiceIn *hw, struct
> audsettings *as, QJackIn  *ji  = (QJackIn *)hw;
>      Audiodev *dev = (Audiodev *)drv_opaque;
> 
> -    qjack_client_fini(&ji->c);
> -
>      ji->c.out       = false;
>      ji->c.enabled   = false;
>      ji->c.nchannels = as->nchannels;
> @@ -557,6 +565,8 @@ static int qjack_init_in(HWVoiceIn *hw, struct
> audsettings *as,
> 
>  static void qjack_client_fini(QJackClient *c)
>  {
> +    qemu_bh_cancel(c->shutdown_bh);
> +

Looks like a potential race. Quote from the API doc of qemu_bh_cancel():

	"While cancellation itself is also wait-free and thread-safe, it can of 	
	course race with the loop that executes bottom halves unless you are 
	holding the iothread mutex.  This makes it mostly useless if you are not 
	holding the mutex."

>      switch (c->state) {
>      case QJACK_STATE_RUNNING:
>          jack_deactivate(c->client);
> @@ -564,6 +574,7 @@ static void qjack_client_fini(QJackClient *c)
> 
>      case QJACK_STATE_SHUTDOWN:
>          jack_client_close(c->client);
> +        c->client = NULL;
>          /* fallthrough */
> 
>      case QJACK_STATE_DISCONNECTED:

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v5 1/1] audio/jack: fix use after free segfault
  2020-08-19 15:21   ` Christian Schoenebeck
@ 2020-08-19 15:27     ` Geoffrey McRae
  2020-08-20  5:37     ` Gerd Hoffmann
  1 sibling, 0 replies; 15+ messages in thread
From: Geoffrey McRae @ 2020-08-19 15:27 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: qemu-devel, kraxel



On 2020-08-20 01:21, Christian Schoenebeck wrote:
> On Mittwoch, 19. August 2020 08:29:39 CEST Geoffrey McRae wrote:
>> This change registers a bottom handler to close the JACK client
>> connection when a server shutdown signal is recieved. Without this
>> libjack2 attempts to "clean up" old clients and causes a use after 
>> free
>> segfault.
>> 
>> Signed-off-by: Geoffrey McRae <geoff@hostfission.com>
>> ---
> 
> Looks much better now, but ...
> 
>>  audio/jackaudio.c | 29 ++++++++++++++++++++---------
>>  1 file changed, 20 insertions(+), 9 deletions(-)
>> 
>> diff --git a/audio/jackaudio.c b/audio/jackaudio.c
>> index 72ed7c4929..b0da5cd00b 100644
>> --- a/audio/jackaudio.c
>> +++ b/audio/jackaudio.c
>> @@ -25,6 +25,7 @@
>>  #include "qemu/osdep.h"
>>  #include "qemu/module.h"
>>  #include "qemu/atomic.h"
>> +#include "qemu/main-loop.h"
>>  #include "qemu-common.h"
>>  #include "audio.h"
>> 
>> @@ -63,6 +64,7 @@ typedef struct QJackClient {
>>      QJackState      state;
>>      jack_client_t  *client;
>>      jack_nframes_t  freq;
>> +    QEMUBH         *shutdown_bh;
>> 
>>      struct QJack   *j;
>>      int             nchannels;
>> @@ -306,21 +308,27 @@ static int qjack_xrun(void *arg)
>>      return 0;
>>  }
>> 
>> +static void qjack_shutdown_bh(void *opaque)
>> +{
>> +    QJackClient *c = (QJackClient *)opaque;
>> +    qjack_client_fini(c);
>> +}
>> +
>>  static void qjack_shutdown(void *arg)
>>  {
>>      QJackClient *c = (QJackClient *)arg;
>> -    c->state = QJACK_STATE_SHUTDOWN;
>> +    c->state       = QJACK_STATE_SHUTDOWN;
> 
> White space changes are not much embraced on high traffic projects BTW.

This change is unintentional and was missed in the rollback from the 
prior patch version.

> 
>> +    qemu_bh_schedule(c->shutdown_bh);
>>  }
>> 
>>  static void qjack_client_recover(QJackClient *c)
>>  {
>> -    if (c->state == QJACK_STATE_SHUTDOWN) {
>> -        qjack_client_fini(c);
>> +    if (c->state != QJACK_STATE_DISCONNECTED) {
>> +        return;
>>      }
>> 
>>      /* packets is used simply to throttle this */
>> -    if (c->state == QJACK_STATE_DISCONNECTED &&
>> -        c->packets % 100 == 0) {
>> +    if (c->packets % 100 == 0) {
>> 
>>          /* if enabled then attempt to recover */
>>          if (c->enabled) {
>> @@ -417,6 +425,10 @@ static int qjack_client_init(QJackClient *c)
>>          options |= JackServerName;
>>      }
>> 
>> +    if (!c->shutdown_bh) {
>> +        c->shutdown_bh = qemu_bh_new(qjack_shutdown_bh, c);
>> +    }
>> +
> 
> Where is qemu_bh_delete() ?

Whoops... I will correct this :)

> 
>>      c->client = jack_client_open(client_name, options, &status,
>>        c->opt->server_name);
>> 
>> @@ -489,8 +501,6 @@ static int qjack_init_out(HWVoiceOut *hw, struct
>> audsettings *as, QJackOut *jo  = (QJackOut *)hw;
>>      Audiodev *dev = (Audiodev *)drv_opaque;
>> 
>> -    qjack_client_fini(&jo->c);
>> -
>>      jo->c.out       = true;
>>      jo->c.enabled   = false;
>>      jo->c.nchannels = as->nchannels;
>> @@ -525,8 +535,6 @@ static int qjack_init_in(HWVoiceIn *hw, struct
>> audsettings *as, QJackIn  *ji  = (QJackIn *)hw;
>>      Audiodev *dev = (Audiodev *)drv_opaque;
>> 
>> -    qjack_client_fini(&ji->c);
>> -
>>      ji->c.out       = false;
>>      ji->c.enabled   = false;
>>      ji->c.nchannels = as->nchannels;
>> @@ -557,6 +565,8 @@ static int qjack_init_in(HWVoiceIn *hw, struct
>> audsettings *as,
>> 
>>  static void qjack_client_fini(QJackClient *c)
>>  {
>> +    qemu_bh_cancel(c->shutdown_bh);
>> +
> 
> Looks like a potential race. Quote from the API doc of 
> qemu_bh_cancel():
> 
> 	"While cancellation itself is also wait-free and thread-safe, it can 
> of
> 	course race with the loop that executes bottom halves unless you are
> 	holding the iothread mutex.  This makes it mostly useless if you are 
> not
> 	holding the mutex."
> 

Noted. How does one go about holding the iothread mutex?

>>      switch (c->state) {
>>      case QJACK_STATE_RUNNING:
>>          jack_deactivate(c->client);
>> @@ -564,6 +574,7 @@ static void qjack_client_fini(QJackClient *c)
>> 
>>      case QJACK_STATE_SHUTDOWN:
>>          jack_client_close(c->client);
>> +        c->client = NULL;
>>          /* fallthrough */
>> 
>>      case QJACK_STATE_DISCONNECTED:
> 
> Best regards,
> Christian Schoenebeck


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v5 1/1] audio/jack: fix use after free segfault
  2020-08-19 15:21   ` Christian Schoenebeck
  2020-08-19 15:27     ` Geoffrey McRae
@ 2020-08-20  5:37     ` Gerd Hoffmann
  2020-08-20 10:06       ` Christian Schoenebeck
  1 sibling, 1 reply; 15+ messages in thread
From: Gerd Hoffmann @ 2020-08-20  5:37 UTC (permalink / raw)
  To: Christian Schoenebeck; +Cc: Geoffrey McRae, qemu-devel

  Hi,

> > +    qemu_bh_cancel(c->shutdown_bh);
> 
> Looks like a potential race. Quote from the API doc of qemu_bh_cancel():
> 
> 	"While cancellation itself is also wait-free and thread-safe, it can of 	
> 	course race with the loop that executes bottom halves unless you are 
> 	holding the iothread mutex.  This makes it mostly useless if you are not 
> 	holding the mutex."

Should not be a problem, all auto backend code should only be called
while qemu holds the iothread mutex.  With the exception of the shutdown
handler which jack might call from signal context (which is why we need
the BH in the first place).

take care,
  Gerd



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v5 1/1] audio/jack: fix use after free segfault
  2020-08-20  5:37     ` Gerd Hoffmann
@ 2020-08-20 10:06       ` Christian Schoenebeck
  2020-08-20 10:54         ` Paolo Bonzini
  0 siblings, 1 reply; 15+ messages in thread
From: Christian Schoenebeck @ 2020-08-20 10:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: Gerd Hoffmann, Geoffrey McRae, Paolo Bonzini

On Donnerstag, 20. August 2020 07:37:28 CEST Gerd Hoffmann wrote:
>   Hi,
> 
> > > +    qemu_bh_cancel(c->shutdown_bh);
> > 
> > Looks like a potential race. Quote from the API doc of qemu_bh_cancel():
> > 	"While cancellation itself is also wait-free and thread-safe, it can of
> > 	course race with the loop that executes bottom halves unless you are
> > 	holding the iothread mutex.  This makes it mostly useless if you are not
> > 	holding the mutex."
> 
> Should not be a problem, all auto backend code should only be called
> while qemu holds the iothread mutex.  With the exception of the shutdown
> handler which jack might call from signal context (which is why we need
> the BH in the first place).

Hmmm, as Geoffrey already added a lock today, I noticed that QEMU's main IO 
thread mutex is not initialized as 'recursive' lock type. Does that make 
sense? I.e. shouldn't there be a

	qemu_rec_mutex_init(&qemu_global_mutex);

in softmmu/cpus.c for safety reasons to prevent nested locks from same thread 
causing misbehaviour?

CCing Paolo to clarify.

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v5 1/1] audio/jack: fix use after free segfault
  2020-08-20 10:06       ` Christian Schoenebeck
@ 2020-08-20 10:54         ` Paolo Bonzini
  2020-08-20 12:00           ` Christian Schoenebeck
                             ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Paolo Bonzini @ 2020-08-20 10:54 UTC (permalink / raw)
  To: Christian Schoenebeck, qemu-devel; +Cc: Geoffrey McRae, Gerd Hoffmann

On 20/08/20 12:06, Christian Schoenebeck wrote:
> Hmmm, as Geoffrey already added a lock today, I noticed that QEMU's main IO 
> thread mutex is not initialized as 'recursive' lock type. Does that make 
> sense? I.e. shouldn't there be a
> 
> 	qemu_rec_mutex_init(&qemu_global_mutex);
> 
> in softmmu/cpus.c for safety reasons to prevent nested locks from same thread 
> causing misbehaviour?
> 
> CCing Paolo to clarify.

atexit functions (in this case
audio_cleanup->free_audio_state->qjack_fini_out) might be called both
with or without the BQL taken, so v7 of this series is likely wrong as
you point out.

However, recursive locks are pure evil. :)

More seriously: programming with concurrency is first and foremost about
understanding invariants[1].  Locks are relatively simple to reason
about because they enforce invariants at the points where they are
acquired or released.

If you have something like

static void do_it_locked(struct foo *foo)
{
    /* ... */
}

static void do_it(struct foo *foo)
{
    mutex_lock(&foo->lock);
    do_it_locked(foo);
    mutex_unlock(&foo->lock);
}

then you can immediately know that foo_locked() calls requires more
care, because the invariants around "foo" have to be provided by the
caller of foo_locked().  Instead, on a call to foo() the invariants are
guaranteed just because the caller must not have locked foo().

Instead, recursive locks allow you to slip into a different mindset
where locks protect the _code_ instead of the _data_ (and the invariants
of that data).  Things look simpler because you can just have

static void do_it(struct foo *foo)
{
    rec_mutex_lock(&foo->lock);
    /* ... */
    rec_mutex_unlock(&foo->lock);
}

but callers have no clue about what invariants are provided around calls
to do_it().  So, understanding complex code that uses recursive mutexes
is effectively impossible.

More on the practical side, recursive mutex are an easy way to get a
deadlock.  It's a common idiom to do

    /* Need to take foo->lock outside bar->lock.  */
    mutex_unlock(&bar->lock);
    mutex_lock(&foo->lock);
    mutex_lock(&bar->lock);

With a recursive mutex, there's no guarantee that foo->lock is *really*
taken outside bar->lock, because the first unlock could have done
nothing except decreasing the lock-nesting count.  This is also why QEMU
uses differently-named functions to lock/unlock recursive mutexes,
instead of having a flag like pthread mutexes do; it makes code like
this stand out as wrong:

    /* Meant to take foo->lock outside bar->lock, but really doesn't  */
    rec_mutex_unlock(&bar->lock);
    mutex_lock(&foo->lock);
    rec_mutex_lock(&bar->lock);

A variant of this applies to callbacks: the golden rule is that
callbacks (e.g. from a function that iterates over a data structure)
should be called without any lock taken in order to avoid deadlocks.
However, this rule is most often ignored in code that uses a recursive
mutex, because that code is written around the (incorrect) paradigm that
mutual exclusion makes code safe.  Which technically is true in this
case, as deadlocks are not a safety problem, but that's not a great
consolation.

My suggestion is to work towards protecting the audio code with its own
mutex(es) and ignore the existence of the BQL for subsystems that can do
so (audio is a prime candidate).  Also please add comments to
audio_int.h about which functions are called from other threads than the
QEMU main thread.

Thanks,

Paolo

[1] https://lamport.azurewebsites.net/pubs/teaching-concurrency.pdf

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v5 1/1] audio/jack: fix use after free segfault
  2020-08-20 10:54         ` Paolo Bonzini
@ 2020-08-20 12:00           ` Christian Schoenebeck
  2020-08-21 13:13             ` Paolo Bonzini
  2020-08-21 11:12           ` recursive locks (in general) Christian Schoenebeck
  2020-08-21 11:28           ` [PATCH v5 1/1] audio/jack: fix use after free segfault Geoffrey McRae
  2 siblings, 1 reply; 15+ messages in thread
From: Christian Schoenebeck @ 2020-08-20 12:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini, Geoffrey McRae, Gerd Hoffmann

On Donnerstag, 20. August 2020 12:54:49 CEST Paolo Bonzini wrote:
> More on the practical side, recursive mutex are an easy way to get a
> deadlock.  It's a common idiom to do
> 
>     /* Need to take foo->lock outside bar->lock.  */
>     mutex_unlock(&bar->lock);
>     mutex_lock(&foo->lock);
>     mutex_lock(&bar->lock);

The general theoretical implications about recursive locks was clear to me. 
AFAICS your point is that a recursive lock could mislead poeple taking things 
easy and running into a deadlock scenario like outlined by you.

My point was if it happens for whatever reason that a main IO mutex lock was 
accidentally introduced, i.e. without knowing it was already locked on a 
higher level, wouldn't it make sense to deal with this in some kind of 
defensive way?

One way would be a recursive type and logging a warning, which you obviously 
don't like; another option would be an assertion fault instead to make 
developers immediately aware about the double lock on early testing. Because 
on a large scale project like this, it is almost impossible for all developers 
to be aware about all implied locks. Don't you think so?

At least IMO the worst case would be a double unlock on a non-recursive main 
thread mutex and running silently into undefined behaviour.

> My suggestion is to work towards protecting the audio code with its own
> mutex(es) and ignore the existence of the BQL for subsystems that can do
> so (audio is a prime candidate).  Also please add comments to
> audio_int.h about which functions are called from other threads than the
> QEMU main thread.

That main thread lock came up here because I noticed this API comment on 
qemu_bh_cancel():

  "While cancellation itself is also wait-free and thread-safe, it can of         
   course race with the loop that executes bottom halves unless you are 
   holding the iothread mutex.  This makes it mostly useless if you are not 
   holding the mutex."

So this lock was not about driver internal data protection, but rather about 
dealing with the BH API correctly.

Best regards,
Christian Schoenebeck

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v5 1/1] audio/jack: fix use after free segfault
  2020-08-20 12:00           ` Christian Schoenebeck
@ 2020-08-21 13:13             ` Paolo Bonzini
  2020-08-26 13:48               ` PTHREAD_MUTEX_ERRORCHECK and fork() Christian Schoenebeck
  0 siblings, 1 reply; 15+ messages in thread
From: Paolo Bonzini @ 2020-08-21 13:13 UTC (permalink / raw)
  To: Christian Schoenebeck, qemu-devel; +Cc: Geoffrey McRae, Gerd Hoffmann

On 20/08/20 14:00, Christian Schoenebeck wrote:
> One way would be a recursive type and logging a warning, which you obviously 
> don't like; another option would be an assertion fault instead to make 
> developers immediately aware about the double lock on early testing. Because 
> on a large scale project like this, it is almost impossible for all developers 
> to be aware about all implied locks. Don't you think so?
> 
> At least IMO the worst case would be a double unlock on a non-recursive main 
> thread mutex and running silently into undefined behaviour.

Yes, more assertions are always fine.

We were using errorcheck mutexes until a few years ago, unfortunately we
couldn't because they are broken with respect to fork (commit 24fa90499,
"qemu-thread: do not use PTHREAD_MUTEX_ERRORCHECK", 2015-03-05).

> That main thread lock came up here because I noticed this API comment on 
> qemu_bh_cancel():
> 
>   "While cancellation itself is also wait-free and thread-safe, it can of         
>    course race with the loop that executes bottom halves unless you are 
>    holding the iothread mutex.  This makes it mostly useless if you are not 
>    holding the mutex."
> 
> So this lock was not about driver internal data protection, but rather about 
> dealing with the BH API correctly.

Yes, on the other hand that is not a problem if the BH is idempotent.
For example something like this is okay:

bh_body_locked()
{
	free(foo);
	foo = NULL;
}

bh_body()
{
	qemu_mutex_lock(&foo_lock);
	bh_body_locked();
	qemu_mutex_unlock(&foo_lock);
}

...

	qemu_mutex_lock(&foo_lock);
	qemu_bh_delete(foo_bh);		// also calls qemu_bh_cancel
	bh_body_locked();
	qemu_mutex_unlock(&foo_lock);

Paolo



^ permalink raw reply	[flat|nested] 15+ messages in thread

* PTHREAD_MUTEX_ERRORCHECK and fork()
  2020-08-21 13:13             ` Paolo Bonzini
@ 2020-08-26 13:48               ` Christian Schoenebeck
  0 siblings, 0 replies; 15+ messages in thread
From: Christian Schoenebeck @ 2020-08-26 13:48 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini, Geoffrey McRae, Gerd Hoffmann

On Freitag, 21. August 2020 15:13:35 CEST Paolo Bonzini wrote:
> On 20/08/20 14:00, Christian Schoenebeck wrote:
> > One way would be a recursive type and logging a warning, which you
> > obviously don't like; another option would be an assertion fault instead
> > to make developers immediately aware about the double lock on early
> > testing. Because on a large scale project like this, it is almost
> > impossible for all developers to be aware about all implied locks. Don't
> > you think so?
> > 
> > At least IMO the worst case would be a double unlock on a non-recursive
> > main thread mutex and running silently into undefined behaviour.
> 
> Yes, more assertions are always fine.
> 
> We were using errorcheck mutexes until a few years ago, unfortunately we
> couldn't because they are broken with respect to fork (commit 24fa90499,
> "qemu-thread: do not use PTHREAD_MUTEX_ERRORCHECK", 2015-03-05).

I had a go on this one; you still get EPERM when trying to 
pthread_mutex_unlock() from a forked child process on a 
PTHREAD_MUTEX_ERRORCHECK mutex locked by parent process.

The common opinion though seems to be that unlocking parent's lock(s) by child 
process was illegal:
https://groups.google.com/forum/#!topic/comp.programming.threads/ywMInaZjmqo
https://sourceware.org/bugzilla/show_bug.cgi?id=2745

The relevant section from POSIX:

	fork - create a new process
	...
	A process shall be created with a single thread. If a multi-threaded 
	process calls fork(), the new process shall contain a replica of the 
	calling thread and its entire address space, possibly including the states 
	of mutexes and other resources. Consequently, to avoid errors, the child 
	process may only execute async-signal-safe operations until such time as 
	one of the exec functions is called.

https://pubs.opengroup.org/onlinepubs/9699919799/

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 15+ messages in thread

* recursive locks (in general)
  2020-08-20 10:54         ` Paolo Bonzini
  2020-08-20 12:00           ` Christian Schoenebeck
@ 2020-08-21 11:12           ` Christian Schoenebeck
  2020-08-21 13:08             ` Paolo Bonzini
  2020-08-21 11:28           ` [PATCH v5 1/1] audio/jack: fix use after free segfault Geoffrey McRae
  2 siblings, 1 reply; 15+ messages in thread
From: Christian Schoenebeck @ 2020-08-21 11:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini, Geoffrey McRae, Gerd Hoffmann

On Donnerstag, 20. August 2020 12:54:49 CEST Paolo Bonzini wrote:
> More seriously: programming with concurrency is first and foremost about
> understanding invariants[1].  Locks are relatively simple to reason
> about because they enforce invariants at the points where they are
> acquired or released.

As a side note, independent of the actual QBL issue discussed, while I agree 
with you that nested locks should be avoided as much as possible, especially 
on heterogenous large scale projects like QEMU; please let me correct some of 
the things you said about recursive locks in general:

> However, recursive locks are pure evil. :)

That's a common overgeneralization of the potential issues when dealing with 
recursive locks. Especially this claim ...

> but callers have no clue about what invariants are provided around calls
> to do_it().  So, understanding complex code that uses recursive mutexes
> is effectively impossible.

... is incorrect.

It is correct that you can run into deadlocks if you don't know how to deal 
with nested recursive locks appropriately. It is incorrect though to say they 
were not managable.

There is a golden rule with recursive locks: You always have to preserve a 
clear hierarchy. Say you have the following recursive mutexes:

RecursiveMutex mutex0;
RecursiveMutex mutex1;
RecursiveMutex mutex2;
...
RecursiveMutex mutexN;

where the suffix shall identify the hierarchy, i.e. h(mutex0) = 0,
h(mutex1) = 1, ... h(mutexN) = N. Then the golden rule is that in any call 
stack the nested locks must always preserve the same transitive hierarchy, 
e.g.:

	h(lock1) <= h(lock2) <= ... <= h(lockK)

Example (using lock-guard notation), let's say ascending transitivity is 
chosen, then the following is Ok, as it does not violate chosen transitivity:

	synchronized(mutex0) {
		synchronized(mutex1) {
			...
			synchronized(mutexN) {
			}
		}
	}

Likewise, the following is Ok as well:

	synchronized(mutex3) {
		synchronized(mutex8) {
			...
			synchronized(mutexN) {
			}
		}
	}

whereas this would be illegal:

	synchronized(mutex3) {
		synchronized(mutex2) { // < violates chosen transitivity
			...
			synchronized(mutexN) {
			}
		}
	}

Now let's say one day somebody accidentally broke that rule and ran into a 
deadlock. What you can do to resolve the situation is capturing the call stack 
of each mutex's [last] lock. And when you triggered the deadlock, you know 
that at least one of the threads violated the lock hierarchy. So you look at 
the individual call stacks and correct the program flow appropriately to 
restore the hierarchy. And the latter is BTW independent of (i.e. any side 
effects) of other threads, so it is really just about looking at what exactly 
ONE thread is doing.

And for the latter reason, there are also more systematic approaches to ensure 
correctness: for instance a built-in hierarchy check in the individual Mutex 
implementation which would then e.g. raise an assertaion fault on early 
testing whenever a developer broke the hierarchy in a nested lock sequence.

Another solution would be integrating this hierarchy check into a (e.g. 
static) sanatizer, as this information can already be derived directly from 
the AST in most cases. But unfortunatley this does not exist yet in any 
sanatizer yet AFAIK.

For me, a non-recursive mutex makes sense for one use case: if the intention 
is to lock the mutex on one thread while allowing to unlock it on another 
thread. For all other use cases I would (personally) prefer a recursive type, 
as it guards a clear ownership relation and hence allows to guard and prevent 
many mistakes.

Best regards,
Christian Schoenebeck

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: recursive locks (in general)
  2020-08-21 11:12           ` recursive locks (in general) Christian Schoenebeck
@ 2020-08-21 13:08             ` Paolo Bonzini
  2020-08-21 15:25               ` Christian Schoenebeck
  0 siblings, 1 reply; 15+ messages in thread
From: Paolo Bonzini @ 2020-08-21 13:08 UTC (permalink / raw)
  To: Christian Schoenebeck, qemu-devel; +Cc: Geoffrey McRae, Gerd Hoffmann

On 21/08/20 13:12, Christian Schoenebeck wrote:
> There is a golden rule with recursive locks: You always have to preserve a 
> clear hierarchy. Say you have the following recursive mutexes:
> 
> RecursiveMutex mutex0;
> RecursiveMutex mutex1;
> RecursiveMutex mutex2;
> ...
> RecursiveMutex mutexN;
> 
> where the suffix shall identify the hierarchy, i.e. h(mutex0) = 0,
> h(mutex1) = 1, ... h(mutexN) = N. Then the golden rule is that in any call 
> stack the nested locks must always preserve the same transitive hierarchy, 
> e.g.:

That's also what you do with regular locks.

But the difference is that with regular locks you can always do

void bar(std::unique_lock<std::mutex> &mutex3_guard)
{
	...
	mutex3_guard.unlock();
	synchronized(mutex2) {
	}
	mutex3_guard.lock();
	...
}

while with recursive locks you cannot, because you never know if
mutex3_guard.unlock() is really going to unlock mutex3.  So a simple
reasoning on the invariants guaranteed by mutex3 has turned into
interprocedural reasoning on all the callers of bar(), including callers
of callers and so on.

> For me, a non-recursive mutex makes sense for one use case: if the intention 
> is to lock the mutex on one thread while allowing to unlock it on another 
> thread.

Then you want a semaphore, not a non-recursive mutex.  Doing what you
suggest with pthread_mutex or C++ std::mutex is undefined behavior.

Paolo

> For all other use cases I would (personally) prefer a recursive type, 
> as it guards a clear ownership relation and hence allows to guard and prevent 
> many mistakes.
> 
> Best regards,
> Christian Schoenebeck
> 
> 



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: recursive locks (in general)
  2020-08-21 13:08             ` Paolo Bonzini
@ 2020-08-21 15:25               ` Christian Schoenebeck
  0 siblings, 0 replies; 15+ messages in thread
From: Christian Schoenebeck @ 2020-08-21 15:25 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel, Geoffrey McRae, Gerd Hoffmann

On Freitag, 21. August 2020 15:08:09 CEST Paolo Bonzini wrote:
> On 21/08/20 13:12, Christian Schoenebeck wrote:
> > There is a golden rule with recursive locks: You always have to preserve a
> > clear hierarchy. Say you have the following recursive mutexes:
> > 
> > RecursiveMutex mutex0;
> > RecursiveMutex mutex1;
> > RecursiveMutex mutex2;
> > ...
> > RecursiveMutex mutexN;
> > 
> > where the suffix shall identify the hierarchy, i.e. h(mutex0) = 0,
> > h(mutex1) = 1, ... h(mutexN) = N. Then the golden rule is that in any call
> > stack the nested locks must always preserve the same transitive hierarchy,
> 
> > e.g.:
> That's also what you do with regular locks.

You can't do that with non-recursive mutexes if you have cyclic dependencies.

> But the difference is that with regular locks you can always do
> 
> void bar(std::unique_lock<std::mutex> &mutex3_guard)
> {
> 	...
> 	mutex3_guard.unlock();
> 	synchronized(mutex2) {
> 	}
> 	mutex3_guard.lock();
> 	...
> }

Right, if you are able to clearly judge that this unlock is really safe for 
all layers involved.

Okay never mind, I see that we'll keep split on this recursive lock issue 
anyway. Sorry for the noise Paolo! :)

Best regards,
Christian Schoenebeck




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v5 1/1] audio/jack: fix use after free segfault
  2020-08-20 10:54         ` Paolo Bonzini
  2020-08-20 12:00           ` Christian Schoenebeck
  2020-08-21 11:12           ` recursive locks (in general) Christian Schoenebeck
@ 2020-08-21 11:28           ` Geoffrey McRae
  2020-08-21 13:13             ` Paolo Bonzini
  2 siblings, 1 reply; 15+ messages in thread
From: Geoffrey McRae @ 2020-08-21 11:28 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Christian Schoenebeck, qemu-devel, Gerd Hoffmann


> My suggestion is to work towards protecting the audio code with its own
> mutex(es) and ignore the existence of the BQL for subsystems that can 
> do
> so (audio is a prime candidate).  Also please add comments to
> audio_int.h about which functions are called from other threads than 
> the
> QEMU main thread.

Ok, so to get back on topic, what exactly is the best way forward to fix 
this issue in this patchset? Should I be managing a local mutex instead?

> 
> Thanks,
> 
> Paolo
> 
> [1] https://lamport.azurewebsites.net/pubs/teaching-concurrency.pdf


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v5 1/1] audio/jack: fix use after free segfault
  2020-08-21 11:28           ` [PATCH v5 1/1] audio/jack: fix use after free segfault Geoffrey McRae
@ 2020-08-21 13:13             ` Paolo Bonzini
  0 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2020-08-21 13:13 UTC (permalink / raw)
  To: Geoffrey McRae; +Cc: Christian Schoenebeck, qemu-devel, Gerd Hoffmann

On 21/08/20 13:28, Geoffrey McRae wrote:
> 
>> My suggestion is to work towards protecting the audio code with its own
>> mutex(es) and ignore the existence of the BQL for subsystems that can do
>> so (audio is a prime candidate).  Also please add comments to
>> audio_int.h about which functions are called from other threads than the
>> QEMU main thread.
> 
> Ok, so to get back on topic, what exactly is the best way forward to fix
> this issue in this patchset? Should I be managing a local mutex instead?

I think adding a local mutex for audio stuff would be best, if it's not
too hard.

Paolo



^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2020-08-26 13:49 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-08-19  6:29 [PATCH v5 0/1] audio/jack: fix use after free segfault Geoffrey McRae
2020-08-19  6:29 ` [PATCH v5 1/1] " Geoffrey McRae
2020-08-19 15:21   ` Christian Schoenebeck
2020-08-19 15:27     ` Geoffrey McRae
2020-08-20  5:37     ` Gerd Hoffmann
2020-08-20 10:06       ` Christian Schoenebeck
2020-08-20 10:54         ` Paolo Bonzini
2020-08-20 12:00           ` Christian Schoenebeck
2020-08-21 13:13             ` Paolo Bonzini
2020-08-26 13:48               ` PTHREAD_MUTEX_ERRORCHECK and fork() Christian Schoenebeck
2020-08-21 11:12           ` recursive locks (in general) Christian Schoenebeck
2020-08-21 13:08             ` Paolo Bonzini
2020-08-21 15:25               ` Christian Schoenebeck
2020-08-21 11:28           ` [PATCH v5 1/1] audio/jack: fix use after free segfault Geoffrey McRae
2020-08-21 13:13             ` Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).