From: Linda Knippers <linda.knippers@hpe.com>
To: Dan Williams <dan.j.williams@intel.com>, linux-nvdimm@lists.01.org
Cc: linux-acpi@vger.kernel.org
Subject: Re: [PATCH] acpi, nfit: fix module unload vs workqueue shutdown race
Date: Tue, 18 Apr 2017 18:03:12 -0400 [thread overview]
Message-ID: <58F68D20.5090607@hpe.com> (raw)
In-Reply-To: <149253869300.4222.11825248275497461939.stgit@dwillia2-desk3.amr.corp.intel.com>
On 04/18/2017 02:06 PM, Dan Williams wrote:
> The workqueue may still be running when the devres callbacks start
> firing to deallocate an acpi_nfit_desc instance. Stop and flush the
> workqueue before letting any other devres de-allocations proceed.
>
> Reported-by: Linda Knippers <linda.knippers@hpe.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>
> I was able to produce a reliable nfit_test crash failure by running the
> tests on hardware *and* disabling all debug messages. That last detail
> may be why I have been seeing much less frequent failures. With this
> change, in addition to the previous fix [1], I was again able to
> complete a successful run.
It seems a bit better because I can sometimes get a test to complete
but then I'll get a panic when I try again. The footprints are the same
as before, sometimes when unloading the module but sometimes in a kfree
or related function.
I'd be interested in your .config file and exactly how you're building
your bare metal kernel.
-- ljk
>
> [1]: https://patchwork.kernel.org/patch/9681861/
>
> drivers/acpi/nfit/core.c | 76 +++++++++++++++++++++++---------------
> drivers/acpi/nfit/nfit.h | 1 +
> tools/testing/nvdimm/test/nfit.c | 4 ++
> 3 files changed, 51 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
> index 69c6cc77130c..261eea1d2906 100644
> --- a/drivers/acpi/nfit/core.c
> +++ b/drivers/acpi/nfit/core.c
> @@ -2604,7 +2604,8 @@ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
> return rc;
> }
>
> - queue_work(nfit_wq, &acpi_desc->work);
> + if (!acpi_desc->cancel)
> + queue_work(nfit_wq, &acpi_desc->work);
> return 0;
> }
>
> @@ -2650,32 +2651,11 @@ static int acpi_nfit_desc_init_scrub_attr(struct acpi_nfit_desc *acpi_desc)
> return 0;
> }
>
> -static void acpi_nfit_destruct(void *data)
> +static void acpi_nfit_unregister(void *data)
> {
> struct acpi_nfit_desc *acpi_desc = data;
> - struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
>
> - /*
> - * Destruct under acpi_desc_lock so that nfit_handle_mce does not
> - * race teardown
> - */
> - mutex_lock(&acpi_desc_lock);
> - acpi_desc->cancel = 1;
> - /*
> - * Bounce the nvdimm bus lock to make sure any in-flight
> - * acpi_nfit_ars_rescan() submissions have had a chance to
> - * either submit or see ->cancel set.
> - */
> - device_lock(bus_dev);
> - device_unlock(bus_dev);
> -
> - flush_workqueue(nfit_wq);
> - if (acpi_desc->scrub_count_state)
> - sysfs_put(acpi_desc->scrub_count_state);
> nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
> - acpi_desc->nvdimm_bus = NULL;
> - list_del(&acpi_desc->list);
> - mutex_unlock(&acpi_desc_lock);
> }
>
> int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
> @@ -2693,7 +2673,7 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
> if (!acpi_desc->nvdimm_bus)
> return -ENOMEM;
>
> - rc = devm_add_action_or_reset(dev, acpi_nfit_destruct,
> + rc = devm_add_action_or_reset(dev, acpi_nfit_unregister,
> acpi_desc);
> if (rc)
> return rc;
> @@ -2787,9 +2767,10 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
>
> /* bounce the init_mutex to make init_complete valid */
> mutex_lock(&acpi_desc->init_mutex);
> - mutex_unlock(&acpi_desc->init_mutex);
> - if (acpi_desc->init_complete)
> + if (acpi_desc->cancel || acpi_desc->init_complete) {
> + mutex_unlock(&acpi_desc->init_mutex);
> return 0;
> + }
>
> /*
> * Scrub work could take 10s of seconds, userspace may give up so we
> @@ -2798,6 +2779,7 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
> INIT_WORK_ONSTACK(&flush.work, flush_probe);
> COMPLETION_INITIALIZER_ONSTACK(flush.cmp);
> queue_work(nfit_wq, &flush.work);
> + mutex_unlock(&acpi_desc->init_mutex);
>
> rc = wait_for_completion_interruptible(&flush.cmp);
> cancel_work_sync(&flush.work);
> @@ -2834,10 +2816,12 @@ int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
> if (work_busy(&acpi_desc->work))
> return -EBUSY;
>
> - if (acpi_desc->cancel)
> + mutex_lock(&acpi_desc->init_mutex);
> + if (acpi_desc->cancel) {
> + mutex_unlock(&acpi_desc->init_mutex);
> return 0;
> + }
>
> - mutex_lock(&acpi_desc->init_mutex);
> list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
> struct acpi_nfit_system_address *spa = nfit_spa->spa;
>
> @@ -2886,6 +2870,35 @@ static void acpi_nfit_put_table(void *table)
> acpi_put_table(table);
> }
>
> +void acpi_nfit_shutdown(void *data)
> +{
> + struct acpi_nfit_desc *acpi_desc = data;
> + struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
> +
> + /*
> + * Destruct under acpi_desc_lock so that nfit_handle_mce does not
> + * race teardown
> + */
> + mutex_lock(&acpi_desc_lock);
> + list_del(&acpi_desc->list);
> + mutex_unlock(&acpi_desc_lock);
> +
> + mutex_lock(&acpi_desc->init_mutex);
> + acpi_desc->cancel = 1;
> + mutex_unlock(&acpi_desc->init_mutex);
> +
> + /*
> + * Bounce the nvdimm bus lock to make sure any in-flight
> + * acpi_nfit_ars_rescan() submissions have had a chance to
> + * either submit or see ->cancel set.
> + */
> + device_lock(bus_dev);
> + device_unlock(bus_dev);
> +
> + flush_workqueue(nfit_wq);
> +}
> +EXPORT_SYMBOL_GPL(acpi_nfit_shutdown);
> +
> static int acpi_nfit_add(struct acpi_device *adev)
> {
> struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
> @@ -2933,12 +2946,15 @@ static int acpi_nfit_add(struct acpi_device *adev)
> rc = acpi_nfit_init(acpi_desc, (void *) tbl
> + sizeof(struct acpi_table_nfit),
> sz - sizeof(struct acpi_table_nfit));
> - return rc;
> +
> + if (rc)
> + return rc;
> + return devm_add_action_or_reset(dev, acpi_nfit_shutdown, acpi_desc);
> }
>
> static int acpi_nfit_remove(struct acpi_device *adev)
> {
> - /* see acpi_nfit_destruct */
> + /* see acpi_nfit_unregister */
> return 0;
> }
>
> diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
> index fac098bfa585..58fb7d68e04a 100644
> --- a/drivers/acpi/nfit/nfit.h
> +++ b/drivers/acpi/nfit/nfit.h
> @@ -239,6 +239,7 @@ static inline struct acpi_nfit_desc *to_acpi_desc(
>
> const u8 *to_nfit_uuid(enum nfit_uuids id);
> int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz);
> +void acpi_nfit_shutdown(void *data);
> void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event);
> void __acpi_nvdimm_notify(struct device *dev, u32 event);
> int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
> diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
> index d7fb1b894128..c2187178fb13 100644
> --- a/tools/testing/nvdimm/test/nfit.c
> +++ b/tools/testing/nvdimm/test/nfit.c
> @@ -1851,6 +1851,10 @@ static int nfit_test_probe(struct platform_device *pdev)
> if (rc)
> return rc;
>
> + rc = devm_add_action_or_reset(&pdev->dev, acpi_nfit_shutdown, acpi_desc);
> + if (rc)
> + return rc;
> +
> if (nfit_test->setup != nfit_test0_setup)
> return 0;
>
>
WARNING: multiple messages have this Message-ID (diff)
From: Linda Knippers <linda.knippers@hpe.com>
To: Dan Williams <dan.j.williams@intel.com>, linux-nvdimm@lists.01.org
Cc: linux-acpi@vger.kernel.org
Subject: Re: [PATCH] acpi, nfit: fix module unload vs workqueue shutdown race
Date: Tue, 18 Apr 2017 18:03:12 -0400 [thread overview]
Message-ID: <58F68D20.5090607@hpe.com> (raw)
In-Reply-To: <149253869300.4222.11825248275497461939.stgit@dwillia2-desk3.amr.corp.intel.com>
On 04/18/2017 02:06 PM, Dan Williams wrote:
> The workqueue may still be running when the devres callbacks start
> firing to deallocate an acpi_nfit_desc instance. Stop and flush the
> workqueue before letting any other devres de-allocations proceed.
>
> Reported-by: Linda Knippers <linda.knippers@hpe.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>
> I was able to produce a reliable nfit_test crash failure by running the
> tests on hardware *and* disabling all debug messages. That last detail
> may be why I have been seeing much less frequent failures. With this
> change, in addition to the previous fix [1], I was again able to
> complete a successful run.
It seems a bit better because I can sometimes get a test to complete
but then I'll get a panic when I try again. The footprints are the same
as before, sometimes when unloading the module but sometimes in a kfree
or related function.
I'd be interested in your .config file and exactly how you're building
your bare metal kernel.
-- ljk
>
> [1]: https://patchwork.kernel.org/patch/9681861/
>
> drivers/acpi/nfit/core.c | 76 +++++++++++++++++++++++---------------
> drivers/acpi/nfit/nfit.h | 1 +
> tools/testing/nvdimm/test/nfit.c | 4 ++
> 3 files changed, 51 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
> index 69c6cc77130c..261eea1d2906 100644
> --- a/drivers/acpi/nfit/core.c
> +++ b/drivers/acpi/nfit/core.c
> @@ -2604,7 +2604,8 @@ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
> return rc;
> }
>
> - queue_work(nfit_wq, &acpi_desc->work);
> + if (!acpi_desc->cancel)
> + queue_work(nfit_wq, &acpi_desc->work);
> return 0;
> }
>
> @@ -2650,32 +2651,11 @@ static int acpi_nfit_desc_init_scrub_attr(struct acpi_nfit_desc *acpi_desc)
> return 0;
> }
>
> -static void acpi_nfit_destruct(void *data)
> +static void acpi_nfit_unregister(void *data)
> {
> struct acpi_nfit_desc *acpi_desc = data;
> - struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
>
> - /*
> - * Destruct under acpi_desc_lock so that nfit_handle_mce does not
> - * race teardown
> - */
> - mutex_lock(&acpi_desc_lock);
> - acpi_desc->cancel = 1;
> - /*
> - * Bounce the nvdimm bus lock to make sure any in-flight
> - * acpi_nfit_ars_rescan() submissions have had a chance to
> - * either submit or see ->cancel set.
> - */
> - device_lock(bus_dev);
> - device_unlock(bus_dev);
> -
> - flush_workqueue(nfit_wq);
> - if (acpi_desc->scrub_count_state)
> - sysfs_put(acpi_desc->scrub_count_state);
> nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
> - acpi_desc->nvdimm_bus = NULL;
> - list_del(&acpi_desc->list);
> - mutex_unlock(&acpi_desc_lock);
> }
>
> int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
> @@ -2693,7 +2673,7 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
> if (!acpi_desc->nvdimm_bus)
> return -ENOMEM;
>
> - rc = devm_add_action_or_reset(dev, acpi_nfit_destruct,
> + rc = devm_add_action_or_reset(dev, acpi_nfit_unregister,
> acpi_desc);
> if (rc)
> return rc;
> @@ -2787,9 +2767,10 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
>
> /* bounce the init_mutex to make init_complete valid */
> mutex_lock(&acpi_desc->init_mutex);
> - mutex_unlock(&acpi_desc->init_mutex);
> - if (acpi_desc->init_complete)
> + if (acpi_desc->cancel || acpi_desc->init_complete) {
> + mutex_unlock(&acpi_desc->init_mutex);
> return 0;
> + }
>
> /*
> * Scrub work could take 10s of seconds, userspace may give up so we
> @@ -2798,6 +2779,7 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
> INIT_WORK_ONSTACK(&flush.work, flush_probe);
> COMPLETION_INITIALIZER_ONSTACK(flush.cmp);
> queue_work(nfit_wq, &flush.work);
> + mutex_unlock(&acpi_desc->init_mutex);
>
> rc = wait_for_completion_interruptible(&flush.cmp);
> cancel_work_sync(&flush.work);
> @@ -2834,10 +2816,12 @@ int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
> if (work_busy(&acpi_desc->work))
> return -EBUSY;
>
> - if (acpi_desc->cancel)
> + mutex_lock(&acpi_desc->init_mutex);
> + if (acpi_desc->cancel) {
> + mutex_unlock(&acpi_desc->init_mutex);
> return 0;
> + }
>
> - mutex_lock(&acpi_desc->init_mutex);
> list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
> struct acpi_nfit_system_address *spa = nfit_spa->spa;
>
> @@ -2886,6 +2870,35 @@ static void acpi_nfit_put_table(void *table)
> acpi_put_table(table);
> }
>
> +void acpi_nfit_shutdown(void *data)
> +{
> + struct acpi_nfit_desc *acpi_desc = data;
> + struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
> +
> + /*
> + * Destruct under acpi_desc_lock so that nfit_handle_mce does not
> + * race teardown
> + */
> + mutex_lock(&acpi_desc_lock);
> + list_del(&acpi_desc->list);
> + mutex_unlock(&acpi_desc_lock);
> +
> + mutex_lock(&acpi_desc->init_mutex);
> + acpi_desc->cancel = 1;
> + mutex_unlock(&acpi_desc->init_mutex);
> +
> + /*
> + * Bounce the nvdimm bus lock to make sure any in-flight
> + * acpi_nfit_ars_rescan() submissions have had a chance to
> + * either submit or see ->cancel set.
> + */
> + device_lock(bus_dev);
> + device_unlock(bus_dev);
> +
> + flush_workqueue(nfit_wq);
> +}
> +EXPORT_SYMBOL_GPL(acpi_nfit_shutdown);
> +
> static int acpi_nfit_add(struct acpi_device *adev)
> {
> struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
> @@ -2933,12 +2946,15 @@ static int acpi_nfit_add(struct acpi_device *adev)
> rc = acpi_nfit_init(acpi_desc, (void *) tbl
> + sizeof(struct acpi_table_nfit),
> sz - sizeof(struct acpi_table_nfit));
> - return rc;
> +
> + if (rc)
> + return rc;
> + return devm_add_action_or_reset(dev, acpi_nfit_shutdown, acpi_desc);
> }
>
> static int acpi_nfit_remove(struct acpi_device *adev)
> {
> - /* see acpi_nfit_destruct */
> + /* see acpi_nfit_unregister */
> return 0;
> }
>
> diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
> index fac098bfa585..58fb7d68e04a 100644
> --- a/drivers/acpi/nfit/nfit.h
> +++ b/drivers/acpi/nfit/nfit.h
> @@ -239,6 +239,7 @@ static inline struct acpi_nfit_desc *to_acpi_desc(
>
> const u8 *to_nfit_uuid(enum nfit_uuids id);
> int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz);
> +void acpi_nfit_shutdown(void *data);
> void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event);
> void __acpi_nvdimm_notify(struct device *dev, u32 event);
> int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
> diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
> index d7fb1b894128..c2187178fb13 100644
> --- a/tools/testing/nvdimm/test/nfit.c
> +++ b/tools/testing/nvdimm/test/nfit.c
> @@ -1851,6 +1851,10 @@ static int nfit_test_probe(struct platform_device *pdev)
> if (rc)
> return rc;
>
> + rc = devm_add_action_or_reset(&pdev->dev, acpi_nfit_shutdown, acpi_desc);
> + if (rc)
> + return rc;
> +
> if (nfit_test->setup != nfit_test0_setup)
> return 0;
>
>
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm
next prev parent reply other threads:[~2017-04-18 22:03 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-04-18 18:06 [PATCH] acpi, nfit: fix module unload vs workqueue shutdown race Dan Williams
2017-04-18 18:06 ` Dan Williams
2017-04-18 22:03 ` Linda Knippers [this message]
2017-04-18 22:03 ` Linda Knippers
[not found] ` <58F68D20.5090607-ZPxbGqLxI0U@public.gmane.org>
2017-04-18 23:05 ` Dan Williams
2017-04-18 23:05 ` Dan Williams
2017-04-18 23:16 ` Linda Knippers
2017-04-18 23:16 ` Linda Knippers
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=58F68D20.5090607@hpe.com \
--to=linda.knippers@hpe.com \
--cc=dan.j.williams@intel.com \
--cc=linux-acpi@vger.kernel.org \
--cc=linux-nvdimm@lists.01.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.