[PATCH] edac, poll timeout cannot be zero

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] edac, poll timeout cannot be zero
@ 2014-02-03 20:05 Prarit Bhargava
  2014-02-12 15:07 ` Borislav Petkov
  0 siblings, 1 reply; 7+ messages in thread
From: Prarit Bhargava @ 2014-02-03 20:05 UTC (permalink / raw)
  To: linux-edac; +Cc: Prarit Bhargava, Doug Thompson, linux-kernel

If you do

echo 0 > /sys/module/edac_core/parameters/edac_mc_poll_msec

the following stack trace is output because the edac module is not
designed to poll with a timeout of zero.

------------[ cut here ]------------
WARNING: CPU: 12 PID: 0 at lib/list_debug.c:33 __list_add+0xac/0xc0()
list_add corruption. prev->next should be next (ffff8808291dd1b8), but was           (null). (prev=ffff8808286fe3f8).
Modules linked in: sg nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache cfg80211 rfkill x86_pkg_temp_thermal coretemp kvm_intel kvm ixgbe e1000e crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd iTCO_wdt ptp sb_edac iTCO_vendor_support pps_core mdio ipmi_devintf edac_core ioatdma microcode shpchp lpc_ich pcspkr i2c_i801 dca mfd_core ipmi_si wmi ipmi_msghandler nfsd auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sd_mod sr_mod cdrom crc_t10dif crct10dif_common mgag200 syscopyarea sysfillrect sysimgblt isci i2c_algo_bit drm_kms_helper ttm drm libsas ahci libahci scsi_transport_sas libata i2c_core dm_mirror dm_region_hash dm_log dm_mod
CPU: 12 PID: 0 Comm: swapper/12 Not tainted 3.13.0+ #1
Hardware name: Intel Corporation LH Pass ........../SVRBD-ROW_T, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
 0000000000000009 ffff88082f683cc8 ffffffff815f1945 ffff88082f683d10
 ffff88082f683d00 ffffffff81069dad ffff880826f9b248 ffff8808291dd1b8
 ffff8808286fe3f8 ffff8808291dc000 0000000000000000 ffff88082f683d60
Call Trace:
 <IRQ>  [<ffffffff815f1945>] dump_stack+0x45/0x56
 [<ffffffff81069dad>] warn_slowpath_common+0x7d/0xa0
 [<ffffffff81069e1c>] warn_slowpath_fmt+0x4c/0x50
 [<ffffffff812dd0fc>] __list_add+0xac/0xc0
 [<ffffffff8107570b>] __internal_add_timer+0xab/0x130
 [<ffffffff81075907>] internal_add_timer+0x17/0x40
 [<ffffffff81077dfa>] mod_timer_pinned+0xca/0x170
 [<ffffffff8149f7f0>] ? pid_param_set+0x130/0x130
 [<ffffffff8149fa7a>] intel_pstate_timer_func+0x28a/0x380
 [<ffffffff8149f7f0>] ? pid_param_set+0x130/0x130
 [<ffffffff810757d6>] call_timer_fn+0x36/0x100
 [<ffffffff8149f7f0>] ? pid_param_set+0x130/0x130
 [<ffffffff810767ff>] run_timer_softirq+0x1ff/0x2f0
 [<ffffffff8106f3b5>] __do_softirq+0xf5/0x2e0
 [<ffffffff8106f87d>] irq_exit+0x10d/0x120
 [<ffffffff816042b5>] smp_apic_timer_interrupt+0x45/0x60
 [<ffffffff81602c1d>] apic_timer_interrupt+0x6d/0x80
 <EOI>  [<ffffffff814a022f>] ? cpuidle_enter_state+0x4f/0xc0
 [<ffffffff814a0228>] ? cpuidle_enter_state+0x48/0xc0
 [<ffffffff814a0359>] cpuidle_idle_call+0xb9/0x1f0
 [<ffffffff8101e44e>] arch_cpu_idle+0xe/0x30
 [<ffffffff810c327e>] cpu_startup_entry+0x9e/0x240
 [<ffffffff81043544>] start_secondary+0x1e4/0x290
---[ end trace 078b214fc68689e6 ]---
------------[ cut here ]------------
------------[ cut here ]------------
kernel BUG at kernel/timer.c:1084!
invalid opcode: 0000 [#1] SMP
Modules linked in: sg nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache cfg80211 rfkill x86_pkg_temp_thermal coretemp kvm_intel kvm ixgbe e1000e crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd iTCO_wdt ptp sb_edac iTCO_vendor_support pps_core mdio ipmi_devintf edac_core ioatdma microcode shpchp lpc_ich pcspkr i2c_i801 dca mfd_core ipmi_si wmi ipmi_msghandler nfsd auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sd_mod sr_mod cdrom crc_t10dif crct10dif_common mgag200 syscopyarea sysfillrect sysimgblt isci i2c_algo_bit drm_kms_helper ttm drm libsas ahci libahci scsi_transport_sas libata i2c_core dm_mirror dm_region_hash dm_log dm_mod
CPU: 12 PID: 0 Comm: swapper/12 Tainted: G        W    3.13.0+ #1
Hardware name: Intel Corporation LH Pass ........../SVRBD-ROW_T, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
task: ffff880428eadd30 ti: ffff880428eb8000 task.ti: ffff880428eb8000
RIP: 0010:[<ffffffff810759c3>]  [<ffffffff810759c3>] cascade+0x93/0xa0
RSP: 0018:ffff88082f683e70  EFLAGS: 00010083
RAX: ffffffff81c64580 RBX: 0000000000000000 RCX: ffff88082f683ed8
RDX: 0000000000000019 RSI: ffff8808286f83f8 RDI: ffff8808291dc000
RBP: ffff88082f683ea0 R08: ffff8808291dd3b8 R09: 0000000000000100
R10: 0000000000000000 R11: ffff88082f6839fe R12: ffff8808291dc000
R13: ffff88082f683e70 R14: 0000000000000019 R15: ffff88082f683ed8
FS:  0000000000000000(0000) GS:ffff88082f680000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004a4fb0 CR3: 00000000018f8000 CR4: 00000000000407e0
Stack:
 ffff8808286f83f8 ffff880826f9b248 ffff8808291dc000 0000000000000000
 ffffffff81083c10 0000000000000002 ffff88082f683f10 ffffffff81076845
 ffff8808291ddc28 ffff8808291dd828 ffff8808291dd428 ffff8808291dd028
Call Trace:
 <IRQ>
 [<ffffffff81083c10>] ? __queue_work+0x320/0x320
 [<ffffffff81076845>] run_timer_softirq+0x245/0x2f0
 [<ffffffff8106f3b5>] __do_softirq+0xf5/0x2e0
 [<ffffffff8106f87d>] irq_exit+0x10d/0x120
 [<ffffffff816042b5>] smp_apic_timer_interrupt+0x45/0x60
 [<ffffffff81602c1d>] apic_timer_interrupt+0x6d/0x80
 <EOI>
 [<ffffffff814a022f>] ? cpuidle_enter_state+0x4f/0xc0
 [<ffffffff814a0228>] ? cpuidle_enter_state+0x48/0xc0
 [<ffffffff814a0359>] cpuidle_idle_call+0xb9/0x1f0
 [<ffffffff8101e44e>] arch_cpu_idle+0xe/0x30
 [<ffffffff810c327e>] cpu_startup_entry+0x9e/0x240
 [<ffffffff81043544>] start_secondary+0x1e4/0x290
Code: 49 39 cc 75 26 48 89 de 48 89 c3 4c 89 e7 e8 b5 fc ff ff 4c 39 eb 48 8b 03 75 dd 48 83 c4 10 44 89 f0 5b 41 5c 41 5d 41 5e 5d c3 <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 55 48 89
RIP  [<ffffffff810759c3>] cascade+0x93/0xa0
 RSP <ffff88082f683e70>
WARNING: CPU: 36 PID: 1154 at kernel/workqueue.c:1461 __queue_delayed_work+0xed/0x1a0()
[ 3108.665556] Modules linked in: sg nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache cfg80211 rfkill x86_pkg_temp_thermal coretemp kvm_intel kvm ixgbe e1000e crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd iTCO_wdt ptp sb_edac iTCO_vendor_support pps_core mdio ipmi_devintf edac_core ioatdma microcode shpchp lpc_ich pcspkr i2c_i801 dca mfd_core ipmi_si wmi ipmi_msghandler nfsd auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sd_mod sr_mod cdrom crc_t10dif crct10dif_common mgag200 syscopyarea sysfillrect sysimgblt isci i2c_algo_bit drm_kms_helper ttm drm libsas ahci libahci scsi_transport_sas libata i2c_core dm_mirror dm_region_hash dm_log dm_mod
[ 3108.737843] CPU: 36 PID: 1154 Comm: kworker/u481:3 Tainted: G        W    3.13.0+ #1
[ 3108.746480] Hardware name: Intel Corporation LH Pass ........../SVRBD-ROW_T, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013
[ 3108.759435] Workqueue: edac-poller edac_mc_workq_function [edac_core]
[ 3108.766630]  0000000000000009 ffff880425137d70 ffffffff815f1945 0000000000000000
[ 3108.774932]  ffff880425137da8 ffffffff81069dad 0000000000002000 ffff880826bb1200
[ 3108.783230]  ffff8808286fe3f8 ffff880036371000 0000000000003c20 ffff880425137db8
[ 3108.791521] Call Trace:
[ 3108.794265]  [<ffffffff815f1945>] dump_stack+0x45/0x56
[ 3108.800015]  [<ffffffff81069dad>] warn_slowpath_common+0x7d/0xa0
[ 3108.806732]  [<ffffffff81069e8a>] warn_slowpath_null+0x1a/0x20
[ 3108.813238]  [<ffffffff81083d1d>] __queue_delayed_work+0xed/0x1a0
[ 3108.820034]  [<ffffffff81084077>] queue_delayed_work_on+0x27/0x50
[ 3108.826831]  [<ffffffffa14b0392>] edac_mc_workq_function+0x72/0xa0 [edac_core]
[ 3108.834908]  [<ffffffff81085c3b>] process_one_work+0x17b/0x460
[ 3108.841414]  [<ffffffff810869db>] worker_thread+0x11b/0x400
[ 3108.847635]  [<ffffffff810868c0>] ? rescuer_thread+0x3e0/0x3e0
[ 3108.854143]  [<ffffffff8108d612>] kthread+0xd2/0xf0
[ 3108.859584]  [<ffffffff8108d540>] ? kthread_create_on_node+0x180/0x180
[ 3108.866873]  [<ffffffff81601efc>] ret_from_fork+0x7c/0xb0
[ 3108.872896]  [<ffffffff8108d540>] ? kthread_create_on_node+0x180/0x180
[ 3108.880176] ---[ end trace 078b214fc68689e7 ]---

This patch adds a range check in the edac_mc_poll_msec code to check for 0.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Doug Thompson <dougthompson@xmission.com>
Cc: linux-kernel@vger.kernel.org
---
 drivers/edac/edac_mc_sysfs.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 51c0362..8ec1747 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -61,7 +61,7 @@ static int edac_set_poll_msec(const char *val, struct kernel_param *kp)
 	ret = kstrtol(val, 0, &l);
 	if (ret)
 		return ret;
-	if ((int)l != l)
+	if (!l || ((int)l != l))
 		return -EINVAL;
 	*((int *)kp->arg) = l;
 
-- 
1.7.9.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] edac, poll timeout cannot be zero
  2014-02-03 20:05 [PATCH] edac, poll timeout cannot be zero Prarit Bhargava
@ 2014-02-12 15:07 ` Borislav Petkov
  2014-02-12 15:16   ` Tejun Heo
  0 siblings, 1 reply; 7+ messages in thread
From: Borislav Petkov @ 2014-02-12 15:07 UTC (permalink / raw)
  To: Prarit Bhargava, Tejun Heo; +Cc: linux-edac, Doug Thompson, linux-kernel

On Mon, Feb 03, 2014 at 03:05:13PM -0500, Prarit Bhargava wrote:
> If you do
> 
> echo 0 > /sys/module/edac_core/parameters/edac_mc_poll_msec
> 
> the following stack trace is output because the edac module is not
> designed to poll with a timeout of zero.

Ok, I took your patch and extended it a bit, see bottom of mail. We're
not allowing intervals lower than a second now because it doesn't make
any sense, IMO.

While testing, however, I keep seeing the splat below and that's:

        if (WARN_ON(!list_empty(&work->entry))) {
                spin_unlock(&pwq->pool->lock);
                return;
        }

and there seems to be some interference with edac_mc_workq_setup()
which does mod_delayed_work() and then the workqueue callback
edac_mc_workq_function() which does queue_delayed_work().

What I'm seeing in the splat is that when the timer fires to run the
delayed work, __queue_work() complains that the work list is not empty
even though we've done mod_delayed_work() which is supposed to cancel
any pending work.

Tejun, any ideas what's happening? Do we need synchronization here or do
you have a _sync version of mod_delayed_work() which makes sure any work
is cancelled?

Or does this mean that once the work is getting queued from the timer
callback delayed_work_timer_fn, it cannot be cancelled anymore? Or
something else I'm missing...?

Thanks.

[ 4143.086470] ------------[ cut here ]------------
[ 4143.094342] WARNING: CPU: 1 PID: 0 at kernel/workqueue.c:1393 __queue_work+0x1d7/0x340()
[ 4143.105683] Modules linked in: sb_edac edac_core ext2 vfat fat fuse loop dm_crypt dm_mod usbhid x86_pkg_temp_thermal coretemp kvm_intel kvm crc32_pclmul crc32c_intel ghash_clmulni_intel ehci_pci aesni_intel ehci_hcd aes_x86_64 xhci_hcd glue_helper snd_hda_codec_hdmi lrw usbcore gf128mul ablk_helper cryptd snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec microcode snd_hwdep snd_pcm iTCO_wdt snd_timer pcspkr iTCO_vendor_support evdev i2c_i801 lpc_ich usb_common button snd dcdbas mfd_core acpi_cpufreq soundcore processor [last unloaded: edac_core]
[ 4143.167227] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.14.0-rc2+ #3
[ 4143.177066] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A08 01/24/2013
[ 4143.187959]  0000000000000009 ffff88044ec43da8 ffffffff8163a5e0 0000000000000000
[ 4143.198898]  ffff88044ec43de0 ffffffff8104ae9d ffff88043a866100 ffff8804338b6428
[ 4143.209844]  0000000000000008 ffff88043a39f200 000000000000f128 ffff88044ec43df0
[ 4143.220808] Call Trace:
[ 4143.226686]  <IRQ>  [<ffffffff8163a5e0>] dump_stack+0x4d/0x66
[ 4143.235942]  [<ffffffff8104ae9d>] warn_slowpath_common+0x7d/0xa0
[ 4143.245407]  [<ffffffff8104af7a>] warn_slowpath_null+0x1a/0x20
[ 4143.254662]  [<ffffffff81066707>] __queue_work+0x1d7/0x340
[ 4143.263546]  [<ffffffff81066ed0>] ? execute_in_process_context+0xa0/0xa0
[ 4143.273625]  [<ffffffff81066eee>] delayed_work_timer_fn+0x1e/0x20
[ 4143.283091]  [<ffffffff81056cef>] call_timer_fn+0x7f/0x180
[ 4143.291939]  [<ffffffff81056c75>] ? call_timer_fn+0x5/0x180
[ 4143.300833]  [<ffffffff81066ed0>] ? execute_in_process_context+0xa0/0xa0
[ 4143.310866]  [<ffffffff81056f52>] run_timer_softirq+0x162/0x2b0
[ 4143.320117]  [<ffffffff810503ae>] __do_softirq+0x12e/0x300
[ 4143.328889]  [<ffffffff81050835>] irq_exit+0xa5/0xb0
[ 4143.337114]  [<ffffffff8164d925>] smp_apic_timer_interrupt+0x45/0x60
[ 4143.346711]  [<ffffffff8164c6ef>] apic_timer_interrupt+0x6f/0x80
[ 4143.355944]  <EOI>  [<ffffffff815202a4>] ? cpuidle_enter_state+0x54/0xc0
[ 4143.365927]  [<ffffffff815203d2>] cpuidle_idle_call+0xc2/0x220
[ 4143.374979]  [<ffffffff8100c00e>] arch_cpu_idle+0xe/0x30
[ 4143.383499]  [<ffffffff810a655a>] cpu_startup_entry+0xea/0x2c0
[ 4143.392524]  [<ffffffff8102f946>] start_secondary+0x1e6/0x240
[ 4143.401423] ---[ end trace a140660262786ef9 ]---

---
From: Prarit Bhargava <prarit@redhat.com>
Subject: [PATCH] EDAC: Poll timeout cannot be zero

Filter out 0 as it is an invalid poll timeout.

Boris: sanitize code even more to accept unsigned longs only and to
not allow polling intervals below 1 second as this is unnecessary and
doesn't make much sense for polling errors.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Link: http://lkml.kernel.org/r/1391457913-881-1-git-send-email-prarit@redhat.com
Cc: Doug Thompson <dougthompson@xmission.com>
Cc: stable@vger.kernel.org
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/edac/edac_mc.c       |  4 ++--
 drivers/edac/edac_mc_sysfs.c | 10 ++++++----
 drivers/edac/edac_module.h   |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index e8c9ef03495b..aef5ec24908e 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -601,7 +601,7 @@ static void edac_mc_workq_teardown(struct mem_ctl_info *mci)
  *	user space has updated our poll period value, need to
  *	reset our workq delays
  */
-void edac_mc_reset_delay_period(int value)
+void edac_mc_reset_delay_period(unsigned long value)
 {
 	struct mem_ctl_info *mci;
 	struct list_head *item;
@@ -611,7 +611,7 @@ void edac_mc_reset_delay_period(int value)
 	list_for_each(item, &mc_devices) {
 		mci = list_entry(item, struct mem_ctl_info, link);
 
-		edac_mc_workq_setup(mci, (unsigned long) value);
+		edac_mc_workq_setup(mci, value);
 	}
 
 	mutex_unlock(&mem_ctls_mutex);
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 51c0362acf5c..b335c6ab5efe 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -52,18 +52,20 @@ int edac_mc_get_poll_msec(void)
 
 static int edac_set_poll_msec(const char *val, struct kernel_param *kp)
 {
-	long l;
+	unsigned long l;
 	int ret;
 
 	if (!val)
 		return -EINVAL;
 
-	ret = kstrtol(val, 0, &l);
+	ret = kstrtoul(val, 0, &l);
 	if (ret)
 		return ret;
-	if ((int)l != l)
+
+	if (l < 1000)
 		return -EINVAL;
-	*((int *)kp->arg) = l;
+
+	*((unsigned long *)kp->arg) = l;
 
 	/* notify edac_mc engine to reset the poll period */
 	edac_mc_reset_delay_period(l);
diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h
index 3d139c6e7fe3..f2118bfcf8df 100644
--- a/drivers/edac/edac_module.h
+++ b/drivers/edac/edac_module.h
@@ -52,7 +52,7 @@ extern void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev,
 extern void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev);
 extern void edac_device_reset_delay_period(struct edac_device_ctl_info
 					   *edac_dev, unsigned long value);
-extern void edac_mc_reset_delay_period(int value);
+extern void edac_mc_reset_delay_period(unsigned long value);
 
 extern void *edac_align_ptr(void **p, unsigned size, int n_elems);
 
-- 
1.8.5.2.192.g7794a68

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] edac, poll timeout cannot be zero
  2014-02-12 15:07 ` Borislav Petkov
@ 2014-02-12 15:16   ` Tejun Heo
  2014-02-12 15:57     ` Borislav Petkov
  0 siblings, 1 reply; 7+ messages in thread
From: Tejun Heo @ 2014-02-12 15:16 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: Prarit Bhargava, linux-edac, Doug Thompson, linux-kernel

Hello,

On Wed, Feb 12, 2014 at 04:07:48PM +0100, Borislav Petkov wrote:
> While testing, however, I keep seeing the splat below and that's:
> 
>         if (WARN_ON(!list_empty(&work->entry))) {
>                 spin_unlock(&pwq->pool->lock);
>                 return;
>         }
> 
> and there seems to be some interference with edac_mc_workq_setup()
> which does mod_delayed_work() and then the workqueue callback
> edac_mc_workq_function() which does queue_delayed_work().
> 
> What I'm seeing in the splat is that when the timer fires to run the
> delayed work, __queue_work() complains that the work list is not empty
> even though we've done mod_delayed_work() which is supposed to cancel
> any pending work.
> 
> Tejun, any ideas what's happening? Do we need synchronization here or do
> you have a _sync version of mod_delayed_work() which makes sure any work
> is cancelled?

No, you don't need to.  All workqueue operations should be able to
synchronize with each other.

> Or does this mean that once the work is getting queued from the timer
> callback delayed_work_timer_fn, it cannot be cancelled anymore? Or
> something else I'm missing...?

Looking at edac_mc_workq_setup().... it contains INIT_DELAYED_WORK().
Does this race with other workqueue operations on the work item?  If
so, it of course breaks.  It's like doing spin_lock_init() while other
spinlock operations are in progress.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] edac, poll timeout cannot be zero
  2014-02-12 15:16   ` Tejun Heo
@ 2014-02-12 15:57     ` Borislav Petkov
  2014-02-12 16:23       ` Tejun Heo
  0 siblings, 1 reply; 7+ messages in thread
From: Borislav Petkov @ 2014-02-12 15:57 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Prarit Bhargava, linux-edac, Doug Thompson, linux-kernel

On Wed, Feb 12, 2014 at 10:16:57AM -0500, Tejun Heo wrote:
> No, you don't need to.  All workqueue operations should be able to
> synchronize with each other.

Ok.

> > Or does this mean that once the work is getting queued from the timer
> > callback delayed_work_timer_fn, it cannot be cancelled anymore? Or
> > something else I'm missing...?
> 
> Looking at edac_mc_workq_setup().... it contains INIT_DELAYED_WORK().
> Does this race with other workqueue operations on the work item?  If
> so, it of course breaks.  It's like doing spin_lock_init() while other
> spinlock operations are in progress.

Ha, so this sounds like the issue because the splat happens
when updating /sys/.../edac_mc_poll_msec and it goes and calls
edac_mc_workq_setup()...

And this code is pretty old and edac_mc_workq_setup() is used both
when one inits an edac driver and also when one wants to change the
polling period (edac_mc_reset_delay_period) and thus needs to mod the
workqueue's timeout.

So I'm guessing a simple fix would be to differentiate between the two
paths, something like the diff below.

Or is there a reliable way to check whether a workqueue has been
initialized already, say, something like

	if (work->func)

or so... I.e., what PREPARE_WORK() does?

Thanks!

---
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index e8c9ef03495b..17c51d3a1143 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -559,7 +559,8 @@ static void edac_mc_workq_function(struct work_struct *work_req)
  *
  *		called with the mem_ctls_mutex held
  */
-static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec)
+static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec,
+				bool init)
 {
 	edac_dbg(0, "\n");
 
@@ -567,7 +568,9 @@ static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec)
 	if (mci->op_state != OP_RUNNING_POLL)
 		return;
 
-	INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function);
+	if (init)
+		INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function);
+
 	mod_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec));
 }
 
@@ -611,7 +614,7 @@ void edac_mc_reset_delay_period(int value)
 	list_for_each(item, &mc_devices) {
 		mci = list_entry(item, struct mem_ctl_info, link);
 
-		edac_mc_workq_setup(mci, (unsigned long) value);
+		edac_mc_workq_setup(mci, (unsigned long) value, false);
 	}
 
 	mutex_unlock(&mem_ctls_mutex);
@@ -782,7 +785,7 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
 		/* This instance is NOW RUNNING */
 		mci->op_state = OP_RUNNING_POLL;
 
-		edac_mc_workq_setup(mci, edac_mc_get_poll_msec());
+		edac_mc_workq_setup(mci, edac_mc_get_poll_msec(), true);
 	} else {
 		mci->op_state = OP_RUNNING_INTERRUPT;
 	}

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] edac, poll timeout cannot be zero
  2014-02-12 15:57     ` Borislav Petkov
@ 2014-02-12 16:23       ` Tejun Heo
  2014-02-12 16:36         ` Borislav Petkov
  0 siblings, 1 reply; 7+ messages in thread
From: Tejun Heo @ 2014-02-12 16:23 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: Prarit Bhargava, linux-edac, Doug Thompson, linux-kernel

Hello,

On Wed, Feb 12, 2014 at 04:57:53PM +0100, Borislav Petkov wrote:
> Ha, so this sounds like the issue because the splat happens
> when updating /sys/.../edac_mc_poll_msec and it goes and calls
> edac_mc_workq_setup()...
> 
> And this code is pretty old and edac_mc_workq_setup() is used both
> when one inits an edac driver and also when one wants to change the
> polling period (edac_mc_reset_delay_period) and thus needs to mod the
> workqueue's timeout.
> 
> So I'm guessing a simple fix would be to differentiate between the two
> paths, something like the diff below.
> 
> Or is there a reliable way to check whether a workqueue has been
> initialized already, say, something like
> 
> 	if (work->func)

Hah... can't you just do it on object initialization?  It's a bit
nasty and fragile to game initialization state.

> or so... I.e., what PREPARE_WORK() does?

It allows you to change the callback without breaking the
synchronization.  Of course, if there's a pending work item, it may or
may not execute the newly assigned function.  It's also a bit nasty
and has only few users in the whole kernel and I think we might wanna
get rid of them.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] edac, poll timeout cannot be zero
  2014-02-12 16:23       ` Tejun Heo
@ 2014-02-12 16:36         ` Borislav Petkov
  2014-02-12 17:22           ` [PATCH] EDAC: Correct workqueue setup path Borislav Petkov
  0 siblings, 1 reply; 7+ messages in thread
From: Borislav Petkov @ 2014-02-12 16:36 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Prarit Bhargava, linux-edac, Doug Thompson, linux-kernel

On Wed, Feb 12, 2014 at 11:23:11AM -0500, Tejun Heo wrote:
> Hah... can't you just do it on object initialization?  It's a bit
> nasty and fragile to game initialization state.
> 
> > or so... I.e., what PREPARE_WORK() does?
> 
> It allows you to change the callback without breaking the
> synchronization.  Of course, if there's a pending work item, it may or
> may not execute the newly assigned function.  It's also a bit nasty
> and has only few users in the whole kernel and I think we might wanna
> get rid of them.

Yeah, no, I certainly don't have to touch wq inner organs - I was
wondering whether there's such functionality already. The simple patch I
sent earlier should be good enough then.

Thanks!

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] EDAC: Correct workqueue setup path
  2014-02-12 16:36         ` Borislav Petkov
@ 2014-02-12 17:22           ` Borislav Petkov
  0 siblings, 0 replies; 7+ messages in thread
From: Borislav Petkov @ 2014-02-12 17:22 UTC (permalink / raw)
  To: linux-edac; +Cc: Tejun Heo, Prarit Bhargava, Doug Thompson, linux-kernel

From: Borislav Petkov <bp@suse.de>

We're using edac_mc_workq_setup() both on the init path, when
we load an edac driver and when we change the polling period
(edac_mc_reset_delay_period) through /sys/.../edac_mc_poll_msec.

On that second path we don't need to init the workqueue which has been
initialized already.

Thanks to Tejun for workqueue insights.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
---
 drivers/edac/edac_mc.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index aef5ec24908e..33edd6766344 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -559,7 +559,8 @@ static void edac_mc_workq_function(struct work_struct *work_req)
  *
  *		called with the mem_ctls_mutex held
  */
-static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec)
+static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec,
+				bool init)
 {
 	edac_dbg(0, "\n");
 
@@ -567,7 +568,9 @@ static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec)
 	if (mci->op_state != OP_RUNNING_POLL)
 		return;
 
-	INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function);
+	if (init)
+		INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function);
+
 	mod_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec));
 }
 
@@ -611,7 +614,7 @@ void edac_mc_reset_delay_period(unsigned long value)
 	list_for_each(item, &mc_devices) {
 		mci = list_entry(item, struct mem_ctl_info, link);
 
-		edac_mc_workq_setup(mci, value);
+		edac_mc_workq_setup(mci, value, false);
 	}
 
 	mutex_unlock(&mem_ctls_mutex);
@@ -782,7 +785,7 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
 		/* This instance is NOW RUNNING */
 		mci->op_state = OP_RUNNING_POLL;
 
-		edac_mc_workq_setup(mci, edac_mc_get_poll_msec());
+		edac_mc_workq_setup(mci, edac_mc_get_poll_msec(), true);
 	} else {
 		mci->op_state = OP_RUNNING_INTERRUPT;
 	}
-- 
1.8.5.2.192.g7794a68

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2014-02-12 17:23 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-02-03 20:05 [PATCH] edac, poll timeout cannot be zero Prarit Bhargava
2014-02-12 15:07 ` Borislav Petkov
2014-02-12 15:16   ` Tejun Heo
2014-02-12 15:57     ` Borislav Petkov
2014-02-12 16:23       ` Tejun Heo
2014-02-12 16:36         ` Borislav Petkov
2014-02-12 17:22           ` [PATCH] EDAC: Correct workqueue setup path Borislav Petkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.