From: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
To: tglx@linutronix.de
Cc: mkshah@codeaurora.org, linux-doc@vger.kernel.org,
peterz@infradead.org, linus.walleij@linaro.org,
kernelfans@gmail.com, afzal.mohd.ma@gmail.com,
Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>,
corbet@lwn.net, maz@kernel.org, gustavo@embeddedor.com,
helgaas@kernel.org, pawan.kumar.gupta@linux.intel.com,
pmladek@suse.com, gpiccoli@canonical.com,
viro@zeniv.linux.org.uk, oneukum@suse.com,
kexec@lists.infradead.org, linux-kernel@vger.kernel.org,
ilina@codeaurora.org, Jisheng.Zhang@synaptics.com,
akpm@linux-foundation.org, mike.kravetz@oracle.com
Subject: Re: [PATCH 0/3] warn and suppress irqflood
Date: Tue, 2 Mar 2021 13:15:56 +0530 [thread overview]
Message-ID: <20210302074556.23998-1-saiprakash.ranjan@codeaurora.org> (raw)
In-Reply-To: <87tuueftou.fsf@nanos.tec.linutronix.de>
Hi Thomas,
> On Wed, Oct 28, 2020 at 12:58:41PM +0100, Thomas Gleixner wrote:
>
<snip>...
> Something like the completly untested below should work independent of
> config options.
>
> Thanks,
>
> tglx
> ---
> include/linux/irqdesc.h | 4 ++
> kernel/irq/manage.c | 3 +
> kernel/irq/spurious.c | 74 +++++++++++++++++++++++++++++++++++-------------
> 3 files changed, 61 insertions(+), 20 deletions(-)
>
> --- a/include/linux/irqdesc.h
> +++ b/include/linux/irqdesc.h
> @@ -30,6 +30,8 @@ struct pt_regs;
> * @tot_count: stats field for non-percpu irqs
> * @irq_count: stats field to detect stalled irqs
> * @last_unhandled: aging timer for unhandled count
> + * @storm_count: Counter for irq storm detection
> + * @storm_checked: Timestamp for irq storm detection
> * @irqs_unhandled: stats field for spurious unhandled interrupts
> * @threads_handled: stats field for deferred spurious detection of threaded handlers
> * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers
> @@ -65,6 +67,8 @@ struct irq_desc {
> unsigned int tot_count;
> unsigned int irq_count; /* For detecting broken IRQs */
> unsigned long last_unhandled; /* Aging timer for unhandled count */
> + unsigned long storm_count;
> + unsigned long storm_checked;
> unsigned int irqs_unhandled;
> atomic_t threads_handled;
> int threads_handled_last;
> --- a/kernel/irq/manage.c
> +++ b/kernel/irq/manage.c
> @@ -1581,6 +1581,9 @@ static int
> if (!shared) {
> init_waitqueue_head(&desc->wait_for_threads);
>
> + /* Take a timestamp for interrupt storm detection */
> + desc->storm_checked = jiffies;
> +
> /* Setup the type (level, edge polarity) if configured: */
> if (new->flags & IRQF_TRIGGER_MASK) {
> ret = __irq_set_trigger(desc,
> --- a/kernel/irq/spurious.c
> +++ b/kernel/irq/spurious.c
> @@ -21,6 +21,7 @@ static void poll_spurious_irqs(struct ti
> static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
> static int irq_poll_cpu;
> static atomic_t irq_poll_active;
> +static unsigned long irqstorm_limit __ro_after_init;
>
> /*
> * We wait here for a poller to finish.
> @@ -189,18 +190,21 @@ static inline int bad_action_ret(irqretu
> * (The other 100-of-100,000 interrupts may have been a correctly
> * functioning device sharing an IRQ with the failing one)
> */
> -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
> +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret,
> + bool storm)
> {
> unsigned int irq = irq_desc_get_irq(desc);
> struct irqaction *action;
> unsigned long flags;
>
> - if (bad_action_ret(action_ret)) {
> - printk(KERN_ERR "irq event %d: bogus return value %x\n",
> - irq, action_ret);
> - } else {
> - printk(KERN_ERR "irq %d: nobody cared (try booting with "
> + if (!storm) {
> + if (bad_action_ret(action_ret)) {
> + pr_err("irq event %d: bogus return value %x\n",
> + irq, action_ret);
> + } else {
> + pr_err("irq %d: nobody cared (try booting with "
> "the \"irqpoll\" option)\n", irq);
> + }
> }
> dump_stack();
> printk(KERN_ERR "handlers:\n");
> @@ -228,7 +232,7 @@ static void report_bad_irq(struct irq_de
>
> if (count > 0) {
> count--;
> - __report_bad_irq(desc, action_ret);
> + __report_bad_irq(desc, action_ret, false);
> }
> }
>
> @@ -267,6 +271,33 @@ try_misrouted_irq(unsigned int irq, stru
> return action && (action->flags & IRQF_IRQPOLL);
> }
>
> +static void disable_stuck_irq(struct irq_desc *desc, irqreturn_t action_ret,
> + const char *reason, bool storm)
> +{
> + __report_bad_irq(desc, action_ret, storm);
> + pr_emerg("Disabling %s IRQ #%d\n", reason, irq_desc_get_irq(desc));
> + desc->istate |= IRQS_SPURIOUS_DISABLED;
> + desc->depth++;
> + irq_disable(desc);
> +}
> +
> +/* Interrupt storm detector for runaway interrupts (handled or not). */
> +static bool irqstorm_detected(struct irq_desc *desc)
> +{
> + unsigned long now = jiffies;
> +
> + if (++desc->storm_count < irqstorm_limit) {
> + if (time_after(now, desc->storm_checked + HZ)) {
> + desc->storm_count = 0;
> + desc->storm_checked = now;
> + }
> + return false;
> + }
> +
> + disable_stuck_irq(desc, IRQ_NONE, "runaway", true);
> + return true;
> +}
> +
> #define SPURIOUS_DEFERRED 0x80000000
>
> void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
> @@ -403,24 +434,16 @@ void note_interrupt(struct irq_desc *des
> desc->irqs_unhandled -= ok;
> }
>
> + if (unlikely(irqstorm_limit && irqstorm_detected(desc)))
> + return;
> +
> desc->irq_count++;
> if (likely(desc->irq_count < 100000))
> return;
>
> desc->irq_count = 0;
> if (unlikely(desc->irqs_unhandled > 99900)) {
> - /*
> - * The interrupt is stuck
> - */
> - __report_bad_irq(desc, action_ret);
> - /*
> - * Now kill the IRQ
> - */
> - printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
> - desc->istate |= IRQS_SPURIOUS_DISABLED;
> - desc->depth++;
> - irq_disable(desc);
> -
> + disable_stuck_irq(desc, action_ret, "unhandled", false);
> mod_timer(&poll_spurious_irq_timer,
> jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
> }
> @@ -462,5 +485,16 @@ static int __init irqpoll_setup(char *st
> "performance\n");
> return 1;
> }
> -
> __setup("irqpoll", irqpoll_setup);
> +
> +static int __init irqstorm_setup(char *arg)
> +{
> + int res = kstrtoul(arg, 0, &irqstorm_limit);
> +
> + if (!res) {
> + pr_info("Interrupt storm detector enabled. Limit=%lu / s\n",
> + irqstorm_limit);
> + }
> + return !!res;
> +}
> +__setup("irqstorm_limit", irqstorm_setup);
This irq storm detection feature is very useful, any chance to get this merged?
We will be happy to test. People seem to be having their own copy of such feature
out-of-tree [1].
[1] https://elinux.org/images/d/de/Oct28_InterruptStormDetectionFeature_KentoKobayashi.pdf
Thanks,
Sai
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
next prev parent reply other threads:[~2021-03-02 7:46 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-22 5:56 [PATCH 0/3] warn and suppress irqflood Pingfan Liu
2020-10-22 5:56 ` [PATCH 1/3] kernel/watchdog: show irq percentage if irq floods Pingfan Liu
2020-10-22 5:56 ` [PATCH 2/3] kernel/watchdog: suppress max irq when " Pingfan Liu
2020-10-22 5:56 ` [PATCH 3/3] Documentation: introduce a param "irqflood_suppress" Pingfan Liu
2020-10-22 8:37 ` [PATCH 0/3] warn and suppress irqflood Thomas Gleixner
2020-10-25 11:12 ` Pingfan Liu
2020-10-25 12:21 ` [Skiboot] " Oliver O'Halloran
2020-10-25 13:11 ` Pingfan Liu
2020-10-25 13:51 ` Oliver O'Halloran
2020-10-26 15:06 ` Guilherme Piccoli
2020-10-26 19:59 ` Thomas Gleixner
2020-10-26 20:28 ` Guilherme Piccoli
2020-10-26 21:21 ` Thomas Gleixner
2020-10-27 12:28 ` Guilherme Piccoli
2020-10-28 6:02 ` Pingfan Liu
2020-10-28 11:58 ` Thomas Gleixner
2020-10-29 6:26 ` Pingfan Liu
2020-11-06 5:53 ` Pingfan Liu
2020-11-18 3:36 ` [PATCH 0/3] use soft lockup to detect irq flood Pingfan Liu
2020-11-18 3:36 ` [PATCH 1/3] x86/irq: account the unused irq Pingfan Liu
2020-11-18 3:36 ` [PATCH 2/3] kernel/watchdog: make watchdog_touch_ts more accurate by using nanosecond Pingfan Liu
2020-11-18 3:36 ` [PATCH 3/3] kernel/watchdog: use soft lockup to detect irq flood Pingfan Liu
2021-03-02 7:45 ` Sai Prakash Ranjan [this message]
2021-06-05 2:32 ` [PATCH 0/3] warn and suppress irqflood Sai Prakash Ranjan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210302074556.23998-1-saiprakash.ranjan@codeaurora.org \
--to=saiprakash.ranjan@codeaurora.org \
--cc=Jisheng.Zhang@synaptics.com \
--cc=afzal.mohd.ma@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=corbet@lwn.net \
--cc=gpiccoli@canonical.com \
--cc=gustavo@embeddedor.com \
--cc=helgaas@kernel.org \
--cc=ilina@codeaurora.org \
--cc=kernelfans@gmail.com \
--cc=kexec@lists.infradead.org \
--cc=linus.walleij@linaro.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=maz@kernel.org \
--cc=mike.kravetz@oracle.com \
--cc=mkshah@codeaurora.org \
--cc=oneukum@suse.com \
--cc=pawan.kumar.gupta@linux.intel.com \
--cc=peterz@infradead.org \
--cc=pmladek@suse.com \
--cc=tglx@linutronix.de \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox