From: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
To: tglx@linutronix.de
Cc: Jisheng.Zhang@synaptics.com, afzal.mohd.ma@gmail.com,
akpm@linux-foundation.org, corbet@lwn.net,
gpiccoli@canonical.com, gustavo@embeddedor.com,
helgaas@kernel.org, ilina@codeaurora.org, kernelfans@gmail.com,
kexec@lists.infradead.org, linus.walleij@linaro.org,
linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
maz@kernel.org, mike.kravetz@oracle.com, mkshah@codeaurora.org,
oneukum@suse.com, pawan.kumar.gupta@linux.intel.com,
peterz@infradead.org, pmladek@suse.com, viro@zeniv.linux.org.uk
Subject: Re: [PATCH 0/3] warn and suppress irqflood
Date: Sat, 05 Jun 2021 08:02:13 +0530 [thread overview]
Message-ID: <5a547a75b621e9919cd7f4706306c0e0@codeaurora.org> (raw)
In-Reply-To: <20210302074556.23998-1-saiprakash.ranjan@codeaurora.org>
Hi Thomas,
On 2021-03-02 13:15, Sai Prakash Ranjan wrote:
> Hi Thomas,
>
>> On Wed, Oct 28, 2020 at 12:58:41PM +0100, Thomas Gleixner wrote:
>>
>
> <snip>...
>
>> Something like the completly untested below should work independent of
>> config options.
>>
>> Thanks,
>>
>> tglx
>> ---
>> include/linux/irqdesc.h | 4 ++
>> kernel/irq/manage.c | 3 +
>> kernel/irq/spurious.c | 74
>> +++++++++++++++++++++++++++++++++++-------------
>> 3 files changed, 61 insertions(+), 20 deletions(-)
>>
>> --- a/include/linux/irqdesc.h
>> +++ b/include/linux/irqdesc.h
>> @@ -30,6 +30,8 @@ struct pt_regs;
>> * @tot_count: stats field for non-percpu irqs
>> * @irq_count: stats field to detect stalled irqs
>> * @last_unhandled: aging timer for unhandled count
>> + * @storm_count: Counter for irq storm detection
>> + * @storm_checked: Timestamp for irq storm detection
>> * @irqs_unhandled: stats field for spurious unhandled interrupts
>> * @threads_handled: stats field for deferred spurious detection of
>> threaded handlers
>> * @threads_handled_last: comparator field for deferred spurious
>> detection of theraded handlers
>> @@ -65,6 +67,8 @@ struct irq_desc {
>> unsigned int tot_count;
>> unsigned int irq_count; /* For detecting broken IRQs */
>> unsigned long last_unhandled; /* Aging timer for unhandled count */
>> + unsigned long storm_count;
>> + unsigned long storm_checked;
>> unsigned int irqs_unhandled;
>> atomic_t threads_handled;
>> int threads_handled_last;
>> --- a/kernel/irq/manage.c
>> +++ b/kernel/irq/manage.c
>> @@ -1581,6 +1581,9 @@ static int
>> if (!shared) {
>> init_waitqueue_head(&desc->wait_for_threads);
>>
>> + /* Take a timestamp for interrupt storm detection */
>> + desc->storm_checked = jiffies;
>> +
>> /* Setup the type (level, edge polarity) if configured: */
>> if (new->flags & IRQF_TRIGGER_MASK) {
>> ret = __irq_set_trigger(desc,
>> --- a/kernel/irq/spurious.c
>> +++ b/kernel/irq/spurious.c
>> @@ -21,6 +21,7 @@ static void poll_spurious_irqs(struct ti
>> static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
>> static int irq_poll_cpu;
>> static atomic_t irq_poll_active;
>> +static unsigned long irqstorm_limit __ro_after_init;
>>
>> /*
>> * We wait here for a poller to finish.
>> @@ -189,18 +190,21 @@ static inline int bad_action_ret(irqretu
>> * (The other 100-of-100,000 interrupts may have been a correctly
>> * functioning device sharing an IRQ with the failing one)
>> */
>> -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t
>> action_ret)
>> +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t
>> action_ret,
>> + bool storm)
>> {
>> unsigned int irq = irq_desc_get_irq(desc);
>> struct irqaction *action;
>> unsigned long flags;
>>
>> - if (bad_action_ret(action_ret)) {
>> - printk(KERN_ERR "irq event %d: bogus return value %x\n",
>> - irq, action_ret);
>> - } else {
>> - printk(KERN_ERR "irq %d: nobody cared (try booting with "
>> + if (!storm) {
>> + if (bad_action_ret(action_ret)) {
>> + pr_err("irq event %d: bogus return value %x\n",
>> + irq, action_ret);
>> + } else {
>> + pr_err("irq %d: nobody cared (try booting with "
>> "the \"irqpoll\" option)\n", irq);
>> + }
>> }
>> dump_stack();
>> printk(KERN_ERR "handlers:\n");
>> @@ -228,7 +232,7 @@ static void report_bad_irq(struct irq_de
>>
>> if (count > 0) {
>> count--;
>> - __report_bad_irq(desc, action_ret);
>> + __report_bad_irq(desc, action_ret, false);
>> }
>> }
>>
>> @@ -267,6 +271,33 @@ try_misrouted_irq(unsigned int irq, stru
>> return action && (action->flags & IRQF_IRQPOLL);
>> }
>>
>> +static void disable_stuck_irq(struct irq_desc *desc, irqreturn_t
>> action_ret,
>> + const char *reason, bool storm)
>> +{
>> + __report_bad_irq(desc, action_ret, storm);
>> + pr_emerg("Disabling %s IRQ #%d\n", reason, irq_desc_get_irq(desc));
>> + desc->istate |= IRQS_SPURIOUS_DISABLED;
>> + desc->depth++;
>> + irq_disable(desc);
>> +}
>> +
>> +/* Interrupt storm detector for runaway interrupts (handled or not).
>> */
>> +static bool irqstorm_detected(struct irq_desc *desc)
>> +{
>> + unsigned long now = jiffies;
>> +
>> + if (++desc->storm_count < irqstorm_limit) {
>> + if (time_after(now, desc->storm_checked + HZ)) {
>> + desc->storm_count = 0;
>> + desc->storm_checked = now;
>> + }
>> + return false;
>> + }
>> +
>> + disable_stuck_irq(desc, IRQ_NONE, "runaway", true);
>> + return true;
>> +}
>> +
>> #define SPURIOUS_DEFERRED 0x80000000
>>
>> void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
>> @@ -403,24 +434,16 @@ void note_interrupt(struct irq_desc *des
>> desc->irqs_unhandled -= ok;
>> }
>>
>> + if (unlikely(irqstorm_limit && irqstorm_detected(desc)))
>> + return;
>> +
>> desc->irq_count++;
>> if (likely(desc->irq_count < 100000))
>> return;
>>
>> desc->irq_count = 0;
>> if (unlikely(desc->irqs_unhandled > 99900)) {
>> - /*
>> - * The interrupt is stuck
>> - */
>> - __report_bad_irq(desc, action_ret);
>> - /*
>> - * Now kill the IRQ
>> - */
>> - printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
>> - desc->istate |= IRQS_SPURIOUS_DISABLED;
>> - desc->depth++;
>> - irq_disable(desc);
>> -
>> + disable_stuck_irq(desc, action_ret, "unhandled", false);
>> mod_timer(&poll_spurious_irq_timer,
>> jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
>> }
>> @@ -462,5 +485,16 @@ static int __init irqpoll_setup(char *st
>> "performance\n");
>> return 1;
>> }
>> -
>> __setup("irqpoll", irqpoll_setup);
>> +
>> +static int __init irqstorm_setup(char *arg)
>> +{
>> + int res = kstrtoul(arg, 0, &irqstorm_limit);
>> +
>> + if (!res) {
>> + pr_info("Interrupt storm detector enabled. Limit=%lu / s\n",
>> + irqstorm_limit);
>> + }
>> + return !!res;
>> +}
>> +__setup("irqstorm_limit", irqstorm_setup);
>
> This irq storm detection feature is very useful, any chance to get this
> merged?
> We will be happy to test. People seem to be having their own copy of
> such feature
> out-of-tree [1].
>
> [1]
> https://elinux.org/images/d/de/Oct28_InterruptStormDetectionFeature_KentoKobayashi.pdf
>
Any chance of having this useful debug feature in upstream kernel?
Thanks,
Sai
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member
of Code Aurora Forum, hosted by The Linux Foundation
prev parent reply other threads:[~2021-06-05 2:32 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-22 5:56 [PATCH 0/3] warn and suppress irqflood Pingfan Liu
2020-10-22 5:56 ` [PATCH 1/3] kernel/watchdog: show irq percentage if irq floods Pingfan Liu
2020-10-22 5:56 ` [PATCH 2/3] kernel/watchdog: suppress max irq when " Pingfan Liu
2020-10-22 5:56 ` [PATCH 3/3] Documentation: introduce a param "irqflood_suppress" Pingfan Liu
2020-10-22 8:37 ` [PATCH 0/3] warn and suppress irqflood Thomas Gleixner
2020-10-25 11:12 ` Pingfan Liu
2020-10-25 12:21 ` [Skiboot] " Oliver O'Halloran
2020-10-25 13:11 ` Pingfan Liu
2020-10-25 13:51 ` Oliver O'Halloran
2020-10-26 15:06 ` Guilherme Piccoli
2020-10-26 19:59 ` Thomas Gleixner
2020-10-26 20:28 ` Guilherme Piccoli
2020-10-26 21:21 ` Thomas Gleixner
2020-10-27 12:28 ` Guilherme Piccoli
2020-10-28 6:02 ` Pingfan Liu
2020-10-28 11:58 ` Thomas Gleixner
2020-10-29 6:26 ` Pingfan Liu
2020-11-06 5:53 ` Pingfan Liu
2021-03-02 7:45 ` Sai Prakash Ranjan
2021-06-05 2:32 ` Sai Prakash Ranjan [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5a547a75b621e9919cd7f4706306c0e0@codeaurora.org \
--to=saiprakash.ranjan@codeaurora.org \
--cc=Jisheng.Zhang@synaptics.com \
--cc=afzal.mohd.ma@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=corbet@lwn.net \
--cc=gpiccoli@canonical.com \
--cc=gustavo@embeddedor.com \
--cc=helgaas@kernel.org \
--cc=ilina@codeaurora.org \
--cc=kernelfans@gmail.com \
--cc=kexec@lists.infradead.org \
--cc=linus.walleij@linaro.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=maz@kernel.org \
--cc=mike.kravetz@oracle.com \
--cc=mkshah@codeaurora.org \
--cc=oneukum@suse.com \
--cc=pawan.kumar.gupta@linux.intel.com \
--cc=peterz@infradead.org \
--cc=pmladek@suse.com \
--cc=tglx@linutronix.de \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).