All of lore.kernel.org
 help / color / mirror / Atom feed
From: Borislav Petkov <bp@alien8.de>
To: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Cc: tony.luck@intel.com, ananth@in.ibm.com,
	masbock@linux.vnet.ibm.com, lcm@linux.vnet.ibm.com,
	linux-kernel@vger.kernel.org, linux-acpi@vger.kernel.org,
	ying.huang@intel.com
Subject: Re: [PATCH v3 3/3] mce, acpi/apei: Soft-offline a page on firmware GHES notification
Date: Tue, 2 Jul 2013 01:08:00 +0200	[thread overview]
Message-ID: <20130701230800.GO23515@pd.tnic> (raw)
In-Reply-To: <20130701153859.6197.59186.stgit@localhost.localdomain>

On Mon, Jul 01, 2013 at 09:08:59PM +0530, Naveen N. Rao wrote:
> If the firmware indicates in GHES error data entry that the error threshold
> has exceeded for a corrected error event, then we try to soft-offline the
> page. This could be called in interrupt context, so we queue this up similar
> to how we handle memory failure scenarios.
> 
> 
> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> ---
>  drivers/acpi/apei/ghes.c |   12 ++++++++++
>  include/linux/mm.h       |    1 +
>  mm/memory-failure.c      |   53 ++++++++++++++++++++++++++++++----------------
>  3 files changed, 48 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index fcd7d91..5a630ed 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -429,6 +429,18 @@ static void ghes_do_proc(struct ghes *ghes,
>  						  mem_err);
>  #endif
>  #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
> +			if (sec_sev == GHES_SEV_CORRECTED &&
> +			    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) &&
> +			    (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) {
> +				unsigned long pfn;
> +				pfn = mem_err->physical_addr >> PAGE_SHIFT;
> +				if (pfn_valid(pfn))
> +					soft_memory_failure_queue(pfn, 0, 0);
> +				else
> +					pr_warning(FW_WARN GHES_PFX
> +					"Invalid address in generic error data: %#lx\n",
> +					mem_err->physical_addr);
> +			}

Yuck, this looks like BIOS code.

Can we carve out this into a function and do

void function(.. )
{
#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE

	<code at 1st indentation, much more readable>

#endif
}

so that we can nicely call it from ghes_do_proc()?

>  			if (sev == GHES_SEV_RECOVERABLE &&
>  			    sec_sev == GHES_SEV_RECOVERABLE &&
>  			    mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index e0c8528..f9907d2 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1787,6 +1787,7 @@ enum mf_flags {
>  };
>  extern int memory_failure(unsigned long pfn, int trapno, int flags);
>  extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
> +extern void soft_memory_failure_queue(unsigned long pfn, int trapno, int flags);
>  extern int unpoison_memory(unsigned long pfn);
>  extern int sysctl_memory_failure_early_kill;
>  extern int sysctl_memory_failure_recovery;
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index ceb0c7f..50caefd 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1222,6 +1222,7 @@ struct memory_failure_entry {
>  	unsigned long pfn;
>  	int trapno;
>  	int flags;
> +	bool soft_offline;

Why a new bool? This flags int looks nice above. :)

>  };
>  
>  struct memory_failure_cpu {
> @@ -1233,6 +1234,28 @@ struct memory_failure_cpu {
>  
>  static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
>  
> +void __memory_failure_queue(unsigned long pfn, int trapno, int flags, int soft_offline)
> +{
> +	struct memory_failure_cpu *mf_cpu;
> +	unsigned long proc_flags;
> +	struct memory_failure_entry entry = {
> +		.pfn =		pfn,
> +		.trapno =	trapno,
> +		.flags =	flags,
> +		.soft_offline = soft_offline
> +	};
> +
> +	mf_cpu = &get_cpu_var(memory_failure_cpu);
> +	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
> +	if (kfifo_put(&mf_cpu->fifo, &entry))
> +		schedule_work_on(smp_processor_id(), &mf_cpu->work);
> +	else
> +		pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
> +		       pfn);
> +	spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
> +	put_cpu_var(memory_failure_cpu);
> +}
> +
>  /**
>   * memory_failure_queue - Schedule handling memory failure of a page.
>   * @pfn: Page Number of the corrupted page
> @@ -1250,28 +1273,19 @@ static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
>   *
>   * Can run in IRQ context.
>   */
> +
>  void memory_failure_queue(unsigned long pfn, int trapno, int flags)
>  {
> -	struct memory_failure_cpu *mf_cpu;
> -	unsigned long proc_flags;
> -	struct memory_failure_entry entry = {
> -		.pfn =		pfn,
> -		.trapno =	trapno,
> -		.flags =	flags,
> -	};
> -
> -	mf_cpu = &get_cpu_var(memory_failure_cpu);
> -	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
> -	if (kfifo_put(&mf_cpu->fifo, &entry))
> -		schedule_work_on(smp_processor_id(), &mf_cpu->work);
> -	else
> -		pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
> -		       pfn);
> -	spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
> -	put_cpu_var(memory_failure_cpu);
> +	__memory_failure_queue(pfn, trapno, flags, false);
>  }
>  EXPORT_SYMBOL_GPL(memory_failure_queue);
>  
> +void soft_memory_failure_queue(unsigned long pfn, int trapno, int flags)
> +{
> +	__memory_failure_queue(pfn, trapno, flags, true);

... which can save us this "true" thing there and maybe a bunch of code
churn around here too.

> +}
> +EXPORT_SYMBOL_GPL(soft_memory_failure_queue);
> +
>  static void memory_failure_work_func(struct work_struct *work)
>  {
>  	struct memory_failure_cpu *mf_cpu;
> @@ -1286,7 +1300,10 @@ static void memory_failure_work_func(struct work_struct *work)
>  		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
>  		if (!gotten)
>  			break;
> -		memory_failure(entry.pfn, entry.trapno, entry.flags);
> +		if (entry.soft_offline)
> +			soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
> +		else
> +			memory_failure(entry.pfn, entry.trapno, entry.flags);
>  	}
>  }
>  
> 
> 

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

  reply	other threads:[~2013-07-01 23:08 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-07-01 15:38 [PATCH v3 0/3] Firmware first mode for corrected errors Naveen N. Rao
2013-07-01 15:38 ` [PATCH v3 1/3] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC Naveen N. Rao
2013-07-01 22:45   ` Borislav Petkov
2013-07-01 15:38 ` [PATCH v3 2/3] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors Naveen N. Rao
2013-07-01 22:49   ` Borislav Petkov
2013-07-01 15:38 ` [PATCH v3 3/3] mce, acpi/apei: Soft-offline a page on firmware GHES notification Naveen N. Rao
2013-07-01 23:08   ` Borislav Petkov [this message]
2013-07-02 11:05     ` Naveen N. Rao
2013-07-02 11:32     ` Naveen N. Rao
2013-07-03 14:44       ` Borislav Petkov
2013-07-03 15:40         ` Naveen N. Rao
2013-07-08 19:00           ` Tony Luck
2013-07-10  9:27             ` Naveen N. Rao
2013-07-10  9:38               ` Borislav Petkov
2013-07-10 20:05                 ` Tony Luck
2013-07-01 20:08 ` [PATCH v3 0/3] Firmware first mode for corrected errors Borislav Petkov
2013-07-02 12:54 ` [PATCH 4] mce: acpi/apei: Add a sysctl to control page offlining on firmware report Naveen N. Rao
2013-07-03 14:46   ` Borislav Petkov
2013-07-03 15:46     ` Naveen N. Rao
2013-07-08 20:26       ` Luck, Tony
2013-07-08 20:26         ` Luck, Tony
2013-07-10  9:17         ` Naveen N. Rao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130701230800.GO23515@pd.tnic \
    --to=bp@alien8.de \
    --cc=ananth@in.ibm.com \
    --cc=lcm@linux.vnet.ibm.com \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=masbock@linux.vnet.ibm.com \
    --cc=naveen.n.rao@linux.vnet.ibm.com \
    --cc=tony.luck@intel.com \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.