[RFC][PATCH v2 2/3] Hold multiple logs

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC][PATCH v2 2/3] Hold multiple logs
@ 2012-07-19 21:13 Seiji Aguchi
  2012-07-19 21:22 ` Luck, Tony
  0 siblings, 1 reply; 23+ messages in thread
From: Seiji Aguchi @ 2012-07-19 21:13 UTC (permalink / raw)
  To: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	Luck, Tony (tony.luck@intel.com), mikew@google.com,
	dzickus@redhat.com, Matthew Garrett (mjg@redhat.com)
  Cc: dle-develop@lists.sourceforge.net, Satoru Moriya

[Problem]
 When efi_pstore holds just one log and it doesn't overwrite an exisiting entry,
 we lose a critical message if kernel panics while system is rebooting.

 [Solution]
  With this patch, efi_pstore can hold multiple logs with a new kernel parameter, efi_pstore_log_num.
  We can simply avoid losing a critical message in case mutiple events happen.

[Patch Description]
 - Introduce a new kernel parameter specifying the number of logs efi_pstore holds.
 - Pass ctime to an argument of erase callback.
    - Current variable name consists of type, id and ctime. So, when handling mutiple logs,
      pstore should pass ctime to erase callback to avoid erasing invisible entries via /dev/pstore.

Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
---
 Documentation/kernel-parameters.txt |    6 ++++++
 drivers/acpi/apei/erst.c            |    4 ++--
 drivers/firmware/efivars.c          |   33 +++++++++++++++++++++++++--------
 fs/pstore/inode.c                   |    2 +-
 fs/pstore/ram.c                     |    2 +-
 include/linux/pstore.h              |    2 +-
 6 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a92c5eb..9d38561 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -786,6 +786,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	edd=		[EDD]
 			Format: {"off" | "on" | "skip[mbr]"}
 
+	efivars.efi_pstore_log_num=
+			Set the maximum number of logs efi_pstore saves into
+			NVRAM. n >= 1 limits the number of logs. n <= 0 is
+			invalid.
+			default: 1
+
 	eisa_irq_edge=	[PARISC,HW]
 			See header of drivers/parisc/eisa.c.
 
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index e4d9d24..0bd6ae4 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -938,7 +938,7 @@ static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
 		       u64 *id, unsigned int part,
 		       size_t size, struct pstore_info *psi);
 static int erst_clearer(enum pstore_type_id type, u64 id,
-			struct pstore_info *psi);
+			struct timespec time, struct pstore_info *psi);
 
 static struct pstore_info erst_info = {
 	.owner		= THIS_MODULE,
@@ -1102,7 +1102,7 @@ static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
 }
 
 static int erst_clearer(enum pstore_type_id type, u64 id,
-			struct pstore_info *psi)
+			struct timespec time, struct pstore_info *psi)
 {
 	return erst_clear(id);
 }
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 75a7c82..55188d7 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -628,6 +628,27 @@ efivar_unregister(struct efivar_entry *var)
 
 #ifdef CONFIG_PSTORE
 
+static unsigned long efi_pstore_log_num = 1;
+static int param_set_efi_pstore_log_num(const char *val,
+					struct kernel_param *kp)
+{
+	int ret;
+	unsigned long l;
+
+	ret = kstrtoul(val, 0, &l);
+	if (ret || l == 0)
+		return -EINVAL;
+
+	ret = param_set_ulong(val, kp);
+	if (ret)
+		return -EINVAL;
+
+	return 0;
+}
+
+module_param_call(efi_pstore_log_num, param_set_efi_pstore_log_num,
+		  param_get_ulong, &efi_pstore_log_num, S_IRUGO | S_IWUSR);
+
 static int efi_pstore_open(struct pstore_info *psi)
 {
 	struct efivars *efivars = psi->data;
@@ -731,7 +752,7 @@ static int efi_pstore_write(enum pstore_type_id type,
 
 	current_log_num = get_current_log_num(efi_name, efivars);
 
-	if (current_log_num >= 1) {
+	if (current_log_num >= efi_pstore_log_num) {
 		spin_unlock(&efivars->lock);
 		*id = part;
 		return -EEXIST;
@@ -756,7 +777,7 @@ static int efi_pstore_write(enum pstore_type_id type,
 };
 
 static int efi_pstore_erase(enum pstore_type_id type, u64 id,
-			    struct pstore_info *psi)
+			    struct timespec time, struct pstore_info *psi)
 {
 	char stub_name[DUMP_NAME_LEN];
 	efi_char16_t efi_name[DUMP_NAME_LEN];
@@ -765,7 +786,7 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id,
 	struct efivar_entry *entry, *found = NULL;
 	int i;
 
-	sprintf(stub_name, "dump-type%u-%llu-", type, id);
+	sprintf(stub_name, "dump-type%u-%llu-%lu", type, id, time.tv_sec);
 
 	spin_lock(&efivars->lock);
 
@@ -783,10 +804,6 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id,
 		if (utf16_strncmp(entry->var.VariableName, efi_name,
 				  utf16_strlen(efi_name)))
 			continue;
-		/* Needs to be a prefix */
-		if (entry->var.VariableName[utf16_strlen(efi_name)]
-		    == 0)
-			continue;
 
 		/* found */
 		found = entry;
@@ -832,7 +849,7 @@ static int efi_pstore_write(enum pstore_type_id type,
 }
 
 static int efi_pstore_erase(enum pstore_type_id type, u64 id,
-			    struct pstore_info *psi)
+			    struct timespec time, struct pstore_info *psi)
 {
 	return 0;
 }
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 11a2aa2..9acd703 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -75,7 +75,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
 	struct pstore_private *p = dentry->d_inode->i_private;
 
 	if (p->psi->erase)
-		p->psi->erase(p->type, p->id, p->psi);
+		p->psi->erase(p->type, p->id, dentry->d_inode->i_ctime, p->psi);
 
 	return simple_unlink(dir, dentry);
 }
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 453030f..06357c9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -178,7 +178,7 @@ static int ramoops_pstore_write(enum pstore_type_id type,
 }
 
 static int ramoops_pstore_erase(enum pstore_type_id type, u64 id,
-				struct pstore_info *psi)
+				timespec time, struct pstore_info *psi)
 {
 	struct ramoops_context *cxt = psi->data;
 
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index e1461e1..92cb90e 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -48,7 +48,7 @@ struct pstore_info {
 			enum kmsg_dump_reason reason, u64 *id,
 			unsigned int part, size_t size, struct pstore_info *psi);
 	int		(*erase)(enum pstore_type_id type, u64 id,
-			struct pstore_info *psi);
+			struct timespec time, struct pstore_info *psi);
 	void		*data;
 };
 
-- 1.7.1 


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-19 21:13 [RFC][PATCH v2 2/3] Hold multiple logs Seiji Aguchi
@ 2012-07-19 21:22 ` Luck, Tony
  2012-07-19 21:43   ` Seiji Aguchi
  0 siblings, 1 reply; 23+ messages in thread
From: Luck, Tony @ 2012-07-19 21:22 UTC (permalink / raw)
  To: Seiji Aguchi, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dzickus@redhat.com, Matthew Garrett (mjg@redhat.com)
  Cc: dle-develop@lists.sourceforge.net, Satoru Moriya

>  With this patch, efi_pstore can hold multiple logs with a new kernel parameter, efi_pstore_log_num.

How big a value for efi_pstore_log_num have you tried? Did you see any problems with EFI running
out of space? Do you get some helpful error message if you pick a number that is too big?

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-19 21:22 ` Luck, Tony
@ 2012-07-19 21:43   ` Seiji Aguchi
  2012-07-19 22:10     ` Luck, Tony
  0 siblings, 1 reply; 23+ messages in thread
From: Seiji Aguchi @ 2012-07-19 21:43 UTC (permalink / raw)
  To: Luck, Tony, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dzickus@redhat.com, Matthew Garrett (mjg@redhat.com)
  Cc: dle-develop@lists.sourceforge.net, Satoru Moriya

> How big a value for efi_pstore_log_num have you tried? Did you see any problems with EFI running out of space? Do you get some
> helpful error message if you pick a number that is too big?

Please calm down... This is RFC patch.

If users specify a number of that is too big, the message will be meaningless.
I just couldn't decide the appropriate number by myself.
Then, I make it tunable.

So, I would like to hear about your opinion.

Do you agree with an idea holding multiple logs?
If so, which number is appropriate? 

Seiji

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-19 21:43   ` Seiji Aguchi
@ 2012-07-19 22:10     ` Luck, Tony
  2012-07-19 23:08       ` Seiji Aguchi
  0 siblings, 1 reply; 23+ messages in thread
From: Luck, Tony @ 2012-07-19 22:10 UTC (permalink / raw)
  To: Seiji Aguchi, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dzickus@redhat.com, Matthew Garrett (mjg@redhat.com)
  Cc: dle-develop@lists.sourceforge.net, Satoru Moriya

> If users specify a number of that is too big, the message will be meaningless.
> I just couldn't decide the appropriate number by myself.
> Then, I make it tunable.

I think that 3 or 4 logs should be plenty to cover almost all situations. E.g.
with 3 logs you could capture 2 OOPS (and perhaps miss some other OOPS)
and then get the final panic that kills the system.  Messier crashes are of
course possible ... but that would give lots of clues on where the problems
lie.

I was just wondering whether you had successfully stored 2, or 3, or
more logs - and if you'd seen any problems doing so.  Matthew seemed
very worried about the amount of available space for EFI.

If you don't know what is the appropriate number ... then how will
users decide? We should really give them some guidance ... especially
if there are odd problems[1] if they pick a number that is too big.

-Tony

[1] I don't know if there will be problems ... I don't know what else
EFI will store here, and what would happen if it ran out of space.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-19 22:10     ` Luck, Tony
@ 2012-07-19 23:08       ` Seiji Aguchi
  2012-07-19 23:42         ` Luck, Tony
  0 siblings, 1 reply; 23+ messages in thread
From: Seiji Aguchi @ 2012-07-19 23:08 UTC (permalink / raw)
  To: Luck, Tony, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dzickus@redhat.com, Matthew Garrett (mjg@redhat.com)
  Cc: dle-develop@lists.sourceforge.net, Satoru Moriya

>
> I think that 3 or 4 logs should be plenty to cover almost all situations. E.g.
> with 3 logs you could capture 2 OOPS (and perhaps miss some other OOPS) and then get the final panic that kills the system.  Messier
> crashes are of course possible ... but that would give lots of clues on where the problems lie.
>

Thank you for letting my know your idea.

Let me explain my opinion.
If you are concerned about multiple OOPS case, I think an user app which logs from /dev/pstore to /var/log should be developed.
Once it is developed, we don't need to care about multiple oops case and the appropriate number is two.

- In case where system is workable after oops.
The user app will erase an entry in NVRAM.
And we can get the message via /var/log.

- In case where system hangs up or panics due to the oops.
Oops is the critical message and we don't need care about subsequent events.
 
What do you think?

> If you don't know what is the appropriate number ... then how will users decide? We should really give them some guidance ...
> especially if there are odd problems[1] if they pick a number that is too big.

You are right.
There is no user app above right now. So, I was in stuck...
But I understand I shouldn't have introduce efi_pstore_log_num.

Seiji



^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-19 23:08       ` Seiji Aguchi
@ 2012-07-19 23:42         ` Luck, Tony
  2012-07-20  0:39           ` Seiji Aguchi
  0 siblings, 1 reply; 23+ messages in thread
From: Luck, Tony @ 2012-07-19 23:42 UTC (permalink / raw)
  To: Seiji Aguchi, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dzickus@redhat.com, Matthew Garrett (mjg@redhat.com)
  Cc: dle-develop@lists.sourceforge.net, Satoru Moriya

> If you are concerned about multiple OOPS case, I think an user app which logs from /dev/pstore to /var/log should be developed.

Agreed - we need an app/daemon to do this.

> Once it is developed, we don't need to care about multiple oops case and the appropriate number is two.

Only if you can guarantee that the app/daemon will run and save the first OOPS before the next
occurs. Even if the system were running normally this might be difficult to achieve ... but in this
case we know the system isn't running normally (it just OOPSed twice!).

However - there is progressively less value in collecting additional consecutive OOPS. Perhaps
one is enough 90% or even 99% of the time. I'm naturally paranoid so having two or three
would make me feel happy that most of the remaining 10% or 1% of the cases were covered.

> - In case where system is workable after oops.
> The user app will erase an entry in NVRAM.
> And we can get the message via /var/log.

Yes - the system can keep running after many types of OOPs - so the OOPS will be logged in /var/log (or by the app/daemon
copying from pstore, or both).

> - In case where system hangs up or panics due to the oops.
> Oops is the critical message and we don't need care about subsequent events.

Yes - if the OOPs is instrumental in the path leading to the hang/panic - then the OOPS is the
first place to look for the root cause of the problem. But it will be a case by case analysis.
Sometimes the OOPS might be unconnected. If possible we'd like to log more information
to allow detective work to decide whether there is a connection. But as I mentioned above
there are severe limits to how much better things are by storing more information.

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-19 23:42         ` Luck, Tony
@ 2012-07-20  0:39           ` Seiji Aguchi
  2012-07-20  3:03             ` Don Zickus
  0 siblings, 1 reply; 23+ messages in thread
From: Seiji Aguchi @ 2012-07-20  0:39 UTC (permalink / raw)
  To: Luck, Tony, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dzickus@redhat.com, Matthew Garrett (mjg@redhat.com)
  Cc: dle-develop@lists.sourceforge.net, Satoru Moriya


Thank you for describing this in detail.

> Yes - if the OOPs is instrumental in the path leading to the hang/panic - then the OOPS is the first place to look for the root cause of
> the problem. But it will be a case by case analysis.
> Sometimes the OOPS might be unconnected. If possible we'd like to log more information to allow detective work to decide whether
> there is a connection. But as I mentioned above there are severe limits to how much better things are by storing more information.

I understand the reason why you think 3 or 4 logs are reasonable.
There are some cases  2nd or 3rd oops is critical....

I have some enterprise customers who are sensitive for a software failure  and specify panic_on_oops=1.
In this case, they don't need 3,4 logs. 2 logs  are enough.

So, kernel parameter should be as follows.

Log_num =1
  - For users who want to hold just one log.

Log_num=2
  - For users who can handle multiple logs and 1st oops is concerned. (by specifying panic_on_oops=1)

Log_num=3,4
 -  for users who care about 2nd or 3rd oops.

Log_num=5 or more
Invalid value.

If there is misunderstanding, please let me know.

Seiji

> -----Original Message-----
> From: Luck, Tony [mailto:tony.luck@intel.com]
> Sent: Thursday, July 19, 2012 7:42 PM
> To: Seiji Aguchi; linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org; mikew@google.com; dzickus@redhat.com; Matthew
> Garrett (mjg@redhat.com)
> Cc: dle-develop@lists.sourceforge.net; Satoru Moriya
> Subject: RE: [RFC][PATCH v2 2/3] Hold multiple logs
> 
> > If you are concerned about multiple OOPS case, I think an user app which logs from /dev/pstore to /var/log should be developed.
> 
> Agreed - we need an app/daemon to do this.
> 
> > Once it is developed, we don't need to care about multiple oops case and the appropriate number is two.
> 
> Only if you can guarantee that the app/daemon will run and save the first OOPS before the next occurs. Even if the system were
> running normally this might be difficult to achieve.. but in this case we know the system isn't running normally (it just OOPSed twice!).
> 
> However - there is progressively less value in collecting additional consecutive OOPS. Perhaps one is enough 90% or even 99% of the
> time. I'm naturally paranoid so having two or three would make me feel happy that most of the remaining 10% or 1% of the cases
> were covered.
> 
> > - In case where system is workable after oops.
> > The user app will erase an entry in NVRAM.
> > And we can get the message via /var/log.
> 
> Yes - the system can keep running after many types of OOPs - so the OOPS will be logged in /var/log (or by the app/daemon copying
> from pstore, or both).
> 
> > - In case where system hangs up or panics due to the oops.
> > Oops is the critical message and we don't need care about subsequent events.
> 
> Yes - if the OOPs is instrumental in the path leading to the hang/panic - then the OOPS is the first place to look for the root cause of
> the problem. But it will be a case by case analysis.
> Sometimes the OOPS might be unconnected. If possible we'd like to log more information to allow detective work to decide whether
> there is a connection. But as I mentioned above there are severe limits to how much better things are by storing more information.
> 
> -Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-20  0:39           ` Seiji Aguchi
@ 2012-07-20  3:03             ` Don Zickus
  2012-07-20 13:24               ` Seiji Aguchi
  2012-07-23 14:16               ` Matthew Garrett
  0 siblings, 2 replies; 23+ messages in thread
From: Don Zickus @ 2012-07-20  3:03 UTC (permalink / raw)
  To: Seiji Aguchi
  Cc: Luck, Tony, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	Matthew Garrett (mjg@redhat.com),
	dle-develop@lists.sourceforge.net, Satoru Moriya

On Fri, Jul 20, 2012 at 12:39:24AM +0000, Seiji Aguchi wrote:
> 
> Thank you for describing this in detail.
> 
> > Yes - if the OOPs is instrumental in the path leading to the hang/panic - then the OOPS is the first place to look for the root cause of
> > the problem. But it will be a case by case analysis.
> > Sometimes the OOPS might be unconnected. If possible we'd like to log more information to allow detective work to decide whether
> > there is a connection. But as I mentioned above there are severe limits to how much better things are by storing more information.
> 
> I understand the reason why you think 3 or 4 logs are reasonable.
> There are some cases  2nd or 3rd oops is critical....
> 
> I have some enterprise customers who are sensitive for a software failure  and specify panic_on_oops=1.
> In this case, they don't need 3,4 logs. 2 logs  are enough.
> 
> So, kernel parameter should be as follows.
> 
> Log_num =1
>   - For users who want to hold just one log.
> 
> Log_num=2
>   - For users who can handle multiple logs and 1st oops is concerned. (by specifying panic_on_oops=1)
> 
> Log_num=3,4
>  -  for users who care about 2nd or 3rd oops.
> 
> Log_num=5 or more
> Invalid value.

What is the harm of not using this and just letting the number be infinite
(or until EFI runs out of space)?  Is it a big deal if extra failures are
logged?

The hope would be a daemon would clear the old logs out and you never run
out of space.

Cheers,
Don

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-20  3:03             ` Don Zickus
@ 2012-07-20 13:24               ` Seiji Aguchi
  2012-07-20 13:42                 ` Seiji Aguchi
  2012-07-20 14:29                 ` Don Zickus
  2012-07-23 14:16               ` Matthew Garrett
  1 sibling, 2 replies; 23+ messages in thread
From: Seiji Aguchi @ 2012-07-20 13:24 UTC (permalink / raw)
  To: Don Zickus
  Cc: Luck, Tony, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	Matthew Garrett (mjg@redhat.com),
	dle-develop@lists.sourceforge.net, Satoru Moriya


> What is the harm of not using this and just letting the number be infinite (or until EFI runs out of space)?  Is it a big deal if extra failures
> are logged?


There may be someone using NVRAM for other purposes.
Actually, we have the user interface, /sys/firmware/efi/vars/new_vars, del_vars.

In this case, they want to avoid filling with unneeded logs. 

> The hope would be a daemon would clear the old logs out and you never run out of space.

In most case
But as Tony mentioned, NVRAM may be filled with multiple oops even if we have the daemon.

Seiji

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-20 13:24               ` Seiji Aguchi
@ 2012-07-20 13:42                 ` Seiji Aguchi
  2012-07-20 14:29                 ` Don Zickus
  1 sibling, 0 replies; 23+ messages in thread
From: Seiji Aguchi @ 2012-07-20 13:42 UTC (permalink / raw)
  To: Seiji Aguchi, Don Zickus
  Cc: Luck, Tony, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	Matthew Garrett (mjg@redhat.com),
	dle-develop@lists.sourceforge.net, Satoru Moriya

> > The hope would be a daemon would clear the old logs out and you never run out of space.
> 
> In most case
> But as Tony mentioned, NVRAM may be filled with multiple oops even if we have the daemon.

Oops...
I meant that in most case, the daemon can erase entries.
But  as Tony mentioned, NVRAM may be filled with multiple oops even if we have the daemon.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-20 13:24               ` Seiji Aguchi
  2012-07-20 13:42                 ` Seiji Aguchi
@ 2012-07-20 14:29                 ` Don Zickus
  2012-07-20 16:56                   ` Luck, Tony
  1 sibling, 1 reply; 23+ messages in thread
From: Don Zickus @ 2012-07-20 14:29 UTC (permalink / raw)
  To: Seiji Aguchi
  Cc: Luck, Tony, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	Matthew Garrett (mjg@redhat.com),
	dle-develop@lists.sourceforge.net, Satoru Moriya

On Fri, Jul 20, 2012 at 01:24:38PM +0000, Seiji Aguchi wrote:
> 
> > What is the harm of not using this and just letting the number be infinite (or until EFI runs out of space)?  Is it a big deal if extra failures
> > are logged?
> 
> 
> There may be someone using NVRAM for other purposes.
> Actually, we have the user interface, /sys/firmware/efi/vars/new_vars, del_vars.
> 
> In this case, they want to avoid filling with unneeded logs. 

Sure, which is why it needs to be cleaned out regularly.  This isn't much
different than with /var/logs or even /boot.  The former can suck up lots
of disk space for no reason and the later can use up limited space if you
install to many kernels.

I don't know, I am not a fan of this policy in the kernel, I would rather
keep it simple, like an on/off switch.  If you decide you want to use efi
for logging than you better make sure you have an app to clean it up.
Otherwise the user shouldn't turn it on.

Creating rules like this seems to be complicating things to me IMHO.  But
I will gladly defer to someone else's judgement.

Cheers,
Don

> 
> > The hope would be a daemon would clear the old logs out and you never run out of space.
> 
> In most case
> But as Tony mentioned, NVRAM may be filled with multiple oops even if we have the daemon.
> 
> Seiji

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-20 14:29                 ` Don Zickus
@ 2012-07-20 16:56                   ` Luck, Tony
  2012-07-20 18:49                     ` Seiji Aguchi
  0 siblings, 1 reply; 23+ messages in thread
From: Luck, Tony @ 2012-07-20 16:56 UTC (permalink / raw)
  To: Don Zickus, Seiji Aguchi
  Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	mikew@google.com, Matthew Garrett (mjg@redhat.com),
	dle-develop@lists.sourceforge.net, Satoru Moriya

> > What is the harm of not using this and just letting the number be infinite (or until EFI runs out of space)?  Is it a big deal if extra failures
> > are logged?

The big question is what happens when EFI runs out of space. Matthew avoided
the question by implementing the "just one record" policy. Was this because he
has some specific knowledge about what happens, or does he just have an
irrational[1] fear that the EFI implementation will handle this poorly?

Without some tests on at least a couple of different platforms that show that
EFI handles out of space conditions gracefully, I continue to have concerns.

-Tony

[1] Perhaps his fears are rational given how many other places he has found
EFI not doing what we'd expect or want.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-20 16:56                   ` Luck, Tony
@ 2012-07-20 18:49                     ` Seiji Aguchi
  0 siblings, 0 replies; 23+ messages in thread
From: Seiji Aguchi @ 2012-07-20 18:49 UTC (permalink / raw)
  To: Luck, Tony, Don Zickus
  Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	mikew@google.com, Matthew Garrett (mjg@redhat.com),
	dle-develop@lists.sourceforge.net, Satoru Moriya

Tony,

> The big question is what happens when EFI runs out of space.

According to EFI specification, set_variable service returns EFI_OUT_OF_RESOURCES.
If it doesn't work, the firmware should be fixed.

>Matthew avoided the question by implementing the "just one record" policy.

I think we need to ask Matthew the reason why he introduced "just one record" policy 
rather than guessing it by reading his original source code.

Seiji

> -----Original Message-----
> From: Luck, Tony [mailto:tony.luck@intel.com]
> Sent: Friday, July 20, 2012 12:56 PM
> To: Don Zickus; Seiji Aguchi
> Cc: linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org; mikew@google.com; Matthew Garrett (mjg@redhat.com); dle-
> develop@lists.sourceforge.net; Satoru Moriya
> Subject: RE: [RFC][PATCH v2 2/3] Hold multiple logs
> 
> > > What is the harm of not using this and just letting the number be
> > > infinite (or until EFI runs out of space)?  Is it a big deal if extra failures are logged?
> 
> The big question is what happens when EFI runs out of space. Matthew avoided the question by implementing the "just one record"
> policy. Was this because he has some specific knowledge about what happens, or does he just have an irrational[1] fear that the EFI
> implementation will handle this poorly?
> 
> Without some tests on at least a couple of different platforms that show that EFI handles out of space conditions gracefully, I continue
> to have concerns.
> 
> -Tony
> 
> [1] Perhaps his fears are rational given how many other places he has found EFI not doing what we'd expect or want.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-20  3:03             ` Don Zickus
  2012-07-20 13:24               ` Seiji Aguchi
@ 2012-07-23 14:16               ` Matthew Garrett
  2012-07-24 17:23                 ` Seiji Aguchi
  1 sibling, 1 reply; 23+ messages in thread
From: Matthew Garrett @ 2012-07-23 14:16 UTC (permalink / raw)
  To: Don Zickus
  Cc: Seiji Aguchi, Luck, Tony, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dle-develop@lists.sourceforge.net, Satoru Moriya

On Thu, Jul 19, 2012 at 11:03:28PM -0400, Don Zickus wrote:

> What is the harm of not using this and just letting the number be infinite
> (or until EFI runs out of space)?  Is it a big deal if extra failures are
> logged?

Running out of space in EFI isn't a well-tested scenario, and I wouldn't 
expect all firmware to handle it gracefully. This is made worse by EFI 1 
not providing any information about available storage. I'd be fine with 
changing the default number of entries on systems where we can obtain 
the appropriate information to make that decision, but otherwise I think 
it should be limited to 1.

-- 
Matthew Garrett | mjg59@srcf.ucam.org

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-23 14:16               ` Matthew Garrett
@ 2012-07-24 17:23                 ` Seiji Aguchi
  2012-07-24 17:52                   ` Luck, Tony
  0 siblings, 1 reply; 23+ messages in thread
From: Seiji Aguchi @ 2012-07-24 17:23 UTC (permalink / raw)
  To: Luck, Tony (tony.luck@intel.com)
  Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	mikew@google.com, dle-develop@lists.sourceforge.net,
	Satoru Moriya, Matthew Garrett (mjg@redhat.com),
	dzickus@redhat.com

Tony,

I think all guys agree to hold multiple logs.
On the other hand, we have different opinions on overwriting policy.

So, I would like to find a way to fix this issue ,losing critical message, without overwriting policy at first.

I talked with Matthew a bit privately and he suggested to use QueryVariableInfo service which is supported in EFI 2.0 or later.
If we can use it, we know the remaining NVRAM space before calling SetVariable.

This mean that we can avoid the situation which efi_pstore have to handle out of space conditions.
Also, we don't need to introduce a new kernel parameter by just holding multiple logs.

What do you think?

Seiji

> -----Original Message-----
> From: Matthew Garrett [mailto:mjg@redhat.com]
> Sent: Monday, July 23, 2012 10:17 AM
> To: Don Zickus
> Cc: Seiji Aguchi; Luck, Tony; linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org; mikew@google.com; dle-
> develop@lists.sourceforge.net; Satoru Moriya
> Subject: Re: [RFC][PATCH v2 2/3] Hold multiple logs
> 
> On Thu, Jul 19, 2012 at 11:03:28PM -0400, Don Zickus wrote:
> 
> > What is the harm of not using this and just letting the number be
> > infinite (or until EFI runs out of space)?  Is it a big deal if extra
> > failures are logged?
> 
> Running out of space in EFI isn't a well-tested scenario, and I wouldn't expect all firmware to handle it gracefully. This is made worse by
> EFI 1 not providing any information about available storage. I'd be fine with changing the default number of entries on systems where
> we can obtain the appropriate information to make that decision, but otherwise I think it should be limited to 1.
> 
> --
> Matthew Garrett | mjg59@srcf.ucam.org

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-24 17:23                 ` Seiji Aguchi
@ 2012-07-24 17:52                   ` Luck, Tony
  2012-07-24 18:18                     ` Matthew Garrett
  0 siblings, 1 reply; 23+ messages in thread
From: Luck, Tony @ 2012-07-24 17:52 UTC (permalink / raw)
  To: Seiji Aguchi
  Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	mikew@google.com, dle-develop@lists.sourceforge.net,
	Satoru Moriya, Matthew Garrett (mjg@redhat.com),
	dzickus@redhat.com

> I talked with Matthew a bit privately and he suggested to use QueryVariableInfo service which is supported in EFI 2.0 or later.
> If we can use it, we know the remaining NVRAM space before calling SetVariable.

So we can have (pseudo)-code like this:


	if (QueryVariableInfo says enough space)
		pstore saves log as new record
	else
		we consider over-write options to re-use an existing record, or just drop this one

That looks like a good solution - platforms that provide enough non-volatile memory for
multiple records make use of it for better diagnosis. Those that don't have available memory
don't get to test their "what do we do when we run out of space" code.

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-24 17:52                   ` Luck, Tony
@ 2012-07-24 18:18                     ` Matthew Garrett
  2012-07-24 19:57                       ` Luck, Tony
  0 siblings, 1 reply; 23+ messages in thread
From: Matthew Garrett @ 2012-07-24 18:18 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Seiji Aguchi, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dle-develop@lists.sourceforge.net, Satoru Moriya,
	dzickus@redhat.com

On Tue, Jul 24, 2012 at 05:52:25PM +0000, Luck, Tony wrote:
> 	if (QueryVariableInfo says enough space)
> 		pstore saves log as new record
> 	else
> 		we consider over-write options to re-use an existing record, or just drop this one

I'd lean towards saying drop, and rely on userspace to do something 
useful. Personal experience is that if two oopses are unrelated then 
there's enough time for userspace to do something and remove the 
existing record, and if they're related it's the first one that tells 
you where the problem actually is.

One thing that's worth noting - UEFI systems will typically only recover 
deleted space on reset. create->delete->create->delete will reduce 
available space until the platform is rebooted, at which point the 
deleted portion will become available again.

-- 
Matthew Garrett | mjg59@srcf.ucam.org

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-24 18:18                     ` Matthew Garrett
@ 2012-07-24 19:57                       ` Luck, Tony
  2012-07-24 20:39                         ` Seiji Aguchi
  0 siblings, 1 reply; 23+ messages in thread
From: Luck, Tony @ 2012-07-24 19:57 UTC (permalink / raw)
  To: Matthew Garrett
  Cc: Seiji Aguchi, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dle-develop@lists.sourceforge.net, Satoru Moriya,
	dzickus@redhat.com

> One thing that's worth noting - UEFI systems will typically only recover 
> deleted space on reset. create->delete->create->delete will reduce 
> available space until the platform is rebooted, at which point the 
> deleted portion will become available again.

Some ACPI/ERST systems do this too :-(

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-24 19:57                       ` Luck, Tony
@ 2012-07-24 20:39                         ` Seiji Aguchi
  2012-07-24 20:54                           ` Luck, Tony
  0 siblings, 1 reply; 23+ messages in thread
From: Seiji Aguchi @ 2012-07-24 20:39 UTC (permalink / raw)
  To: Luck, Tony, Matthew Garrett
  Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	mikew@google.com, dle-develop@lists.sourceforge.net,
	Satoru Moriya, dzickus@redhat.com

> > One thing that's worth noting - UEFI systems will typically only
> > recover deleted space on reset. create->delete->create->delete will
> > reduce available space until the platform is rebooted, at which point
> > the deleted portion will become available again.
> 
> Some ACPI/ERST systems do this too :-(

So, we don't need to introduce a overwriting policy.
I will make a patch using QueryVariableInfo and just writing multiple logs.

Seiji

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-24 20:39                         ` Seiji Aguchi
@ 2012-07-24 20:54                           ` Luck, Tony
  2012-07-24 21:07                             ` Matthew Garrett
  0 siblings, 1 reply; 23+ messages in thread
From: Luck, Tony @ 2012-07-24 20:54 UTC (permalink / raw)
  To: Seiji Aguchi, Matthew Garrett
  Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	mikew@google.com, dle-develop@lists.sourceforge.net,
	Satoru Moriya, dzickus@redhat.com

> So, we don't need to introduce a overwriting policy.
> I will make a patch using QueryVariableInfo and just writing multiple logs.

I don't think that's what Matthew said.

Here's the bad scenario he envisions:

System is running. It has an OOPs, which gets logged by pstore, and the system carries on running.

The new daemon that we said we needed earlier in this e-mail thread finds the entry in pstore and
copies it to some place in /var/log and removes the pstore entry - causing pstore to ask EFI (or ERST)
backend to erase the record. Firmware does the erase, but doesn't put the space back into the
free pool for use.

Repeat with more OOPs until all the EFI (or ERST) space has been allocated and then lost into
firmware limbo waiting for a reset.

Now we panic. Pstore asks EFI "Do you have any space?" EFI replies "No". Pstore can't even overwrite
one of the old OOPs records ... because it thinks they have all been erased. So we lose the panic
log.

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-24 20:54                           ` Luck, Tony
@ 2012-07-24 21:07                             ` Matthew Garrett
  2012-07-24 21:12                               ` Luck, Tony
  0 siblings, 1 reply; 23+ messages in thread
From: Matthew Garrett @ 2012-07-24 21:07 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Seiji Aguchi, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dle-develop@lists.sourceforge.net, Satoru Moriya,
	dzickus@redhat.com

On Tue, Jul 24, 2012 at 08:54:59PM +0000, Luck, Tony wrote:

> Now we panic. Pstore asks EFI "Do you have any space?" EFI replies "No". Pstore can't even overwrite
> one of the old OOPs records ... because it thinks they have all been erased. So we lose the panic
> log.

I think we inevitably lose in that scenario. I'd need to verify, but my 
recollection is that overwriting existing variables may be equivalent to 
a delete/create cycle.

-- 
Matthew Garrett | mjg59@srcf.ucam.org

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-24 21:07                             ` Matthew Garrett
@ 2012-07-24 21:12                               ` Luck, Tony
  2012-07-24 21:26                                 ` Matthew Garrett
  0 siblings, 1 reply; 23+ messages in thread
From: Luck, Tony @ 2012-07-24 21:12 UTC (permalink / raw)
  To: Matthew Garrett
  Cc: Seiji Aguchi, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dle-develop@lists.sourceforge.net, Satoru Moriya,
	dzickus@redhat.com

> I think we inevitably lose in that scenario. I'd need to verify, but my 
> recollection is that overwriting existing variables may be equivalent to 
> a delete/create cycle.

This would mean that EFI really wants the OS to treat EFI variables as pretty much
exclusively read-only.  Any activity which periodically updates a variable would
eventually run into problems in an EFI implementation that loses the old space
until a reset.

Sad.

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH v2 2/3] Hold multiple logs
  2012-07-24 21:12                               ` Luck, Tony
@ 2012-07-24 21:26                                 ` Matthew Garrett
  0 siblings, 0 replies; 23+ messages in thread
From: Matthew Garrett @ 2012-07-24 21:26 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Seiji Aguchi, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, mikew@google.com,
	dle-develop@lists.sourceforge.net, Satoru Moriya,
	dzickus@redhat.com

On Tue, Jul 24, 2012 at 09:12:25PM +0000, Luck, Tony wrote:
> > I think we inevitably lose in that scenario. I'd need to verify, but my 
> > recollection is that overwriting existing variables may be equivalent to 
> > a delete/create cycle.
> 
> This would mean that EFI really wants the OS to treat EFI variables as pretty much
> exclusively read-only.  Any activity which periodically updates a variable would
> eventually run into problems in an EFI implementation that loses the old space
> until a reset.

Sure. I'll test with a few implementations and see what I can figure 
out. We may just want to reserve some space for pstore, then have delete 
in pstore simply map to hiding the entries rather than deleting them.

-- 
Matthew Garrett | mjg59@srcf.ucam.org

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2012-07-24 21:26 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-07-19 21:13 [RFC][PATCH v2 2/3] Hold multiple logs Seiji Aguchi
2012-07-19 21:22 ` Luck, Tony
2012-07-19 21:43   ` Seiji Aguchi
2012-07-19 22:10     ` Luck, Tony
2012-07-19 23:08       ` Seiji Aguchi
2012-07-19 23:42         ` Luck, Tony
2012-07-20  0:39           ` Seiji Aguchi
2012-07-20  3:03             ` Don Zickus
2012-07-20 13:24               ` Seiji Aguchi
2012-07-20 13:42                 ` Seiji Aguchi
2012-07-20 14:29                 ` Don Zickus
2012-07-20 16:56                   ` Luck, Tony
2012-07-20 18:49                     ` Seiji Aguchi
2012-07-23 14:16               ` Matthew Garrett
2012-07-24 17:23                 ` Seiji Aguchi
2012-07-24 17:52                   ` Luck, Tony
2012-07-24 18:18                     ` Matthew Garrett
2012-07-24 19:57                       ` Luck, Tony
2012-07-24 20:39                         ` Seiji Aguchi
2012-07-24 20:54                           ` Luck, Tony
2012-07-24 21:07                             ` Matthew Garrett
2012-07-24 21:12                               ` Luck, Tony
2012-07-24 21:26                                 ` Matthew Garrett

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox