From: Sam Bobroff <sbobroff@linux.ibm.com>
To: linuxppc-dev@lists.ozlabs.org
Cc: oohall@gmail.com
Subject: [PATCH RFC 03/15] powerpc/eeh: Track orphaned struct eeh_pe
Date: Wed, 2 Oct 2019 16:02:41 +1000 [thread overview]
Message-ID: <63e68b88d6fdc779abbb03869e4f5e33c7c214ed.1569996166.git.sbobroff@linux.ibm.com> (raw)
In-Reply-To: <cover.1569996166.git.sbobroff@linux.ibm.com>
To facilitate debugging of (the inevitable) refcounting problems with
struct eeh_pes, detect when a struct eeh_pe has been removed from it's
global PHB list but not yet freed ("orphaned"), and collect these
PEs in a list.
They should only remain in the list briefly during processing, so any
PEs that stay longer will be the result of bugs.
The list can be examined by reading from the "eeh_pe_debug" file in
debugfs.
Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
---
arch/powerpc/include/asm/eeh.h | 4 +++
arch/powerpc/kernel/eeh.c | 21 ++++++++++++++
arch/powerpc/kernel/eeh_pe.c | 53 +++++++++++++++++++++++++++++++++-
3 files changed, 77 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index e6f461d07426..df843d04268d 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -14,6 +14,7 @@
#include <linux/time.h>
#include <linux/atomic.h>
#include <linux/kref.h>
+#include <linux/seq_file.h>
#include <uapi/asm/eeh.h>
@@ -66,6 +67,7 @@ struct pci_dn;
#define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */
#define EEH_PE_CFG_BLOCKED (1 << 2) /* Block config access */
#define EEH_PE_RESET (1 << 3) /* PE reset in progress */
+#define EEH_PE_ORPHAN (1 << 4) /* Removal pending */
#define EEH_PE_KEEP (1 << 8) /* Keep PE on hotplug */
#define EEH_PE_CFG_RESTRICTED (1 << 9) /* Block config on error */
@@ -103,6 +105,7 @@ struct eeh_pe {
unsigned long stack_trace[64];
int trace_entries;
#endif /* CONFIG_STACKTRACE */
+ struct list_head orphan_entry; /* Memb. eeh_orphan_pes */
};
#define eeh_pe_for_each_dev(pe, edev, tmp) \
@@ -280,6 +283,7 @@ typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
void eeh_lock_pes(unsigned long *flags);
void eeh_unlock_pes(unsigned long flags);
+void eeh_pe_seq_show_orphans(struct seq_file *s);
void eeh_set_pe_aux_size(int size);
void eeh_get_pe(struct eeh_pe *pe);
void eeh_put_pe(struct eeh_pe *pe);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 7e36ad0cfa2b..7eb6ca1ab72b 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -2153,6 +2153,24 @@ static const struct file_operations eeh_dev_break_fops = {
.read = eeh_debugfs_dev_usage,
};
+static int eeh_pe_debug_seq_file(struct seq_file *s, void *unused)
+{
+ eeh_pe_seq_show_orphans(s);
+ return 0;
+}
+
+static int eeh_pe_debug_dbgfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, eeh_pe_debug_seq_file, inode->i_private);
+}
+
+static const struct file_operations eeh_pe_debug_fops = {
+ .owner = THIS_MODULE,
+ .open = eeh_pe_debug_dbgfs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
static int __init eeh_init_proc(void)
@@ -2177,6 +2195,9 @@ static int __init eeh_init_proc(void)
debugfs_create_file_unsafe("eeh_force_recover", 0600,
powerpc_debugfs_root, NULL,
&eeh_force_recover_fops);
+ debugfs_create_file_unsafe("eeh_pe_debug", 0400,
+ powerpc_debugfs_root, NULL,
+ &eeh_pe_debug_fops);
eeh_cache_debugfs_init();
#endif
}
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index fda52d1989c3..aa279474a928 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -21,6 +21,7 @@
static int eeh_pe_aux_size = 0;
static LIST_HEAD(eeh_phb_pe);
+static LIST_HEAD(eeh_orphan_pes);
/*
* Lock protecting the parent and child list fields of struct eeh_pe (this does
@@ -30,6 +31,7 @@ static LIST_HEAD(eeh_phb_pe);
* via it's parent or child fields will be freed.
*/
static DEFINE_RAW_SPINLOCK(eeh_pe_lock);
+static DEFINE_RAW_SPINLOCK(eeh_orphan_lock);
void eeh_lock_pes(unsigned long *flags)
{
@@ -89,9 +91,24 @@ static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type)
return pe;
}
+void eeh_pe_seq_show_orphans(struct seq_file *s)
+{
+ unsigned long flags;
+ struct eeh_pe *pe;
+
+ seq_puts(s, "Orphaned PEs awaiting free:\n");
+ raw_spin_lock_irqsave(&eeh_orphan_lock, flags);
+ list_for_each_entry(pe, &eeh_orphan_pes, orphan_entry)
+ seq_printf(s, "PHB#%x PE#%x Refcount:%d\n",
+ pe->phb->global_number, pe->addr,
+ kref_read(&pe->kref));
+ raw_spin_unlock_irqrestore(&eeh_orphan_lock, flags);
+}
+
static void eeh_pe_free(struct kref *kref)
{
struct eeh_pe *pe = container_of(kref, struct eeh_pe, kref);
+ unsigned long flags;
if (WARN_ONCE(pe->type & EEH_PE_PHB,
"EEH: PHB#%x-PE#%x: Attempt to free PHB PE!\n",
@@ -103,6 +120,13 @@ static void eeh_pe_free(struct kref *kref)
pe->phb->global_number, pe->addr))
return;
+ raw_spin_lock_irqsave(&eeh_orphan_lock, flags);
+ if (pe->state & EEH_PE_ORPHAN) {
+ pr_warn("EEH: PHB#%x-PE#%x: Orphan freed\n",
+ pe->phb->global_number, pe->addr);
+ list_del(&pe->orphan_entry);
+ }
+ raw_spin_unlock_irqrestore(&eeh_orphan_lock, flags);
kfree(pe);
}
@@ -117,9 +141,36 @@ void eeh_get_pe(struct eeh_pe *pe)
void eeh_put_pe(struct eeh_pe *pe)
{
+ int rv, global_number, addr;
+ unsigned long flags;
+
if (!pe)
return;
- kref_put(&pe->kref, eeh_pe_free);
+
+ global_number = pe->phb->global_number;
+ addr = pe->addr;
+ rv = kref_put(&pe->kref, eeh_pe_free);
+ raw_spin_lock_irqsave(&eeh_orphan_lock, flags);
+ if (!rv && !pe->parent && !(pe->type & EEH_PE_PHB) &&
+ !(pe->state & EEH_PE_ORPHAN)) {
+ /* A PE's parent pointer is only set to NULL when it's removed
+ * from it's PHB's list (in eeh_rmv_from_parent_pe()), and that
+ * is followed by a call to eeh_put_pe(). If eeh_put_pe()
+ * doesn't cause the PE to be free'd, then a ref is being held
+ * elsewhere that should be released quickly. Track these PEs
+ * so that if they are not released, they can be found
+ * to assist debugging.
+ */
+ pr_info("EEH: PHB#%x-PE#%x: Orphaned (unlinked from global list but still referenced)\n",
+ global_number, addr);
+
+ pe->state |= EEH_PE_ORPHAN;
+ list_add_tail(&pe->orphan_entry, &eeh_orphan_pes);
+ }
+ raw_spin_unlock_irqrestore(&eeh_orphan_lock, flags);
+ if (rv)
+ pr_debug("EEH: PHB#%x-PE#%x: Final reference released.\n",
+ global_number, addr);
}
void eeh_pe_move_to_parent(struct eeh_pe **pe)
--
2.22.0.216.g00a2a96fc9
next prev parent reply other threads:[~2019-10-02 6:23 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-10-02 6:02 [PATCH RFC 00/15] powerpc/eeh: Synchronize access to struct eeh_pe Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 01/15] powerpc/eeh: Introduce refcounting for " Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 02/15] powerpc/eeh: Rename eeh_pe_get() to eeh_pe_find() Sam Bobroff
2019-11-13 2:32 ` Oliver O'Halloran
2019-10-02 6:02 ` Sam Bobroff [this message]
2019-10-02 6:02 ` [PATCH RFC 04/15] powerpc/eeh: Sync eeh_pe_next(), eeh_pe_find() and early-out traversals Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 05/15] powerpc/eeh: Sync eeh_pe_get_parent() Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 06/15] powerpc/eeh: Sync eeh_phb_pe_get() Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 07/15] powerpc/eeh: Sync eeh_add_to_parent_pe() and eeh_rmv_from_parent_pe() Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 08/15] powerpc/eeh: Sync eeh_handle_normal_event() Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 09/15] powerpw/eeh: Sync eeh_handle_special_event(), pnv_eeh_get_pe(), pnv_eeh_next_error() Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 10/15] powerpc/eeh: Sync eeh_phb_check_failure() Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 11/15] powerpc/eeh: Sync eeh_dev_check_failure() Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 12/15] powerpc/eeh: Sync eeh_pe_get_state() Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 13/15] powerpc/eeh: Sync pnv_eeh_ei_write() Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 14/15] powerpc/eeh: Sync eeh_force_recover_write() Sam Bobroff
2019-10-02 6:02 ` [PATCH RFC 15/15] powerpc/eeh: Sync pcibios_set_pcie_reset_state() Sam Bobroff
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=63e68b88d6fdc779abbb03869e4f5e33c7c214ed.1569996166.git.sbobroff@linux.ibm.com \
--to=sbobroff@linux.ibm.com \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=oohall@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).