* [RFC v2 1/2] proc/crashdd: add API to collect hardware dump in second kernel
2018-03-16 11:12 [RFC v2 0/2] kernel: add support to collect hardware logs in crash recovery kernel Rahul Lakkireddy
@ 2018-03-16 11:12 ` Rahul Lakkireddy
2018-03-16 11:12 ` [RFC v2 2/2] cxgb4: " Rahul Lakkireddy
2018-03-19 7:55 ` [RFC v2 0/2] kernel: add support to collect hardware logs in crash recovery kernel Rahul Lakkireddy
2 siblings, 0 replies; 6+ messages in thread
From: Rahul Lakkireddy @ 2018-03-16 11:12 UTC (permalink / raw)
To: linux-kernel, netdev, kexec
Cc: davem, ebiederm, akpm, torvalds, ganeshgr, nirranjan, indranil,
Rahul Lakkireddy
Add a new module crashdd that exports the /proc/crashdd/ directory
in second kernel, containing collected hardware/firmware dumps.
The sequence of actions done by device drivers to append their device
specific hardware/firmware logs to /proc/crashdd/ directory are as
follows:
1. During probe (before hardware is initialized), device drivers
register to the crashdd module (via crashdd_add_dump()), with
callback function, along with buffer size and log name needed for
firmware/hardware log collection.
2. Crashdd creates a driver's directory under /proc/crashdd/<driver>.
Then, it allocates the buffer with requested size and invokes the
device driver's registered callback function.
3. Device driver collects all hardware/firmware logs into the buffer
and returns control back to crashdd.
4. Crashdd exposes the buffer as a file via
/proc/crashdd/<driver>/<dump_file>.
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
v2:
- Patch added in this series.
fs/proc/Kconfig | 11 ++
fs/proc/Makefile | 1 +
fs/proc/crashdd.c | 263 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/crashdd.h | 43 ++++++++
4 files changed, 318 insertions(+)
create mode 100644 fs/proc/crashdd.c
create mode 100644 include/linux/crashdd.h
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade1206bb89..c378edffe7b3 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -43,6 +43,17 @@ config PROC_VMCORE
help
Exports the dump image of crashed kernel in ELF format.
+config PROC_CRASH_DRIVER_DUMP
+ bool "/proc/crashdd support"
+ depends on PROC_FS && CRASH_DUMP
+ default y
+ ---help---
+ Device drivers can collect the device specific snapshot of
+ their hardware or firmware before they are initialized in
+ crash recovery kernel. If you say Y here a tree of device
+ specific dumps will be made available under /proc/crashdd/
+ directory.
+
config PROC_SYSCTL
bool "Sysctl support (/proc/sys)" if EXPERT
depends on PROC_FS
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ead487e80510..73883bc857b5 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -33,3 +33,4 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+proc-$(CONFIG_PROC_CRASH_DRIVER_DUMP) += crashdd.o
diff --git a/fs/proc/crashdd.c b/fs/proc/crashdd.c
new file mode 100644
index 000000000000..c7585031541e
--- /dev/null
+++ b/fs/proc/crashdd.c
@@ -0,0 +1,263 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/crash_dump.h>
+#include <linux/crashdd.h>
+#include "internal.h"
+
+static LIST_HEAD(crashdd_list);
+static DEFINE_MUTEX(crashdd_mutex);
+
+#define CRASHDD_PROC_PERM 400 /* S_ISRUSR */
+static struct proc_dir_entry *proc_crashdd;
+
+static ssize_t crashdd_read_data(struct file *file, char __user *buffer,
+ size_t buflen, loff_t *fpos)
+{
+ struct crashdd_dump_node *dump = PDE_DATA(file->f_inode);
+ unsigned long len;
+ char *start;
+
+ if (*fpos < 0)
+ return -EINVAL;
+
+ if (!dump || !buflen)
+ return 0;
+
+ if (*fpos >= dump->size)
+ return 0;
+
+ len = dump->size - *fpos;
+ if (len > buflen)
+ len = buflen;
+
+ start = (char *)dump->buf + *fpos;
+ if (copy_to_user(buffer, start, len))
+ return -EFAULT;
+
+ *fpos += len;
+ return len;
+}
+
+static const struct file_operations proc_crashdd_ops = {
+ .read = crashdd_read_data,
+ .llseek = default_llseek,
+ .open = simple_open,
+};
+
+static void *crashdd_proc_mkdir(const char *name)
+{
+ return proc_mkdir_mode(name, CRASHDD_PROC_PERM, proc_crashdd);
+}
+
+static void *crashdd_proc_add(struct proc_dir_entry *parent,
+ const char *name, void *dump)
+{
+ return proc_create_data(name, CRASHDD_PROC_PERM, parent,
+ &proc_crashdd_ops, dump);
+}
+
+static void crashdd_proc_del(struct proc_dir_entry *entry)
+{
+ proc_remove(entry);
+}
+
+/**
+ * crashdd_init_driver - create a proc driver context.
+ * @name: Name of the directory.
+ *
+ * Creates a directory under /proc/crashdd/ with @name. Allocates and
+ * saves the proc context. The proc context is added to the global list
+ * and then returned to the caller. On failure, returns NULL.
+ */
+static struct crashdd_driver_node *crashdd_init_driver(const char *name)
+{
+ struct crashdd_driver_node *node;
+
+ node = vzalloc(sizeof(*node));
+ if (!node)
+ return NULL;
+
+ /* Create a driver's directory under /proc/crashdd/ */
+ node->proc_node = crashdd_proc_mkdir(name);
+ if (!node->proc_node) {
+ vfree(node);
+ return NULL;
+ }
+
+ atomic_set(&node->refcnt, 1);
+
+ /* Initialize the list of dumps that go under this driver's
+ * directory.
+ */
+ INIT_LIST_HEAD(&node->dump_list);
+
+ /* Add the driver's context to global list */
+ mutex_lock(&crashdd_mutex);
+ list_add_tail(&node->list, &crashdd_list);
+ mutex_unlock(&crashdd_mutex);
+
+ return node;
+}
+
+/**
+ * crashdd_get_driver - get an exisiting proc driver context.
+ * @name: Name of the directory.
+ *
+ * Searches and fetches a proc context having @name. If @name is
+ * found, then the reference count is incremented and the context
+ * is returned. If @name is not found, NULL is returned.
+ */
+static struct crashdd_driver_node *crashdd_get_driver(const char *name)
+{
+ struct crashdd_driver_node *node;
+ int found = 0;
+
+ /* Search for an existing driver context having @name */
+ mutex_lock(&crashdd_mutex);
+ list_for_each_entry(node, &crashdd_list, list) {
+ if (!strcmp(node->proc_node->name, name)) {
+ atomic_inc(&node->refcnt);
+ found = 1;
+ break;
+ }
+ }
+ mutex_unlock(&crashdd_mutex);
+
+ if (found)
+ return node;
+
+ /* No driver with @name found */
+ return NULL;
+}
+
+/**
+ * crashdd_put_driver - put an exisiting proc driver context.
+ * @node: driver proc context
+ *
+ * Decrement @node reference count. If there are no dumps left under it,
+ * delete the proc directory and remove it from the global list.
+ */
+static void crashdd_put_driver(struct crashdd_driver_node *node)
+{
+ mutex_lock(&crashdd_mutex);
+ if (atomic_dec_and_test(&node->refcnt)) {
+ /* Delete @node driver context if it has no dumps under it */
+ crashdd_proc_del(node->proc_node);
+ node->proc_node = NULL;
+ list_del(&node->list);
+ }
+ mutex_unlock(&crashdd_mutex);
+}
+
+/**
+ * crashdd_add_dump - Allocate a directory under /proc/crashdd/ and add the
+ * dump to it.
+ * @driver_name: directory name under which the dump should be added.
+ * @data: dump info.
+ *
+ * Search for @driver_name directory under /proc/crashdd/. If not found,
+ * allocate a new directory under /proc/crashdd/ with @driver_name.
+ * Allocate the dump context and invoke the calling driver's dump collect
+ * routine. Once collection is done, add the dump under
+ * /proc/crashdd/@driver_name/ directory.
+ */
+int crashdd_add_dump(const char *driver_name, struct crashdd_data *data)
+{
+ struct crashdd_driver_node *node;
+ struct crashdd_dump_node *dump;
+ void *buf = NULL;
+ int ret;
+
+ if (!driver_name || !strlen(driver_name) ||
+ !data || !strlen(data->name) ||
+ !data->crashdd_callback || !data->size)
+ return -EINVAL;
+
+ /* Get a driver proc context with specified name. */
+ node = crashdd_get_driver(driver_name);
+ if (!node) {
+ /* No driver proc context found with specified name.
+ * So create a new one
+ */
+ node = crashdd_init_driver(driver_name);
+ if (!node)
+ return -ENOMEM;
+ }
+
+ dump = vzalloc(sizeof(*dump));
+ if (!dump) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ /* Allocate buffer for driver's to write their dumps */
+ buf = vzalloc(data->size);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ /* Allocate a proc file under /proc/crashdd/@driver_name/.
+ * Also set the dump as proc file's data
+ */
+ dump->proc_node = crashdd_proc_add(node->proc_node, data->name, dump);
+ if (!dump->proc_node) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ /* Invoke the driver's dump collection routing */
+ ret = data->crashdd_callback(data, buf);
+ if (ret)
+ goto out_err;
+
+ dump->buf = buf;
+ dump->size = data->size;
+ dump->proc_node->size = dump->size;
+
+ /* Add the dump to driver proc context list */
+ mutex_lock(&crashdd_mutex);
+ list_add_tail(&dump->list, &node->dump_list);
+ atomic_inc(&node->refcnt);
+ mutex_unlock(&crashdd_mutex);
+
+ /* Return back the driver proc context reference */
+ crashdd_put_driver(node);
+ return 0;
+
+out_err:
+ if (buf)
+ vfree(buf);
+
+ if (dump) {
+ if (dump->proc_node) {
+ crashdd_proc_del(dump->proc_node);
+ dump->proc_node = NULL;
+ }
+ vfree(dump);
+ }
+
+ crashdd_put_driver(node);
+ return ret;
+}
+EXPORT_SYMBOL(crashdd_add_dump);
+
+/* Init function for crash driver dump module. */
+static int __init crashdd_proc_init(void)
+{
+ /*
+ * Only export this directory in 2nd kernel.
+ */
+ if (!is_kdump_kernel())
+ return 0;
+
+ /* Create /proc/crashdd/ directory */
+ proc_crashdd = proc_mkdir_mode("crashdd", CRASHDD_PROC_PERM, NULL);
+ if (!proc_crashdd)
+ return -ENOMEM;
+
+ return 0;
+}
+fs_initcall(crashdd_proc_init);
diff --git a/include/linux/crashdd.h b/include/linux/crashdd.h
new file mode 100644
index 000000000000..1f1ec280a2bb
--- /dev/null
+++ b/include/linux/crashdd.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef CRASH_DRIVER_DUMP_H
+#define CRASH_DRIVER_DUMP_H
+
+/* Max driver/dump name length */
+#define CRASHDD_NAME_LENGTH 32
+
+/* Dump proc context internal to crashdd */
+struct crashdd_dump_node {
+ /* Pointer to list of dumps under the driver proc context */
+ struct list_head list;
+ void *buf; /* Buffer containing device's dump */
+ unsigned long size; /* Size of the buffer */
+ /* Pointer to dump's entry in driver directory */
+ struct proc_dir_entry *proc_node;
+};
+
+/* Driver proc context internal to crashdd */
+struct crashdd_driver_node {
+ /* Pointer to global list of driver proc contexts */
+ struct list_head list;
+ struct list_head dump_list; /* List of dumps under this driver */
+ atomic_t refcnt; /* Number of dumps under this directory */
+ /* Pointer to driver directory entry */
+ struct proc_dir_entry *proc_node;
+};
+
+/* Driver Dump information to be filled by drivers */
+struct crashdd_data {
+ char name[CRASHDD_NAME_LENGTH]; /* Unique name of the dump */
+ unsigned long size; /* Size of the dump */
+ /* Driver's registered callback to be invoked to collect dump */
+ int (*crashdd_callback)(struct crashdd_data *data, void *buf);
+};
+
+#ifdef CONFIG_PROC_CRASH_DRIVER_DUMP
+int crashdd_add_dump(const char *driver_name, struct crashdd_data *data);
+#else
+#define crashdd_add_dump(x, y) 0
+#endif /* CONFIG_PROC_CRASH_DRIVER_DUMP */
+
+#endif /* CRASH_DRIVER_DUMP_H */
--
2.14.1
^ permalink raw reply related [flat|nested] 6+ messages in thread* [RFC v2 2/2] cxgb4: collect hardware dump in second kernel
2018-03-16 11:12 [RFC v2 0/2] kernel: add support to collect hardware logs in crash recovery kernel Rahul Lakkireddy
2018-03-16 11:12 ` [RFC v2 1/2] proc/crashdd: add API to collect hardware dump in second kernel Rahul Lakkireddy
@ 2018-03-16 11:12 ` Rahul Lakkireddy
2018-03-19 7:55 ` [RFC v2 0/2] kernel: add support to collect hardware logs in crash recovery kernel Rahul Lakkireddy
2 siblings, 0 replies; 6+ messages in thread
From: Rahul Lakkireddy @ 2018-03-16 11:12 UTC (permalink / raw)
To: linux-kernel, netdev, kexec
Cc: davem, ebiederm, akpm, torvalds, ganeshgr, nirranjan, indranil,
Rahul Lakkireddy
Register callback to collect hardware/firmware dumps in second kernel
before hardware/firmware is initialized. The dumps for each device
will be available under /proc/crashdd/cxgb4/ directory in second
kernel.
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
v2:
- Updated dump registration to the new API in patch 1.
drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 4 ++++
drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 25 ++++++++++++++++++++++++
drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h | 3 +++
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 12 ++++++++++++
4 files changed, 44 insertions(+)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index b2df0ffb7c94..00201e98ad19 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -50,6 +50,7 @@
#include <linux/net_tstamp.h>
#include <linux/ptp_clock_kernel.h>
#include <linux/ptp_classify.h>
+#include <linux/crashdd.h>
#include <asm/io.h>
#include "t4_chip_type.h"
#include "cxgb4_uld.h"
@@ -959,6 +960,9 @@ struct adapter {
/* HMA */
struct hma_data hma;
+
+ /* Dump buffer for collecting logs in kdump kernel */
+ struct crashdd_data crashdd_buf;
};
/* Support for "sched-class" command to allow a TX Scheduling Class to be
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
index 143686c60234..ce9f544781af 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
@@ -488,3 +488,28 @@ void cxgb4_init_ethtool_dump(struct adapter *adapter)
adapter->eth_dump.version = adapter->params.fw_vers;
adapter->eth_dump.len = 0;
}
+
+static int cxgb4_cudbg_crashdd_collect(struct crashdd_data *data, void *buf)
+{
+ struct adapter *adap = container_of(data, struct adapter, crashdd_buf);
+ u32 len = data->size;
+
+ return cxgb4_cudbg_collect(adap, buf, &len, CXGB4_ETH_DUMP_ALL);
+}
+
+int cxgb4_cudbg_crashdd_add_dump(struct adapter *adap)
+{
+ struct crashdd_data *data = &adap->crashdd_buf;
+ u32 len;
+
+ len = sizeof(struct cudbg_hdr) +
+ sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY;
+ len += CUDBG_DUMP_BUFF_SIZE;
+
+ data->size = len;
+ snprintf(data->name, sizeof(data->name), "%s_%s", cxgb4_driver_name,
+ adap->name);
+ data->crashdd_callback = cxgb4_cudbg_crashdd_collect;
+
+ return crashdd_add_dump(cxgb4_driver_name, data);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
index ce1ac9a1c878..095c6f04357e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
@@ -41,8 +41,11 @@ enum CXGB4_ETHTOOL_DUMP_FLAGS {
CXGB4_ETH_DUMP_HW = (1 << 1), /* various FW and HW dumps */
};
+#define CXGB4_ETH_DUMP_ALL (CXGB4_ETH_DUMP_MEM | CXGB4_ETH_DUMP_HW)
+
u32 cxgb4_get_dump_length(struct adapter *adap, u32 flag);
int cxgb4_cudbg_collect(struct adapter *adap, void *buf, u32 *buf_size,
u32 flag);
void cxgb4_init_ethtool_dump(struct adapter *adapter);
+int cxgb4_cudbg_crashdd_add_dump(struct adapter *adap);
#endif /* __CXGB4_CUDBG_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 5a349e1576cb..6a380bee85b8 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5484,6 +5484,18 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
if (err)
goto out_free_adapter;
+ if (is_kdump_kernel()) {
+ /* Collect hardware state and append to /proc/crashdd/cxgb4/
+ * directory
+ */
+ err = cxgb4_cudbg_crashdd_add_dump(adapter);
+ if (err) {
+ dev_warn(adapter->pdev_dev,
+ "Fail collecting crash driver dump, err: %d. Continuing\n",
+ err);
+ err = 0;
+ }
+ }
if (!is_t4(adapter->params.chip)) {
s_qpp = (QUEUESPERPAGEPF0_S +
--
2.14.1
^ permalink raw reply related [flat|nested] 6+ messages in thread* Re: [RFC v2 0/2] kernel: add support to collect hardware logs in crash recovery kernel
2018-03-16 11:12 [RFC v2 0/2] kernel: add support to collect hardware logs in crash recovery kernel Rahul Lakkireddy
2018-03-16 11:12 ` [RFC v2 1/2] proc/crashdd: add API to collect hardware dump in second kernel Rahul Lakkireddy
2018-03-16 11:12 ` [RFC v2 2/2] cxgb4: " Rahul Lakkireddy
@ 2018-03-19 7:55 ` Rahul Lakkireddy
2018-03-19 15:22 ` Stephen Hemminger
2 siblings, 1 reply; 6+ messages in thread
From: Rahul Lakkireddy @ 2018-03-19 7:55 UTC (permalink / raw)
To: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
kexec@lists.infradead.org
Cc: davem@davemloft.net, ebiederm@xmission.com,
akpm@linux-foundation.org, torvalds@linux-foundation.org,
Ganesh GR, Nirranjan Kirubaharan, Indranil Choudhury
On Friday, March 03/16/18, 2018 at 16:42:03 +0530, Rahul Lakkireddy wrote:
> On production servers running variety of workloads over time, kernel
> panic can happen sporadically after days or even months. It is
> important to collect as much debug logs as possible to root cause
> and fix the problem, that may not be easy to reproduce. Snapshot of
> underlying hardware/firmware state (like register dump, firmware
> logs, adapter memory, etc.), at the time of kernel panic will be very
> helpful while debugging the culprit device driver.
>
> This series of patches add new generic framework that enable device
> drivers to collect device specific snapshot of the hardware/firmware
> state of the underlying device in the crash recovery kernel. In crash
> recovery kernel, the collected logs are exposed via /proc/crashdd/
> directory, which is copied by user space scripts for post-analysis.
>
> A kernel module crashdd is newly added. In crash recovery kernel,
> crashdd exposes /proc/crashdd/ directory containing device specific
> hardware/firmware logs.
>
> The sequence of actions done by device drivers to append their device
> specific hardware/firmware logs to /proc/crashdd/ directory are as
> follows:
>
> 1. During probe (before hardware is initialized), device drivers
> register to the crashdd module (via crashdd_add_dump()), with
> callback function, along with buffer size and log name needed for
> firmware/hardware log collection.
>
> 2. Crashdd creates a driver's directory under /proc/crashdd/<driver>.
> Then, it allocates the buffer with requested size and invokes the
> device driver's registered callback function.
>
> 3. Device driver collects all hardware/firmware logs into the buffer
> and returns control back to crashdd.
>
> 4. Crashdd exposes the buffer as a file via
> /proc/crashdd/<driver>/<dump_file>.
>
> 5. User space script (/usr/lib/kdump/kdump-lib-initramfs.sh) copies
> the entire /proc/crashdd/ directory to /var/crash/ directory.
>
> Patch 1 adds crashdd module to allow drivers to register callback to
> collect the device specific hardware/firmware logs. The module also
> exports /proc/crashdd/ directory containing the hardware/firmware logs.
>
> Patch 2 shows a cxgb4 driver example using the API to collect
> hardware/firmware logs in crash recovery kernel, before hardware is
> initialized. The logs for the devices are made available under
> /proc/crashdd/cxgb4/ directory.
>
> Suggestions and feedback will be much appreciated.
>
> Thanks,
> Rahul
>
> RFC v1: https://www.spinics.net/lists/netdev/msg486562.html
>
> ---
> v2:
> - Added new crashdd module that exports /proc/crashdd/ containing
> driver's registered hardware/firmware logs in patch 1.
> - Replaced the API to allow drivers to register their hardware/firmware
> log collect routine in crash recovery kernel in patch 1.
> - Updated patch 2 to use the new API in patch 1.
>
> Rahul Lakkireddy (2):
> proc/crashdd: add API to collect hardware dump in second kernel
> cxgb4: collect hardware dump in second kernel
>
> drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 4 +
> drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 25 +++
> drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h | 3 +
> drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 12 ++
> fs/proc/Kconfig | 11 +
> fs/proc/Makefile | 1 +
> fs/proc/crashdd.c | 263 +++++++++++++++++++++++
> include/linux/crashdd.h | 43 ++++
> 8 files changed, 362 insertions(+)
> create mode 100644 fs/proc/crashdd.c
> create mode 100644 include/linux/crashdd.h
>
> --
> 2.14.1
>
Does anyone have any comments with this approach? If there are no
comments, then I'll re-spin this RFC to Patch series.
Thanks,
Rahul
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC v2 0/2] kernel: add support to collect hardware logs in crash recovery kernel
2018-03-19 7:55 ` [RFC v2 0/2] kernel: add support to collect hardware logs in crash recovery kernel Rahul Lakkireddy
@ 2018-03-19 15:22 ` Stephen Hemminger
2018-03-20 13:30 ` Rahul Lakkireddy
0 siblings, 1 reply; 6+ messages in thread
From: Stephen Hemminger @ 2018-03-19 15:22 UTC (permalink / raw)
To: Rahul Lakkireddy
Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
kexec@lists.infradead.org, davem@davemloft.net,
ebiederm@xmission.com, akpm@linux-foundation.org,
torvalds@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
Indranil Choudhury
On Mon, 19 Mar 2018 13:25:56 +0530
Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:
> On Friday, March 03/16/18, 2018 at 16:42:03 +0530, Rahul Lakkireddy wrote:
> > On production servers running variety of workloads over time, kernel
> > panic can happen sporadically after days or even months. It is
> > important to collect as much debug logs as possible to root cause
> > and fix the problem, that may not be easy to reproduce. Snapshot of
> > underlying hardware/firmware state (like register dump, firmware
> > logs, adapter memory, etc.), at the time of kernel panic will be very
> > helpful while debugging the culprit device driver.
> >
> > This series of patches add new generic framework that enable device
> > drivers to collect device specific snapshot of the hardware/firmware
> > state of the underlying device in the crash recovery kernel. In crash
> > recovery kernel, the collected logs are exposed via /proc/crashdd/
> > directory, which is copied by user space scripts for post-analysis.
> >
> > A kernel module crashdd is newly added. In crash recovery kernel,
> > crashdd exposes /proc/crashdd/ directory containing device specific
> > hardware/firmware logs.
> >
> > The sequence of actions done by device drivers to append their device
> > specific hardware/firmware logs to /proc/crashdd/ directory are as
> > follows:
> >
> > 1. During probe (before hardware is initialized), device drivers
> > register to the crashdd module (via crashdd_add_dump()), with
> > callback function, along with buffer size and log name needed for
> > firmware/hardware log collection.
> >
> > 2. Crashdd creates a driver's directory under /proc/crashdd/<driver>.
> > Then, it allocates the buffer with requested size and invokes the
> > device driver's registered callback function.
> >
> > 3. Device driver collects all hardware/firmware logs into the buffer
> > and returns control back to crashdd.
> >
> > 4. Crashdd exposes the buffer as a file via
> > /proc/crashdd/<driver>/<dump_file>.
> >
> > 5. User space script (/usr/lib/kdump/kdump-lib-initramfs.sh) copies
> > the entire /proc/crashdd/ directory to /var/crash/ directory.
> >
> > Patch 1 adds crashdd module to allow drivers to register callback to
> > collect the device specific hardware/firmware logs. The module also
> > exports /proc/crashdd/ directory containing the hardware/firmware logs.
> >
> > Patch 2 shows a cxgb4 driver example using the API to collect
> > hardware/firmware logs in crash recovery kernel, before hardware is
> > initialized. The logs for the devices are made available under
> > /proc/crashdd/cxgb4/ directory.
> >
> > Suggestions and feedback will be much appreciated.
> >
> > Thanks,
> > Rahul
> >
> > RFC v1: https://www.spinics.net/lists/netdev/msg486562.html
> >
> > ---
> > v2:
> > - Added new crashdd module that exports /proc/crashdd/ containing
> > driver's registered hardware/firmware logs in patch 1.
> > - Replaced the API to allow drivers to register their hardware/firmware
> > log collect routine in crash recovery kernel in patch 1.
> > - Updated patch 2 to use the new API in patch 1.
> >
> > Rahul Lakkireddy (2):
> > proc/crashdd: add API to collect hardware dump in second kernel
> > cxgb4: collect hardware dump in second kernel
> >
> > drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 4 +
> > drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 25 +++
> > drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h | 3 +
> > drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 12 ++
> > fs/proc/Kconfig | 11 +
> > fs/proc/Makefile | 1 +
> > fs/proc/crashdd.c | 263 +++++++++++++++++++++++
> > include/linux/crashdd.h | 43 ++++
> > 8 files changed, 362 insertions(+)
> > create mode 100644 fs/proc/crashdd.c
> > create mode 100644 include/linux/crashdd.h
> >
> > --
> > 2.14.1
> >
>
> Does anyone have any comments with this approach? If there are no
> comments, then I'll re-spin this RFC to Patch series.
>
> Thanks,
> Rahul
This does look like it gives useful data, but it is not clear that this can
not already be done with existing API's or small extensions.
Introducing a new /proc interface and one that is mostly device specific is
unlikely to be greeted with a warm reception by the current Linux kernel community.
For example, getting firmware logs seems like something more related to
ethtool or sysfs.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [RFC v2 0/2] kernel: add support to collect hardware logs in crash recovery kernel
2018-03-19 15:22 ` Stephen Hemminger
@ 2018-03-20 13:30 ` Rahul Lakkireddy
0 siblings, 0 replies; 6+ messages in thread
From: Rahul Lakkireddy @ 2018-03-20 13:30 UTC (permalink / raw)
To: Stephen Hemminger
Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
kexec@lists.infradead.org, davem@davemloft.net,
ebiederm@xmission.com, akpm@linux-foundation.org,
torvalds@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
Indranil Choudhury
On Monday, March 03/19/18, 2018 at 20:52:11 +0530, Stephen Hemminger wrote:
> On Mon, 19 Mar 2018 13:25:56 +0530
> Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:
>
> > On Friday, March 03/16/18, 2018 at 16:42:03 +0530, Rahul Lakkireddy wrote:
> > > On production servers running variety of workloads over time, kernel
> > > panic can happen sporadically after days or even months. It is
> > > important to collect as much debug logs as possible to root cause
> > > and fix the problem, that may not be easy to reproduce. Snapshot of
> > > underlying hardware/firmware state (like register dump, firmware
> > > logs, adapter memory, etc.), at the time of kernel panic will be very
> > > helpful while debugging the culprit device driver.
> > >
> > > This series of patches add new generic framework that enable device
> > > drivers to collect device specific snapshot of the hardware/firmware
> > > state of the underlying device in the crash recovery kernel. In crash
> > > recovery kernel, the collected logs are exposed via /proc/crashdd/
> > > directory, which is copied by user space scripts for post-analysis.
> > >
> > > A kernel module crashdd is newly added. In crash recovery kernel,
> > > crashdd exposes /proc/crashdd/ directory containing device specific
> > > hardware/firmware logs.
> > >
> > > The sequence of actions done by device drivers to append their device
> > > specific hardware/firmware logs to /proc/crashdd/ directory are as
> > > follows:
> > >
> > > 1. During probe (before hardware is initialized), device drivers
> > > register to the crashdd module (via crashdd_add_dump()), with
> > > callback function, along with buffer size and log name needed for
> > > firmware/hardware log collection.
> > >
> > > 2. Crashdd creates a driver's directory under /proc/crashdd/<driver>.
> > > Then, it allocates the buffer with requested size and invokes the
> > > device driver's registered callback function.
> > >
> > > 3. Device driver collects all hardware/firmware logs into the buffer
> > > and returns control back to crashdd.
> > >
> > > 4. Crashdd exposes the buffer as a file via
> > > /proc/crashdd/<driver>/<dump_file>.
> > >
> > > 5. User space script (/usr/lib/kdump/kdump-lib-initramfs.sh) copies
> > > the entire /proc/crashdd/ directory to /var/crash/ directory.
> > >
> > > Patch 1 adds crashdd module to allow drivers to register callback to
> > > collect the device specific hardware/firmware logs. The module also
> > > exports /proc/crashdd/ directory containing the hardware/firmware logs.
> > >
> > > Patch 2 shows a cxgb4 driver example using the API to collect
> > > hardware/firmware logs in crash recovery kernel, before hardware is
> > > initialized. The logs for the devices are made available under
> > > /proc/crashdd/cxgb4/ directory.
> > >
> > > Suggestions and feedback will be much appreciated.
> > >
> > > Thanks,
> > > Rahul
> > >
> > > RFC v1: https://www.spinics.net/lists/netdev/msg486562.html
> > >
> > > ---
> > > v2:
> > > - Added new crashdd module that exports /proc/crashdd/ containing
> > > driver's registered hardware/firmware logs in patch 1.
> > > - Replaced the API to allow drivers to register their hardware/firmware
> > > log collect routine in crash recovery kernel in patch 1.
> > > - Updated patch 2 to use the new API in patch 1.
> > >
> > > Rahul Lakkireddy (2):
> > > proc/crashdd: add API to collect hardware dump in second kernel
> > > cxgb4: collect hardware dump in second kernel
> > >
> > > drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 4 +
> > > drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 25 +++
> > > drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h | 3 +
> > > drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 12 ++
> > > fs/proc/Kconfig | 11 +
> > > fs/proc/Makefile | 1 +
> > > fs/proc/crashdd.c | 263 +++++++++++++++++++++++
> > > include/linux/crashdd.h | 43 ++++
> > > 8 files changed, 362 insertions(+)
> > > create mode 100644 fs/proc/crashdd.c
> > > create mode 100644 include/linux/crashdd.h
> > >
> > > --
> > > 2.14.1
> > >
> >
> > Does anyone have any comments with this approach? If there are no
> > comments, then I'll re-spin this RFC to Patch series.
> >
> > Thanks,
> > Rahul
>
> This does look like it gives useful data, but it is not clear that this can
> not already be done with existing API's or small extensions.
>
> Introducing a new /proc interface and one that is mostly device specific is
> unlikely to be greeted with a warm reception by the current Linux kernel community.
>
> For example, getting firmware logs seems like something more related to
> ethtool or sysfs.
The /proc/crashdd/ is only exposed in 2nd (crash recovery) kernel.
This is similar to /proc/vmcore. Since vmcore is exported via
/proc/ interface, I've exported the device specific logs also via
/proc/crashdd/.
I can change to sysfs if that is the preferred approach. Does
/sys/crashdd/ sound good?
Thanks,
Rahul
^ permalink raw reply [flat|nested] 6+ messages in thread