* [PATCH 2/5] crash: add kernel crash watchdog library
2025-04-20 5:48 [PATCH 0/5] crash: provide a crash watchdog Luis Chamberlain
2025-04-20 5:48 ` [PATCH 1/5] systemd-remote: use ip address for systemd-remote journal Luis Chamberlain
@ 2025-04-20 5:48 ` Luis Chamberlain
2025-04-20 5:48 ` [PATCH 3/5] fstests_watchdog.py: use the new " Luis Chamberlain
` (3 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Luis Chamberlain @ 2025-04-20 5:48 UTC (permalink / raw)
To: Chuck Lever, Daniel Gomez, kdevops; +Cc: Luis Chamberlain
This adds a rebost kernel watchdog library which can be used by
workflows in their own customized watchdogs. We will later implement
a generic watchdog for simple workflows which don't need much
information on CIs.
We use the crash/ directory to place snippets of:
* kernel crashes
* unexpected filesystem corruptions
* kernel warnings
And if we can, we provided the decoded version of the file.
This can be used later for commit logs into kdevops-results-archive.
This watchdog also implements support for automatically resetting
a host if it can, and so it provides a complete alternative to the
old kernel-ci bash shell stuff we had. For now we only support resetting
hosts if they are libvirt / guestfs. After we reset a guest we also
wait for them to come back up.
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
scripts/workflows/lib/crash.py | 724 +++++++++++++++++++++++++++++++++
1 file changed, 724 insertions(+)
create mode 100755 scripts/workflows/lib/crash.py
diff --git a/scripts/workflows/lib/crash.py b/scripts/workflows/lib/crash.py
new file mode 100755
index 000000000000..82c2b0bb8372
--- /dev/null
+++ b/scripts/workflows/lib/crash.py
@@ -0,0 +1,724 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: copyleft-next-0.3.1
+
+"""
+This module implements a kernel crash watchdog to detect kernel crashes in hosts
+and collect crash information using journalctl.
+"""
+
+import os
+import sys
+import subprocess
+import re
+import logging
+import argparse
+import yaml
+from datetime import datetime, timedelta
+from pathlib import Path
+import glob
+import pwd
+import getpass
+
+# Configure logging
+logging.basicConfig(
+ level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger("crash_watchdog")
+
+EXTRA_VARS_FILE = "extra_vars.yaml"
+REMOTE_JOURNAL_DIR = "/var/log/journal/remote"
+
+
+class KernelCrashWatchdog:
+ CRASH_PATTERNS = [
+ r"Kernel panic",
+ r"BUG:",
+ r"Oops:",
+ r"general protection fault",
+ r"Unable to handle kernel",
+ r"divide error",
+ r"kernel BUG at",
+ r"UBSAN:",
+ r"kernel stack overflow",
+ r"Kernel offset leak",
+ r"RIP:",
+ r"segfault at",
+ r"kernel thread",
+ r"detected stall on CPU",
+ r"soft lockup",
+ r"watchdog: BUG: soft lockup",
+ r"hung_task: blocked tasks",
+ r"NMI backtrace",
+ r"Call Trace",
+ r"Stack:",
+ r"nfs: server .* not responding",
+ r"INFO: task .* blocked for more than \\d+ seconds",
+ ]
+
+ BENIGN_WARNINGS = [
+ r"Spectre V2 : WARNING: Unprivileged eBPF is enabled",
+ r"WARNING: CPU: \d+ PID: \d+ at (net|drivers|security|arch)/.*spectre",
+ r"WARNING: You are running an unsupported configuration",
+ r"WARNING: Support for unprivileged eBPF will be removed soon",
+ ]
+
+ FILESYSTEM_CORRUPTION_PATTERNS = [
+ # General
+ r"Filesystem corruption detected",
+ r"Corrupted directory entry",
+ r"bad inode",
+ r"I/O error",
+ r"failed to read block",
+ r"journal commit I/O error",
+ # XFS
+ r"XFS: Internal error",
+ r"XFS \(.+\): Corruption detected",
+ r"XFS \(.+\): Metadata corruption",
+ r"XFS \(.+\): bad magic number",
+ r"XFS \(.+\): Unrecoverable I/O failure",
+ r"XFS \(.+\): Attempted to access beyond EOF",
+ r"XFS \(.+\): Log inconsistent",
+ r"XFS \(.+\): Inode .+ has inconsistent extent state",
+ r"XFS \(.+\): AGF has mismatched freelist count",
+ r"XFS \(.+\): Log recovery failed",
+ r"XFS: Assertion failed:",
+ # Btrfs
+ r"BTRFS error",
+ r"BTRFS critical",
+ r"BTRFS: corruption",
+ r"BTRFS: device label .+ lost",
+ r"BTRFS: unable to find logical",
+ r"BTRFS: failed to read",
+ r"BTRFS: parent transid verify failed",
+ r"BTRFS: inode corruption",
+ r"BTRFS: checksum verify failed",
+ r"BTRFS: block group .+ bad",
+ r"BTRFS: Transaction aborted",
+ r"BTRFS: tree block corruption detected",
+ ]
+
+ # List of xfstests that use _require_scratch_nocheck (intentional corruption)
+ INTENTIONAL_CORRUPTION_TESTS = [
+ "btrfs/011",
+ "btrfs/012",
+ "btrfs/027",
+ "btrfs/060",
+ "btrfs/061",
+ "btrfs/062",
+ "btrfs/063",
+ "btrfs/064",
+ "btrfs/065",
+ "btrfs/066",
+ "btrfs/067",
+ "btrfs/068",
+ "btrfs/069",
+ "btrfs/070",
+ "btrfs/071",
+ "btrfs/072",
+ "btrfs/073",
+ "btrfs/074",
+ "btrfs/080",
+ "btrfs/136",
+ "btrfs/196",
+ "btrfs/207",
+ "btrfs/254",
+ "btrfs/290",
+ "btrfs/321",
+ "ext4/002",
+ "ext4/025",
+ "ext4/033",
+ "ext4/037",
+ "ext4/040",
+ "ext4/041",
+ "ext4/054",
+ "ext4/055",
+ "generic/050",
+ "generic/311",
+ "generic/321",
+ "generic/322",
+ "generic/338",
+ "generic/347",
+ "generic/405",
+ "generic/455",
+ "generic/461",
+ "generic/464",
+ "generic/466",
+ "generic/482",
+ "generic/484",
+ "generic/487",
+ "generic/500",
+ "generic/520",
+ "generic/556",
+ "generic/563",
+ "generic/570",
+ "generic/590",
+ "generic/623",
+ "generic/740",
+ "generic/749",
+ "generic/757",
+ "overlay/005",
+ "overlay/010",
+ "overlay/014",
+ "overlay/019",
+ "overlay/031",
+ "overlay/035",
+ "overlay/036",
+ "overlay/037",
+ "overlay/038",
+ "overlay/041",
+ "overlay/043",
+ "overlay/044",
+ "overlay/045",
+ "overlay/046",
+ "overlay/049",
+ "overlay/051",
+ "overlay/053",
+ "overlay/055",
+ "overlay/056",
+ "overlay/057",
+ "overlay/059",
+ "overlay/060",
+ "overlay/065",
+ "overlay/067",
+ "overlay/069",
+ "overlay/070",
+ "overlay/071",
+ "overlay/077",
+ "overlay/079",
+ "overlay/080",
+ "overlay/083",
+ "overlay/084",
+ "overlay/085",
+ "overlay/086",
+ "overlay/087",
+ "xfs/001",
+ "xfs/002",
+ "xfs/005",
+ "xfs/045",
+ "xfs/049",
+ "xfs/058",
+ "xfs/070",
+ "xfs/076",
+ "xfs/081",
+ "xfs/115",
+ "xfs/132",
+ "xfs/133",
+ "xfs/134",
+ "xfs/154",
+ "xfs/155",
+ "xfs/157",
+ "xfs/162",
+ "xfs/179",
+ "xfs/202",
+ "xfs/205",
+ "xfs/270",
+ "xfs/306",
+ "xfs/310",
+ "xfs/424",
+ "xfs/438",
+ "xfs/439",
+ "xfs/448",
+ "xfs/449",
+ "xfs/490",
+ "xfs/493",
+ "xfs/495",
+ "xfs/500",
+ "xfs/503",
+ "xfs/504",
+ "xfs/506",
+ "xfs/516",
+ "xfs/520",
+ "xfs/521",
+ "xfs/522",
+ "xfs/523",
+ "xfs/524",
+ "xfs/525",
+ "xfs/526",
+ "xfs/528",
+ "xfs/530",
+ "xfs/533",
+ "xfs/546",
+ "xfs/569",
+ "xfs/601",
+ "xfs/602",
+ "xfs/603",
+ "xfs/608",
+ "xfs/798",
+ ]
+
+ def __init__(
+ self,
+ host_name=None,
+ output_dir="crashes",
+ full_log=False,
+ decode_crash=True,
+ reset_host=True,
+ save_warnings=False,
+ ):
+ self.host_name = host_name
+ self.output_dir = os.path.join(output_dir, host_name)
+ self.save_warnings = save_warnings
+ self.full_log = full_log
+ self.decode_crash = decode_crash
+ self.should_reset_host = reset_host
+ self.topdir_path = None
+ self.libvirt_provider = False
+ self.libvirt_uri_system = False
+ self.config = {}
+ self.devconfig_enable_systemd_journal_remote = False
+ self.kdevops_enable_guestfs = False
+
+ self.is_an_fstests = False
+ self.current_test_id = None
+ self.unexpected_corrupting_tests = set()
+ self.test_logs = {}
+ self.intentional_corruption_tests_seen = set()
+
+ try:
+ with open(EXTRA_VARS_FILE, "r") as f:
+ self.config = yaml.safe_load(f)
+ self.devconfig_enable_systemd_journal_remote = self.config.get(
+ "devconfig_enable_systemd_journal_remote", False
+ )
+ self.kdevops_enable_guestfs = self.config.get(
+ "kdevops_enable_guestfs", False
+ )
+ self.topdir_path = self.config.get("topdir_path")
+ self.libvirt_provider = self.config.get("libvirt_provider", False)
+ self.libvirt_uri_system = self.config.get("libvirt_uri_system", False)
+ except Exception as e:
+ logger.warning(f"Failed to read {EXTRA_VARS_FILE}: {e}")
+
+ def get_host_ip(self):
+ try:
+ result = subprocess.run(
+ ["ssh", "-G", self.host_name],
+ capture_output=True,
+ text=True,
+ check=True,
+ )
+ for line in result.stdout.splitlines():
+ if line.startswith("hostname "):
+ return line.split()[1]
+ except subprocess.SubprocessError as e:
+ logger.warning(f"Failed to resolve IP for {self.host_name}: {e}")
+ return None
+
+ def try_remote_journal(self):
+ ip = self.get_host_ip()
+ if not ip:
+ return None
+
+ journal_file = os.path.join(REMOTE_JOURNAL_DIR, f"remote-{ip}.journal")
+ if not os.path.exists(journal_file):
+ logger.info(
+ f"Remote journal not found for {self.host_name} at {journal_file}"
+ )
+ return None
+
+ try:
+ result = subprocess.run(
+ ["journalctl", "-k", f"--file={journal_file}"],
+ capture_output=True,
+ text=True,
+ timeout=15,
+ )
+ if result.returncode == 0:
+ return result.stdout
+ except subprocess.SubprocessError as e:
+ logger.warning(f"Failed to read remote journal for {self.host_name}: {e}")
+
+ return None
+
+ def convert_console_log(self):
+ ip = self.get_host_ip()
+ if not ip:
+ return None
+
+ console_dir = Path(f"guestfs/{self.host_name}")
+ if not console_dir.exists():
+ return None
+
+ log_files = sorted(console_dir.glob("console.log*"), key=os.path.getmtime)
+ if not log_files:
+ return None
+
+ all_lines = []
+
+ for log_file in log_files:
+ try:
+ # Try reading file, fallback to sudo chown if permission denied
+ try:
+ with open(log_file, "rb") as f:
+ raw = f.readlines()
+ except PermissionError:
+ if getattr(self, "libvirt_uri_system", False):
+ logger.info(f"Fixing permissions for {log_file}")
+ subprocess.run(
+ [
+ "sudo",
+ "chown",
+ f"{getpass.getuser()}:{getpass.getuser()}",
+ str(log_file),
+ ],
+ check=True,
+ )
+ with open(log_file, "rb") as f:
+ raw = f.readlines()
+ else:
+ raise
+
+ all_lines.extend(raw)
+ except Exception as e:
+ logger.warning(f"Failed to read {log_file}: {e}")
+
+ # Decode all lines safely
+ decoded_lines = [
+ l.decode("utf-8", errors="replace").rstrip() for l in all_lines
+ ]
+
+ # Find last Linux version line
+ linux_indices = [
+ i for i, line in enumerate(decoded_lines) if "Linux version" in line
+ ]
+ if not linux_indices:
+ logger.warning(
+ f"No 'Linux version' line found in console logs for {self.host_name}"
+ )
+ return None
+
+ start_index = linux_indices[-1]
+
+ try:
+ btime_output = subprocess.run(
+ ["awk", "/^btime/ {print $2}", "/proc/stat"],
+ capture_output=True,
+ text=True,
+ check=True,
+ )
+ btime = int(btime_output.stdout.strip())
+ boot_time = datetime.fromtimestamp(btime)
+ except Exception as e:
+ logger.warning(f"Failed to get boot time: {e}")
+ return None
+
+ # Convert logs from last boot only
+ converted_lines = []
+ for line in decoded_lines[start_index:]:
+ match = re.match(r"\[\s*(\d+\.\d+)\] (.*)", line)
+ if match:
+ seconds = float(match.group(1))
+ wall_time = boot_time + timedelta(seconds=seconds)
+ timestamp = wall_time.strftime("%b %d %H:%M:%S")
+ converted_lines.append(
+ f"{timestamp} {self.host_name} kernel: {match.group(2)}"
+ )
+ else:
+ converted_lines.append(line)
+
+ return "\n".join(converted_lines)
+
+ def check_host_reachable(self):
+ try:
+ result = subprocess.run(
+ ["ssh", self.host_name, "true"], capture_output=True, timeout=10
+ )
+ return result.returncode == 0
+ except (subprocess.TimeoutExpired, subprocess.SubprocessError):
+ return False
+
+ def collect_journal(self):
+ try:
+ result = subprocess.run(
+ ["ssh", self.host_name, "sudo journalctl -k -b"],
+ capture_output=True,
+ text=True,
+ timeout=30,
+ )
+ if result.returncode == 0:
+ return result.stdout
+ else:
+ logger.error(f"Failed to collect journal: {result.stderr}")
+ return None
+ except (subprocess.TimeoutExpired, subprocess.SubprocessError) as e:
+ logger.error(f"Error collecting journal: {e}")
+ return None
+
+ def detect_warnings(self, log_content):
+ if not log_content:
+ return False
+ benign_regexes = [re.compile(p) for p in self.BENIGN_WARNINGS]
+ detected = []
+ for line in log_content:
+ if "WARNING:" in line:
+ if any(p.search(line) for p in benign_regexes):
+ continue
+ detected.append(line)
+
+ return detected
+
+ def detect_crash(self, log_content):
+ if not log_content:
+ return False
+ for pattern in self.CRASH_PATTERNS:
+ if re.search(pattern, log_content):
+ return True
+ return False
+
+ def detect_filesystem_corruption(self, log_content):
+ if not log_content:
+ return False
+ for pattern in self.FILESYSTEM_CORRUPTION_PATTERNS:
+ if re.search(pattern, log_content):
+ return True
+ return False
+
+ def infer_fstests_state(self, log_content):
+ current_test = None
+ in_fstests = False
+ lines = log_content.split("\n")
+
+ for line in lines:
+ if "run fstests fstestsstart/000" in line:
+ in_fstests = True
+ continue
+ elif "run fstests fstestsdone/000" in line:
+ in_fstests = False
+ current_test = None
+ continue
+ elif in_fstests and "run fstests" in line:
+ match = re.search(r"run fstests (\S+/\d+) ", line)
+ if match:
+ current_test = match.group(1)
+ self.test_logs.setdefault(current_test, [])
+ continue
+
+ if in_fstests and current_test:
+ self.test_logs[current_test].append(line)
+
+ for test, logs in self.test_logs.items():
+ if test in INTENTIONAL_CORRUPTION_TESTS:
+ self.intentional_corruption_tests_seen.add(test)
+ else:
+ for pattern in self.FILESYSTEM_CORRUPTION_PATTERNS:
+ for line in logs:
+ if re.search(pattern, line):
+ self.unexpected_corrupting_tests.add(test)
+ break
+
+ self.is_an_fstests = bool(self.test_logs)
+ if self.test_logs:
+ self.current_test_id = list(self.test_logs.keys())[-1]
+
+ def get_fstests_log(self, test_id):
+ if test_id in self.test_logs:
+ return "\n".join(self.test_logs[test_id])
+ logger.warning(f"Test ID {test_id} not found in log records")
+ return None
+
+ def extract_kernel_snippet(self, log_content):
+ if not log_content:
+ return None
+ if self.full_log:
+ return log_content
+
+ crash_line_idx = -1
+ crash_pattern = None
+ lines = log_content.split("\n")
+
+ for pattern in self.CRASH_PATTERNS + self.FILESYSTEM_CORRUPTION_PATTERNS:
+ for i, line in enumerate(lines):
+ if re.search(pattern, line):
+ crash_line_idx = i
+ crash_pattern = pattern
+ break
+ if crash_line_idx != -1:
+ break
+
+ if crash_line_idx == -1:
+ return None
+
+ start_idx = max(0, crash_line_idx - 5)
+ end_idx = min(len(lines), crash_line_idx + 100)
+ crash_context = "\n".join(lines[start_idx:end_idx])
+ return f"Detected kernel crash ({crash_pattern}):\n\n{crash_context}"
+
+ def decode_log_output(self, log_file):
+ if not self.decode_crash:
+ return
+
+ if not self.topdir_path:
+ return
+
+ decode_script = os.path.join(
+ self.topdir_path, "linux/scripts/decode_stacktrace.sh"
+ )
+ vmlinux_path = os.path.join(self.topdir_path, "linux/vmlinux")
+
+ if not (os.path.exists(decode_script) and os.path.exists(vmlinux_path)):
+ logger.info("Skipping crash decode: required files not found")
+ return
+
+ try:
+ logger.info("Decoding crash log with decode_stacktrace.sh...")
+ base, ext = os.path.splitext(log_file)
+ decoded_file = f"{base}.decoded{ext}"
+ with open(log_file, "r") as log_input, open(
+ decoded_file, "w"
+ ) as log_output:
+ subprocess.run(
+ [decode_script, vmlinux_path],
+ stdin=log_input,
+ stdout=log_output,
+ stderr=subprocess.STDOUT,
+ check=True,
+ )
+ logger.info(f"Decoded kernel log saved to: {decoded_file}")
+ except subprocess.SubprocessError as e:
+ logger.warning(f"Failed to decode kernel log output: {e}")
+
+ def save_log(self, log, context):
+ if not log:
+ return None
+
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+ log_file = os.path.join(self.output_dir, f"journal-{timestamp}.{context}")
+
+ os.makedirs(self.output_dir, exist_ok=True)
+ with open(log_file, "w") as f:
+ f.write(log)
+
+ logger.info(f"{context} log saved to {log_file}")
+ return log_file
+
+ def reset_host_now(self):
+ if not self.should_reset_host:
+ logger.info("Host reset disabled by user")
+ return False
+
+ if self.libvirt_provider:
+ virsh_cmd = ["virsh", "reset", self.host_name]
+ if self.libvirt_uri_system:
+ virsh_cmd.insert(0, "sudo")
+
+ try:
+ result = subprocess.run(virsh_cmd, capture_output=True, text=True)
+ if result.returncode == 0:
+ logger.info(f"Successfully reset host {self.host_name}")
+ return True
+ else:
+ logger.error(f"Failed to reset host: {result.stderr}")
+ return False
+ except subprocess.SubprocessError as e:
+ logger.error(f"Error resetting host: {e}")
+ return False
+ else:
+ logger.warning("Reset for non-libvirt providers is not yet implemented")
+ return False
+
+ def wait_for_ssh(self):
+ logger.info(f"Waiting for {self.host_name} to become reachable via SSH...")
+ try:
+ subprocess.run(
+ [
+ "ansible",
+ "-i",
+ "hosts",
+ "all",
+ "-m",
+ "wait_for_connection",
+ "-l",
+ self.host_name,
+ ],
+ check=True,
+ )
+ logger.info(f"{self.host_name} is now reachable.")
+ except subprocess.CalledProcessError as e:
+ logger.warning(f"Failed to wait for SSH on {self.host_name}: {e}")
+
+ def check_and_reset_host(self, method="auto", get_fstests_log=None):
+ crash_file = None
+ warnings_file = None
+ journal_logs = None
+
+ # 1. Try console log first if guestfs is enabled
+ if method == "console" or (method == "auto" and self.kdevops_enable_guestfs):
+ logger.info(f"Trying console.log fallback for {self.host_name}")
+ journal_logs = self.convert_console_log()
+
+ # 2. Try remote journal if that didn’t work and it's enabled.
+ # If you are using a cloud provider try to get systemd remote journal
+ # devconfig_enable_systemd_journal_remote working so you can leverage
+ # this. Experience seems to be that it may not capture all crashes.
+ if not journal_logs and (
+ method == "remote"
+ or (method == "auto" and self.devconfig_enable_systemd_journal_remote)
+ ):
+ journal_logs = self.try_remote_journal()
+ if journal_logs:
+ logger.info(f"Using remote journal logs for {self.host_name}")
+
+ # 3. Fallback to SSH-based journal access if nothing worked yet
+ if (
+ not journal_logs
+ and (method == "ssh" or method == "auto")
+ and self.check_host_reachable()
+ ):
+ logger.info(f"Trying SSH-based journalctl access for {self.host_name}")
+ journal_logs = self.collect_journal()
+
+ if not journal_logs:
+ logger.warning(f"Unable to collect logs for {self.host_name}, resetting")
+ self.reset_host_now()
+ self.wait_for_ssh()
+ return None, None
+
+ self.infer_fstests_state(journal_logs)
+ if self.save_warnings:
+ warnings = self.detect_warnings(journal_logs)
+ if warnings:
+ os.makedirs(self.output_dir, exist_ok=True)
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+ warnings_file = os.path.join(
+ self.output_dir, f"journal-{timestamp}.warning"
+ )
+ logger.info(
+ f"Saving kernel warnings found for {self.host_name} on {warnings_file}"
+ )
+ with open(warnings_file, "w") as out:
+ out.writelines(warnings)
+ else:
+ logger.info(f"No kernel warnings found for {self.host_name}")
+
+ if get_fstests_log:
+ log_output = self.get_fstests_log(get_fstests_log)
+ if log_output:
+ print(log_output)
+ return None, None
+
+ crash_detected = self.detect_crash(journal_logs)
+ fs_corruption_detected = self.detect_filesystem_corruption(journal_logs)
+
+ if (
+ fs_corruption_detected
+ and self.is_an_fstests
+ and not self.unexpected_corrupting_tests
+ ):
+ fs_corruption_detected = False
+
+ if crash_detected and fs_corruption_detected:
+ issue_context = "crash_and_corruption"
+ elif crash_detected:
+ issue_context = "crash"
+ elif fs_corruption_detected:
+ issue_context = "corruption"
+ else:
+ return None, None
+
+ kernel_snippet = self.extract_kernel_snippet(journal_logs)
+ log_file = self.save_log(kernel_snippet, issue_context)
+ self.decode_log_output(log_file)
+ self.reset_host_now()
+ self.wait_for_ssh()
+
+ return log_file, warnings_file
--
2.47.2
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 3/5] fstests_watchdog.py: use the new crash watchdog library
2025-04-20 5:48 [PATCH 0/5] crash: provide a crash watchdog Luis Chamberlain
2025-04-20 5:48 ` [PATCH 1/5] systemd-remote: use ip address for systemd-remote journal Luis Chamberlain
2025-04-20 5:48 ` [PATCH 2/5] crash: add kernel crash watchdog library Luis Chamberlain
@ 2025-04-20 5:48 ` Luis Chamberlain
2025-04-20 5:48 ` [PATCH 4/5] crash_watchdog.py: add generic crash watchdog Luis Chamberlain
` (2 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Luis Chamberlain @ 2025-04-20 5:48 UTC (permalink / raw)
To: Chuck Lever, Daniel Gomez, kdevops; +Cc: Luis Chamberlain
Make the fstests_watchdog.py use the new crash watchdog library.
Since filesystem CIs are already using this, they will immediate
benefit from gathering the crashes / corruptions / kernel warning
into the crash/ directory, and *also* resetting the hosts once hosed.
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
scripts/workflows/fstests/fstests_watchdog.py | 89 +++++++++----------
1 file changed, 44 insertions(+), 45 deletions(-)
diff --git a/scripts/workflows/fstests/fstests_watchdog.py b/scripts/workflows/fstests/fstests_watchdog.py
index f7408a659e56..3fef54843b9a 100755
--- a/scripts/workflows/fstests/fstests_watchdog.py
+++ b/scripts/workflows/fstests/fstests_watchdog.py
@@ -11,6 +11,7 @@ from datetime import datetime
from lib import kssh
from lib import fstests
from lib import systemd_remote
+from lib.crash import KernelCrashWatchdog
import sys, os, grp
import configparser
import argparse
@@ -22,23 +23,23 @@ def print_fstest_host_status(host, verbose, use_remote, use_ssh, basedir, config
if "CONFIG_WORKFLOW_LINUX_DISTRO" in config:
configured_kernel = "Distro-kernel"
elif "CONFIG_BOOTLINUX_TREE_REF" in config:
- configured_kernel = config["CONFIG_BOOTLINUX_TREE_REF"].strip('\"')
+ configured_kernel = config["CONFIG_BOOTLINUX_TREE_REF"].strip('"')
remote_path = "/var/log/journal/remote/"
kernel = systemd_remote.get_uname(remote_path, host, configured_kernel)
if kernel == configured_kernel:
- kernel += " (inferred)"
+ kernel += " (custom)"
if kernel is None:
sys.stderr.write("No kernel could be identified for host: %s\n" % host)
sys.exit(1)
else:
kernel = kssh.get_uname(host).rstrip()
+
section = fstests.get_section(host, config)
- (last_test, last_test_time, current_time_str, delta_seconds, stall_suspect) = fstests.get_fstest_host(use_remote, use_ssh, host, basedir, kernel, section, config)
- checktime = fstests.get_checktime(host, basedir, kernel, section, last_test)
+ (last_test, last_test_time, current_time_str, delta_seconds, stall_suspect) = \
+ fstests.get_fstest_host(use_remote, use_ssh, host, basedir, kernel, section, config)
- percent_done = 0
- if checktime > 0:
- percent_done = delta_seconds * 100 / checktime
+ checktime = fstests.get_checktime(host, basedir, kernel, section, last_test)
+ percent_done = (delta_seconds * 100 / checktime) if checktime > 0 else 0
stall_str = "OK"
if stall_suspect:
@@ -47,44 +48,43 @@ def print_fstest_host_status(host, verbose, use_remote, use_ssh, basedir, config
else:
stall_str = "Hung-Stalled"
- if last_test is None:
- if verbose:
- sys.stdout.write("Host : %s\n" % (host))
- sys.stdout.write("Last test : None\n")
- else:
- percent_done_str = "%.0f%%" % (0)
- sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % (host, "None", percent_done_str, 0, 0, stall_str, kernel))
- return
+ crash_state = "OK"
+ watchdog = KernelCrashWatchdog(host_name=host,
+ decode_crash=True,
+ reset_host=True,
+ save_warnings=True)
+ crash_file, warning_file = watchdog.check_and_reset_host()
+ if crash_file:
+ crash_state = "CRASH"
+ elif warning_file:
+ crash_state = "WARNING"
if not verbose:
- soak_duration_seconds = 0
- if "CONFIG_FSTESTS_SOAK_DURATION" in config:
- soak_duration_seconds = config["CONFIG_FSTESTS_SOAK_DURATION"].strip('\"')
- soak_duration_seconds = int(soak_duration_seconds)
- uses_soak = fstests.fstests_test_uses_soak_duration(last_test)
+ soak_duration_seconds = int(config.get("CONFIG_FSTESTS_SOAK_DURATION", '0').strip('"'))
+ uses_soak = fstests.fstests_test_uses_soak_duration(last_test or "")
is_soaking = uses_soak and soak_duration_seconds != 0
- soaking_str = ""
- if is_soaking:
- soaking_str = "(soak)"
+ soaking_str = "(soak)" if is_soaking else ""
percent_done_str = "%.0f%% %s" % (percent_done, soaking_str)
- sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % (host, last_test, percent_done_str, str(delta_seconds), str(checktime), stall_str, kernel))
+ if delta_seconds is None:
+ delta_seconds = 0
+ if checktime is None:
+ checktime = 0
+ sys.stdout.write(
+ f"{host:>25} {last_test or 'None':>15} {percent_done_str:>15} "
+ f"{delta_seconds:>12} {checktime:>17} {stall_str:>13} "
+ f"{kernel:<38} {crash_state:<10}\n"
+ )
return
sys.stdout.write("Host : %s\n" % (host))
sys.stdout.write("Last test : %s\n" % (last_test))
sys.stdout.write("Last test time: %s\n" % (last_test_time))
sys.stdout.write("Current system time: %s\n" % (current_time_str))
-
sys.stdout.write("Delta: %d total second\n" % (delta_seconds))
sys.stdout.write("\t%d minutes\n" % (delta_seconds / 60))
sys.stdout.write("\t%d seconds\n" % (delta_seconds % 60))
- sys.stdout.write("Timeout-status: ")
-
- if stall_suspect:
- sys.stdout.write("POSSIBLE-STALL")
- else:
- sys.stdout.write("OK")
- sys.stdout.write("\n")
+ sys.stdout.write("Timeout-status: %s\n" % ("POSSIBLE-STALL" if stall_suspect else "OK"))
+ sys.stdout.write("Crash-status : %s\n" % crash_state)
def _main():
parser = argparse.ArgumentParser(description='fstest-watchdog')
@@ -95,11 +95,11 @@ def _main():
default='baseline',
help='The name of the section to read hosts from')
parser.add_argument('--verbose', const=True, default=False, action="store_const",
- help='Be verbose on otput.')
+ help='Be verbose on output.')
parser.add_argument('--use-systemd-remote', const=True, default=True, action="store_const",
- help='Use use systemd-remote uploaded journals if available')
+ help='Use systemd-remote uploaded journals if available')
parser.add_argument('--use-ssh', const=True, default=False, action="store_const",
- help='Force to only use use ssh for journals.')
+ help='Force to only use ssh for journals.')
args = parser.parse_args()
if not os.path.isfile(args.hostfile):
@@ -114,33 +114,32 @@ def _main():
basedir = os.path.dirname(dotconfig)
remote_group = "systemd-journal-remote"
-
if "CONFIG_DEVCONFIG_ENABLE_SYSTEMD_JOURNAL_REMOTE" in config and not args.use_ssh:
group = grp.getgrnam(remote_group)
if group is not None:
remote_gid = group[2]
if remote_gid not in os.getgrouplist(os.getlogin(), os.getgid()):
- sys.stderr.write("Your username is not part of the group %s\n" %
- remote_group)
+ sys.stderr.write("Your username is not part of the group %s\n" % remote_group)
sys.stderr.write("Fix this and try again")
sys.exit(1)
else:
- sys.stderr.write("The group %s was not found, add Kconfig support for the systemd-remote-journal group used" % remote_group)
- sys.exit(1)
+ sys.stderr.write("The group %s was not found, add Kconfig support for the systemd-remote-journal group used" % remote_group)
+ sys.exit(1)
hosts = fstests.get_hosts(args.hostfile, args.hostsection)
- sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % ("Hostname", "Test-name", "Completion %", "runtime(s)", "last-runtime(s)", "Stall-status", "Kernel"))
+ sys.stdout.write(
+ f"{'Hostname':>25} {'Test-name':>15} {'Completion %':>15} "
+ f"{'runtime(s)':>12} {'last-runtime(s)':>17} {'Stall-status':>13} "
+ f"{'Kernel':<38} {'Crash-status':<10}\n"
+ )
for h in hosts:
print_fstest_host_status(h, args.verbose,
args.use_systemd_remote,
args.use_ssh,
basedir,
config)
- soak_duration_seconds = 0
- if "CONFIG_FSTESTS_SOAK_DURATION" in config:
- soak_duration_seconds = config["CONFIG_FSTESTS_SOAK_DURATION"].strip('\"')
- soak_duration_seconds = int(soak_duration_seconds)
+ soak_duration_seconds = int(config.get("CONFIG_FSTESTS_SOAK_DURATION", '0').strip('"'))
journal_method = "ssh"
if "CONFIG_DEVCONFIG_ENABLE_SYSTEMD_JOURNAL_REMOTE" in config and not args.use_ssh:
journal_method = "systemd-journal-remote"
--
2.47.2
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 4/5] crash_watchdog.py: add generic crash watchdog
2025-04-20 5:48 [PATCH 0/5] crash: provide a crash watchdog Luis Chamberlain
` (2 preceding siblings ...)
2025-04-20 5:48 ` [PATCH 3/5] fstests_watchdog.py: use the new " Luis Chamberlain
@ 2025-04-20 5:48 ` Luis Chamberlain
2025-04-20 5:48 ` [PATCH 5/5] crash_report.py: add a crash report Luis Chamberlain
2025-04-20 15:19 ` [PATCH 0/5] crash: provide a crash watchdog Chuck Lever
5 siblings, 0 replies; 9+ messages in thread
From: Luis Chamberlain @ 2025-04-20 5:48 UTC (permalink / raw)
To: Chuck Lever, Daniel Gomez, kdevops; +Cc: Luis Chamberlain
This can be used by any workflow. Specialized workflows can use the
library and customize it as they see fit to provide CIs more output.
Its easy to forget where the hell the kernel logs are so this also
provides a symlink helper which can be used to get the kernel logs
from a host.
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
scripts/workflows/generic/crash_watchdog.py | 186 ++++++++++++++++++++
scripts/workflows/generic/get_console.py | 1 +
scripts/workflows/generic/lib | 1 +
3 files changed, 188 insertions(+)
create mode 100755 scripts/workflows/generic/crash_watchdog.py
create mode 120000 scripts/workflows/generic/get_console.py
create mode 120000 scripts/workflows/generic/lib
diff --git a/scripts/workflows/generic/crash_watchdog.py b/scripts/workflows/generic/crash_watchdog.py
new file mode 100755
index 000000000000..3860de9d5592
--- /dev/null
+++ b/scripts/workflows/generic/crash_watchdog.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: copyleft-next-0.3.1
+
+"""
+This script is intended to run as a kernel-ci agent. Monitoring for crashes
+and kernel warnings and reseting host after capturing essential information.
+It can also be invoked as 'get_console.py' to retrieve the entire kernel log.
+"""
+
+import os
+import sys
+import subprocess
+import re
+import logging
+import argparse
+import yaml
+from datetime import datetime, timedelta
+from pathlib import Path
+from lib.crash import KernelCrashWatchdog
+
+# Configure logging
+logging.basicConfig(
+ level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger("crash_watchdog")
+
+def get_active_hosts():
+ """Get the list of active hosts from kdevops configuration."""
+ try:
+ # First try to get the hosts from the ansible inventory
+ result = subprocess.run(
+ ["ansible-inventory", "-i", "hosts", "--list"],
+ capture_output=True, text=True, check=True
+ )
+ inventory = yaml.safe_load(result.stdout)
+ hosts = inventory.get("baseline", {}).get("hosts", [])
+ return sorted(set(hosts))
+ except Exception as e:
+ logger.error(f"Error getting active hosts: {e}")
+ return []
+
+def run_crash_watchdog_on_host(args, this_host_name):
+ watchdog = KernelCrashWatchdog(
+ host_name=this_host_name,
+ output_dir=args.output_dir,
+ full_log=args.full_log,
+ decode_crash=not args.no_decode,
+ reset_host=not args.no_reset,
+ save_warnings = args.save_warnings,
+ )
+
+ crashed = False
+ warnings_found = False
+
+ crash_file, warning_file = watchdog.check_and_reset_host(method=args.method, get_fstests_log=args.fstests_log)
+
+ if warning_file:
+ logger.warning(f"Kernel warning and logged to {warning_file}")
+ warnings_found = True
+ elif args.save_warnings:
+ logger.info(f"No kernel warnings detected for host {args.host_name}")
+ if crash_file:
+ crashed = True
+ logger.warning(f"Crash detected and logged to {crash_file}")
+ else:
+ logger.info(f"No crash detected for host {args.host_name}")
+ return crashed, [crash_file], warnings_found, warning_file
+
+def run_crash_watchdog_all_hosts(args):
+ """Check all active hosts for kernel crashes."""
+ hosts = get_active_hosts()
+ crash_detected = False
+ crash_files = []
+ warnings_detected = False
+ warning_files = []
+
+ logger.info(
+ f"Checking {len(hosts)} hosts for kernel crashes: {', '.join(hosts)}"
+ )
+
+ for host in hosts:
+ host_crash_detected, crash_file, host_warnings_detected, warnings_file = run_crash_watchdog_on_host(args, host)
+ if host_crash_detected and crash_file:
+ crash_detected = True
+ crash_files.append(crash_file)
+ logger.info(f"Crash detected in host {host}, logs saved to {crash_file}")
+ if host_warnings_detected and warnings_file:
+ warnings_detected = True
+ warning_files.append(warning_file)
+ logger.info(f"Kernel warning found on host {host}, logs saved to {warning_file}")
+
+ return crash_detected, crash_files, warnings_detected, warning_files
+
+def write_log_section(f, title, files, label):
+ f.write(f"# {title}\n\n")
+ for path in files:
+ f.write(f"- {label} detected: {path}\n")
+ try:
+ with open(path, "r") as content_file:
+ snippet = "".join(content_file.readlines()[:10]) + "\n...(truncated)..."
+ f.write("\n```\n" + snippet + "\n```\n\n")
+ except Exception as e:
+ f.write(f"\nError reading {label.lower()} file: {e}\n\n")
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Detect and handle kernel crashes or kernel warnings in hosts.",
+ epilog="""
+Examples:
+ Detect and reset all hosts a crash was found (default):
+ ./crash_watchdog.py
+
+ Detect and reset host crash only on e3-ext4-2k guest:
+ ./crash_watchdog.py --host-name e3-ext4-2k
+
+ Detect using systemd-remote journal and show full kernel log:
+ ./crash_watchdog.py e3-ext4-2k --method remote --full-log
+
+ Skip decoding and skip reset:
+ ./crash_watchdog.py e3-ext4-2k --no-decode --no-reset
+
+ Just fetch the full kernel log using symlinked name:
+ ln -s crash_watchdog.py get_console.py
+ ./get_console.py e3-ext4-2k
+
+ Use guestfs console log and do not decode:
+ ./crash_watchdog.py e3-ext4-2k --method console --no-decode
+
+ Use SSH to query the live journalctl output:
+ ./crash_watchdog.py e3-ext4-2k --method ssh
+
+ Disable guest reset when using libvirt:
+ ./crash_watchdog.py e3-ext4-2k --no-reset
+
+ Print full kernel logs for a specific fstest (all tests run with it):
+ ./crash_watchdog.py e3-ext4-2k --fstests-log generic/750
+
+ Get all kernel warnings only:
+ ./crash_watchdog.py e3-ext4-2k --method remote --save-warnings sad.warn
+ """,
+ formatter_class=argparse.RawTextHelpFormatter
+ )
+
+ parser.add_argument("--host-name", help="Optional name of the host to check", default="all")
+ parser.add_argument("--output-dir", help="Directory to store crash logs", default="crashes")
+ parser.add_argument(
+ "--method",
+ choices=["auto", "remote", "console", "ssh"],
+ default="auto",
+ help="Choose method to collect logs: auto, remote, console, or ssh"
+ )
+ parser.add_argument("--full-log", action="store_true", help="Get full kernel log instead of only crash context")
+ parser.add_argument("--no-decode", action="store_true", help="Disable decoding crash logs with decode_stacktrace.sh")
+ parser.add_argument("--no-reset", action="store_true", help="Do not reset the guest even if a crash is detected")
+ parser.add_argument("--fstests-log", help="Show all kernel log lines for a specific fstests test ID (e.g., generic/750)")
+ parser.add_argument("--save-warnings", help="Do you want detected and save kernel warnings", default=True)
+ args = parser.parse_args()
+ crash_files = []
+ warnings_files = []
+
+ invoked_name = os.path.basename(sys.argv[0])
+ if invoked_name == "get_console.py":
+ args.no_reset = True
+ args.save_warnings = False
+ args.full_log_mode = True
+
+ if (args.host_name != "all"):
+ crash_detected, crash_files, warnings_detected, warnings_files = run_crash_watchdog_on_host(args, args.host_name)
+ else:
+ crash_detected, crash_files, warnings_detected, warnings_files = run_crash_watchdog_all_hosts(args)
+
+ if warnings_detected:
+ logger.warning("Kernel warnings detected in one or more hosts")
+ else:
+ logger.info("No kernel warnings detected")
+
+ if crash_detected:
+ logger.warning("Kernel crashes detected in one or more hosts")
+ sys.exit(1)
+ else:
+ logger.info("No kernel crashes detected")
+ sys.exit(0)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/workflows/generic/get_console.py b/scripts/workflows/generic/get_console.py
new file mode 120000
index 000000000000..0169b0dd6188
--- /dev/null
+++ b/scripts/workflows/generic/get_console.py
@@ -0,0 +1 @@
+crash_watchdog.py
\ No newline at end of file
diff --git a/scripts/workflows/generic/lib b/scripts/workflows/generic/lib
new file mode 120000
index 000000000000..5bf80bf1392c
--- /dev/null
+++ b/scripts/workflows/generic/lib
@@ -0,0 +1 @@
+../lib/
\ No newline at end of file
--
2.47.2
^ permalink raw reply related [flat|nested] 9+ messages in thread