From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from bombadil.infradead.org (bombadil.infradead.org [198.137.202.133]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0EE8D14F9D6 for ; Sun, 20 Apr 2025 05:48:26 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.137.202.133 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1745128109; cv=none; b=QTv8HWh8Bv4POk/kO/RMosX4cs6FXJExo0/LDTL5toLj86q7Fc4hs4PJAGJTzJJ+ZSLU5rHNA0wF7LpnTcV4JEVo+WAId7qVSbV5IGKqlC61cUe9aa2b+frXDYRRIaIMP8pA4ZS7wtHMEPizFF6cPlLb/9MWUofCMwnCjgWIWiw= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1745128109; c=relaxed/simple; bh=j7VMcmF1F1q66FgPFHbuyGX2811SWolNOdwe4ZWABgY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=PJAxx2fDY3MRsvuGnqKQUOFsdIB34wd3xdvcAlyUIQMfyfxWauQY4ualBOPPk9y4nioLeHGvkzZWsj38qiGXBTjFG4OEAZTjwc1t3Qd/DWUwlk2etfLzOHOg+A1MSciZKmbJEHi2ZWCrZCzTBEMXk6Woj+ZiysS3yDuq2JT5RgE= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=fail (p=quarantine dis=none) header.from=kernel.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=MrEumVxY; arc=none smtp.client-ip=198.137.202.133 Authentication-Results: smtp.subspace.kernel.org; dmarc=fail (p=quarantine dis=none) header.from=kernel.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="MrEumVxY" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=bombadil.20210309; h=Sender:Content-Transfer-Encoding: MIME-Version:References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From: Reply-To:Content-Type:Content-ID:Content-Description; bh=qCI2Hch5ad0pbY9QCMC46De5i2tgu7hJRiZhoHidXEY=; b=MrEumVxYMWPbpsRAivDJ/DPUlF qDhlHwHZlTdloGFLDrEYw+isdukfF0HUZ7tueITYm2ER4e5nw/Sw/mbh//eCArrdi9qN9uJHGDCHU hFLCwoBHfDzh354w+zCiN+QcU/EDLYVEuZnwPamlqekjqZYVQ0aUgzXdDlhDYgUw01euamkhfvNUb HRcoXNsLETQ5TW0R03W3GaaJhueq/YLAu2xBFBqoflK29ds3CDJDW7AHlI6FsOJuBRndbUOwQxsbI qOGdZzWsiy9MejXS0KUSfJY5Dlbf0/Iw4UQ23mfXzzZiA8wuuntxGXW2JAwpgpE0BsXDTooN7UCOz 1gps9KNw==; Received: from mcgrof by bombadil.infradead.org with local (Exim 4.98.2 #2 (Red Hat Linux)) id 1u6NXW-00000002EvC-2mDy; Sun, 20 Apr 2025 05:48:26 +0000 From: Luis Chamberlain To: Chuck Lever , Daniel Gomez , kdevops@lists.linux.dev Cc: Luis Chamberlain Subject: [PATCH 4/5] crash_watchdog.py: add generic crash watchdog Date: Sat, 19 Apr 2025 22:48:20 -0700 Message-ID: <20250420054822.533987-5-mcgrof@kernel.org> X-Mailer: git-send-email 2.49.0 In-Reply-To: <20250420054822.533987-1-mcgrof@kernel.org> References: <20250420054822.533987-1-mcgrof@kernel.org> Precedence: bulk X-Mailing-List: kdevops@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: Luis Chamberlain This can be used by any workflow. Specialized workflows can use the library and customize it as they see fit to provide CIs more output. Its easy to forget where the hell the kernel logs are so this also provides a symlink helper which can be used to get the kernel logs from a host. Signed-off-by: Luis Chamberlain --- scripts/workflows/generic/crash_watchdog.py | 186 ++++++++++++++++++++ scripts/workflows/generic/get_console.py | 1 + scripts/workflows/generic/lib | 1 + 3 files changed, 188 insertions(+) create mode 100755 scripts/workflows/generic/crash_watchdog.py create mode 120000 scripts/workflows/generic/get_console.py create mode 120000 scripts/workflows/generic/lib diff --git a/scripts/workflows/generic/crash_watchdog.py b/scripts/workflows/generic/crash_watchdog.py new file mode 100755 index 000000000000..3860de9d5592 --- /dev/null +++ b/scripts/workflows/generic/crash_watchdog.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: copyleft-next-0.3.1 + +""" +This script is intended to run as a kernel-ci agent. Monitoring for crashes +and kernel warnings and reseting host after capturing essential information. +It can also be invoked as 'get_console.py' to retrieve the entire kernel log. +""" + +import os +import sys +import subprocess +import re +import logging +import argparse +import yaml +from datetime import datetime, timedelta +from pathlib import Path +from lib.crash import KernelCrashWatchdog + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger("crash_watchdog") + +def get_active_hosts(): + """Get the list of active hosts from kdevops configuration.""" + try: + # First try to get the hosts from the ansible inventory + result = subprocess.run( + ["ansible-inventory", "-i", "hosts", "--list"], + capture_output=True, text=True, check=True + ) + inventory = yaml.safe_load(result.stdout) + hosts = inventory.get("baseline", {}).get("hosts", []) + return sorted(set(hosts)) + except Exception as e: + logger.error(f"Error getting active hosts: {e}") + return [] + +def run_crash_watchdog_on_host(args, this_host_name): + watchdog = KernelCrashWatchdog( + host_name=this_host_name, + output_dir=args.output_dir, + full_log=args.full_log, + decode_crash=not args.no_decode, + reset_host=not args.no_reset, + save_warnings = args.save_warnings, + ) + + crashed = False + warnings_found = False + + crash_file, warning_file = watchdog.check_and_reset_host(method=args.method, get_fstests_log=args.fstests_log) + + if warning_file: + logger.warning(f"Kernel warning and logged to {warning_file}") + warnings_found = True + elif args.save_warnings: + logger.info(f"No kernel warnings detected for host {args.host_name}") + if crash_file: + crashed = True + logger.warning(f"Crash detected and logged to {crash_file}") + else: + logger.info(f"No crash detected for host {args.host_name}") + return crashed, [crash_file], warnings_found, warning_file + +def run_crash_watchdog_all_hosts(args): + """Check all active hosts for kernel crashes.""" + hosts = get_active_hosts() + crash_detected = False + crash_files = [] + warnings_detected = False + warning_files = [] + + logger.info( + f"Checking {len(hosts)} hosts for kernel crashes: {', '.join(hosts)}" + ) + + for host in hosts: + host_crash_detected, crash_file, host_warnings_detected, warnings_file = run_crash_watchdog_on_host(args, host) + if host_crash_detected and crash_file: + crash_detected = True + crash_files.append(crash_file) + logger.info(f"Crash detected in host {host}, logs saved to {crash_file}") + if host_warnings_detected and warnings_file: + warnings_detected = True + warning_files.append(warning_file) + logger.info(f"Kernel warning found on host {host}, logs saved to {warning_file}") + + return crash_detected, crash_files, warnings_detected, warning_files + +def write_log_section(f, title, files, label): + f.write(f"# {title}\n\n") + for path in files: + f.write(f"- {label} detected: {path}\n") + try: + with open(path, "r") as content_file: + snippet = "".join(content_file.readlines()[:10]) + "\n...(truncated)..." + f.write("\n```\n" + snippet + "\n```\n\n") + except Exception as e: + f.write(f"\nError reading {label.lower()} file: {e}\n\n") + +def main(): + parser = argparse.ArgumentParser( + description="Detect and handle kernel crashes or kernel warnings in hosts.", + epilog=""" +Examples: + Detect and reset all hosts a crash was found (default): + ./crash_watchdog.py + + Detect and reset host crash only on e3-ext4-2k guest: + ./crash_watchdog.py --host-name e3-ext4-2k + + Detect using systemd-remote journal and show full kernel log: + ./crash_watchdog.py e3-ext4-2k --method remote --full-log + + Skip decoding and skip reset: + ./crash_watchdog.py e3-ext4-2k --no-decode --no-reset + + Just fetch the full kernel log using symlinked name: + ln -s crash_watchdog.py get_console.py + ./get_console.py e3-ext4-2k + + Use guestfs console log and do not decode: + ./crash_watchdog.py e3-ext4-2k --method console --no-decode + + Use SSH to query the live journalctl output: + ./crash_watchdog.py e3-ext4-2k --method ssh + + Disable guest reset when using libvirt: + ./crash_watchdog.py e3-ext4-2k --no-reset + + Print full kernel logs for a specific fstest (all tests run with it): + ./crash_watchdog.py e3-ext4-2k --fstests-log generic/750 + + Get all kernel warnings only: + ./crash_watchdog.py e3-ext4-2k --method remote --save-warnings sad.warn + """, + formatter_class=argparse.RawTextHelpFormatter + ) + + parser.add_argument("--host-name", help="Optional name of the host to check", default="all") + parser.add_argument("--output-dir", help="Directory to store crash logs", default="crashes") + parser.add_argument( + "--method", + choices=["auto", "remote", "console", "ssh"], + default="auto", + help="Choose method to collect logs: auto, remote, console, or ssh" + ) + parser.add_argument("--full-log", action="store_true", help="Get full kernel log instead of only crash context") + parser.add_argument("--no-decode", action="store_true", help="Disable decoding crash logs with decode_stacktrace.sh") + parser.add_argument("--no-reset", action="store_true", help="Do not reset the guest even if a crash is detected") + parser.add_argument("--fstests-log", help="Show all kernel log lines for a specific fstests test ID (e.g., generic/750)") + parser.add_argument("--save-warnings", help="Do you want detected and save kernel warnings", default=True) + args = parser.parse_args() + crash_files = [] + warnings_files = [] + + invoked_name = os.path.basename(sys.argv[0]) + if invoked_name == "get_console.py": + args.no_reset = True + args.save_warnings = False + args.full_log_mode = True + + if (args.host_name != "all"): + crash_detected, crash_files, warnings_detected, warnings_files = run_crash_watchdog_on_host(args, args.host_name) + else: + crash_detected, crash_files, warnings_detected, warnings_files = run_crash_watchdog_all_hosts(args) + + if warnings_detected: + logger.warning("Kernel warnings detected in one or more hosts") + else: + logger.info("No kernel warnings detected") + + if crash_detected: + logger.warning("Kernel crashes detected in one or more hosts") + sys.exit(1) + else: + logger.info("No kernel crashes detected") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/scripts/workflows/generic/get_console.py b/scripts/workflows/generic/get_console.py new file mode 120000 index 000000000000..0169b0dd6188 --- /dev/null +++ b/scripts/workflows/generic/get_console.py @@ -0,0 +1 @@ +crash_watchdog.py \ No newline at end of file diff --git a/scripts/workflows/generic/lib b/scripts/workflows/generic/lib new file mode 120000 index 000000000000..5bf80bf1392c --- /dev/null +++ b/scripts/workflows/generic/lib @@ -0,0 +1 @@ +../lib/ \ No newline at end of file -- 2.47.2