From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from bombadil.infradead.org (bombadil.infradead.org [198.137.202.133]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0EE3F149C7D for ; Sun, 20 Apr 2025 05:48:26 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.137.202.133 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1745128109; cv=none; b=IqWFb55G1uw4cSZ6F0Lr2c0dK1NhxeYiDKPRhYC9eNY+Y8Zx5CG1hcviSATq36NnVmXMqq34LnxrN7m5Q/a5U9hZMsUpWkeQ6aUkyhexnGbhYamoJjGUqNH7REC6bdCoEID/HUutPkfdo/nygf/yYlKSnb/GhKNNfcg3k05PA68= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1745128109; c=relaxed/simple; bh=MKqDc25cK7tpJ8Ewde7pvx+IiikRDN4uYdeNhwklUpI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=apbr7dkZExYjAoEtsUxZJ5HGFwllj7yGTZihoz5z4/YTjegmsljfMOuIrYskDVHK8P4LJqNw5nsk9LZSKuDUDX8hnv3pon2FQ+g6osybBtDKU4vm9N7luKUd78cmG+OCgy8LN9qYo5KhZjC1F6+PRJPQNm1wICNR1FTVcFVNwR0= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=fail (p=quarantine dis=none) header.from=kernel.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=a87P52Yy; arc=none smtp.client-ip=198.137.202.133 Authentication-Results: smtp.subspace.kernel.org; dmarc=fail (p=quarantine dis=none) header.from=kernel.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="a87P52Yy" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=bombadil.20210309; h=Sender:Content-Transfer-Encoding: MIME-Version:References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From: Reply-To:Content-Type:Content-ID:Content-Description; bh=h1sg6Q0wzRsXOPQDCH8VFE8RSgnz4G0b5b2OtEuYyqM=; b=a87P52YyvGJ4AYEiheCIUWdUAl F1pLbtLbK9q7+YLEWQspQfeOzATqbJ9RZ8xfAG9u+uQyXYFAuC9+aEPa5pea+rGI7lS9U3Pzj9ECJ wvD76CLt1P/fa5wau7pkbRE6euHLutzvHhm2Gx8e9YbM4M3KJwoqdhC8udRpmEO+2XufRO6d2Za5j cTjrFiZYa1YBTrAZQ1OoW3C74rrHZfOkCXPRu3C51laP7/cxjI4pM1J+ObD/bcjInxM9c8/IMZVnK febDLLJO7b6ap1gLVqY3NrRFL5YyjqBeedFLCjcz6gDfq6itphIuCX9YJqS6FmNJva7X7vEMyfOqr OJEDnkKw==; Received: from mcgrof by bombadil.infradead.org with local (Exim 4.98.2 #2 (Red Hat Linux)) id 1u6NXW-00000002Ev7-2eOz; Sun, 20 Apr 2025 05:48:26 +0000 From: Luis Chamberlain To: Chuck Lever , Daniel Gomez , kdevops@lists.linux.dev Cc: Luis Chamberlain Subject: [PATCH 3/5] fstests_watchdog.py: use the new crash watchdog library Date: Sat, 19 Apr 2025 22:48:19 -0700 Message-ID: <20250420054822.533987-4-mcgrof@kernel.org> X-Mailer: git-send-email 2.49.0 In-Reply-To: <20250420054822.533987-1-mcgrof@kernel.org> References: <20250420054822.533987-1-mcgrof@kernel.org> Precedence: bulk X-Mailing-List: kdevops@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: Luis Chamberlain Make the fstests_watchdog.py use the new crash watchdog library. Since filesystem CIs are already using this, they will immediate benefit from gathering the crashes / corruptions / kernel warning into the crash/ directory, and *also* resetting the hosts once hosed. Signed-off-by: Luis Chamberlain --- scripts/workflows/fstests/fstests_watchdog.py | 89 +++++++++---------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/scripts/workflows/fstests/fstests_watchdog.py b/scripts/workflows/fstests/fstests_watchdog.py index f7408a659e56..3fef54843b9a 100755 --- a/scripts/workflows/fstests/fstests_watchdog.py +++ b/scripts/workflows/fstests/fstests_watchdog.py @@ -11,6 +11,7 @@ from datetime import datetime from lib import kssh from lib import fstests from lib import systemd_remote +from lib.crash import KernelCrashWatchdog import sys, os, grp import configparser import argparse @@ -22,23 +23,23 @@ def print_fstest_host_status(host, verbose, use_remote, use_ssh, basedir, config if "CONFIG_WORKFLOW_LINUX_DISTRO" in config: configured_kernel = "Distro-kernel" elif "CONFIG_BOOTLINUX_TREE_REF" in config: - configured_kernel = config["CONFIG_BOOTLINUX_TREE_REF"].strip('\"') + configured_kernel = config["CONFIG_BOOTLINUX_TREE_REF"].strip('"') remote_path = "/var/log/journal/remote/" kernel = systemd_remote.get_uname(remote_path, host, configured_kernel) if kernel == configured_kernel: - kernel += " (inferred)" + kernel += " (custom)" if kernel is None: sys.stderr.write("No kernel could be identified for host: %s\n" % host) sys.exit(1) else: kernel = kssh.get_uname(host).rstrip() + section = fstests.get_section(host, config) - (last_test, last_test_time, current_time_str, delta_seconds, stall_suspect) = fstests.get_fstest_host(use_remote, use_ssh, host, basedir, kernel, section, config) - checktime = fstests.get_checktime(host, basedir, kernel, section, last_test) + (last_test, last_test_time, current_time_str, delta_seconds, stall_suspect) = \ + fstests.get_fstest_host(use_remote, use_ssh, host, basedir, kernel, section, config) - percent_done = 0 - if checktime > 0: - percent_done = delta_seconds * 100 / checktime + checktime = fstests.get_checktime(host, basedir, kernel, section, last_test) + percent_done = (delta_seconds * 100 / checktime) if checktime > 0 else 0 stall_str = "OK" if stall_suspect: @@ -47,44 +48,43 @@ def print_fstest_host_status(host, verbose, use_remote, use_ssh, basedir, config else: stall_str = "Hung-Stalled" - if last_test is None: - if verbose: - sys.stdout.write("Host : %s\n" % (host)) - sys.stdout.write("Last test : None\n") - else: - percent_done_str = "%.0f%%" % (0) - sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % (host, "None", percent_done_str, 0, 0, stall_str, kernel)) - return + crash_state = "OK" + watchdog = KernelCrashWatchdog(host_name=host, + decode_crash=True, + reset_host=True, + save_warnings=True) + crash_file, warning_file = watchdog.check_and_reset_host() + if crash_file: + crash_state = "CRASH" + elif warning_file: + crash_state = "WARNING" if not verbose: - soak_duration_seconds = 0 - if "CONFIG_FSTESTS_SOAK_DURATION" in config: - soak_duration_seconds = config["CONFIG_FSTESTS_SOAK_DURATION"].strip('\"') - soak_duration_seconds = int(soak_duration_seconds) - uses_soak = fstests.fstests_test_uses_soak_duration(last_test) + soak_duration_seconds = int(config.get("CONFIG_FSTESTS_SOAK_DURATION", '0').strip('"')) + uses_soak = fstests.fstests_test_uses_soak_duration(last_test or "") is_soaking = uses_soak and soak_duration_seconds != 0 - soaking_str = "" - if is_soaking: - soaking_str = "(soak)" + soaking_str = "(soak)" if is_soaking else "" percent_done_str = "%.0f%% %s" % (percent_done, soaking_str) - sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % (host, last_test, percent_done_str, str(delta_seconds), str(checktime), stall_str, kernel)) + if delta_seconds is None: + delta_seconds = 0 + if checktime is None: + checktime = 0 + sys.stdout.write( + f"{host:>25} {last_test or 'None':>15} {percent_done_str:>15} " + f"{delta_seconds:>12} {checktime:>17} {stall_str:>13} " + f"{kernel:<38} {crash_state:<10}\n" + ) return sys.stdout.write("Host : %s\n" % (host)) sys.stdout.write("Last test : %s\n" % (last_test)) sys.stdout.write("Last test time: %s\n" % (last_test_time)) sys.stdout.write("Current system time: %s\n" % (current_time_str)) - sys.stdout.write("Delta: %d total second\n" % (delta_seconds)) sys.stdout.write("\t%d minutes\n" % (delta_seconds / 60)) sys.stdout.write("\t%d seconds\n" % (delta_seconds % 60)) - sys.stdout.write("Timeout-status: ") - - if stall_suspect: - sys.stdout.write("POSSIBLE-STALL") - else: - sys.stdout.write("OK") - sys.stdout.write("\n") + sys.stdout.write("Timeout-status: %s\n" % ("POSSIBLE-STALL" if stall_suspect else "OK")) + sys.stdout.write("Crash-status : %s\n" % crash_state) def _main(): parser = argparse.ArgumentParser(description='fstest-watchdog') @@ -95,11 +95,11 @@ def _main(): default='baseline', help='The name of the section to read hosts from') parser.add_argument('--verbose', const=True, default=False, action="store_const", - help='Be verbose on otput.') + help='Be verbose on output.') parser.add_argument('--use-systemd-remote', const=True, default=True, action="store_const", - help='Use use systemd-remote uploaded journals if available') + help='Use systemd-remote uploaded journals if available') parser.add_argument('--use-ssh', const=True, default=False, action="store_const", - help='Force to only use use ssh for journals.') + help='Force to only use ssh for journals.') args = parser.parse_args() if not os.path.isfile(args.hostfile): @@ -114,33 +114,32 @@ def _main(): basedir = os.path.dirname(dotconfig) remote_group = "systemd-journal-remote" - if "CONFIG_DEVCONFIG_ENABLE_SYSTEMD_JOURNAL_REMOTE" in config and not args.use_ssh: group = grp.getgrnam(remote_group) if group is not None: remote_gid = group[2] if remote_gid not in os.getgrouplist(os.getlogin(), os.getgid()): - sys.stderr.write("Your username is not part of the group %s\n" % - remote_group) + sys.stderr.write("Your username is not part of the group %s\n" % remote_group) sys.stderr.write("Fix this and try again") sys.exit(1) else: - sys.stderr.write("The group %s was not found, add Kconfig support for the systemd-remote-journal group used" % remote_group) - sys.exit(1) + sys.stderr.write("The group %s was not found, add Kconfig support for the systemd-remote-journal group used" % remote_group) + sys.exit(1) hosts = fstests.get_hosts(args.hostfile, args.hostsection) - sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % ("Hostname", "Test-name", "Completion %", "runtime(s)", "last-runtime(s)", "Stall-status", "Kernel")) + sys.stdout.write( + f"{'Hostname':>25} {'Test-name':>15} {'Completion %':>15} " + f"{'runtime(s)':>12} {'last-runtime(s)':>17} {'Stall-status':>13} " + f"{'Kernel':<38} {'Crash-status':<10}\n" + ) for h in hosts: print_fstest_host_status(h, args.verbose, args.use_systemd_remote, args.use_ssh, basedir, config) - soak_duration_seconds = 0 - if "CONFIG_FSTESTS_SOAK_DURATION" in config: - soak_duration_seconds = config["CONFIG_FSTESTS_SOAK_DURATION"].strip('\"') - soak_duration_seconds = int(soak_duration_seconds) + soak_duration_seconds = int(config.get("CONFIG_FSTESTS_SOAK_DURATION", '0').strip('"')) journal_method = "ssh" if "CONFIG_DEVCONFIG_ENABLE_SYSTEMD_JOURNAL_REMOTE" in config and not args.use_ssh: journal_method = "systemd-journal-remote" -- 2.47.2