From: Luis Chamberlain <mcgrof@kernel.org>
To: Chuck Lever <cel@kernel.org>, Daniel Gomez <da.gomez@kruces.com>,
kdevops@lists.linux.dev
Cc: Luis Chamberlain <mcgrof@kernel.org>
Subject: [PATCH 3/5] fstests_watchdog.py: use the new crash watchdog library
Date: Sat, 19 Apr 2025 22:48:19 -0700 [thread overview]
Message-ID: <20250420054822.533987-4-mcgrof@kernel.org> (raw)
In-Reply-To: <20250420054822.533987-1-mcgrof@kernel.org>
Make the fstests_watchdog.py use the new crash watchdog library.
Since filesystem CIs are already using this, they will immediate
benefit from gathering the crashes / corruptions / kernel warning
into the crash/ directory, and *also* resetting the hosts once hosed.
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
scripts/workflows/fstests/fstests_watchdog.py | 89 +++++++++----------
1 file changed, 44 insertions(+), 45 deletions(-)
diff --git a/scripts/workflows/fstests/fstests_watchdog.py b/scripts/workflows/fstests/fstests_watchdog.py
index f7408a659e56..3fef54843b9a 100755
--- a/scripts/workflows/fstests/fstests_watchdog.py
+++ b/scripts/workflows/fstests/fstests_watchdog.py
@@ -11,6 +11,7 @@ from datetime import datetime
from lib import kssh
from lib import fstests
from lib import systemd_remote
+from lib.crash import KernelCrashWatchdog
import sys, os, grp
import configparser
import argparse
@@ -22,23 +23,23 @@ def print_fstest_host_status(host, verbose, use_remote, use_ssh, basedir, config
if "CONFIG_WORKFLOW_LINUX_DISTRO" in config:
configured_kernel = "Distro-kernel"
elif "CONFIG_BOOTLINUX_TREE_REF" in config:
- configured_kernel = config["CONFIG_BOOTLINUX_TREE_REF"].strip('\"')
+ configured_kernel = config["CONFIG_BOOTLINUX_TREE_REF"].strip('"')
remote_path = "/var/log/journal/remote/"
kernel = systemd_remote.get_uname(remote_path, host, configured_kernel)
if kernel == configured_kernel:
- kernel += " (inferred)"
+ kernel += " (custom)"
if kernel is None:
sys.stderr.write("No kernel could be identified for host: %s\n" % host)
sys.exit(1)
else:
kernel = kssh.get_uname(host).rstrip()
+
section = fstests.get_section(host, config)
- (last_test, last_test_time, current_time_str, delta_seconds, stall_suspect) = fstests.get_fstest_host(use_remote, use_ssh, host, basedir, kernel, section, config)
- checktime = fstests.get_checktime(host, basedir, kernel, section, last_test)
+ (last_test, last_test_time, current_time_str, delta_seconds, stall_suspect) = \
+ fstests.get_fstest_host(use_remote, use_ssh, host, basedir, kernel, section, config)
- percent_done = 0
- if checktime > 0:
- percent_done = delta_seconds * 100 / checktime
+ checktime = fstests.get_checktime(host, basedir, kernel, section, last_test)
+ percent_done = (delta_seconds * 100 / checktime) if checktime > 0 else 0
stall_str = "OK"
if stall_suspect:
@@ -47,44 +48,43 @@ def print_fstest_host_status(host, verbose, use_remote, use_ssh, basedir, config
else:
stall_str = "Hung-Stalled"
- if last_test is None:
- if verbose:
- sys.stdout.write("Host : %s\n" % (host))
- sys.stdout.write("Last test : None\n")
- else:
- percent_done_str = "%.0f%%" % (0)
- sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % (host, "None", percent_done_str, 0, 0, stall_str, kernel))
- return
+ crash_state = "OK"
+ watchdog = KernelCrashWatchdog(host_name=host,
+ decode_crash=True,
+ reset_host=True,
+ save_warnings=True)
+ crash_file, warning_file = watchdog.check_and_reset_host()
+ if crash_file:
+ crash_state = "CRASH"
+ elif warning_file:
+ crash_state = "WARNING"
if not verbose:
- soak_duration_seconds = 0
- if "CONFIG_FSTESTS_SOAK_DURATION" in config:
- soak_duration_seconds = config["CONFIG_FSTESTS_SOAK_DURATION"].strip('\"')
- soak_duration_seconds = int(soak_duration_seconds)
- uses_soak = fstests.fstests_test_uses_soak_duration(last_test)
+ soak_duration_seconds = int(config.get("CONFIG_FSTESTS_SOAK_DURATION", '0').strip('"'))
+ uses_soak = fstests.fstests_test_uses_soak_duration(last_test or "")
is_soaking = uses_soak and soak_duration_seconds != 0
- soaking_str = ""
- if is_soaking:
- soaking_str = "(soak)"
+ soaking_str = "(soak)" if is_soaking else ""
percent_done_str = "%.0f%% %s" % (percent_done, soaking_str)
- sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % (host, last_test, percent_done_str, str(delta_seconds), str(checktime), stall_str, kernel))
+ if delta_seconds is None:
+ delta_seconds = 0
+ if checktime is None:
+ checktime = 0
+ sys.stdout.write(
+ f"{host:>25} {last_test or 'None':>15} {percent_done_str:>15} "
+ f"{delta_seconds:>12} {checktime:>17} {stall_str:>13} "
+ f"{kernel:<38} {crash_state:<10}\n"
+ )
return
sys.stdout.write("Host : %s\n" % (host))
sys.stdout.write("Last test : %s\n" % (last_test))
sys.stdout.write("Last test time: %s\n" % (last_test_time))
sys.stdout.write("Current system time: %s\n" % (current_time_str))
-
sys.stdout.write("Delta: %d total second\n" % (delta_seconds))
sys.stdout.write("\t%d minutes\n" % (delta_seconds / 60))
sys.stdout.write("\t%d seconds\n" % (delta_seconds % 60))
- sys.stdout.write("Timeout-status: ")
-
- if stall_suspect:
- sys.stdout.write("POSSIBLE-STALL")
- else:
- sys.stdout.write("OK")
- sys.stdout.write("\n")
+ sys.stdout.write("Timeout-status: %s\n" % ("POSSIBLE-STALL" if stall_suspect else "OK"))
+ sys.stdout.write("Crash-status : %s\n" % crash_state)
def _main():
parser = argparse.ArgumentParser(description='fstest-watchdog')
@@ -95,11 +95,11 @@ def _main():
default='baseline',
help='The name of the section to read hosts from')
parser.add_argument('--verbose', const=True, default=False, action="store_const",
- help='Be verbose on otput.')
+ help='Be verbose on output.')
parser.add_argument('--use-systemd-remote', const=True, default=True, action="store_const",
- help='Use use systemd-remote uploaded journals if available')
+ help='Use systemd-remote uploaded journals if available')
parser.add_argument('--use-ssh', const=True, default=False, action="store_const",
- help='Force to only use use ssh for journals.')
+ help='Force to only use ssh for journals.')
args = parser.parse_args()
if not os.path.isfile(args.hostfile):
@@ -114,33 +114,32 @@ def _main():
basedir = os.path.dirname(dotconfig)
remote_group = "systemd-journal-remote"
-
if "CONFIG_DEVCONFIG_ENABLE_SYSTEMD_JOURNAL_REMOTE" in config and not args.use_ssh:
group = grp.getgrnam(remote_group)
if group is not None:
remote_gid = group[2]
if remote_gid not in os.getgrouplist(os.getlogin(), os.getgid()):
- sys.stderr.write("Your username is not part of the group %s\n" %
- remote_group)
+ sys.stderr.write("Your username is not part of the group %s\n" % remote_group)
sys.stderr.write("Fix this and try again")
sys.exit(1)
else:
- sys.stderr.write("The group %s was not found, add Kconfig support for the systemd-remote-journal group used" % remote_group)
- sys.exit(1)
+ sys.stderr.write("The group %s was not found, add Kconfig support for the systemd-remote-journal group used" % remote_group)
+ sys.exit(1)
hosts = fstests.get_hosts(args.hostfile, args.hostsection)
- sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % ("Hostname", "Test-name", "Completion %", "runtime(s)", "last-runtime(s)", "Stall-status", "Kernel"))
+ sys.stdout.write(
+ f"{'Hostname':>25} {'Test-name':>15} {'Completion %':>15} "
+ f"{'runtime(s)':>12} {'last-runtime(s)':>17} {'Stall-status':>13} "
+ f"{'Kernel':<38} {'Crash-status':<10}\n"
+ )
for h in hosts:
print_fstest_host_status(h, args.verbose,
args.use_systemd_remote,
args.use_ssh,
basedir,
config)
- soak_duration_seconds = 0
- if "CONFIG_FSTESTS_SOAK_DURATION" in config:
- soak_duration_seconds = config["CONFIG_FSTESTS_SOAK_DURATION"].strip('\"')
- soak_duration_seconds = int(soak_duration_seconds)
+ soak_duration_seconds = int(config.get("CONFIG_FSTESTS_SOAK_DURATION", '0').strip('"'))
journal_method = "ssh"
if "CONFIG_DEVCONFIG_ENABLE_SYSTEMD_JOURNAL_REMOTE" in config and not args.use_ssh:
journal_method = "systemd-journal-remote"
--
2.47.2
next prev parent reply other threads:[~2025-04-20 5:48 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-20 5:48 [PATCH 0/5] crash: provide a crash watchdog Luis Chamberlain
2025-04-20 5:48 ` [PATCH 1/5] systemd-remote: use ip address for systemd-remote journal Luis Chamberlain
2025-04-20 5:48 ` [PATCH 2/5] crash: add kernel crash watchdog library Luis Chamberlain
2025-04-20 5:48 ` Luis Chamberlain [this message]
2025-04-20 5:48 ` [PATCH 4/5] crash_watchdog.py: add generic crash watchdog Luis Chamberlain
2025-04-20 5:48 ` [PATCH 5/5] crash_report.py: add a crash report Luis Chamberlain
2025-04-20 15:19 ` [PATCH 0/5] crash: provide a crash watchdog Chuck Lever
2025-04-21 23:16 ` Luis Chamberlain
2025-04-22 2:38 ` Luis Chamberlain
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250420054822.533987-4-mcgrof@kernel.org \
--to=mcgrof@kernel.org \
--cc=cel@kernel.org \
--cc=da.gomez@kruces.com \
--cc=kdevops@lists.linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox