From: Luis Chamberlain <mcgrof@kernel.org>
To: Chuck Lever <cel@kernel.org>, Daniel Gomez <da.gomez@kruces.com>,
kdevops@lists.linux.dev
Cc: Luis Chamberlain <mcgrof@kernel.org>
Subject: [PATCH 3/5] fstests_watchdog.py: use the new crash watchdog library
Date: Sat, 19 Apr 2025 22:48:19 -0700 [thread overview]
Message-ID: <20250420054822.533987-4-mcgrof@kernel.org> (raw)
In-Reply-To: <20250420054822.533987-1-mcgrof@kernel.org>
Make the fstests_watchdog.py use the new crash watchdog library.
Since filesystem CIs are already using this, they will immediate
benefit from gathering the crashes / corruptions / kernel warning
into the crash/ directory, and *also* resetting the hosts once hosed.
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
scripts/workflows/fstests/fstests_watchdog.py | 89 +++++++++----------
1 file changed, 44 insertions(+), 45 deletions(-)
diff --git a/scripts/workflows/fstests/fstests_watchdog.py b/scripts/workflows/fstests/fstests_watchdog.py
index f7408a659e56..3fef54843b9a 100755
--- a/scripts/workflows/fstests/fstests_watchdog.py
+++ b/scripts/workflows/fstests/fstests_watchdog.py
@@ -11,6 +11,7 @@ from datetime import datetime
from lib import kssh
from lib import fstests
from lib import systemd_remote
+from lib.crash import KernelCrashWatchdog
import sys, os, grp
import configparser
import argparse
@@ -22,23 +23,23 @@ def print_fstest_host_status(host, verbose, use_remote, use_ssh, basedir, config
if "CONFIG_WORKFLOW_LINUX_DISTRO" in config:
configured_kernel = "Distro-kernel"
elif "CONFIG_BOOTLINUX_TREE_REF" in config:
- configured_kernel = config["CONFIG_BOOTLINUX_TREE_REF"].strip('\"')
+ configured_kernel = config["CONFIG_BOOTLINUX_TREE_REF"].strip('"')
remote_path = "/var/log/journal/remote/"
kernel = systemd_remote.get_uname(remote_path, host, configured_kernel)
if kernel == configured_kernel:
- kernel += " (inferred)"
+ kernel += " (custom)"
if kernel is None:
sys.stderr.write("No kernel could be identified for host: %s\n" % host)
sys.exit(1)
else:
kernel = kssh.get_uname(host).rstrip()
+
section = fstests.get_section(host, config)
- (last_test, last_test_time, current_time_str, delta_seconds, stall_suspect) = fstests.get_fstest_host(use_remote, use_ssh, host, basedir, kernel, section, config)
- checktime = fstests.get_checktime(host, basedir, kernel, section, last_test)
+ (last_test, last_test_time, current_time_str, delta_seconds, stall_suspect) = \
+ fstests.get_fstest_host(use_remote, use_ssh, host, basedir, kernel, section, config)
- percent_done = 0
- if checktime > 0:
- percent_done = delta_seconds * 100 / checktime
+ checktime = fstests.get_checktime(host, basedir, kernel, section, last_test)
+ percent_done = (delta_seconds * 100 / checktime) if checktime > 0 else 0
stall_str = "OK"
if stall_suspect:
@@ -47,44 +48,43 @@ def print_fstest_host_status(host, verbose, use_remote, use_ssh, basedir, config
else:
stall_str = "Hung-Stalled"
- if last_test is None:
- if verbose:
- sys.stdout.write("Host : %s\n" % (host))
- sys.stdout.write("Last test : None\n")
- else:
- percent_done_str = "%.0f%%" % (0)
- sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % (host, "None", percent_done_str, 0, 0, stall_str, kernel))
- return
+ crash_state = "OK"
+ watchdog = KernelCrashWatchdog(host_name=host,
+ decode_crash=True,
+ reset_host=True,
+ save_warnings=True)
+ crash_file, warning_file = watchdog.check_and_reset_host()
+ if crash_file:
+ crash_state = "CRASH"
+ elif warning_file:
+ crash_state = "WARNING"
if not verbose:
- soak_duration_seconds = 0
- if "CONFIG_FSTESTS_SOAK_DURATION" in config:
- soak_duration_seconds = config["CONFIG_FSTESTS_SOAK_DURATION"].strip('\"')
- soak_duration_seconds = int(soak_duration_seconds)
- uses_soak = fstests.fstests_test_uses_soak_duration(last_test)
+ soak_duration_seconds = int(config.get("CONFIG_FSTESTS_SOAK_DURATION", '0').strip('"'))
+ uses_soak = fstests.fstests_test_uses_soak_duration(last_test or "")
is_soaking = uses_soak and soak_duration_seconds != 0
- soaking_str = ""
- if is_soaking:
- soaking_str = "(soak)"
+ soaking_str = "(soak)" if is_soaking else ""
percent_done_str = "%.0f%% %s" % (percent_done, soaking_str)
- sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % (host, last_test, percent_done_str, str(delta_seconds), str(checktime), stall_str, kernel))
+ if delta_seconds is None:
+ delta_seconds = 0
+ if checktime is None:
+ checktime = 0
+ sys.stdout.write(
+ f"{host:>25} {last_test or 'None':>15} {percent_done_str:>15} "
+ f"{delta_seconds:>12} {checktime:>17} {stall_str:>13} "
+ f"{kernel:<38} {crash_state:<10}\n"
+ )
return
sys.stdout.write("Host : %s\n" % (host))
sys.stdout.write("Last test : %s\n" % (last_test))
sys.stdout.write("Last test time: %s\n" % (last_test_time))
sys.stdout.write("Current system time: %s\n" % (current_time_str))
-
sys.stdout.write("Delta: %d total second\n" % (delta_seconds))
sys.stdout.write("\t%d minutes\n" % (delta_seconds / 60))
sys.stdout.write("\t%d seconds\n" % (delta_seconds % 60))
- sys.stdout.write("Timeout-status: ")
-
- if stall_suspect:
- sys.stdout.write("POSSIBLE-STALL")
- else:
- sys.stdout.write("OK")
- sys.stdout.write("\n")
+ sys.stdout.write("Timeout-status: %s\n" % ("POSSIBLE-STALL" if stall_suspect else "OK"))
+ sys.stdout.write("Crash-status : %s\n" % crash_state)
def _main():
parser = argparse.ArgumentParser(description='fstest-watchdog')
@@ -95,11 +95,11 @@ def _main():
default='baseline',
help='The name of the section to read hosts from')
parser.add_argument('--verbose', const=True, default=False, action="store_const",
- help='Be verbose on otput.')
+ help='Be verbose on output.')
parser.add_argument('--use-systemd-remote', const=True, default=True, action="store_const",
- help='Use use systemd-remote uploaded journals if available')
+ help='Use systemd-remote uploaded journals if available')
parser.add_argument('--use-ssh', const=True, default=False, action="store_const",
- help='Force to only use use ssh for journals.')
+ help='Force to only use ssh for journals.')
args = parser.parse_args()
if not os.path.isfile(args.hostfile):
@@ -114,33 +114,32 @@ def _main():
basedir = os.path.dirname(dotconfig)
remote_group = "systemd-journal-remote"
-
if "CONFIG_DEVCONFIG_ENABLE_SYSTEMD_JOURNAL_REMOTE" in config and not args.use_ssh:
group = grp.getgrnam(remote_group)
if group is not None:
remote_gid = group[2]
if remote_gid not in os.getgrouplist(os.getlogin(), os.getgid()):
- sys.stderr.write("Your username is not part of the group %s\n" %
- remote_group)
+ sys.stderr.write("Your username is not part of the group %s\n" % remote_group)
sys.stderr.write("Fix this and try again")
sys.exit(1)
else:
- sys.stderr.write("The group %s was not found, add Kconfig support for the systemd-remote-journal group used" % remote_group)
- sys.exit(1)
+ sys.stderr.write("The group %s was not found, add Kconfig support for the systemd-remote-journal group used" % remote_group)
+ sys.exit(1)
hosts = fstests.get_hosts(args.hostfile, args.hostsection)
- sys.stdout.write("%35s%20s%20s%20s%20s%15s%30s\n" % ("Hostname", "Test-name", "Completion %", "runtime(s)", "last-runtime(s)", "Stall-status", "Kernel"))
+ sys.stdout.write(
+ f"{'Hostname':>25} {'Test-name':>15} {'Completion %':>15} "
+ f"{'runtime(s)':>12} {'last-runtime(s)':>17} {'Stall-status':>13} "
+ f"{'Kernel':<38} {'Crash-status':<10}\n"
+ )
for h in hosts:
print_fstest_host_status(h, args.verbose,
args.use_systemd_remote,
args.use_ssh,
basedir,
config)
- soak_duration_seconds = 0
- if "CONFIG_FSTESTS_SOAK_DURATION" in config:
- soak_duration_seconds = config["CONFIG_FSTESTS_SOAK_DURATION"].strip('\"')
- soak_duration_seconds = int(soak_duration_seconds)
+ soak_duration_seconds = int(config.get("CONFIG_FSTESTS_SOAK_DURATION", '0').strip('"'))
journal_method = "ssh"
if "CONFIG_DEVCONFIG_ENABLE_SYSTEMD_JOURNAL_REMOTE" in config and not args.use_ssh:
journal_method = "systemd-journal-remote"
--
2.47.2
next prev parent reply other threads:[~2025-04-20 5:48 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-20 5:48 [PATCH 0/5] crash: provide a crash watchdog Luis Chamberlain
2025-04-20 5:48 ` [PATCH 1/5] systemd-remote: use ip address for systemd-remote journal Luis Chamberlain
2025-04-20 5:48 ` [PATCH 2/5] crash: add kernel crash watchdog library Luis Chamberlain
2025-04-20 5:48 ` Luis Chamberlain [this message]
2025-04-20 5:48 ` [PATCH 4/5] crash_watchdog.py: add generic crash watchdog Luis Chamberlain
2025-04-20 5:48 ` [PATCH 5/5] crash_report.py: add a crash report Luis Chamberlain
2025-04-20 15:19 ` [PATCH 0/5] crash: provide a crash watchdog Chuck Lever
2025-04-21 23:16 ` Luis Chamberlain
2025-04-22 2:38 ` Luis Chamberlain
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250420054822.533987-4-mcgrof@kernel.org \
--to=mcgrof@kernel.org \
--cc=cel@kernel.org \
--cc=da.gomez@kruces.com \
--cc=kdevops@lists.linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.