From mboxrd@z Thu Jan 1 00:00:00 1970 From: Lucas Meneghel Rodrigues Subject: Re: [PATCH] Virt: Adding softlockup subtest Date: Wed, 20 Jul 2011 22:38:03 -0300 Message-ID: <4E2782FB.3030602@redhat.com> References: <1311211809-5085-1-git-send-email-lmr@redhat.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Cc: autotest@test.kernel.org, kvm@vger.kernel.org, pradeep , Pradeep Kumar Surisetty To: Lucas Meneghel Rodrigues Return-path: Received: from mx1.redhat.com ([209.132.183.28]:9625 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751686Ab1GUBiO (ORCPT ); Wed, 20 Jul 2011 21:38:14 -0400 In-Reply-To: <1311211809-5085-1-git-send-email-lmr@redhat.com> Sender: kvm-owner@vger.kernel.org List-ID: On 07/20/2011 10:30 PM, Lucas Meneghel Rodrigues wrote: > From: pradeep Ok Pradeep, I checked out the new version of the test and made corrections to it (see changelog). Now, what I don't quite like on this test is: * There's no PASS/FAIL criteria, that is, the test never fails. This is not good. * The method of determining the drift looks strange to me. The drift monitor produces a line of drifts every second, then your code checks only the last line of it. Not sure if this is correct. * Also, when trying out the test here I found problems. Did you actually run the test until the end? I kindly ask you to test with a reduced time length (say, 15 or 30 minutes). I've adapted the test so it can use fractions of an hour instead of full hours. So please, go through the new patch: http://patchwork.test.kernel.org/patch/3570/mbox/ And give me a failure criteria and justify the drift calculation being done the way your are doing (or fix it). Thanks, Lucas > This patch introduces a soft lockup/drift test with stress. > > 1) Boot up a VM. > 2) Build stress on host and guest. > 3) run heartbeat monitor with the given options on server and host. > 3) Run for a relatively long time length, ex: 12, 18 or 24 hours. > 4) Output the test result and observe drift. > > Changes from v2: > * Fixed up commands being used on guest, lack of proper output > redirection was confusing aexpect > * Proper clean up previous instances of the monitor programs > lying around, as well as log files > * Resort to another method of determining host IP if the same > has no fully qualified hostname (stand alone laptops, for > example) > * Only use a single session on guest to execute all the commands. > previous version was opening unneeded connections. > * Fix stress execution in guest and host, now the stress instances > effectively start > * Actively open guest and host firewall rules so heartbeat monitor > communication can happen > > Signed-off-by: Lucas Meneghel Rodrigues > Signed-off-by: Pradeep Kumar Surisetty > --- > client/tests/kvm/deps/heartbeat_slu.py | 205 ++++++++++++++++++++++++++++++++ > client/tests/kvm/tests_base.cfg.sample | 18 +++ > client/virt/tests/softlockup.py | 147 +++++++++++++++++++++++ > 3 files changed, 370 insertions(+), 0 deletions(-) > create mode 100755 client/tests/kvm/deps/heartbeat_slu.py > create mode 100644 client/virt/tests/softlockup.py > > diff --git a/client/tests/kvm/deps/heartbeat_slu.py b/client/tests/kvm/deps/heartbeat_slu.py > new file mode 100755 > index 0000000..697bbbf > --- /dev/null > +++ b/client/tests/kvm/deps/heartbeat_slu.py > @@ -0,0 +1,205 @@ > +#!/usr/bin/env python > + > +""" > +Heartbeat server/client to detect soft lockups > +""" > + > +import socket, os, sys, time, getopt > + > +def daemonize(output_file): > + try: > + pid = os.fork() > + except OSError, e: > + raise Exception, "error %d: %s" % (e.strerror, e.errno) > + > + if pid: > + os._exit(0) > + > + os.umask(0) > + os.setsid() > + sys.stdout.flush() > + sys.stderr.flush() > + > + if file: > + output_handle = file(output_file, 'a+', 0) > + # autoflush stdout/stderr > + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) > + sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0) > + else: > + output_handle = file('/dev/null', 'a+') > + > + stdin_handle = open('/dev/null', 'r') > + os.dup2(output_handle.fileno(), sys.stdout.fileno()) > + os.dup2(output_handle.fileno(), sys.stderr.fileno()) > + os.dup2(stdin_handle.fileno(), sys.stdin.fileno()) > + > +def recv_all(sock): > + total_data = [] > + while True: > + data = sock.recv(1024) > + if not data: > + break > + total_data.append(data) > + return ''.join(total_data) > + > +def run_server(host, port, daemon, file, queue_size, threshold, drift): > + if daemon: > + daemonize(output_file=file) > + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) > + sock.bind((host, port)) > + sock.listen(queue_size) > + timeout_interval = threshold * 2 > + prev_check_timestamp = float(time.time()) > + while 1: > + c_sock, c_addr = sock.accept() > + heartbeat = recv_all(c_sock) > + local_timestamp = float(time.time()) > + drift = check_heartbeat(heartbeat, local_timestamp, threshold, check_drift) > + # NOTE: this doesn't work if the only client is the one that timed > + # out, but anything more complete would require another thread and > + # a lock for client_prev_timestamp. > + if local_timestamp - prev_check_timestamp> threshold * 2.0: > + check_for_timeouts(threshold, check_drift) > + prev_check_timestamp = local_timestamp > + if verbose: > + if check_drift: > + print "%.2f: %s (%s)" % (local_timestamp, heartbeat, drift) > + else: > + print "%.2f: %s" % (local_timestamp, heartbeat) > + > +def run_client(host, port, daemon, file, interval): > + if daemon: > + daemonize(output_file=file) > + seq = 1 > + while 1: > + try: > + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) > + sock.connect((host, port)) > + heartbeat = get_heartbeat(seq) > + sock.sendall(heartbeat) > + sock.close() > + if verbose: > + print heartbeat > + except socket.error, (value, message): > + print "%.2f: ERROR, %d - %s" % (float(time.time()), value, message) > + > + seq += 1 > + time.sleep(interval) > + > +def get_heartbeat(seq=1): > + return "%s %06d %.2f" % (hostname, seq, float(time.time())) > + > +def check_heartbeat(heartbeat, local_timestamp, threshold, check_drift): > + hostname, seq, timestamp = heartbeat.rsplit() > + timestamp = float(timestamp) > + if client_prev_timestamp.has_key(hostname): > + delta = local_timestamp - client_prev_timestamp[hostname] > + if delta> threshold: > + print "%.2f: ALERT, SLU detected on host %s, delta %ds" \ > + % (float(time.time()), hostname, delta) > + > + client_prev_timestamp[hostname] = local_timestamp > + > + if check_drift: > + if not client_clock_offset.has_key(hostname): > + client_clock_offset[hostname] = timestamp - local_timestamp > + client_prev_drift[hostname] = 0 > + drift = timestamp - local_timestamp - client_clock_offset[hostname] > + drift_delta = drift - client_prev_drift[hostname] > + client_prev_drift[hostname] = drift > + return "drift %+4.2f (%+4.2f)" % (drift, drift_delta) > + > +def check_for_timeouts(threshold, check_drift): > + local_timestamp = float(time.time()) > + hostname_list = list(client_prev_timestamp) > + for hostname in hostname_list: > + timestamp = client_prev_timestamp[hostname] > + delta = local_timestamp - timestamp > + if delta> threshold * 2: > + print "%.2f: ALERT, SLU detected on host %s, no heartbeat for %ds" \ > + % (local_timestamp, hostname, delta) > + del client_prev_timestamp[hostname] > + if check_drift: > + del client_clock_offset[hostname] > + del client_prev_drift[hostname] > + > +def usage(): > + print """ > +Usage: > + > + heartbeat_slu.py --server --address --port > + [--file] [--no-daemon] [--verbose] > + [--threshold] > + > + heartbeat_slu.py --client --address -p > + [--file output_file] [--no-daemon] [--verbose] > + [--interval] > +""" > + > +# host information and global data > +hostname = socket.gethostname() > +client_prev_timestamp = {} > +client_clock_offset = {} > +client_prev_drift = {} > + > +# default param values > +host_port = 9001 > +host_address = '' > +interval = 1 # seconds between heartbeats > +threshold = 10 # seconds late till alert > +is_server = False > +is_daemon = True > +file_server = "/tmp/heartbeat_server.out" > +file_client = "/tmp/heartbeat_client.out" > +file_selected = None > +queue_size = 5 > +verbose = False > +check_drift = False > + > +# process cmdline opts > +try: > + opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [ > + "server", "client", "no-daemon", "address=", "port=", > + "file=", "server", "interval=", "threshold=", "verbose", > + "check-drift", "help"]) > +except getopt.GetoptError, e: > + print "error: %s" % str(e) > + usage() > + exit(1) > + > +for param, value in opts: > + if param in ["-p", "--port"]: > + host_port = int(value) > + elif param in ["-a", "--address"]: > + host_address = value > + elif param in ["-s", "--server"]: > + is_server = True > + elif param in ["-c", "--client"]: > + is_server = False > + elif param in ["--no-daemon"]: > + is_daemon = False > + elif param in ["-f", "--file"]: > + file_selected = value > + elif param in ["-i", "--interval"]: > + interval = int(value) > + elif param in ["-t", "--threshold"]: > + threshold = int(value) > + elif param in ["-d", "--check-drift"]: > + check_drift = True > + elif param in ["-v", "--verbose"]: > + verbose = True > + elif param in ["-h", "--help"]: > + usage() > + exit(0) > + else: > + print "error: unrecognized option: %s" % value > + usage() > + exit(1) > + > +# run until we're terminated > +if is_server: > + file_server = file_selected or file_server > + run_server(host_address, host_port, is_daemon, file_server, queue_size, threshold, check_drift) > +else: > + file_client = file_selected or file_client > + run_client(host_address, host_port, is_daemon, file_client, interval) > diff --git a/client/tests/kvm/tests_base.cfg.sample b/client/tests/kvm/tests_base.cfg.sample > index 65880d8..e9e41f9 100644 > --- a/client/tests/kvm/tests_base.cfg.sample > +++ b/client/tests/kvm/tests_base.cfg.sample > @@ -420,6 +420,24 @@ variants: > type = smbios_table > start_vm = no > > + - softlockup: install setup unattended_install.cdrom > + only Linux > + type = softlockup > + softlockup_files = stress-1.0.4.tar.gz > + stress_setup_cmd = "cd %s&& tar xvf stress-1.0.4.tar.gz&& cd stress-1.0.4&& ./configure&& make&& cd src" > + server_setup_cmd = "%s/heartbeat_slu.py --server --threshold %s --file %s --port %s --verbose --check-drift" > + client_setup_cmd = "%s/heartbeat_slu.py --client --address %s --file %s --port %s --interval 1" > + stress_cmd = "cd %s&& cd stress-1.0.4&& cd src&& nohup ./stress -c %s> /dev/null 2>&1&" > + kill_monitor_cmd = "ps aux | grep heart | grep -v grep | awk '{print$2}' | xargs kill -9> /dev/null 2>&1" > + kill_stress_cmd = "pkill -f stress> /dev/null 2>&1" > + drift_cmd = "tail -1 %s | awk '{print $7}'" > + monitor_log_file_server = /tmp/heartbeat_server.log > + monitor_log_file_client = /tmp/heartbeat_client.log > + monitor_port = 13330 > + stress_threshold = 10 > + # time_to_run (hours) = 12, 18, 24, 48 hours > + test_length = 0.10 > + > - stress_boot: install setup image_copy unattended_install.cdrom > type = stress_boot > max_vms = 5 > diff --git a/client/virt/tests/softlockup.py b/client/virt/tests/softlockup.py > new file mode 100644 > index 0000000..d946965 > --- /dev/null > +++ b/client/virt/tests/softlockup.py > @@ -0,0 +1,147 @@ > +import logging, os, socket, time > +from autotest_lib.client.bin import utils > + > + > +def run_softlockup(test, params, env): > + """ > + soft lockup/drift test with stress. > + > + 1) Boot up a VM. > + 2) Build stress on host and guest. > + 3) run heartbeat with the given options on server and host. > + 3) Run for a relatively long time length. ex: 12, 18 or 24 hours. > + 4) Output the test result and observe drift. > + > + @param test: KVM test object. > + @param params: Dictionary with the test parameters. > + @param env: Dictionary with test environment. > + """ > + stress_setup_cmd = params.get("stress_setup_cmd") > + stress_cmd = params.get("stress_cmd") > + server_setup_cmd = params.get("server_setup_cmd") > + drift_cmd = params.get("drift_cmd") > + kill_stress_cmd = params.get("kill_stress_cmd") > + kill_monitor_cmd = params.get("kill_monitor_cmd") > + > + threshold = int(params.get("stress_threshold")) > + monitor_log_file_server = params.get("monitor_log_file_server") > + monitor_log_file_client = params.get("monitor_log_file_client") > + test_length = int(3600 * float(params.get("test_length"))) > + monitor_port = int(params.get("monitor_port")) > + > + vm = env.get_vm(params["main_vm"]) > + login_timeout = int(params.get("login_timeout", 360)) > + stress_dir = os.path.join(os.environ['AUTODIR'], "tests/stress") > + monitor_dir = os.path.join(test.bindir, 'deps') > + > + > + def _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd): > + logging.info("Kill stress and monitor on guest") > + try: > + session.cmd(kill_stress_cmd) > + except: > + pass > + try: > + session.cmd(kill_monitor_cmd) > + except: > + pass > + > + > + def _kill_host_programs(kill_stress_cmd, kill_monitor_cmd): > + logging.info("Kill stress and monitor on host") > + utils.run(kill_stress_cmd, ignore_status=True) > + utils.run(kill_monitor_cmd, ignore_status=True) > + > + > + def host(): > + logging.info("Setup monitor server on host") > + # Kill previous instances of the host load programs, if any > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > + # Cleanup previous log instances > + if os.path.isfile(monitor_log_file_server): > + os.remove(monitor_log_file_server) > + # Opening firewall ports on host > + utils.run("iptables -F", ignore_status=True) > + > + # Run heartbeat on host > + utils.run(server_setup_cmd % (monitor_dir, threshold, > + monitor_log_file_server, monitor_port)) > + > + logging.info("Build stress on host") > + # Uncompress and build stress on host > + utils.run(stress_setup_cmd % stress_dir) > + > + logging.info("Run stress on host") > + # stress_threads = 2 * n_cpus > + threads_host = 2 * utils.count_cpus() > + # Run stress test on host > + utils.run(stress_cmd % (stress_dir, threads_host)) > + > + > + def guest(): > + try: > + host_ip = socket.gethostbyname(socket.gethostname()) > + except socket.error: > + try: > + # Hackish, but works well on stand alone (laptop) setups > + # with access to the internet. If this fails, well, then > + # not much else can be done... > + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) > + s.connect(("redhat.com", 80)) > + host_ip = s.getsockname()[0] > + except socket.error, (value, e): > + raise error.TestError("Could not determine host IP: %d %s" % > + (value, e)) > + > + # Now, starting the guest > + vm.verify_alive() > + session = vm.wait_for_login(timeout=login_timeout) > + > + # Kill previous instances of the load programs, if any > + _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd) > + # Clean up previous log instances > + session.cmd("rm -f %s" % monitor_log_file_client) > + > + # Opening firewall ports on guest > + try: > + session.cmd("iptables -F") > + except: > + pass > + > + # Get required files and copy them from host to guest > + monitor_path = os.path.join(test.bindir, 'deps', 'heartbeat_slu.py') > + stress_path = os.path.join(os.environ['AUTODIR'], "tests", "stress", > + "stress-1.0.4.tar.gz") > + vm.copy_files_to(monitor_path, "/tmp") > + vm.copy_files_to(stress_path, "/tmp") > + > + logging.info("Setup monitor client on guest") > + # Start heartbeat on guest > + session.cmd(params.get("client_setup_cmd") % > + ("/tmp", monitor_log_file_client, host_ip, monitor_port)) > + > + logging.info("Build stress on guest") > + # Uncompress and build stress on guest > + session.cmd(stress_setup_cmd % "/tmp", timeout=200) > + > + logging.info("Run stress on guest") > + # stress_threads = 2 * n_vcpus > + threads_guest = 2 * int(params.get("smp", 1)) > + # Run stress test on guest > + session.cmd(stress_cmd % ("/tmp", threads_guest)) > + > + # Wait and report > + logging.debug("Wait for %d s", test_length) > + time.sleep(test_length) > + > + # Kill instances of the load programs on both guest and host > + _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd) > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > + > + # Collect drift > + drift = utils.system_output(drift_cmd % monitor_log_file_server) > + logging.info("Drift noticed: %s", drift) > + > + > + host() > + guest()