* [PATCH 1/2] qemurunner: Impove stdout logging handling
@ 2023-12-18 15:29 Richard Purdie
2023-12-18 15:29 ` [PATCH 2/2] qemurunner: Impove handling of serial port output blocking Richard Purdie
` (2 more replies)
0 siblings, 3 replies; 21+ messages in thread
From: Richard Purdie @ 2023-12-18 15:29 UTC (permalink / raw)
To: openembedded-core
We need to ensure we read from and log the output from qemu stdout
as otherwise the buffers can fill and block, leading qemu to hand.
Use our existing logging thread to do this.
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
---
meta/lib/oeqa/utils/qemurunner.py | 54 ++++++++++++++++++-------------
1 file changed, 32 insertions(+), 22 deletions(-)
diff --git a/meta/lib/oeqa/utils/qemurunner.py b/meta/lib/oeqa/utils/qemurunner.py
index ee367078000..36bdfb96ae6 100644
--- a/meta/lib/oeqa/utils/qemurunner.py
+++ b/meta/lib/oeqa/utils/qemurunner.py
@@ -29,6 +29,15 @@ control_chars = [chr(x) for x in control_range
if chr(x) not in string.printable]
re_control_char = re.compile('[%s]' % re.escape("".join(control_chars)))
+def getOutput(o):
+ import fcntl
+ fl = fcntl.fcntl(o, fcntl.F_GETFL)
+ fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK)
+ try:
+ return os.read(o.fileno(), 1000000).decode("utf-8")
+ except BlockingIOError:
+ return ""
+
class QemuRunner:
def __init__(self, machine, rootfs, display, tmpdir, deploy_dir_image, logfile, boottime, dump_dir, use_kvm, logger, use_slirp=False,
@@ -55,6 +64,7 @@ class QemuRunner:
self.boottime = boottime
self.logged = False
self.thread = None
+ self.threadsock = None
self.use_kvm = use_kvm
self.use_ovmf = use_ovmf
self.use_slirp = use_slirp
@@ -119,21 +129,11 @@ class QemuRunner:
f.write(msg)
self.msg += self.decode_qemulog(msg)
- def getOutput(self, o):
- import fcntl
- fl = fcntl.fcntl(o, fcntl.F_GETFL)
- fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK)
- try:
- return os.read(o.fileno(), 1000000).decode("utf-8")
- except BlockingIOError:
- return ""
-
-
def handleSIGCHLD(self, signum, frame):
if self.runqemu and self.runqemu.poll():
if self.runqemu.returncode:
self.logger.error('runqemu exited with code %d' % self.runqemu.returncode)
- self.logger.error('Output from runqemu:\n%s' % self.getOutput(self.runqemu.stdout))
+ self.logger.error('Output from runqemu:\n%s' % getOutput(self.runqemu.stdout))
self.stop()
def start(self, qemuparams = None, get_ip = True, extra_bootparams = None, runqemuparams='', launch_cmd=None, discard_writes=True):
@@ -282,7 +282,7 @@ class QemuRunner:
if self.runqemu.returncode:
# No point waiting any longer
self.logger.warning('runqemu exited with code %d' % self.runqemu.returncode)
- self.logger.warning("Output from runqemu:\n%s" % self.getOutput(output))
+ self.logger.warning("Output from runqemu:\n%s" % getOutput(output))
self.stop()
return False
time.sleep(0.5)
@@ -309,7 +309,7 @@ class QemuRunner:
ps = subprocess.Popen(['ps', 'axww', '-o', 'pid,ppid,pri,ni,command '], stdout=subprocess.PIPE).communicate()[0]
processes = ps.decode("utf-8")
self.logger.debug("Running processes:\n%s" % processes)
- op = self.getOutput(output)
+ op = getOutput(output)
self.stop()
if op:
self.logger.error("Output from runqemu:\n%s" % op)
@@ -387,7 +387,7 @@ class QemuRunner:
time.time() - connect_time))
# We are alive: qemu is running
- out = self.getOutput(output)
+ out = getOutput(output)
netconf = False # network configuration is not required by default
self.logger.debug("qemu started in %.2f seconds - qemu procces pid is %s (%s)" %
(time.time() - (endtime - self.runqemutime),
@@ -430,9 +430,10 @@ class QemuRunner:
self.logger.debug("Target IP: %s" % self.ip)
self.logger.debug("Server IP: %s" % self.server_ip)
+ self.thread = LoggingThread(self.log, self.threadsock, self.logger, self.runqemu.stdout)
+ self.thread.start()
+
if self.serial_ports >= 2:
- self.thread = LoggingThread(self.log, self.threadsock, self.logger)
- self.thread.start()
if not self.thread.connection_established.wait(self.boottime):
self.logger.error("Didn't receive a console connection from qemu. "
"Here is the qemu command line used:\n%s\nand "
@@ -444,7 +445,7 @@ class QemuRunner:
self.logger.debug("Waiting at most %d seconds for login banner (%s)" %
(self.boottime, time.strftime("%D %H:%M:%S")))
endtime = time.time() + self.boottime
- filelist = [self.server_socket, self.runqemu.stdout]
+ filelist = [self.server_socket]
reachedlogin = False
stopread = False
qemusock = None
@@ -564,7 +565,7 @@ class QemuRunner:
self.logger.debug("Sending SIGKILL to runqemu")
os.killpg(os.getpgid(self.runqemu.pid), signal.SIGKILL)
if not self.runqemu.stdout.closed:
- self.logger.info("Output from runqemu:\n%s" % self.getOutput(self.runqemu.stdout))
+ self.logger.info("Output from runqemu:\n%s" % getOutput(self.runqemu.stdout))
self.runqemu.stdin.close()
self.runqemu.stdout.close()
self.runqemu_exited = True
@@ -700,9 +701,11 @@ class QemuRunner:
# event loop. The mechanism for stopping the thread is a simple pipe which
# will wake up the poll and allow for tearing everything down.
class LoggingThread(threading.Thread):
- def __init__(self, logfunc, sock, logger):
+ def __init__(self, logfunc, sock, logger, qemuoutput):
self.connection_established = threading.Event()
+
self.serversock = sock
+ self.qemuoutput = qemuoutput
self.logfunc = logfunc
self.logger = logger
self.readsock = None
@@ -732,7 +735,8 @@ class LoggingThread(threading.Thread):
def teardown(self):
self.logger.debug("Tearing down logging thread")
- self.close_socket(self.serversock)
+ if self.serversock:
+ self.close_socket(self.serversock)
if self.readsock is not None:
self.close_socket(self.readsock)
@@ -747,7 +751,9 @@ class LoggingThread(threading.Thread):
def eventloop(self):
poll = select.poll()
event_read_mask = self.errorevents | self.readevents
- poll.register(self.serversock.fileno())
+ if self.serversock:
+ poll.register(self.serversock.fileno())
+ poll.register(self.qemuoutput.fileno())
poll.register(self.readpipe, event_read_mask)
breakout = False
@@ -767,7 +773,7 @@ class LoggingThread(threading.Thread):
break
# A connection request was received
- elif self.serversock.fileno() == event[0]:
+ elif self.serversock and self.serversock.fileno() == event[0]:
self.logger.debug("Connection request received")
self.readsock, _ = self.serversock.accept()
self.readsock.setblocking(0)
@@ -781,6 +787,10 @@ class LoggingThread(threading.Thread):
elif self.readsock.fileno() == event[0]:
data = self.recv(1024)
self.logfunc(data)
+ elif self.qemuoutput.fileno() == event[0]:
+ data = getOutput(self.qemuoutput)
+ self.logger.debug("Data received on qemu stdout %s" % data)
+ self.logfunc(data, ".stdout")
# Since the socket is non-blocking make sure to honor EAGAIN
# and EWOULDBLOCK.
--
2.39.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH 2/2] qemurunner: Impove handling of serial port output blocking
2023-12-18 15:29 [PATCH 1/2] qemurunner: Impove stdout logging handling Richard Purdie
@ 2023-12-18 15:29 ` Richard Purdie
2023-12-19 15:10 ` [OE-core] " Alex Bennée
2023-12-18 16:04 ` [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling Fabio Estevam
2023-12-18 17:45 ` Khem Raj
2 siblings, 1 reply; 21+ messages in thread
From: Richard Purdie @ 2023-12-18 15:29 UTC (permalink / raw)
To: openembedded-core
Similar to stdout in the previous commit, we need to ensure serial output
if written is read and put somewhere, else qemu might block on writes to
the serial port leading to hangs in the kernel. Use our existing logging
thread to log data when run_serial is not in use.
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
---
meta/lib/oeqa/utils/qemurunner.py | 97 ++++++++++++++++++++++---------
1 file changed, 68 insertions(+), 29 deletions(-)
diff --git a/meta/lib/oeqa/utils/qemurunner.py b/meta/lib/oeqa/utils/qemurunner.py
index 36bdfb96ae6..16aa1c08d3e 100644
--- a/meta/lib/oeqa/utils/qemurunner.py
+++ b/meta/lib/oeqa/utils/qemurunner.py
@@ -21,6 +21,7 @@ import threading
import codecs
import tempfile
from collections import defaultdict
+from contextlib import contextmanager
import importlib
# Get Unicode non printable control chars
@@ -517,8 +518,12 @@ class QemuRunner:
except Exception as e:
self.logger.warning('Extra log data exception %s' % repr(e))
data = None
+ self.thread.serial_lock.release()
return False
+ with self.thread.serial_lock:
+ self.thread.set_serialsock(self.server_socket)
+
# If we are not able to login the tests can continue
try:
(status, output) = self.run_serial(self.boot_patterns['send_login_user'], raw=True, timeout=120)
@@ -653,31 +658,32 @@ class QemuRunner:
data = ''
status = 0
- self.server_socket.sendall(command.encode('utf-8'))
- start = time.time()
- end = start + timeout
- while True:
- now = time.time()
- if now >= end:
- data += "<<< run_serial(): command timed out after %d seconds without output >>>\r\n\r\n" % timeout
- break
- try:
- sread, _, _ = select.select([self.server_socket],[],[], end - now)
- except InterruptedError:
- continue
- if sread:
- # try to avoid reading single character at a time
- time.sleep(0.1)
- answer = self.server_socket.recv(1024)
- if answer:
- data += answer.decode('utf-8')
- # Search the prompt to stop
- if re.search(self.boot_patterns['search_cmd_finished'], data):
- break
- else:
- if self.canexit:
- return (1, "")
- raise Exception("No data on serial console socket, connection closed?")
+ with self.thread.serial_lock:
+ self.server_socket.sendall(command.encode('utf-8'))
+ start = time.time()
+ end = start + timeout
+ while True:
+ now = time.time()
+ if now >= end:
+ data += "<<< run_serial(): command timed out after %d seconds without output >>>\r\n\r\n" % timeout
+ break
+ try:
+ sread, _, _ = select.select([self.server_socket],[],[], end - now)
+ except InterruptedError:
+ continue
+ if sread:
+ # try to avoid reading single character at a time
+ time.sleep(0.1)
+ answer = self.server_socket.recv(1024)
+ if answer:
+ data += answer.decode('utf-8')
+ # Search the prompt to stop
+ if re.search(self.boot_patterns['search_cmd_finished'], data):
+ break
+ else:
+ if self.canexit:
+ return (1, "")
+ raise Exception("No data on serial console socket, connection closed?")
if data:
if raw:
@@ -696,6 +702,15 @@ class QemuRunner:
status = 1
return (status, str(data))
+@contextmanager
+def nonblocking_lock(lock):
+ locked = lock.acquire(False)
+ try:
+ yield locked
+ finally:
+ if locked:
+ lock.release()
+
# This class is for reading data from a socket and passing it to logfunc
# to be processed. It's completely event driven and has a straightforward
# event loop. The mechanism for stopping the thread is a simple pipe which
@@ -703,8 +718,10 @@ class QemuRunner:
class LoggingThread(threading.Thread):
def __init__(self, logfunc, sock, logger, qemuoutput):
self.connection_established = threading.Event()
+ self.serial_lock = threading.Lock()
self.serversock = sock
+ self.serialsock = None
self.qemuoutput = qemuoutput
self.logfunc = logfunc
self.logger = logger
@@ -717,9 +734,14 @@ class LoggingThread(threading.Thread):
threading.Thread.__init__(self, target=self.threadtarget)
+ def set_serialsock(self, serialsock):
+ self.serialsock = serialsock
+
def threadtarget(self):
try:
self.eventloop()
+ except Exception as e:
+ self.logger.warning("Exception %s in logging thread" % repr(e))
finally:
self.teardown()
@@ -753,6 +775,7 @@ class LoggingThread(threading.Thread):
event_read_mask = self.errorevents | self.readevents
if self.serversock:
poll.register(self.serversock.fileno())
+ serial_registered = False
poll.register(self.qemuoutput.fileno())
poll.register(self.readpipe, event_read_mask)
@@ -760,7 +783,7 @@ class LoggingThread(threading.Thread):
self.running = True
self.logger.debug("Starting thread event loop")
while not breakout:
- events = poll.poll()
+ events = poll.poll(2)
for event in events:
# An error occurred, bail out
if event[1] & self.errorevents:
@@ -785,18 +808,34 @@ class LoggingThread(threading.Thread):
# Actual data to be logged
elif self.readsock.fileno() == event[0]:
- data = self.recv(1024)
+ data = self.recv(1024, self.readsock)
self.logfunc(data)
elif self.qemuoutput.fileno() == event[0]:
data = getOutput(self.qemuoutput)
self.logger.debug("Data received on qemu stdout %s" % data)
self.logfunc(data, ".stdout")
+ elif self.serialsock and self.serialsock.fileno() == event[0]:
+ if self.serial_lock.acquire(blocking=False):
+ data = self.recv(1024, self.serialsock)
+ self.logger.debug("Data received serial thread %s" % data.decode('utf-8', 'replace'))
+ self.logfunc(data, ".2")
+ self.serial_lock.release()
+ else:
+ serial_registered = False
+ poll.unregister(self.serialsock.fileno())
+
+ if not serial_registered and self.serialsock:
+ with nonblocking_lock(self.serial_lock) as l:
+ if l:
+ serial_registered = True
+ poll.register(self.serialsock.fileno(), event_read_mask)
+
# Since the socket is non-blocking make sure to honor EAGAIN
# and EWOULDBLOCK.
- def recv(self, count):
+ def recv(self, count, sock):
try:
- data = self.readsock.recv(count)
+ data = sock.recv(count)
except socket.error as e:
if e.errno == errno.EAGAIN or e.errno == errno.EWOULDBLOCK:
return b''
--
2.39.2
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [OE-core] [PATCH 2/2] qemurunner: Impove handling of serial port output blocking
2023-12-18 15:29 ` [PATCH 2/2] qemurunner: Impove handling of serial port output blocking Richard Purdie
@ 2023-12-19 15:10 ` Alex Bennée
2023-12-19 15:15 ` Richard Purdie
0 siblings, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2023-12-19 15:10 UTC (permalink / raw)
To: Richard Purdie; +Cc: openembedded-core
"Richard Purdie" <richard.purdie@linuxfoundation.org> writes:
> Similar to stdout in the previous commit, we need to ensure serial output
> if written is read and put somewhere, else qemu might block on writes to
> the serial port leading to hangs in the kernel. Use our existing logging
> thread to log data when run_serial is not in use.
>
> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
I'm still seeing hanging with my test case:
o regression_1010.5 Invalid memory access 5
regression_1010.5 OK
o regression_1010.6 Invalid memory access 6
Stdout:
Tried to dump info from target but serial console failed
Failed CMD: top -bn1
Tried to dump info from target but serial console failed
Failed CMD: ps
Tried to dump info from target but serial console failed
Failed CMD: free
NOTE: test_soafee (soafeetestsuite.SoafeeTestSuite.test_soafee)
DEBUG: Checking if at least one of soafee-test-suite is installed
DEBUG: Setting up a 400 second(s) timeout
DEBUG: [Running]$ ssh -l root -o ServerAliveCountMax=2 -o ServerAliveInterval=30 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR -p 2222 127.0.0.1 export PATH=/usr/sbin:/sbin:/usr/bin:/bin; sudo -i -u trs soafee-test-suite run -r -t -T ; rm -rf /tmp/soafee-test-suite
DEBUG: Waiting for process output: time: 1702998407.0291, endtime: 1702998707.0285847
DEBUG: Waiting for process output: time: 1702998412.0338314, endtime: 1702998707.0285847
DEBUG: Waiting for process output: time: 1702998417.0378022, endtime: 1702998707.0285847
DEBUG: Waiting for process output: time: 1702998422.0417945, endtime: 1702998707.0285847
<snip>
The regression tests are hanging because that has triggered output on
the serial port as the firmware attempted to service the test and the
port isn't being drained. Then when that times out we start spinning
waiting for more output.
--
Alex Bennée
Virtualisation Tech Lead @ Linaro
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [OE-core] [PATCH 2/2] qemurunner: Impove handling of serial port output blocking
2023-12-19 15:10 ` [OE-core] " Alex Bennée
@ 2023-12-19 15:15 ` Richard Purdie
2023-12-19 16:44 ` Alex Bennée
0 siblings, 1 reply; 21+ messages in thread
From: Richard Purdie @ 2023-12-19 15:15 UTC (permalink / raw)
To: Alex Bennée; +Cc: openembedded-core
On Tue, 2023-12-19 at 15:10 +0000, Alex Bennée wrote:
> "Richard Purdie" <richard.purdie@linuxfoundation.org> writes:
>
> > Similar to stdout in the previous commit, we need to ensure serial output
> > if written is read and put somewhere, else qemu might block on writes to
> > the serial port leading to hangs in the kernel. Use our existing logging
> > thread to log data when run_serial is not in use.
> >
> > Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
>
> I'm still seeing hanging with my test case:
>
> o regression_1010.5 Invalid memory access 5
> regression_1010.5 OK
> o regression_1010.6 Invalid memory access 6
>
> Stdout:
> Tried to dump info from target but serial console failed
> Failed CMD: top -bn1
> Tried to dump info from target but serial console failed
> Failed CMD: ps
> Tried to dump info from target but serial console failed
> Failed CMD: free
>
> NOTE: test_soafee (soafeetestsuite.SoafeeTestSuite.test_soafee)
> DEBUG: Checking if at least one of soafee-test-suite is installed
> DEBUG: Setting up a 400 second(s) timeout
> DEBUG: [Running]$ ssh -l root -o ServerAliveCountMax=2 -o ServerAliveInterval=30 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR -p 2222 127.0.0.1 export PATH=/usr/sbin:/sbin:/usr/bin:/bin; sudo -i -u trs soafee-test-suite run -r -t -T ; rm -rf /tmp/soafee-test-suite
> DEBUG: Waiting for process output: time: 1702998407.0291, endtime: 1702998707.0285847
> DEBUG: Waiting for process output: time: 1702998412.0338314, endtime: 1702998707.0285847
> DEBUG: Waiting for process output: time: 1702998417.0378022, endtime: 1702998707.0285847
> DEBUG: Waiting for process output: time: 1702998422.0417945, endtime: 1702998707.0285847
> <snip>
>
> The regression tests are hanging because that has triggered output on
> the serial port as the firmware attempted to service the test and the
> port isn't being drained. Then when that times out we start spinning
> waiting for more output.
I don't really understand where it might be hanging unfortunately as in
my local tests it is draining stdout and the serial ports. Does it
login to the serial console correctly? Are there any WARNING messages?
I did use this debug patch:
https://git.yoctoproject.org/poky-contrib/commit/?h=rpurdie/t222&id=0d9029bad610736c114e0770d2433b53133fa620
to prove that it can interleave serial commands with left over output
successfully. A similar patch may help work out where your setup may be
hanging?
Cheers,
Richard
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [OE-core] [PATCH 2/2] qemurunner: Impove handling of serial port output blocking
2023-12-19 15:15 ` Richard Purdie
@ 2023-12-19 16:44 ` Alex Bennée
2023-12-19 22:57 ` Richard Purdie
0 siblings, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2023-12-19 16:44 UTC (permalink / raw)
To: Richard Purdie; +Cc: openembedded-core
Richard Purdie <richard.purdie@linuxfoundation.org> writes:
> On Tue, 2023-12-19 at 15:10 +0000, Alex Bennée wrote:
>> "Richard Purdie" <richard.purdie@linuxfoundation.org> writes:
>>
>> > Similar to stdout in the previous commit, we need to ensure serial output
>> > if written is read and put somewhere, else qemu might block on writes to
>> > the serial port leading to hangs in the kernel. Use our existing logging
>> > thread to log data when run_serial is not in use.
>> >
>> > Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
>>
>> I'm still seeing hanging with my test case:
>>
>> o regression_1010.5 Invalid memory access 5
>> regression_1010.5 OK
>> o regression_1010.6 Invalid memory access 6
>>
>> Stdout:
>> Tried to dump info from target but serial console failed
>> Failed CMD: top -bn1
>> Tried to dump info from target but serial console failed
>> Failed CMD: ps
>> Tried to dump info from target but serial console failed
>> Failed CMD: free
>>
>> NOTE: test_soafee (soafeetestsuite.SoafeeTestSuite.test_soafee)
>> DEBUG: Checking if at least one of soafee-test-suite is installed
>> DEBUG: Setting up a 400 second(s) timeout
>> DEBUG: [Running]$ ssh -l root -o ServerAliveCountMax=2 -o ServerAliveInterval=30 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR -p 2222 127.0.0.1 export PATH=/usr/sbin:/sbin:/usr/bin:/bin; sudo -i -u trs soafee-test-suite run -r -t -T ; rm -rf /tmp/soafee-test-suite
>> DEBUG: Waiting for process output: time: 1702998407.0291, endtime: 1702998707.0285847
>> DEBUG: Waiting for process output: time: 1702998412.0338314, endtime: 1702998707.0285847
>> DEBUG: Waiting for process output: time: 1702998417.0378022, endtime: 1702998707.0285847
>> DEBUG: Waiting for process output: time: 1702998422.0417945, endtime: 1702998707.0285847
>> <snip>
>>
>> The regression tests are hanging because that has triggered output on
>> the serial port as the firmware attempted to service the test and the
>> port isn't being drained. Then when that times out we start spinning
>> waiting for more output.
>
> I don't really understand where it might be hanging unfortunately as in
> my local tests it is draining stdout and the serial ports. Does it
> login to the serial console correctly? Are there any WARNING messages?
>
> I did use this debug patch:
>
> https://git.yoctoproject.org/poky-contrib/commit/?h=rpurdie/t222&id=0d9029bad610736c114e0770d2433b53133fa620
>
> to prove that it can interleave serial commands with left over output
> successfully. A similar patch may help work out where your setup may be
> hanging?
Yeah I'm only seeing A10/A11:
o regression_1010.5 Invalid memory access 5
regression_1010.5 OK
o regression_1010.6 Invalid memory access 6
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
NOTE: ... FAIL
DEBUG: Removed SIGALRM handler
Traceback (most recent call last):
File "/home/alex/lsrc/tests/trs/build/../poky/meta/lib/oeqa/core/decorator/__init__.py", line 35, in wrapped_f
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/alex/lsrc/tests/trs/build/../poky/meta/lib/oeqa/core/decorator/__init__.py", line 35, in wrapped_f
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/alex/lsrc/tests/trs/meta-trs/lib/oeqa/runtime/cases/opteetest.py", line 27, in test_opteetest_xtest
self.assertEqual(status, 0, msg='\n'.join([cmd, output]))
AssertionError: 255 != 0 : xtest
Run test suite with level=0
Those are the only two bits hit in the logging:
grep "WARNING: here" build/tmp_trs-qemuarm64/work/trs_qemuarm64-trs-linux/trs-image/1.0/temp/log.do_testimage
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
WARNING: here A10
WARNING: here A11
>
> Cheers,
>
> Richard
--
Alex Bennée
Virtualisation Tech Lead @ Linaro
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [OE-core] [PATCH 2/2] qemurunner: Impove handling of serial port output blocking
2023-12-19 16:44 ` Alex Bennée
@ 2023-12-19 22:57 ` Richard Purdie
2023-12-20 9:53 ` Alex Bennée
0 siblings, 1 reply; 21+ messages in thread
From: Richard Purdie @ 2023-12-19 22:57 UTC (permalink / raw)
To: Alex Bennée; +Cc: openembedded-core
On Tue, 2023-12-19 at 16:44 +0000, Alex Bennée wrote:
> Richard Purdie <richard.purdie@linuxfoundation.org> writes:
>
> > On Tue, 2023-12-19 at 15:10 +0000, Alex Bennée wrote:
> > > "Richard Purdie" <richard.purdie@linuxfoundation.org> writes:
> > >
> > > > Similar to stdout in the previous commit, we need to ensure serial output
> > > > if written is read and put somewhere, else qemu might block on writes to
> > > > the serial port leading to hangs in the kernel. Use our existing logging
> > > > thread to log data when run_serial is not in use.
> > > >
> > > > Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
> > >
> > > I'm still seeing hanging with my test case:
> > >
> > > o regression_1010.5 Invalid memory access 5
> > > regression_1010.5 OK
> > > o regression_1010.6 Invalid memory access 6
> > >
> > > Stdout:
> > > Tried to dump info from target but serial console failed
> > > Failed CMD: top -bn1
> > > Tried to dump info from target but serial console failed
> > > Failed CMD: ps
> > > Tried to dump info from target but serial console failed
> > > Failed CMD: free
> > >
> > > NOTE: test_soafee (soafeetestsuite.SoafeeTestSuite.test_soafee)
> > > DEBUG: Checking if at least one of soafee-test-suite is installed
> > > DEBUG: Setting up a 400 second(s) timeout
> > > DEBUG: [Running]$ ssh -l root -o ServerAliveCountMax=2 -o ServerAliveInterval=30 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR -p 2222 127.0.0.1 export PATH=/usr/sbin:/sbin:/usr/bin:/bin; sudo -i -u trs soafee-test-suite run -r -t -T ; rm -rf /tmp/soafee-test-suite
> > > DEBUG: Waiting for process output: time: 1702998407.0291, endtime: 1702998707.0285847
> > > DEBUG: Waiting for process output: time: 1702998412.0338314, endtime: 1702998707.0285847
> > > DEBUG: Waiting for process output: time: 1702998417.0378022, endtime: 1702998707.0285847
> > > DEBUG: Waiting for process output: time: 1702998422.0417945, endtime: 1702998707.0285847
> > > <snip>
> > >
> > > The regression tests are hanging because that has triggered output on
> > > the serial port as the firmware attempted to service the test and the
> > > port isn't being drained. Then when that times out we start spinning
> > > waiting for more output.
> >
> > I don't really understand where it might be hanging unfortunately as in
> > my local tests it is draining stdout and the serial ports. Does it
> > login to the serial console correctly? Are there any WARNING messages?
> >
> > I did use this debug patch:
> >
> > https://git.yoctoproject.org/poky-contrib/commit/?h=rpurdie/t222&id=0d9029bad610736c114e0770d2433b53133fa620
> >
> > to prove that it can interleave serial commands with left over output
> > successfully. A similar patch may help work out where your setup may be
> > hanging?
>
> Yeah I'm only seeing A10/A11:
>
> o regression_1010.5 Invalid memory access 5
> regression_1010.5 OK
> o regression_1010.6 Invalid memory access 6
>
> WARNING: here A10
> WARNING: here A11
> WARNING: here A10
> WARNING: here A11
> WARNING: here A10
> WARNING: here A11
> WARNING: here A10
> WARNING: here A11
> WARNING: here A10
> WARNING: here A11
> WARNING: here A10
> WARNING: here A11
> WARNING: here A10
> WARNING: here A11
> WARNING: here A10
> WARNING: here A11
> WARNING: here A10
> WARNING: here A11
> WARNING: here A10
> WARNING: here A11
> NOTE: ... FAIL
> DEBUG: Removed SIGALRM handler
> Traceback (most recent call last):
> File "/home/alex/lsrc/tests/trs/build/../poky/meta/lib/oeqa/core/decorator/__init__.py", line 35, in wrapped_f
> return func(*args, **kwargs)
> ^^^^^^^^^^^^^^^^^^^^^
> File "/home/alex/lsrc/tests/trs/build/../poky/meta/lib/oeqa/core/decorator/__init__.py", line 35, in wrapped_f
> return func(*args, **kwargs)
> ^^^^^^^^^^^^^^^^^^^^^
> File "/home/alex/lsrc/tests/trs/meta-trs/lib/oeqa/runtime/cases/opteetest.py", line 27, in test_opteetest_xtest
> self.assertEqual(status, 0, msg='\n'.join([cmd, output]))
> AssertionError: 255 != 0 : xtest
> Run test suite with level=0
How long does it take for the test data to "back up" and timeout? The
logging thread should take a maximum of 2s to notice data not being
read/handled on the serial port.
What the code should be doing is noticing the run_serial function isn't
using the port and if there is data there, reading it in the logging
thread. You could put further warnings around the code there to see if
it ever detects data to read on that port. A warning when it exits the
run_serial command may help too to see if it is waiting for data there
or not.
I also ended up using:
https://git.yoctoproject.org/poky-contrib/commit/?h=rpurdie/t222&id=febb0d892e4c3648cd88b8ecb9c0260edc45b68e
to show what was going over the serial interface when I found it wasn't
what I expected.
I suspect/hope my patch is a step in the right direction but it looks
like there may be cases it doesn't catch yet :/.
Cheers,
Richard
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [OE-core] [PATCH 2/2] qemurunner: Impove handling of serial port output blocking
2023-12-19 22:57 ` Richard Purdie
@ 2023-12-20 9:53 ` Alex Bennée
2023-12-20 10:04 ` Richard Purdie
0 siblings, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2023-12-20 9:53 UTC (permalink / raw)
To: Richard Purdie; +Cc: openembedded-core
Richard Purdie <richard.purdie@linuxfoundation.org> writes:
> On Tue, 2023-12-19 at 16:44 +0000, Alex Bennée wrote:
>> Richard Purdie <richard.purdie@linuxfoundation.org> writes:
>>
>> > On Tue, 2023-12-19 at 15:10 +0000, Alex Bennée wrote:
>> > > "Richard Purdie" <richard.purdie@linuxfoundation.org> writes:
>> > >
>> > > > Similar to stdout in the previous commit, we need to ensure serial output
>> > > > if written is read and put somewhere, else qemu might block on writes to
>> > > > the serial port leading to hangs in the kernel. Use our existing logging
>> > > > thread to log data when run_serial is not in use.
>> > > >
>> > > > Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
>> > >
>> > > I'm still seeing hanging with my test case:
>> > >
>> > > o regression_1010.5 Invalid memory access 5
>> > > regression_1010.5 OK
>> > > o regression_1010.6 Invalid memory access 6
>> > >
>> > > Stdout:
>> > > Tried to dump info from target but serial console failed
>> > > Failed CMD: top -bn1
>> > > Tried to dump info from target but serial console failed
>> > > Failed CMD: ps
>> > > Tried to dump info from target but serial console failed
>> > > Failed CMD: free
>> > >
>> > > NOTE: test_soafee (soafeetestsuite.SoafeeTestSuite.test_soafee)
>> > > DEBUG: Checking if at least one of soafee-test-suite is installed
>> > > DEBUG: Setting up a 400 second(s) timeout
>> > > DEBUG: [Running]$ ssh -l root -o ServerAliveCountMax=2 -o ServerAliveInterval=30 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR -p 2222 127.0.0.1 export PATH=/usr/sbin:/sbin:/usr/bin:/bin; sudo -i -u trs soafee-test-suite run -r -t -T ; rm -rf /tmp/soafee-test-suite
>> > > DEBUG: Waiting for process output: time: 1702998407.0291, endtime: 1702998707.0285847
>> > > DEBUG: Waiting for process output: time: 1702998412.0338314, endtime: 1702998707.0285847
>> > > DEBUG: Waiting for process output: time: 1702998417.0378022, endtime: 1702998707.0285847
>> > > DEBUG: Waiting for process output: time: 1702998422.0417945, endtime: 1702998707.0285847
>> > > <snip>
>> > >
>> > > The regression tests are hanging because that has triggered output on
>> > > the serial port as the firmware attempted to service the test and the
>> > > port isn't being drained. Then when that times out we start spinning
>> > > waiting for more output.
>> >
>> > I don't really understand where it might be hanging unfortunately as in
>> > my local tests it is draining stdout and the serial ports. Does it
>> > login to the serial console correctly? Are there any WARNING messages?
>> >
>> > I did use this debug patch:
>> >
>> > https://git.yoctoproject.org/poky-contrib/commit/?h=rpurdie/t222&id=0d9029bad610736c114e0770d2433b53133fa620
>> >
>> > to prove that it can interleave serial commands with left over output
>> > successfully. A similar patch may help work out where your setup may be
>> > hanging?
>>
>> Yeah I'm only seeing A10/A11:
>>
>> o regression_1010.5 Invalid memory access 5
>> regression_1010.5 OK
>> o regression_1010.6 Invalid memory access 6
>>
>> WARNING: here A10
>> WARNING: here A11
>> WARNING: here A10
>> WARNING: here A11
>> WARNING: here A10
>> WARNING: here A11
>> WARNING: here A10
>> WARNING: here A11
>> WARNING: here A10
>> WARNING: here A11
>> WARNING: here A10
>> WARNING: here A11
>> WARNING: here A10
>> WARNING: here A11
>> WARNING: here A10
>> WARNING: here A11
>> WARNING: here A10
>> WARNING: here A11
>> WARNING: here A10
>> WARNING: here A11
>> NOTE: ... FAIL
>> DEBUG: Removed SIGALRM handler
>> Traceback (most recent call last):
>> File "/home/alex/lsrc/tests/trs/build/../poky/meta/lib/oeqa/core/decorator/__init__.py", line 35, in wrapped_f
>> return func(*args, **kwargs)
>> ^^^^^^^^^^^^^^^^^^^^^
>> File "/home/alex/lsrc/tests/trs/build/../poky/meta/lib/oeqa/core/decorator/__init__.py", line 35, in wrapped_f
>> return func(*args, **kwargs)
>> ^^^^^^^^^^^^^^^^^^^^^
>> File "/home/alex/lsrc/tests/trs/meta-trs/lib/oeqa/runtime/cases/opteetest.py", line 27, in test_opteetest_xtest
>> self.assertEqual(status, 0, msg='\n'.join([cmd, output]))
>> AssertionError: 255 != 0 : xtest
>> Run test suite with level=0
>
> How long does it take for the test data to "back up" and timeout? The
> logging thread should take a maximum of 2s to notice data not being
> read/handled on the serial port.
I don't think it goes around the logging loop more than once. I only
see:
grep "got" build/tmp_trs-qemuarm64/work/trs_qemuarm64-trs-linux/trs-image/1.0/temp/log.do_testimage
DEBUG: got [(12, 1)]
from the poll.
>
> What the code should be doing is noticing the run_serial function isn't
> using the port and if there is data there, reading it in the logging
> thread. You could put further warnings around the code there to see if
> it ever detects data to read on that port. A warning when it exits the
> run_serial command may help too to see if it is waiting for data there
> or not.
>
> I also ended up using:
>
> https://git.yoctoproject.org/poky-contrib/commit/?h=rpurdie/t222&id=febb0d892e4c3648cd88b8ecb9c0260edc45b68e
>
> to show what was going over the serial interface when I found it wasn't
> what I expected.
I'll keep digging.
>
> I suspect/hope my patch is a step in the right direction but it looks
> like there may be cases it doesn't catch yet :/.
>
> Cheers,
>
> Richard
--
Alex Bennée
Virtualisation Tech Lead @ Linaro
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [OE-core] [PATCH 2/2] qemurunner: Impove handling of serial port output blocking
2023-12-20 9:53 ` Alex Bennée
@ 2023-12-20 10:04 ` Richard Purdie
2023-12-20 16:28 ` Alex Bennée
0 siblings, 1 reply; 21+ messages in thread
From: Richard Purdie @ 2023-12-20 10:04 UTC (permalink / raw)
To: Alex Bennée; +Cc: openembedded-core
On Wed, 2023-12-20 at 09:53 +0000, Alex Bennée wrote:
> Richard Purdie <richard.purdie@linuxfoundation.org> writes:
>
> > On Tue, 2023-12-19 at 16:44 +0000, Alex Bennée wrote:
> > > Richard Purdie <richard.purdie@linuxfoundation.org> writes:
> > > Yeah I'm only seeing A10/A11:
> > >
> > > o regression_1010.5 Invalid memory access 5
> > > regression_1010.5 OK
> > > o regression_1010.6 Invalid memory access 6
> > >
> > > WARNING: here A10
> > > WARNING: here A11
> > > WARNING: here A10
> > > WARNING: here A11
> > > WARNING: here A10
> > > WARNING: here A11
> > > WARNING: here A10
> > > WARNING: here A11
> > > WARNING: here A10
> > > WARNING: here A11
> > > WARNING: here A10
> > > WARNING: here A11
> > > WARNING: here A10
> > > WARNING: here A11
> > > WARNING: here A10
> > > WARNING: here A11
> > > WARNING: here A10
> > > WARNING: here A11
> > > WARNING: here A10
> > > WARNING: here A11
> > > NOTE: ... FAIL
> > > DEBUG: Removed SIGALRM handler
> > > Traceback (most recent call last):
> > > File "/home/alex/lsrc/tests/trs/build/../poky/meta/lib/oeqa/core/decorator/__init__.py", line 35, in wrapped_f
> > > return func(*args, **kwargs)
> > > ^^^^^^^^^^^^^^^^^^^^^
> > > File "/home/alex/lsrc/tests/trs/build/../poky/meta/lib/oeqa/core/decorator/__init__.py", line 35, in wrapped_f
> > > return func(*args, **kwargs)
> > > ^^^^^^^^^^^^^^^^^^^^^
> > > File "/home/alex/lsrc/tests/trs/meta-trs/lib/oeqa/runtime/cases/opteetest.py", line 27, in test_opteetest_xtest
> > > self.assertEqual(status, 0, msg='\n'.join([cmd, output]))
> > > AssertionError: 255 != 0 : xtest
> > > Run test suite with level=0
> >
> > How long does it take for the test data to "back up" and timeout? The
> > logging thread should take a maximum of 2s to notice data not being
> > read/handled on the serial port.
>
> I don't think it goes around the logging loop more than once. I only
> see:
>
> grep "got" build/tmp_trs-qemuarm64/work/trs_qemuarm64-trs-linux/trs-image/1.0/temp/log.do_testimage
> DEBUG: got [(12, 1)]
>
> from the poll.
>
> >
> > What the code should be doing is noticing the run_serial function isn't
> > using the port and if there is data there, reading it in the logging
> > thread. You could put further warnings around the code there to see if
> > it ever detects data to read on that port. A warning when it exits the
> > run_serial command may help too to see if it is waiting for data there
> > or not.
> >
> > I also ended up using:
> >
> > https://git.yoctoproject.org/poky-contrib/commit/?h=rpurdie/t222&id=febb0d892e4c3648cd88b8ecb9c0260edc45b68e
> >
> > to show what was going over the serial interface when I found it wasn't
> > what I expected.
>
> I'll keep digging.
I've merged the patches into master since they are helping the stdout
issue and helped a number of issues others were seeing.
In the final version of the patch I did fix an issue where the
exception handling from the thread was not working so well, it does at
least print a, exception/traceback properly now (in threadtarget()).
One of the reasons I asked about the warnings was in case that was
showing an issue, previously it silently failed.
I can believe there are other issues hiding in here :/.
Cheers,
Richard
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [OE-core] [PATCH 2/2] qemurunner: Impove handling of serial port output blocking
2023-12-20 10:04 ` Richard Purdie
@ 2023-12-20 16:28 ` Alex Bennée
2023-12-20 16:52 ` Richard Purdie
0 siblings, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2023-12-20 16:28 UTC (permalink / raw)
To: Richard Purdie; +Cc: openembedded-core
Richard Purdie <richard.purdie@linuxfoundation.org> writes:
> On Wed, 2023-12-20 at 09:53 +0000, Alex Bennée wrote:
>> Richard Purdie <richard.purdie@linuxfoundation.org> writes:
>>
>> > On Tue, 2023-12-19 at 16:44 +0000, Alex Bennée wrote:
>> > > Richard Purdie <richard.purdie@linuxfoundation.org> writes:
>> > > Yeah I'm only seeing A10/A11:
>> > >
>> > > o regression_1010.5 Invalid memory access 5
>> > > regression_1010.5 OK
>> > > o regression_1010.6 Invalid memory access 6
>> > >
>> > > WARNING: here A10
>> > > WARNING: here A11
<snip>
>> >
>> > How long does it take for the test data to "back up" and timeout? The
>> > logging thread should take a maximum of 2s to notice data not being
>> > read/handled on the serial port.
>>
>> I don't think it goes around the logging loop more than once. I only
>> see:
>>
>> grep "got" build/tmp_trs-qemuarm64/work/trs_qemuarm64-trs-linux/trs-image/1.0/temp/log.do_testimage
>> DEBUG: got [(12, 1)]
>>
>> from the poll.
>>
<snip>
>> I'll keep digging.
>
> I've merged the patches into master since they are helping the stdout
> issue and helped a number of issues others were seeing.
>
> In the final version of the patch I did fix an issue where the
> exception handling from the thread was not working so well, it does at
> least print a, exception/traceback properly now (in threadtarget()).
>
> One of the reasons I asked about the warnings was in case that was
> showing an issue, previously it silently failed.
>
> I can believe there are other issues hiding in here :/.
So the failure was due to an exception being thrown at:
elif self.readsock.fileno() == event[1]:
when my config hadn't triggered its opening. A bit of defensive coding
and it works. See:
Subject: [PATCH] qemurunner: more cleanups for output blocking
Date: Wed, 20 Dec 2023 16:14:50 +0000
Message-Id: <20231220161450.1422484-1-alex.bennee@linaro.org>
>
> Cheers,
>
> Richard
--
Alex Bennée
Virtualisation Tech Lead @ Linaro
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [OE-core] [PATCH 2/2] qemurunner: Impove handling of serial port output blocking
2023-12-20 16:28 ` Alex Bennée
@ 2023-12-20 16:52 ` Richard Purdie
0 siblings, 0 replies; 21+ messages in thread
From: Richard Purdie @ 2023-12-20 16:52 UTC (permalink / raw)
To: Alex Bennée; +Cc: openembedded-core
On Wed, 2023-12-20 at 16:28 +0000, Alex Bennée wrote:
> Richard Purdie <richard.purdie@linuxfoundation.org> writes:
>
> > On Wed, 2023-12-20 at 09:53 +0000, Alex Bennée wrote:
> > > Richard Purdie <richard.purdie@linuxfoundation.org> writes:
> > >
> > > > On Tue, 2023-12-19 at 16:44 +0000, Alex Bennée wrote:
> > > > > Richard Purdie <richard.purdie@linuxfoundation.org> writes:
> > > > > Yeah I'm only seeing A10/A11:
> > > > >
> > > > > o regression_1010.5 Invalid memory access 5
> > > > > regression_1010.5 OK
> > > > > o regression_1010.6 Invalid memory access 6
> > > > >
> > > > > WARNING: here A10
> > > > > WARNING: here A11
> <snip>
> > > >
> > > > How long does it take for the test data to "back up" and timeout? The
> > > > logging thread should take a maximum of 2s to notice data not being
> > > > read/handled on the serial port.
> > >
> > > I don't think it goes around the logging loop more than once. I only
> > > see:
> > >
> > > grep "got" build/tmp_trs-qemuarm64/work/trs_qemuarm64-trs-linux/trs-image/1.0/temp/log.do_testimage
> > > DEBUG: got [(12, 1)]
> > >
> > > from the poll.
> > >
> <snip>
> > > I'll keep digging.
> >
> > I've merged the patches into master since they are helping the stdout
> > issue and helped a number of issues others were seeing.
> >
> > In the final version of the patch I did fix an issue where the
> > exception handling from the thread was not working so well, it does at
> > least print a, exception/traceback properly now (in threadtarget()).
> >
> > One of the reasons I asked about the warnings was in case that was
> > showing an issue, previously it silently failed.
> >
> > I can believe there are other issues hiding in here :/.
>
> So the failure was due to an exception being thrown at:
>
> elif self.readsock.fileno() == event[1]:
>
> when my config hadn't triggered its opening. A bit of defensive coding
> and it works. See:
>
> Subject: [PATCH] qemurunner: more cleanups for output blocking
> Date: Wed, 20 Dec 2023 16:14:50 +0000
> Message-Id: <20231220161450.1422484-1-alex.bennee@linaro.org>
Glad it was something relatively simple! The fix looks good to me.
Cheers,
Richard
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling
2023-12-18 15:29 [PATCH 1/2] qemurunner: Impove stdout logging handling Richard Purdie
2023-12-18 15:29 ` [PATCH 2/2] qemurunner: Impove handling of serial port output blocking Richard Purdie
@ 2023-12-18 16:04 ` Fabio Estevam
2023-12-18 17:30 ` Richard Purdie
2023-12-18 17:45 ` Khem Raj
2 siblings, 1 reply; 21+ messages in thread
From: Fabio Estevam @ 2023-12-18 16:04 UTC (permalink / raw)
To: Richard Purdie; +Cc: openembedded-core
On Mon, Dec 18, 2023 at 12:30 PM Richard Purdie
<richard.purdie@linuxfoundation.org> wrote:
>
> We need to ensure we read from and log the output from qemu stdout
> as otherwise the buffers can fill and block, leading qemu to hand.
> Use our existing logging thread to do this.
s/Impove/Improve on the Subject line of both patches.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling
2023-12-18 15:29 [PATCH 1/2] qemurunner: Impove stdout logging handling Richard Purdie
2023-12-18 15:29 ` [PATCH 2/2] qemurunner: Impove handling of serial port output blocking Richard Purdie
2023-12-18 16:04 ` [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling Fabio Estevam
@ 2023-12-18 17:45 ` Khem Raj
2023-12-18 17:58 ` Richard Purdie
2 siblings, 1 reply; 21+ messages in thread
From: Khem Raj @ 2023-12-18 17:45 UTC (permalink / raw)
To: Richard Purdie; +Cc: openembedded-core
I tried the two patches in this series. It did improve the situation
but I am still getting SSH timeouts. But this time its 13 tests
earlier it used to be 40+
btw. my images are using systemd. So it might be good to see if we see
this with poky-altcfg as well or not.
On Mon, Dec 18, 2023 at 7:29 AM Richard Purdie
<richard.purdie@linuxfoundation.org> wrote:
>
> We need to ensure we read from and log the output from qemu stdout
> as otherwise the buffers can fill and block, leading qemu to hand.
> Use our existing logging thread to do this.
>
> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
> ---
> meta/lib/oeqa/utils/qemurunner.py | 54 ++++++++++++++++++-------------
> 1 file changed, 32 insertions(+), 22 deletions(-)
>
> diff --git a/meta/lib/oeqa/utils/qemurunner.py b/meta/lib/oeqa/utils/qemurunner.py
> index ee367078000..36bdfb96ae6 100644
> --- a/meta/lib/oeqa/utils/qemurunner.py
> +++ b/meta/lib/oeqa/utils/qemurunner.py
> @@ -29,6 +29,15 @@ control_chars = [chr(x) for x in control_range
> if chr(x) not in string.printable]
> re_control_char = re.compile('[%s]' % re.escape("".join(control_chars)))
>
> +def getOutput(o):
> + import fcntl
> + fl = fcntl.fcntl(o, fcntl.F_GETFL)
> + fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK)
> + try:
> + return os.read(o.fileno(), 1000000).decode("utf-8")
> + except BlockingIOError:
> + return ""
> +
> class QemuRunner:
>
> def __init__(self, machine, rootfs, display, tmpdir, deploy_dir_image, logfile, boottime, dump_dir, use_kvm, logger, use_slirp=False,
> @@ -55,6 +64,7 @@ class QemuRunner:
> self.boottime = boottime
> self.logged = False
> self.thread = None
> + self.threadsock = None
> self.use_kvm = use_kvm
> self.use_ovmf = use_ovmf
> self.use_slirp = use_slirp
> @@ -119,21 +129,11 @@ class QemuRunner:
> f.write(msg)
> self.msg += self.decode_qemulog(msg)
>
> - def getOutput(self, o):
> - import fcntl
> - fl = fcntl.fcntl(o, fcntl.F_GETFL)
> - fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK)
> - try:
> - return os.read(o.fileno(), 1000000).decode("utf-8")
> - except BlockingIOError:
> - return ""
> -
> -
> def handleSIGCHLD(self, signum, frame):
> if self.runqemu and self.runqemu.poll():
> if self.runqemu.returncode:
> self.logger.error('runqemu exited with code %d' % self.runqemu.returncode)
> - self.logger.error('Output from runqemu:\n%s' % self.getOutput(self.runqemu.stdout))
> + self.logger.error('Output from runqemu:\n%s' % getOutput(self.runqemu.stdout))
> self.stop()
>
> def start(self, qemuparams = None, get_ip = True, extra_bootparams = None, runqemuparams='', launch_cmd=None, discard_writes=True):
> @@ -282,7 +282,7 @@ class QemuRunner:
> if self.runqemu.returncode:
> # No point waiting any longer
> self.logger.warning('runqemu exited with code %d' % self.runqemu.returncode)
> - self.logger.warning("Output from runqemu:\n%s" % self.getOutput(output))
> + self.logger.warning("Output from runqemu:\n%s" % getOutput(output))
> self.stop()
> return False
> time.sleep(0.5)
> @@ -309,7 +309,7 @@ class QemuRunner:
> ps = subprocess.Popen(['ps', 'axww', '-o', 'pid,ppid,pri,ni,command '], stdout=subprocess.PIPE).communicate()[0]
> processes = ps.decode("utf-8")
> self.logger.debug("Running processes:\n%s" % processes)
> - op = self.getOutput(output)
> + op = getOutput(output)
> self.stop()
> if op:
> self.logger.error("Output from runqemu:\n%s" % op)
> @@ -387,7 +387,7 @@ class QemuRunner:
> time.time() - connect_time))
>
> # We are alive: qemu is running
> - out = self.getOutput(output)
> + out = getOutput(output)
> netconf = False # network configuration is not required by default
> self.logger.debug("qemu started in %.2f seconds - qemu procces pid is %s (%s)" %
> (time.time() - (endtime - self.runqemutime),
> @@ -430,9 +430,10 @@ class QemuRunner:
> self.logger.debug("Target IP: %s" % self.ip)
> self.logger.debug("Server IP: %s" % self.server_ip)
>
> + self.thread = LoggingThread(self.log, self.threadsock, self.logger, self.runqemu.stdout)
> + self.thread.start()
> +
> if self.serial_ports >= 2:
> - self.thread = LoggingThread(self.log, self.threadsock, self.logger)
> - self.thread.start()
> if not self.thread.connection_established.wait(self.boottime):
> self.logger.error("Didn't receive a console connection from qemu. "
> "Here is the qemu command line used:\n%s\nand "
> @@ -444,7 +445,7 @@ class QemuRunner:
> self.logger.debug("Waiting at most %d seconds for login banner (%s)" %
> (self.boottime, time.strftime("%D %H:%M:%S")))
> endtime = time.time() + self.boottime
> - filelist = [self.server_socket, self.runqemu.stdout]
> + filelist = [self.server_socket]
> reachedlogin = False
> stopread = False
> qemusock = None
> @@ -564,7 +565,7 @@ class QemuRunner:
> self.logger.debug("Sending SIGKILL to runqemu")
> os.killpg(os.getpgid(self.runqemu.pid), signal.SIGKILL)
> if not self.runqemu.stdout.closed:
> - self.logger.info("Output from runqemu:\n%s" % self.getOutput(self.runqemu.stdout))
> + self.logger.info("Output from runqemu:\n%s" % getOutput(self.runqemu.stdout))
> self.runqemu.stdin.close()
> self.runqemu.stdout.close()
> self.runqemu_exited = True
> @@ -700,9 +701,11 @@ class QemuRunner:
> # event loop. The mechanism for stopping the thread is a simple pipe which
> # will wake up the poll and allow for tearing everything down.
> class LoggingThread(threading.Thread):
> - def __init__(self, logfunc, sock, logger):
> + def __init__(self, logfunc, sock, logger, qemuoutput):
> self.connection_established = threading.Event()
> +
> self.serversock = sock
> + self.qemuoutput = qemuoutput
> self.logfunc = logfunc
> self.logger = logger
> self.readsock = None
> @@ -732,7 +735,8 @@ class LoggingThread(threading.Thread):
>
> def teardown(self):
> self.logger.debug("Tearing down logging thread")
> - self.close_socket(self.serversock)
> + if self.serversock:
> + self.close_socket(self.serversock)
>
> if self.readsock is not None:
> self.close_socket(self.readsock)
> @@ -747,7 +751,9 @@ class LoggingThread(threading.Thread):
> def eventloop(self):
> poll = select.poll()
> event_read_mask = self.errorevents | self.readevents
> - poll.register(self.serversock.fileno())
> + if self.serversock:
> + poll.register(self.serversock.fileno())
> + poll.register(self.qemuoutput.fileno())
> poll.register(self.readpipe, event_read_mask)
>
> breakout = False
> @@ -767,7 +773,7 @@ class LoggingThread(threading.Thread):
> break
>
> # A connection request was received
> - elif self.serversock.fileno() == event[0]:
> + elif self.serversock and self.serversock.fileno() == event[0]:
> self.logger.debug("Connection request received")
> self.readsock, _ = self.serversock.accept()
> self.readsock.setblocking(0)
> @@ -781,6 +787,10 @@ class LoggingThread(threading.Thread):
> elif self.readsock.fileno() == event[0]:
> data = self.recv(1024)
> self.logfunc(data)
> + elif self.qemuoutput.fileno() == event[0]:
> + data = getOutput(self.qemuoutput)
> + self.logger.debug("Data received on qemu stdout %s" % data)
> + self.logfunc(data, ".stdout")
>
> # Since the socket is non-blocking make sure to honor EAGAIN
> # and EWOULDBLOCK.
> --
> 2.39.2
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#192655): https://lists.openembedded.org/g/openembedded-core/message/192655
> Mute This Topic: https://lists.openembedded.org/mt/103244414/1997914
> Group Owner: openembedded-core+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [raj.khem@gmail.com]
> -=-=-=-=-=-=-=-=-=-=-=-
>
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling
2023-12-18 17:45 ` Khem Raj
@ 2023-12-18 17:58 ` Richard Purdie
2023-12-18 18:07 ` Khem Raj
2023-12-18 22:27 ` Alex Bennée
0 siblings, 2 replies; 21+ messages in thread
From: Richard Purdie @ 2023-12-18 17:58 UTC (permalink / raw)
To: Khem Raj; +Cc: openembedded-core
On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> I tried the two patches in this series. It did improve the situation
> but I am still getting SSH timeouts. But this time its 13 tests
> earlier it used to be 40+
> btw. my images are using systemd. So it might be good to see if we see
> this with poky-altcfg as well or not.
Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
Did you still see rcu stalls in the logs?
Cheers,
Richard
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling
2023-12-18 17:58 ` Richard Purdie
@ 2023-12-18 18:07 ` Khem Raj
2023-12-18 23:01 ` Richard Purdie
[not found] ` <17A20F573E606BC3.25349@lists.openembedded.org>
2023-12-18 22:27 ` Alex Bennée
1 sibling, 2 replies; 21+ messages in thread
From: Khem Raj @ 2023-12-18 18:07 UTC (permalink / raw)
To: Richard Purdie; +Cc: openembedded-core
On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie
<richard.purdie@linuxfoundation.org> wrote:
>
> On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> > I tried the two patches in this series. It did improve the situation
> > but I am still getting SSH timeouts. But this time its 13 tests
> > earlier it used to be 40+
> > btw. my images are using systemd. So it might be good to see if we see
> > this with poky-altcfg as well or not.
>
> Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
yes, further I ran the failing tests in loop one after another still
one test gzip fails with ssh timeouts
https://busybox.net/~kraj/log.do_testimage.503
https://busybox.net/~kraj/testimage/
there are two runs in the testimages folder. In one you see the RCU
stall and in second you do not
but it fails with same ssh timeout issue.
>
> Did you still see rcu stalls in the logs?
>
> Cheers,
>
> Richard
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling
2023-12-18 18:07 ` Khem Raj
@ 2023-12-18 23:01 ` Richard Purdie
2023-12-19 15:25 ` Alex Bennée
[not found] ` <17A20F573E606BC3.25349@lists.openembedded.org>
1 sibling, 1 reply; 21+ messages in thread
From: Richard Purdie @ 2023-12-18 23:01 UTC (permalink / raw)
To: Khem Raj; +Cc: openembedded-core
On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote:
> On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie
> <richard.purdie@linuxfoundation.org> wrote:
> >
> > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> > > I tried the two patches in this series. It did improve the situation
> > > but I am still getting SSH timeouts. But this time its 13 tests
> > > earlier it used to be 40+
> > > btw. my images are using systemd. So it might be good to see if we see
> > > this with poky-altcfg as well or not.
> >
> > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
>
> yes, further I ran the failing tests in loop one after another still
> one test gzip fails with ssh timeouts
>
> https://busybox.net/~kraj/log.do_testimage.503
> https://busybox.net/~kraj/testimage/
>
> there are two runs in the testimages folder. In one you see the RCU
> stall and in second you do not
> but it fails with same ssh timeout issue.
>
> >
> > Did you still see rcu stalls in the logs?
What is interesting is there is ~3MB of nulls in the .2 serial log. The
rcu stall is also:
[ 88.261687] serial8250_tx_chars+0xea/0x2b0
[ 88.261689] serial8250_handle_irq+0x1e9/0x330
[ 88.261691] serial8250_default_handle_irq+0x4a/0x90
[ 88.261693] serial8250_interrupt+0x66/0xc0
[ 88.261696] __handle_irq_event_percpu+0x54/0x1c0
[ 88.261701] handle_irq_event+0x3d/0x80
i.e. it is stalled in the serial TX path.
The big question is why is there so many nulls on the serial port. I
see a few on my local x86 test runs but only ~4kb, not megabytes of
them. I hadn't worked out where/what they are from yet.
I suspect something in the serial/kernel/qemu space isn't interacting
correctly.
Find the cause of the nulls and we might make progress on this.
Cheers,
Richard
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling
2023-12-18 23:01 ` Richard Purdie
@ 2023-12-19 15:25 ` Alex Bennée
0 siblings, 0 replies; 21+ messages in thread
From: Alex Bennée @ 2023-12-19 15:25 UTC (permalink / raw)
To: Richard Purdie; +Cc: Khem Raj, openembedded-core
"Richard Purdie" <richard.purdie@linuxfoundation.org> writes:
> On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote:
>> On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie
>> <richard.purdie@linuxfoundation.org> wrote:
>> >
>> > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
>> > > I tried the two patches in this series. It did improve the situation
>> > > but I am still getting SSH timeouts. But this time its 13 tests
>> > > earlier it used to be 40+
>> > > btw. my images are using systemd. So it might be good to see if we see
>> > > this with poky-altcfg as well or not.
>> >
>> > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
>>
>> yes, further I ran the failing tests in loop one after another still
>> one test gzip fails with ssh timeouts
>>
>> https://busybox.net/~kraj/log.do_testimage.503
>> https://busybox.net/~kraj/testimage/
>>
>> there are two runs in the testimages folder. In one you see the RCU
>> stall and in second you do not
>> but it fails with same ssh timeout issue.
>>
>> >
>> > Did you still see rcu stalls in the logs?
>
> What is interesting is there is ~3MB of nulls in the .2 serial log. The
> rcu stall is also:
>
> [ 88.261687] serial8250_tx_chars+0xea/0x2b0
> [ 88.261689] serial8250_handle_irq+0x1e9/0x330
> [ 88.261691] serial8250_default_handle_irq+0x4a/0x90
> [ 88.261693] serial8250_interrupt+0x66/0xc0
> [ 88.261696] __handle_irq_event_percpu+0x54/0x1c0
> [ 88.261701] handle_irq_event+0x3d/0x80
>
> i.e. it is stalled in the serial TX path.
>
> The big question is why is there so many nulls on the serial port. I
> see a few on my local x86 test runs but only ~4kb, not megabytes of
> them. I hadn't worked out where/what they are from yet.
>
> I suspect something in the serial/kernel/qemu space isn't interacting
> correctly.
We generally don't model FIFOs in QEMU and also we will block the main
thread if we can't write data out.
>
> Find the cause of the nulls and we might make progress on this.
>
> Cheers,
>
> Richard
>
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#192681): https://lists.openembedded.org/g/openembedded-core/message/192681
> Mute This Topic: https://lists.openembedded.org/mt/103244414/5085169
> Group Owner: openembedded-core+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [alex.bennee@linaro.org]
> -=-=-=-=-=-=-=-=-=-=-=-
--
Alex Bennée
Virtualisation Tech Lead @ Linaro
^ permalink raw reply [flat|nested] 21+ messages in thread
[parent not found: <17A20F573E606BC3.25349@lists.openembedded.org>]
* Re: [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling
[not found] ` <17A20F573E606BC3.25349@lists.openembedded.org>
@ 2023-12-19 12:03 ` Richard Purdie
2023-12-19 12:12 ` Mikko Rapeli
0 siblings, 1 reply; 21+ messages in thread
From: Richard Purdie @ 2023-12-19 12:03 UTC (permalink / raw)
To: Khem Raj; +Cc: openembedded-core
On Mon, 2023-12-18 at 23:01 +0000, Richard Purdie via
lists.openembedded.org wrote:
> On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote:
> > On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie
> > <richard.purdie@linuxfoundation.org> wrote:
> > >
> > > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> > > > I tried the two patches in this series. It did improve the situation
> > > > but I am still getting SSH timeouts. But this time its 13 tests
> > > > earlier it used to be 40+
> > > > btw. my images are using systemd. So it might be good to see if we see
> > > > this with poky-altcfg as well or not.
> > >
> > > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
> >
> > yes, further I ran the failing tests in loop one after another still
> > one test gzip fails with ssh timeouts
> >
> > https://busybox.net/~kraj/log.do_testimage.503
> > https://busybox.net/~kraj/testimage/
> >
> > there are two runs in the testimages folder. In one you see the RCU
> > stall and in second you do not
> > but it fails with same ssh timeout issue.
> >
> > >
> > > Did you still see rcu stalls in the logs?
>
> What is interesting is there is ~3MB of nulls in the .2 serial log. The
> rcu stall is also:
>
> [ 88.261687] serial8250_tx_chars+0xea/0x2b0
> [ 88.261689] serial8250_handle_irq+0x1e9/0x330
> [ 88.261691] serial8250_default_handle_irq+0x4a/0x90
> [ 88.261693] serial8250_interrupt+0x66/0xc0
> [ 88.261696] __handle_irq_event_percpu+0x54/0x1c0
> [ 88.261701] handle_irq_event+0x3d/0x80
>
> i.e. it is stalled in the serial TX path.
>
> The big question is why is there so many nulls on the serial port. I
> see a few on my local x86 test runs but only ~4kb, not megabytes of
> them. I hadn't worked out where/what they are from yet.
>
> I suspect something in the serial/kernel/qemu space isn't interacting
> correctly.
>
> Find the cause of the nulls and we might make progress on this.
I went digging and it is a mess. That 3MB file is actually a wtmp file
from /var/log. The reason is a failure of the "target_dumper" commands
and that the simplistic serial command handling doesn't like binary
files.
I've a few patches which should improve the situation but we're clearly
multiple issues at play here where issues are stacking on top of other
issues. My rough list of issues:
a) stdout logging from qemu wasn't being read and could overflow
buffers locking qemu
b) serial data from commands wasn't being fully read and could overflow
buffers locking qemu
c) the dumper log reading command is reading binary files like wtmp
d) the dumper logs overwrite each other so are useless anyway
e) the dumper logs run on every failed command
f) qmp monitor dump command also run on every failed command
I think we'd be much better off if we drop the dumper/monitor stuff and
qemu runs might be quite a bit faster too.
Cheers,
Richard
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling
2023-12-19 12:03 ` Richard Purdie
@ 2023-12-19 12:12 ` Mikko Rapeli
0 siblings, 0 replies; 21+ messages in thread
From: Mikko Rapeli @ 2023-12-19 12:12 UTC (permalink / raw)
To: Richard Purdie; +Cc: Khem Raj, openembedded-core
Hi,
On Tue, Dec 19, 2023 at 12:03:07PM +0000, Richard Purdie wrote:
> On Mon, 2023-12-18 at 23:01 +0000, Richard Purdie via
> lists.openembedded.org wrote:
> > On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote:
> > > On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie
> > > <richard.purdie@linuxfoundation.org> wrote:
> > > >
> > > > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> > > > > I tried the two patches in this series. It did improve the situation
> > > > > but I am still getting SSH timeouts. But this time its 13 tests
> > > > > earlier it used to be 40+
> > > > > btw. my images are using systemd. So it might be good to see if we see
> > > > > this with poky-altcfg as well or not.
> > > >
> > > > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
> > >
> > > yes, further I ran the failing tests in loop one after another still
> > > one test gzip fails with ssh timeouts
> > >
> > > https://busybox.net/~kraj/log.do_testimage.503
> > > https://busybox.net/~kraj/testimage/
> > >
> > > there are two runs in the testimages folder. In one you see the RCU
> > > stall and in second you do not
> > > but it fails with same ssh timeout issue.
> > >
> > > >
> > > > Did you still see rcu stalls in the logs?
> >
> > What is interesting is there is ~3MB of nulls in the .2 serial log. The
> > rcu stall is also:
> >
> > [ 88.261687] serial8250_tx_chars+0xea/0x2b0
> > [ 88.261689] serial8250_handle_irq+0x1e9/0x330
> > [ 88.261691] serial8250_default_handle_irq+0x4a/0x90
> > [ 88.261693] serial8250_interrupt+0x66/0xc0
> > [ 88.261696] __handle_irq_event_percpu+0x54/0x1c0
> > [ 88.261701] handle_irq_event+0x3d/0x80
> >
> > i.e. it is stalled in the serial TX path.
> >
> > The big question is why is there so many nulls on the serial port. I
> > see a few on my local x86 test runs but only ~4kb, not megabytes of
> > them. I hadn't worked out where/what they are from yet.
> >
> > I suspect something in the serial/kernel/qemu space isn't interacting
> > correctly.
> >
> > Find the cause of the nulls and we might make progress on this.
>
> I went digging and it is a mess. That 3MB file is actually a wtmp file
> from /var/log. The reason is a failure of the "target_dumper" commands
> and that the simplistic serial command handling doesn't like binary
> files.
>
> I've a few patches which should improve the situation but we're clearly
> multiple issues at play here where issues are stacking on top of other
> issues. My rough list of issues:
>
> a) stdout logging from qemu wasn't being read and could overflow
> buffers locking qemu
> b) serial data from commands wasn't being fully read and could overflow
> buffers locking qemu
> c) the dumper log reading command is reading binary files like wtmp
> d) the dumper logs overwrite each other so are useless anyway
> e) the dumper logs run on every failed command
> f) qmp monitor dump command also run on every failed command
>
> I think we'd be much better off if we drop the dumper/monitor stuff and
> qemu runs might be quite a bit faster too.
I agree to this. If target has serious issues like a full lockup, then dumper and
monitor commands don't help anyway. Capturing the full serial logs would be sufficient,
though I'd like to see a live capture of /var/log/message and systemd journal in case
services started by init have major issues.
Cheers,
-Mikko
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling
2023-12-18 17:58 ` Richard Purdie
2023-12-18 18:07 ` Khem Raj
@ 2023-12-18 22:27 ` Alex Bennée
2023-12-18 22:54 ` Richard Purdie
1 sibling, 1 reply; 21+ messages in thread
From: Alex Bennée @ 2023-12-18 22:27 UTC (permalink / raw)
To: Richard Purdie; +Cc: Khem Raj, openembedded-core
"Richard Purdie" <richard.purdie@linuxfoundation.org> writes:
> On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
>> I tried the two patches in this series. It did improve the situation
>> but I am still getting SSH timeouts. But this time its 13 tests
>> earlier it used to be 40+
>> btw. my images are using systemd. So it might be good to see if we see
>> this with poky-altcfg as well or not.
>
> Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
>
> Did you still see rcu stalls in the logs?
rcu stalls in the kernel is just a sign of QEMU's relative inefficiency
compared to real HW. However multiple -smp will help alleviate busy
vCPUs if you can. If you want guest time to be a bit more realistic you
can use icount but actual wall clock time will be longer as you can only
have one thread.
>
> Cheers,
>
> Richard
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#192674): https://lists.openembedded.org/g/openembedded-core/message/192674
> Mute This Topic: https://lists.openembedded.org/mt/103244414/5085169
> Group Owner: openembedded-core+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [alex.bennee@linaro.org]
> -=-=-=-=-=-=-=-=-=-=-=-
--
Alex Bennée
Virtualisation Tech Lead @ Linaro
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling
2023-12-18 22:27 ` Alex Bennée
@ 2023-12-18 22:54 ` Richard Purdie
0 siblings, 0 replies; 21+ messages in thread
From: Richard Purdie @ 2023-12-18 22:54 UTC (permalink / raw)
To: Alex Bennée; +Cc: Khem Raj, openembedded-core
On Mon, 2023-12-18 at 22:27 +0000, Alex Bennée wrote:
> "Richard Purdie" <richard.purdie@linuxfoundation.org> writes:
>
> > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> > > I tried the two patches in this series. It did improve the situation
> > > but I am still getting SSH timeouts. But this time its 13 tests
> > > earlier it used to be 40+
> > > btw. my images are using systemd. So it might be good to see if we see
> > > this with poky-altcfg as well or not.
> >
> > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
> >
> > Did you still see rcu stalls in the logs?
>
> rcu stalls in the kernel is just a sign of QEMU's relative inefficiency
> compared to real HW. However multiple -smp will help alleviate busy
> vCPUs if you can. If you want guest time to be a bit more realistic you
> can use icount but actual wall clock time will be longer as you can only
> have one thread.
We have thought that but I realised rcu stalls in the 8250 serial TX
path could mean something quite different, related to 2/2 in this
series.
Cheers,
Richard
^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2023-12-20 16:52 UTC | newest]
Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-12-18 15:29 [PATCH 1/2] qemurunner: Impove stdout logging handling Richard Purdie
2023-12-18 15:29 ` [PATCH 2/2] qemurunner: Impove handling of serial port output blocking Richard Purdie
2023-12-19 15:10 ` [OE-core] " Alex Bennée
2023-12-19 15:15 ` Richard Purdie
2023-12-19 16:44 ` Alex Bennée
2023-12-19 22:57 ` Richard Purdie
2023-12-20 9:53 ` Alex Bennée
2023-12-20 10:04 ` Richard Purdie
2023-12-20 16:28 ` Alex Bennée
2023-12-20 16:52 ` Richard Purdie
2023-12-18 16:04 ` [OE-core] [PATCH 1/2] qemurunner: Impove stdout logging handling Fabio Estevam
2023-12-18 17:30 ` Richard Purdie
2023-12-18 17:45 ` Khem Raj
2023-12-18 17:58 ` Richard Purdie
2023-12-18 18:07 ` Khem Raj
2023-12-18 23:01 ` Richard Purdie
2023-12-19 15:25 ` Alex Bennée
[not found] ` <17A20F573E606BC3.25349@lists.openembedded.org>
2023-12-19 12:03 ` Richard Purdie
2023-12-19 12:12 ` Mikko Rapeli
2023-12-18 22:27 ` Alex Bennée
2023-12-18 22:54 ` Richard Purdie
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox