All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] detect and report qemu-dm failure
@ 2008-06-12 11:23 Ian Jackson
  2008-06-12 11:33 ` Daniel P. Berrange
  0 siblings, 1 reply; 7+ messages in thread
From: Ian Jackson @ 2008-06-12 11:23 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: message body text --]
[-- Type: text/plain, Size: 1101 bytes --]

Currently, when qemu-dm fails, typically its exit status is lost and
xend doesn't notice.  In the patch below I use a fifo (named pipe) to
detect qemu-dm's termination and report the exit status to the
logfile, if possible.  (If xend has been restarted since the domain
was created, this isn't possible but we can still know that it failed
and report that fact.)

It would be better to have a failure of qemu crash the domain, by
calling the SCHEDOP_shutdown hypercall with SHUTDOWN_crash.  However
if you have on_crash=restart and the configuration is broken in some
way that causes qemu-dm to bomb out straight away this causes xend to
spin endlessly restarting the doomed domain.  This is a general
problem with on_*=restart and ought to be fixed separately.  When it
is fixed, we can safely arrange for domains whose dm has crashed to be
themselves forcibly crashed.

(As an aside: error reporting in xend is generally very poor and the
multithreaded programming model is IMO a recipe for pain and bugs.  We
should be using Twisted.)

Ian.

Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>


[-- Attachment #2: detect and log failures of qemu-dm --]
[-- Type: text/plain, Size: 15359 bytes --]

diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN ../xen-unstable-3.hg/tools/python/xen/util/oshelp.py tools/python/xen/util/oshelp.py
--- ../xen-unstable-3.hg/tools/python/xen/util/oshelp.py	1970-01-01 01:00:00.000000000 +0100
+++ tools/python/xen/util/oshelp.py	2008-06-11 16:46:07.000000000 +0100
@@ -0,0 +1,20 @@
+import fcntl
+import os
+
+def fcntl_setfd_cloexec(file, bool):
+	f = fcntl.fcntl(file, fcntl.F_GETFD)
+	if bool: f |= fcntl.FD_CLOEXEC
+	else: f &= ~fcntl.FD_CLOEXEC
+	fcntl.fcntl(file, fcntl.F_SETFD)
+
+def waitstatus_description(st):
+	if os.WIFEXITED(st):
+		es = os.WEXITSTATUS(st)
+		if es: return "exited with nonzero status %i" % es
+		else: return "exited"
+	elif os.WIFSIGNALED(st):
+		s = "died due to signal %i" % os.WTERMSIG(st)
+		if os.WCOREDUMP(st): s += " (core dumped)"
+		return s
+	else:
+		return "failed with unexpected wait status %i" % st
diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN ../xen-unstable-3.hg/tools/python/xen/util/utils.py tools/python/xen/util/utils.py
--- ../xen-unstable-3.hg/tools/python/xen/util/utils.py	1970-01-01 01:00:00.000000000 +0100
+++ tools/python/xen/util/utils.py	2008-06-11 15:21:04.000000000 +0100
@@ -0,0 +1,6 @@
+import traceback
+import sys
+
+def exception_string(e):
+	(ty,v,tb) = sys.exc_info()
+	return traceback.format_exception_only(ty,v)
diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN ../xen-unstable-3.hg/tools/python/xen/xend/XendDomain.py tools/python/xen/xend/XendDomain.py
--- ../xen-unstable-3.hg/tools/python/xen/xend/XendDomain.py	2008-06-12 12:13:04.000000000 +0100
+++ tools/python/xen/xend/XendDomain.py	2008-06-11 16:42:07.000000000 +0100
@@ -34,7 +34,7 @@
 
 from xen.xend import XendOptions, XendCheckpoint, XendDomainInfo
 from xen.xend.PrettyPrint import prettyprint
-from xen.xend import XendConfig
+from xen.xend import XendConfig, image
 from xen.xend.XendError import XendError, XendInvalidDomain, VmError
 from xen.xend.XendError import VMBadState
 from xen.xend.XendLogging import log
@@ -179,6 +179,8 @@
                         log.exception("Failed to create reference to running "
                                       "domain id: %d" % dom['domid'])
 
+	    image.cleanup_stale_sentinel_fifos()
+
             # add all managed domains as dormant domains.
             for dom in managed:
                 dom_uuid = dom.get('uuid')
diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN ../xen-unstable-3.hg/tools/python/xen/xend/XendDomainInfo.py tools/python/xen/xend/XendDomainInfo.py
--- ../xen-unstable-3.hg/tools/python/xen/xend/XendDomainInfo.py	2008-06-12 12:13:04.000000000 +0100
+++ tools/python/xen/xend/XendDomainInfo.py	2008-06-11 15:02:43.000000000 +0100
@@ -30,6 +30,7 @@
 import re
 import copy
 import os
+import fcntl
 import traceback
 from types import StringTypes
 
diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN ../xen-unstable-3.hg/tools/python/xen/xend/XendLogging.py tools/python/xen/xend/XendLogging.py
--- ../xen-unstable-3.hg/tools/python/xen/xend/XendLogging.py	2008-06-12 12:13:04.000000000 +0100
+++ tools/python/xen/xend/XendLogging.py	2008-06-12 12:14:14.000000000 +0100
@@ -25,10 +25,10 @@
 import types
 import logging
 import logging.handlers
-import fcntl
 
 from xen.util import mkdir
 from xen.xend.server import params
+from xen.util import oshelp
 
 
 __all__ = [ 'log', 'init', 'getLogFilename' ]
@@ -103,9 +103,7 @@
     # entire FileHandler, StreamHandler & RotatingFileHandler classes which
     # is even worse
     def setCloseOnExec(self):
-        flags = fcntl.fcntl(self.stream.fileno(), fcntl.F_GETFD)
-        flags |= fcntl.FD_CLOEXEC
-        fcntl.fcntl(self.stream.fileno(), fcntl.F_SETFD, flags)
+        oshelp.fcntl_setfd_cloexec(self.stream, True)
         
 
 def init(filename, level):
diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN ../xen-unstable-3.hg/tools/python/xen/xend/image.py tools/python/xen/xend/image.py
--- ../xen-unstable-3.hg/tools/python/xen/xend/image.py	2008-06-12 12:13:04.000000000 +0100
+++ tools/python/xen/xend/image.py	2008-06-12 12:16:14.000000000 +0100
@@ -22,6 +22,12 @@
 import math
 import time
 import signal
+import thread
+import fcntl
+import sys
+import errno
+import glob
+import traceback
 
 import xen.lowlevel.xc
 from xen.xend.XendConstants import *
@@ -32,11 +38,23 @@
 from xen.xend.xenstore.xswatch import xswatch
 from xen.xend import arch
 from xen.xend import XendOptions
+from xen.util import oshelp
+from xen.util import utils
 
 xc = xen.lowlevel.xc.xc()
 
 MAX_GUEST_CMDLINE = 1024
 
+sentinel_path_prefix = '/var/run/xend/dm-'
+sentinel_fifos_inuse = { }
+
+def cleanup_stale_sentinel_fifos():
+    for path in glob.glob(sentinel_path_prefix + '*.fifo'):
+        if path in sentinel_fifos_inuse: continue
+        try: os.unlink(path)
+        except OSError, e:
+            log.warning('could not delete stale fifo %s: %s',
+                path, utils.exception_string(e))
 
 def create(vm, vmConfig):
     """Create an image handler for a vm.
@@ -318,6 +336,13 @@
         args = args + self.dmargs
         return args
 
+    def _openSentinel(self, sentinel_path_fifo):
+        self.sentinel_fifo = file(sentinel_path_fifo, 'r')
+        self.sentinel_lock = thread.allocate_lock()
+        oshelp.fcntl_setfd_cloexec(self.sentinel_fifo, True)
+        sentinel_fifos_inuse[sentinel_path_fifo] = 1
+        self.sentinel_path_fifo = sentinel_path_fifo
+
     def createDeviceModel(self, restore = False):
         if self.device_model is None:
             return
@@ -333,21 +358,29 @@
             env['XAUTHORITY'] = self.xauthority
         if self.vncconsole:
             args = args + ([ "-vncviewer" ])
+        unique_id = "%i-%i" % (self.vm.getDomid(), time.time())
+        sentinel_path = sentinel_path_prefix + unique_id
+        sentinel_path_fifo = sentinel_path + '.fifo'
+        os.mkfifo(sentinel_path_fifo, 0600)
+        sentinel_write = file(sentinel_path_fifo, 'r+')
+        self._openSentinel(sentinel_path_fifo)
+        self.vm.storeDom("image/device-model-fifo", sentinel_path_fifo)
         xstransact.Mkdir("/local/domain/0/device-model/%i" % self.vm.getDomid())
         xstransact.SetPermissions("/local/domain/0/device-model/%i" % self.vm.getDomid(),
                         { 'dom': self.vm.getDomid(), 'read': True, 'write': True })
         log.info("spawning device models: %s %s", self.device_model, args)
         # keep track of pid and spawned options to kill it later
 
-        logfile = "/var/log/xen/qemu-dm-%s.log" %  str(self.vm.info['name_label'])
-        if os.path.exists(logfile):
-            if os.path.exists(logfile + ".1"):
-                os.unlink(logfile + ".1")
-            os.rename(logfile, logfile + ".1")
+        self.logfile = "/var/log/xen/qemu-dm-%s.log" %  str(self.vm.info['name_label'])
+        if os.path.exists(self.logfile):
+            if os.path.exists(self.logfile + ".1"):
+                os.unlink(self.logfile + ".1")
+            os.rename(self.logfile, self.logfile + ".1")
 
         null = os.open("/dev/null", os.O_RDONLY)
-        logfd = os.open(logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC)
-        
+        logfd = os.open(self.logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
+
+        sys.stderr.flush()
         pid = os.fork()
         if pid == 0: #child
             try:
@@ -356,18 +389,26 @@
                 os.dup2(logfd, 2)
                 os.close(null)
                 os.close(logfd)
+                self.sentinel_fifo.close()
                 try:
                     os.execve(self.device_model, args, env)
-                except:
-                    os._exit(127)
+                except Exception, e:
+                    print >>sys.stderr, (
+                        'failed to set up fds or execute dm %s: %s' %
+                        (self.device_model, utils.exception_string(e)))
+                    os._exit(126)
             except:
                 os._exit(127)
         else:
             self.pid = pid
             os.close(null)
             os.close(logfd)
+	sentinel_write.close()
         self.vm.storeDom("image/device-model-pid", self.pid)
         log.info("device model pid: %d", self.pid)
+        # we would very much prefer not to have a thread here and instead
+        #  have a callback but sadly we don't have Twisted in xend
+        self.sentinel_thread = thread.start_new_thread(self._sentinel_watch,())
 
     def signalDeviceModel(self, cmd, ret, par = None):
         if self.device_model is None:
@@ -413,47 +454,119 @@
         xstransact.Store("/local/domain/0/device-model/%i"
                          % self.vm.getDomid(), ('command', 'continue'))
 
+    def _dmfailed(self, message):
+        log.warning("domain %s: %s", self.vm.getName(), message)
+        # ideally we would like to forcibly crash the domain with
+        # something like
+        #    xc.domain_shutdown(self.vm.getDomid(), DOMAIN_CRASH)
+        # but this can easily lead to very rapid restart loops against
+        # which we currently have no protection
+
     def recreate(self):
         if self.device_model is None:
             return
-        self.pid = self.vm.gatherDom(('image/device-model-pid', int))
+        name = self.vm.getName()
+        sentinel_path_fifo = self.vm.readDom('image/device-model-fifo')
+        fifo_fd = -1
+        log.debug("rediscovering %s", sentinel_path_fifo)
+        if sentinel_path_fifo is None:
+            log.debug("%s device model no sentinel, cannot rediscover", name)
+        else:
+            try:
+                # We open it O_WRONLY because that fails ENXIO if no-one
+                # has it open for reading (see SuSv3).  The dm process got
+                # a read/write descriptor from our earlier invocation.
+                fifo_fd = os.open(sentinel_path_fifo, os.O_WRONLY|os.O_NONBLOCK)
+            except OSError, e:
+                if e.errno == errno.ENXIO:
+                    self._dmfailed("%s device model no longer running"%name)
+                elif e.errno == errno.ENOENT:
+                    log.debug("%s device model sentinel %s absent!",
+                            name, sentinel_path_fifo)
+                else:
+                    raise
+        if fifo_fd >= 0:
+            self._openSentinel(sentinel_path_fifo)
+            os.close(fifo_fd)
+            self.pid = self.vm.gatherDom(('image/device-model-pid', int))
+            log.debug("%s device model rediscovered, pid %s sentinel fifo %s",
+                    name, self.pid, sentinel_path_fifo)
+            self.sentinel_thread = thread.start_new_thread(self._sentinel_watch,())
+
+    def _sentinel_watch(self):
+        log.info("waiting for sentinel_fifo")
+        try: self.sentinel_fifo.read(1)
+        except OSError, e: pass
+        self.sentinel_lock.acquire()
+        try:
+            if self.pid:
+                (p,st) = os.waitpid(self.pid, os.WNOHANG)
+                if p == self.pid:
+                    message = oshelp.waitstatus_description(st)
+                else:
+                    # obviously it is malfunctioning, kill it now
+                    try:
+                        os.kill(self.pid, signal.SIGKILL)
+                        message = "malfunctioning (closed sentinel), killed"
+                    except:
+                        message = "malfunctioning or died ?"
+                message = "pid %d: %s" % (self.pid, message)
+            else:
+                message = "no longer running"
+        except Exception, e:
+            message = "waitpid failed: %s" % utils.exception_string(e)
+        message = "device model failure: %s" % message
+        try: message += "; see %s " % self.logfile
+        except: pass
+        self._dmfailed(message)
+        self.pid = None
+        self.sentinel_lock.release()
 
     def destroyDeviceModel(self):
         if self.device_model is None:
             return
+        self
         if self.pid:
+            self.sentinel_lock.acquire()
             try:
-                os.kill(self.pid, signal.SIGHUP)
-            except OSError, exn:
-                log.exception(exn)
-            try:
-                # Try to reap the child every 100ms for 10s. Then SIGKILL it.
-                for i in xrange(100):
-                    (p, rv) = os.waitpid(self.pid, os.WNOHANG)
-                    if p == self.pid:
-                        break
-                    time.sleep(0.1)
-                else:
-                    log.warning("DeviceModel %d took more than 10s "
-                                "to terminate: sending SIGKILL" % self.pid)
+                try:
+                    os.kill(self.pid, signal.SIGHUP)
+                except OSError, exn:
+                    log.exception(exn)
+                try:
+                    # Try to reap the child every 100ms for 10s. Then SIGKILL it.
+                    for i in xrange(100):
+                        (p, rv) = os.waitpid(self.pid, os.WNOHANG)
+                        if p == self.pid:
+                            break
+                        time.sleep(0.1)
+                    else:
+                        log.warning("DeviceModel %d took more than 10s "
+                                    "to terminate: sending SIGKILL" % self.pid)
+                        os.kill(self.pid, signal.SIGKILL)
+                        os.waitpid(self.pid, 0)
+                except OSError, exn:
+                    # This is expected if Xend has been restarted within the
+                    # life of this domain.  In this case, we can kill the process,
+                    # but we can't wait for it because it's not our child.
+                    # We just make really sure it's going away (SIGKILL) first.
                     os.kill(self.pid, signal.SIGKILL)
-                    os.waitpid(self.pid, 0)
-            except OSError, exn:
-                # This is expected if Xend has been restarted within the
-                # life of this domain.  In this case, we can kill the process,
-                # but we can't wait for it because it's not our child.
-                # We just make really sure it's going away (SIGKILL) first.
-                os.kill(self.pid, signal.SIGKILL)
-            self.pid = None
-            state = xstransact.Remove("/local/domain/0/device-model/%i"
-                                      % self.vm.getDomid())
+                state = xstransact.Remove("/local/domain/0/device-model/%i"
+                                          % self.vm.getDomid())
+            finally:
+                self.pid = None
+                self.sentinel_lock.release()
             
             try:
                 os.unlink('/var/run/tap/qemu-read-%d' % self.vm.getDomid())
                 os.unlink('/var/run/tap/qemu-write-%d' % self.vm.getDomid())
             except:
                 pass
-
+            try:
+                del sentinel_fifos_inuse[self.sentinel_path_fifo]
+                os.unlink(self.sentinel_path_fifo)
+            except:
+                pass
 
 class LinuxImageHandler(ImageHandler):
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] detect and report qemu-dm failure
  2008-06-12 11:23 [PATCH] detect and report qemu-dm failure Ian Jackson
@ 2008-06-12 11:33 ` Daniel P. Berrange
  2008-06-12 12:49   ` Ian Jackson
  0 siblings, 1 reply; 7+ messages in thread
From: Daniel P. Berrange @ 2008-06-12 11:33 UTC (permalink / raw)
  To: Ian Jackson; +Cc: xen-devel

It seems rather complicated, but I can't think of a much better way to deal
with it, so just a handful of minor nit-picks inline...

On Thu, Jun 12, 2008 at 12:23:06PM +0100, Ian Jackson wrote:
> diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN ../xen-unstable-3.hg/tools/python/xen/xend/XendDomain.py tools/python/xen/xend/XendDomain.py
> --- ../xen-unstable-3.hg/tools/python/xen/xend/XendDomain.py	2008-06-12 12:13:04.000000000 +0100
> +++ tools/python/xen/xend/XendDomain.py	2008-06-11 16:42:07.000000000 +0100
> @@ -34,7 +34,7 @@
>  
>  from xen.xend import XendOptions, XendCheckpoint, XendDomainInfo
>  from xen.xend.PrettyPrint import prettyprint
> -from xen.xend import XendConfig
> +from xen.xend import XendConfig, image
>  from xen.xend.XendError import XendError, XendInvalidDomain, VmError
>  from xen.xend.XendError import VMBadState
>  from xen.xend.XendLogging import log
> @@ -179,6 +179,8 @@
>                          log.exception("Failed to create reference to running "
>                                        "domain id: %d" % dom['domid'])
>  
> +	    image.cleanup_stale_sentinel_fifos()
> +
>              # add all managed domains as dormant domains.
>              for dom in managed:


This indentation looks wrong to me.


> diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN ../xen-unstable-3.hg/tools/python/xen/xend/XendDomainInfo.py tools/python/xen/xend/XendDomainInfo.py
> --- ../xen-unstable-3.hg/tools/python/xen/xend/XendDomainInfo.py	2008-06-12 12:13:04.000000000 +0100
> +++ tools/python/xen/xend/XendDomainInfo.py	2008-06-11 15:02:43.000000000 +0100
> @@ -30,6 +30,7 @@
>  import re
>  import copy
>  import os
> +import fcntl
>  import traceback
>  from types import StringTypes
>  


This this change intended ?  You're not adding any calls to fcntl in this patch.

> diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN ../xen-unstable-3.hg/tools/python/xen/xend/image.py tools/python/xen/xend/image.py
> --- ../xen-unstable-3.hg/tools/python/xen/xend/image.py	2008-06-12 12:13:04.000000000 +0100
> +++ tools/python/xen/xend/image.py	2008-06-12 12:16:14.000000000 +0100
>  
>  
>          null = os.open("/dev/null", os.O_RDONLY)
> -        logfd = os.open(logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC)
> -        
> +        logfd = os.open(self.logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
> +

Don't need to add O_APPEND, since we're truncating the file.

> @@ -356,18 +389,26 @@
>                  os.dup2(logfd, 2)
>                  os.close(null)
>                  os.close(logfd)
> +                self.sentinel_fifo.close()
>                  try:
>                      os.execve(self.device_model, args, env)
> -                except:
> -                    os._exit(127)
> +                except Exception, e:
> +                    print >>sys.stderr, (
> +                        'failed to set up fds or execute dm %s: %s' %
> +                        (self.device_model, utils.exception_string(e)))
> +                    os._exit(126)
>              except:
>                  os._exit(127)
>          else:
>              self.pid = pid
>              os.close(null)
>              os.close(logfd)
> +	sentinel_write.close()
>          self.vm.storeDom("image/device-model-pid", self.pid)
>          log.info("device model pid: %d", self.pid)

Broken indentation here. 

>      def destroyDeviceModel(self):
>          if self.device_model is None:
>              return
> +        self

Typo, I guess ?

Regards,
Daniel.
-- 
|: Red Hat, Engineering, London   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] detect and report qemu-dm failure
  2008-06-12 11:33 ` Daniel P. Berrange
@ 2008-06-12 12:49   ` Ian Jackson
  2008-06-12 12:51     ` Ian Jackson
  2008-06-12 13:30     ` Ian Jackson
  0 siblings, 2 replies; 7+ messages in thread
From: Ian Jackson @ 2008-06-12 12:49 UTC (permalink / raw)
  To: Daniel P. Berrange; +Cc: xen-devel

Daniel P. Berrange writes ("Re: [Xen-devel] [PATCH] detect and report qemu-dm failure"):
> It seems rather complicated, but I can't think of a much better way to deal
> with it, so just a handful of minor nit-picks inline...

Yes, it is rather complicated.  There are slightly better ways of
doing thing kind of an event-driven way but even so the fact that xend
isn't necessarily the parent causes trouble.

Thanks for the comments; I'll resubmit shortly (and give it a more
thorough reading and search for tabs first).

> On Thu, Jun 12, 2008 at 12:23:06PM +0100, Ian Jackson wrote:
> > +	    image.cleanup_stale_sentinel_fifos()
> > +
> 
> This indentation looks wrong to me.

tab/space problem.  I should untabify it.

> > +import fcntl
> 
> This this change intended ?  You're not adding any calls to fcntl in
> this patch.

Oh, that'll be from an earlier attempt.

> > +        logfd = os.open(self.logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
> 
> Don't need to add O_APPEND, since we're truncating the file.

Not true.  We share the fd with our child, and want output to be
properly interleaved.  Without this output can sometimes be lost due
to overwriting.

> > +	sentinel_write.close()
> 
> Broken indentation here. 

tab/space damage again.

> >      def destroyDeviceModel(self):
> >          if self.device_model is None:
> >              return
> > +        self
> 
> Typo, I guess ?

That's weird.  I must have introduced that by mistake.  It has no
effect so my testing won't have spotted it.

Ian.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] detect and report qemu-dm failure
  2008-06-12 12:49   ` Ian Jackson
@ 2008-06-12 12:51     ` Ian Jackson
  2008-06-12 13:30     ` Ian Jackson
  1 sibling, 0 replies; 7+ messages in thread
From: Ian Jackson @ 2008-06-12 12:51 UTC (permalink / raw)
  To: xen-devel

iwj writes ("Re: [Xen-devel] [PATCH] detect and report qemu-dm failure"):
> tab/space problem.  I should untabify it.
> tab/space damage again.

What I mean of course is that my code had tabs in it but the
convention in xend is to use all-space indentation.  That makes the
indentation look wrong when you quote it even though it is actually
correct.  But it is right of you to complain because I should follow
the existing whitespace style.

Ian.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] detect and report qemu-dm failure
  2008-06-12 12:49   ` Ian Jackson
  2008-06-12 12:51     ` Ian Jackson
@ 2008-06-12 13:30     ` Ian Jackson
  2008-06-12 14:28       ` Keir Fraser
  1 sibling, 1 reply; 7+ messages in thread
From: Ian Jackson @ 2008-06-12 13:30 UTC (permalink / raw)
  To: Daniel P. Berrange, xen-devel

[-- Attachment #1: message body text --]
[-- Type: text/plain, Size: 1194 bytes --]

Thanks Daniel's your comments.  Here's a revised patch which takes
them into account.  I've made sure everything's detabified and removed
the orphaned word `self'.

Here's my commit log entry again for Keir's convenience:


detect and report qemu-dm failure

Currently, when qemu-dm fails, typically its exit status is lost and
xend doesn't notice.  In the patch below I use a fifo (named pipe) to
detect qemu-dm's termination and report the exit status to the
logfile, if possible.  (If xend has been restarted since the domain
was created, this isn't possible but we can still know that it failed
and report that fact.)

It would be better to have a failure of qemu crash the domain, by
calling the SCHEDOP_shutdown hypercall with SHUTDOWN_crash.  However
if you have on_crash=restart and the configuration is broken in some
way that causes qemu-dm to bomb out straight away this causes xend to
spin endlessly restarting the doomed domain.  This is a general
problem with on_*=restart and ought to be fixed separately.  When it
is fixed, we can safely arrange for domains whose dm has crashed to be
themselves forcibly crashed.

Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>


Ian.


[-- Attachment #2: detect and log failures of qemu-dm --]
[-- Type: text/plain, Size: 15134 bytes --]

diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN /volatile/iwj/xen-unstable-3.hg/tools/python/xen/util/oshelp.py tools/python/xen/util/oshelp.py
--- /volatile/iwj/xen-unstable-3.hg/tools/python/xen/util/oshelp.py	1970-01-01 01:00:00.000000000 +0100
+++ tools/python/xen/util/oshelp.py	2008-06-12 14:25:53.000000000 +0100
@@ -0,0 +1,20 @@
+import fcntl
+import os
+
+def fcntl_setfd_cloexec(file, bool):
+        f = fcntl.fcntl(file, fcntl.F_GETFD)
+        if bool: f |= fcntl.FD_CLOEXEC
+        else: f &= ~fcntl.FD_CLOEXEC
+        fcntl.fcntl(file, fcntl.F_SETFD)
+
+def waitstatus_description(st):
+        if os.WIFEXITED(st):
+                es = os.WEXITSTATUS(st)
+                if es: return "exited with nonzero status %i" % es
+                else: return "exited"
+        elif os.WIFSIGNALED(st):
+                s = "died due to signal %i" % os.WTERMSIG(st)
+                if os.WCOREDUMP(st): s += " (core dumped)"
+                return s
+        else:
+                return "failed with unexpected wait status %i" % st
diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN /volatile/iwj/xen-unstable-3.hg/tools/python/xen/util/utils.py tools/python/xen/util/utils.py
--- /volatile/iwj/xen-unstable-3.hg/tools/python/xen/util/utils.py	1970-01-01 01:00:00.000000000 +0100
+++ tools/python/xen/util/utils.py	2008-06-12 14:25:53.000000000 +0100
@@ -0,0 +1,6 @@
+import traceback
+import sys
+
+def exception_string(e):
+        (ty,v,tb) = sys.exc_info()
+        return traceback.format_exception_only(ty,v)
diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/XendDomain.py tools/python/xen/xend/XendDomain.py
--- /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/XendDomain.py	2008-06-12 14:24:24.000000000 +0100
+++ tools/python/xen/xend/XendDomain.py	2008-06-12 13:49:54.000000000 +0100
@@ -34,7 +34,7 @@
 
 from xen.xend import XendOptions, XendCheckpoint, XendDomainInfo
 from xen.xend.PrettyPrint import prettyprint
-from xen.xend import XendConfig
+from xen.xend import XendConfig, image
 from xen.xend.XendError import XendError, XendInvalidDomain, VmError
 from xen.xend.XendError import VMBadState
 from xen.xend.XendLogging import log
@@ -179,6 +179,8 @@
                         log.exception("Failed to create reference to running "
                                       "domain id: %d" % dom['domid'])
 
+            image.cleanup_stale_sentinel_fifos()
+
             # add all managed domains as dormant domains.
             for dom in managed:
                 dom_uuid = dom.get('uuid')
diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/XendLogging.py tools/python/xen/xend/XendLogging.py
--- /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/XendLogging.py	2008-06-12 14:24:24.000000000 +0100
+++ tools/python/xen/xend/XendLogging.py	2008-06-12 12:14:14.000000000 +0100
@@ -25,10 +25,10 @@
 import types
 import logging
 import logging.handlers
-import fcntl
 
 from xen.util import mkdir
 from xen.xend.server import params
+from xen.util import oshelp
 
 
 __all__ = [ 'log', 'init', 'getLogFilename' ]
@@ -103,9 +103,7 @@
     # entire FileHandler, StreamHandler & RotatingFileHandler classes which
     # is even worse
     def setCloseOnExec(self):
-        flags = fcntl.fcntl(self.stream.fileno(), fcntl.F_GETFD)
-        flags |= fcntl.FD_CLOEXEC
-        fcntl.fcntl(self.stream.fileno(), fcntl.F_SETFD, flags)
+        oshelp.fcntl_setfd_cloexec(self.stream, True)
         
 
 def init(filename, level):
diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig' --exclude='*~' -ruN /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/image.py tools/python/xen/xend/image.py
--- /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/image.py	2008-06-12 14:24:24.000000000 +0100
+++ tools/python/xen/xend/image.py	2008-06-12 14:27:31.000000000 +0100
@@ -22,6 +22,12 @@
 import math
 import time
 import signal
+import thread
+import fcntl
+import sys
+import errno
+import glob
+import traceback
 
 import xen.lowlevel.xc
 from xen.xend.XendConstants import *
@@ -32,11 +38,23 @@
 from xen.xend.xenstore.xswatch import xswatch
 from xen.xend import arch
 from xen.xend import XendOptions
+from xen.util import oshelp
+from xen.util import utils
 
 xc = xen.lowlevel.xc.xc()
 
 MAX_GUEST_CMDLINE = 1024
 
+sentinel_path_prefix = '/var/run/xend/dm-'
+sentinel_fifos_inuse = { }
+
+def cleanup_stale_sentinel_fifos():
+    for path in glob.glob(sentinel_path_prefix + '*.fifo'):
+        if path in sentinel_fifos_inuse: continue
+        try: os.unlink(path)
+        except OSError, e:
+            log.warning('could not delete stale fifo %s: %s',
+                path, utils.exception_string(e))
 
 def create(vm, vmConfig):
     """Create an image handler for a vm.
@@ -318,6 +336,13 @@
         args = args + self.dmargs
         return args
 
+    def _openSentinel(self, sentinel_path_fifo):
+        self.sentinel_fifo = file(sentinel_path_fifo, 'r')
+        self.sentinel_lock = thread.allocate_lock()
+        oshelp.fcntl_setfd_cloexec(self.sentinel_fifo, True)
+        sentinel_fifos_inuse[sentinel_path_fifo] = 1
+        self.sentinel_path_fifo = sentinel_path_fifo
+
     def createDeviceModel(self, restore = False):
         if self.device_model is None:
             return
@@ -333,21 +358,29 @@
             env['XAUTHORITY'] = self.xauthority
         if self.vncconsole:
             args = args + ([ "-vncviewer" ])
+        unique_id = "%i-%i" % (self.vm.getDomid(), time.time())
+        sentinel_path = sentinel_path_prefix + unique_id
+        sentinel_path_fifo = sentinel_path + '.fifo'
+        os.mkfifo(sentinel_path_fifo, 0600)
+        sentinel_write = file(sentinel_path_fifo, 'r+')
+        self._openSentinel(sentinel_path_fifo)
+        self.vm.storeDom("image/device-model-fifo", sentinel_path_fifo)
         xstransact.Mkdir("/local/domain/0/device-model/%i" % self.vm.getDomid())
         xstransact.SetPermissions("/local/domain/0/device-model/%i" % self.vm.getDomid(),
                         { 'dom': self.vm.getDomid(), 'read': True, 'write': True })
         log.info("spawning device models: %s %s", self.device_model, args)
         # keep track of pid and spawned options to kill it later
 
-        logfile = "/var/log/xen/qemu-dm-%s.log" %  str(self.vm.info['name_label'])
-        if os.path.exists(logfile):
-            if os.path.exists(logfile + ".1"):
-                os.unlink(logfile + ".1")
-            os.rename(logfile, logfile + ".1")
+        self.logfile = "/var/log/xen/qemu-dm-%s.log" %  str(self.vm.info['name_label'])
+        if os.path.exists(self.logfile):
+            if os.path.exists(self.logfile + ".1"):
+                os.unlink(self.logfile + ".1")
+            os.rename(self.logfile, self.logfile + ".1")
 
         null = os.open("/dev/null", os.O_RDONLY)
-        logfd = os.open(logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC)
+        logfd = os.open(self.logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
         
+        sys.stderr.flush()
         pid = os.fork()
         if pid == 0: #child
             try:
@@ -356,18 +389,26 @@
                 os.dup2(logfd, 2)
                 os.close(null)
                 os.close(logfd)
+                self.sentinel_fifo.close()
                 try:
                     os.execve(self.device_model, args, env)
-                except:
-                    os._exit(127)
+                except Exception, e:
+                    print >>sys.stderr, (
+                        'failed to set up fds or execute dm %s: %s' %
+                        (self.device_model, utils.exception_string(e)))
+                    os._exit(126)
             except:
                 os._exit(127)
         else:
             self.pid = pid
             os.close(null)
             os.close(logfd)
+        sentinel_write.close()
         self.vm.storeDom("image/device-model-pid", self.pid)
         log.info("device model pid: %d", self.pid)
+        # we would very much prefer not to have a thread here and instead
+        #  have a callback but sadly we don't have Twisted in xend
+        self.sentinel_thread = thread.start_new_thread(self._sentinel_watch,())
 
     def signalDeviceModel(self, cmd, ret, par = None):
         if self.device_model is None:
@@ -413,47 +454,118 @@
         xstransact.Store("/local/domain/0/device-model/%i"
                          % self.vm.getDomid(), ('command', 'continue'))
 
+    def _dmfailed(self, message):
+        log.warning("domain %s: %s", self.vm.getName(), message)
+        # ideally we would like to forcibly crash the domain with
+        # something like
+        #    xc.domain_shutdown(self.vm.getDomid(), DOMAIN_CRASH)
+        # but this can easily lead to very rapid restart loops against
+        # which we currently have no protection
+
     def recreate(self):
         if self.device_model is None:
             return
-        self.pid = self.vm.gatherDom(('image/device-model-pid', int))
+        name = self.vm.getName()
+        sentinel_path_fifo = self.vm.readDom('image/device-model-fifo')
+        fifo_fd = -1
+        log.debug("rediscovering %s", sentinel_path_fifo)
+        if sentinel_path_fifo is None:
+            log.debug("%s device model no sentinel, cannot rediscover", name)
+        else:
+            try:
+                # We open it O_WRONLY because that fails ENXIO if no-one
+                # has it open for reading (see SuSv3).  The dm process got
+                # a read/write descriptor from our earlier invocation.
+                fifo_fd = os.open(sentinel_path_fifo, os.O_WRONLY|os.O_NONBLOCK)
+            except OSError, e:
+                if e.errno == errno.ENXIO:
+                    self._dmfailed("%s device model no longer running"%name)
+                elif e.errno == errno.ENOENT:
+                    log.debug("%s device model sentinel %s absent!",
+                            name, sentinel_path_fifo)
+                else:
+                    raise
+        if fifo_fd >= 0:
+            self._openSentinel(sentinel_path_fifo)
+            os.close(fifo_fd)
+            self.pid = self.vm.gatherDom(('image/device-model-pid', int))
+            log.debug("%s device model rediscovered, pid %s sentinel fifo %s",
+                    name, self.pid, sentinel_path_fifo)
+            self.sentinel_thread = thread.start_new_thread(self._sentinel_watch,())
+
+    def _sentinel_watch(self):
+        log.info("waiting for sentinel_fifo")
+        try: self.sentinel_fifo.read(1)
+        except OSError, e: pass
+        self.sentinel_lock.acquire()
+        try:
+            if self.pid:
+                (p,st) = os.waitpid(self.pid, os.WNOHANG)
+                if p == self.pid:
+                    message = oshelp.waitstatus_description(st)
+                else:
+                    # obviously it is malfunctioning, kill it now
+                    try:
+                        os.kill(self.pid, signal.SIGKILL)
+                        message = "malfunctioning (closed sentinel), killed"
+                    except:
+                        message = "malfunctioning or died ?"
+                message = "pid %d: %s" % (self.pid, message)
+            else:
+                message = "no longer running"
+        except Exception, e:
+            message = "waitpid failed: %s" % utils.exception_string(e)
+        message = "device model failure: %s" % message
+        try: message += "; see %s " % self.logfile
+        except: pass
+        self._dmfailed(message)
+        self.pid = None
+        self.sentinel_lock.release()
 
     def destroyDeviceModel(self):
         if self.device_model is None:
             return
         if self.pid:
+            self.sentinel_lock.acquire()
             try:
-                os.kill(self.pid, signal.SIGHUP)
-            except OSError, exn:
-                log.exception(exn)
-            try:
-                # Try to reap the child every 100ms for 10s. Then SIGKILL it.
-                for i in xrange(100):
-                    (p, rv) = os.waitpid(self.pid, os.WNOHANG)
-                    if p == self.pid:
-                        break
-                    time.sleep(0.1)
-                else:
-                    log.warning("DeviceModel %d took more than 10s "
-                                "to terminate: sending SIGKILL" % self.pid)
+                try:
+                    os.kill(self.pid, signal.SIGHUP)
+                except OSError, exn:
+                    log.exception(exn)
+                try:
+                    # Try to reap the child every 100ms for 10s. Then SIGKILL it.
+                    for i in xrange(100):
+                        (p, rv) = os.waitpid(self.pid, os.WNOHANG)
+                        if p == self.pid:
+                            break
+                        time.sleep(0.1)
+                    else:
+                        log.warning("DeviceModel %d took more than 10s "
+                                    "to terminate: sending SIGKILL" % self.pid)
+                        os.kill(self.pid, signal.SIGKILL)
+                        os.waitpid(self.pid, 0)
+                except OSError, exn:
+                    # This is expected if Xend has been restarted within the
+                    # life of this domain.  In this case, we can kill the process,
+                    # but we can't wait for it because it's not our child.
+                    # We just make really sure it's going away (SIGKILL) first.
                     os.kill(self.pid, signal.SIGKILL)
-                    os.waitpid(self.pid, 0)
-            except OSError, exn:
-                # This is expected if Xend has been restarted within the
-                # life of this domain.  In this case, we can kill the process,
-                # but we can't wait for it because it's not our child.
-                # We just make really sure it's going away (SIGKILL) first.
-                os.kill(self.pid, signal.SIGKILL)
-            self.pid = None
-            state = xstransact.Remove("/local/domain/0/device-model/%i"
-                                      % self.vm.getDomid())
+                state = xstransact.Remove("/local/domain/0/device-model/%i"
+                                          % self.vm.getDomid())
+            finally:
+                self.pid = None
+                self.sentinel_lock.release()
             
             try:
                 os.unlink('/var/run/tap/qemu-read-%d' % self.vm.getDomid())
                 os.unlink('/var/run/tap/qemu-write-%d' % self.vm.getDomid())
             except:
                 pass
-
+            try:
+                del sentinel_fifos_inuse[self.sentinel_path_fifo]
+                os.unlink(self.sentinel_path_fifo)
+            except:
+                pass
 
 class LinuxImageHandler(ImageHandler):
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] detect and report qemu-dm failure
  2008-06-12 13:30     ` Ian Jackson
@ 2008-06-12 14:28       ` Keir Fraser
  2008-06-12 16:00         ` Ian Jackson
  0 siblings, 1 reply; 7+ messages in thread
From: Keir Fraser @ 2008-06-12 14:28 UTC (permalink / raw)
  To: Ian Jackson, Daniel P. Berrange, xen-devel

This patch doesn't apply to tip.

 -- Keir

On 12/6/08 14:30, "Ian Jackson" <Ian.Jackson@eu.citrix.com> wrote:

> Thanks Daniel's your comments.  Here's a revised patch which takes
> them into account.  I've made sure everything's detabified and removed
> the orphaned word `self'.
> 
> Here's my commit log entry again for Keir's convenience:
> 
> 
> detect and report qemu-dm failure
> 
> Currently, when qemu-dm fails, typically its exit status is lost and
> xend doesn't notice.  In the patch below I use a fifo (named pipe) to
> detect qemu-dm's termination and report the exit status to the
> logfile, if possible.  (If xend has been restarted since the domain
> was created, this isn't possible but we can still know that it failed
> and report that fact.)
> 
> It would be better to have a failure of qemu crash the domain, by
> calling the SCHEDOP_shutdown hypercall with SHUTDOWN_crash.  However
> if you have on_crash=restart and the configuration is broken in some
> way that causes qemu-dm to bomb out straight away this causes xend to
> spin endlessly restarting the doomed domain.  This is a general
> problem with on_*=restart and ought to be fixed separately.  When it
> is fixed, we can safely arrange for domains whose dm has crashed to be
> themselves forcibly crashed.
> 
> Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
> 
> 
> Ian.
> 
> diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig'
> --exclude='*~' -ruN
> /volatile/iwj/xen-unstable-3.hg/tools/python/xen/util/oshelp.py
> tools/python/xen/util/oshelp.py
> --- /volatile/iwj/xen-unstable-3.hg/tools/python/xen/util/oshelp.py 1970-01-01
> 01:00:00.000000000 +0100
> +++ tools/python/xen/util/oshelp.py 2008-06-12 14:25:53.000000000 +0100
> @@ -0,0 +1,20 @@
> +import fcntl
> +import os
> +
> +def fcntl_setfd_cloexec(file, bool):
> +        f = fcntl.fcntl(file, fcntl.F_GETFD)
> +        if bool: f |= fcntl.FD_CLOEXEC
> +        else: f &= ~fcntl.FD_CLOEXEC
> +        fcntl.fcntl(file, fcntl.F_SETFD)
> +
> +def waitstatus_description(st):
> +        if os.WIFEXITED(st):
> +                es = os.WEXITSTATUS(st)
> +                if es: return "exited with nonzero status %i" % es
> +                else: return "exited"
> +        elif os.WIFSIGNALED(st):
> +                s = "died due to signal %i" % os.WTERMSIG(st)
> +                if os.WCOREDUMP(st): s += " (core dumped)"
> +                return s
> +        else:
> +                return "failed with unexpected wait status %i" % st
> diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig'
> --exclude='*~' -ruN
> /volatile/iwj/xen-unstable-3.hg/tools/python/xen/util/utils.py
> tools/python/xen/util/utils.py
> --- /volatile/iwj/xen-unstable-3.hg/tools/python/xen/util/utils.py 1970-01-01
> 01:00:00.000000000 +0100
> +++ tools/python/xen/util/utils.py 2008-06-12 14:25:53.000000000 +0100
> @@ -0,0 +1,6 @@
> +import traceback
> +import sys
> +
> +def exception_string(e):
> +        (ty,v,tb) = sys.exc_info()
> +        return traceback.format_exception_only(ty,v)
> diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig'
> --exclude='*~' -ruN
> /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/XendDomain.py
> tools/python/xen/xend/XendDomain.py
> --- 
> /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/XendDomain.py 2008-06-12
> 14:24:24.000000000 +0100
> +++ tools/python/xen/xend/XendDomain.py 2008-06-12 13:49:54.000000000 +0100
> @@ -34,7 +34,7 @@
>  
>  from xen.xend import XendOptions, XendCheckpoint, XendDomainInfo
>  from xen.xend.PrettyPrint import prettyprint
> -from xen.xend import XendConfig
> +from xen.xend import XendConfig, image
>  from xen.xend.XendError import XendError, XendInvalidDomain, VmError
>  from xen.xend.XendError import VMBadState
>  from xen.xend.XendLogging import log
> @@ -179,6 +179,8 @@
>                          log.exception("Failed to create reference to running
> "
>                                        "domain id: %d" % dom['domid'])
>  
> +            image.cleanup_stale_sentinel_fifos()
> +
>              # add all managed domains as dormant domains.
>              for dom in managed:
>                  dom_uuid = dom.get('uuid')
> diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig'
> --exclude='*~' -ruN
> /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/XendLogging.py
> tools/python/xen/xend/XendLogging.py
> --- 
> /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/XendLogging.py 2008-06-1
> 2 14:24:24.000000000 +0100
> +++ tools/python/xen/xend/XendLogging.py 2008-06-12 12:14:14.000000000 +0100
> @@ -25,10 +25,10 @@
>  import types
>  import logging
>  import logging.handlers
> -import fcntl
>  
>  from xen.util import mkdir
>  from xen.xend.server import params
> +from xen.util import oshelp
>  
>  
>  __all__ = [ 'log', 'init', 'getLogFilename' ]
> @@ -103,9 +103,7 @@
>      # entire FileHandler, StreamHandler & RotatingFileHandler classes which
>      # is even worse
>      def setCloseOnExec(self):
> -        flags = fcntl.fcntl(self.stream.fileno(), fcntl.F_GETFD)
> -        flags |= fcntl.FD_CLOEXEC
> -        fcntl.fcntl(self.stream.fileno(), fcntl.F_SETFD, flags)
> +        oshelp.fcntl_setfd_cloexec(self.stream, True)
>          
>  
>  def init(filename, level):
> diff --exclude=.hg --exclude=.config --exclude='*.old' --exclude='*.orig'
> --exclude='*~' -ruN
> /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/image.py
> tools/python/xen/xend/image.py
> --- /volatile/iwj/xen-unstable-3.hg/tools/python/xen/xend/image.py 2008-06-12
> 14:24:24.000000000 +0100
> +++ tools/python/xen/xend/image.py 2008-06-12 14:27:31.000000000 +0100
> @@ -22,6 +22,12 @@
>  import math
>  import time
>  import signal
> +import thread
> +import fcntl
> +import sys
> +import errno
> +import glob
> +import traceback
>  
>  import xen.lowlevel.xc
>  from xen.xend.XendConstants import *
> @@ -32,11 +38,23 @@
>  from xen.xend.xenstore.xswatch import xswatch
>  from xen.xend import arch
>  from xen.xend import XendOptions
> +from xen.util import oshelp
> +from xen.util import utils
>  
>  xc = xen.lowlevel.xc.xc()
>  
>  MAX_GUEST_CMDLINE = 1024
>  
> +sentinel_path_prefix = '/var/run/xend/dm-'
> +sentinel_fifos_inuse = { }
> +
> +def cleanup_stale_sentinel_fifos():
> +    for path in glob.glob(sentinel_path_prefix + '*.fifo'):
> +        if path in sentinel_fifos_inuse: continue
> +        try: os.unlink(path)
> +        except OSError, e:
> +            log.warning('could not delete stale fifo %s: %s',
> +                path, utils.exception_string(e))
>  
>  def create(vm, vmConfig):
>      """Create an image handler for a vm.
> @@ -318,6 +336,13 @@
>          args = args + self.dmargs
>          return args
>  
> +    def _openSentinel(self, sentinel_path_fifo):
> +        self.sentinel_fifo = file(sentinel_path_fifo, 'r')
> +        self.sentinel_lock = thread.allocate_lock()
> +        oshelp.fcntl_setfd_cloexec(self.sentinel_fifo, True)
> +        sentinel_fifos_inuse[sentinel_path_fifo] = 1
> +        self.sentinel_path_fifo = sentinel_path_fifo
> +
>      def createDeviceModel(self, restore = False):
>          if self.device_model is None:
>              return
> @@ -333,21 +358,29 @@
>              env['XAUTHORITY'] = self.xauthority
>          if self.vncconsole:
>              args = args + ([ "-vncviewer" ])
> +        unique_id = "%i-%i" % (self.vm.getDomid(), time.time())
> +        sentinel_path = sentinel_path_prefix + unique_id
> +        sentinel_path_fifo = sentinel_path + '.fifo'
> +        os.mkfifo(sentinel_path_fifo, 0600)
> +        sentinel_write = file(sentinel_path_fifo, 'r+')
> +        self._openSentinel(sentinel_path_fifo)
> +        self.vm.storeDom("image/device-model-fifo", sentinel_path_fifo)
>          xstransact.Mkdir("/local/domain/0/device-model/%i" %
> self.vm.getDomid())
>          xstransact.SetPermissions("/local/domain/0/device-model/%i" %
> self.vm.getDomid(),
>                          { 'dom': self.vm.getDomid(), 'read': True, 'write':
> True })
>          log.info("spawning device models: %s %s", self.device_model, args)
>          # keep track of pid and spawned options to kill it later
>  
> -        logfile = "/var/log/xen/qemu-dm-%s.log" %
> str(self.vm.info['name_label'])
> -        if os.path.exists(logfile):
> -            if os.path.exists(logfile + ".1"):
> -                os.unlink(logfile + ".1")
> -            os.rename(logfile, logfile + ".1")
> +        self.logfile = "/var/log/xen/qemu-dm-%s.log" %
> str(self.vm.info['name_label'])
> +        if os.path.exists(self.logfile):
> +            if os.path.exists(self.logfile + ".1"):
> +                os.unlink(self.logfile + ".1")
> +            os.rename(self.logfile, self.logfile + ".1")
>  
>          null = os.open("/dev/null", os.O_RDONLY)
> -        logfd = os.open(logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC)
> +        logfd = os.open(self.logfile,
> os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
>          
> +        sys.stderr.flush()
>          pid = os.fork()
>          if pid == 0: #child
>              try:
> @@ -356,18 +389,26 @@
>                  os.dup2(logfd, 2)
>                  os.close(null)
>                  os.close(logfd)
> +                self.sentinel_fifo.close()
>                  try:
>                      os.execve(self.device_model, args, env)
> -                except:
> -                    os._exit(127)
> +                except Exception, e:
> +                    print >>sys.stderr, (
> +                        'failed to set up fds or execute dm %s: %s' %
> +                        (self.device_model, utils.exception_string(e)))
> +                    os._exit(126)
>              except:
>                  os._exit(127)
>          else:
>              self.pid = pid
>              os.close(null)
>              os.close(logfd)
> +        sentinel_write.close()
>          self.vm.storeDom("image/device-model-pid", self.pid)
>          log.info("device model pid: %d", self.pid)
> +        # we would very much prefer not to have a thread here and instead
> +        #  have a callback but sadly we don't have Twisted in xend
> +        self.sentinel_thread =
> thread.start_new_thread(self._sentinel_watch,())
>  
>      def signalDeviceModel(self, cmd, ret, par = None):
>          if self.device_model is None:
> @@ -413,47 +454,118 @@
>          xstransact.Store("/local/domain/0/device-model/%i"
>                           % self.vm.getDomid(), ('command', 'continue'))
>  
> +    def _dmfailed(self, message):
> +        log.warning("domain %s: %s", self.vm.getName(), message)
> +        # ideally we would like to forcibly crash the domain with
> +        # something like
> +        #    xc.domain_shutdown(self.vm.getDomid(), DOMAIN_CRASH)
> +        # but this can easily lead to very rapid restart loops against
> +        # which we currently have no protection
> +
>      def recreate(self):
>          if self.device_model is None:
>              return
> -        self.pid = self.vm.gatherDom(('image/device-model-pid', int))
> +        name = self.vm.getName()
> +        sentinel_path_fifo = self.vm.readDom('image/device-model-fifo')
> +        fifo_fd = -1
> +        log.debug("rediscovering %s", sentinel_path_fifo)
> +        if sentinel_path_fifo is None:
> +            log.debug("%s device model no sentinel, cannot rediscover", name)
> +        else:
> +            try:
> +                # We open it O_WRONLY because that fails ENXIO if no-one
> +                # has it open for reading (see SuSv3).  The dm process got
> +                # a read/write descriptor from our earlier invocation.
> +                fifo_fd = os.open(sentinel_path_fifo,
> os.O_WRONLY|os.O_NONBLOCK)
> +            except OSError, e:
> +                if e.errno == errno.ENXIO:
> +                    self._dmfailed("%s device model no longer running"%name)
> +                elif e.errno == errno.ENOENT:
> +                    log.debug("%s device model sentinel %s absent!",
> +                            name, sentinel_path_fifo)
> +                else:
> +                    raise
> +        if fifo_fd >= 0:
> +            self._openSentinel(sentinel_path_fifo)
> +            os.close(fifo_fd)
> +            self.pid = self.vm.gatherDom(('image/device-model-pid', int))
> +            log.debug("%s device model rediscovered, pid %s sentinel fifo
> %s",
> +                    name, self.pid, sentinel_path_fifo)
> +            self.sentinel_thread =
> thread.start_new_thread(self._sentinel_watch,())
> +
> +    def _sentinel_watch(self):
> +        log.info("waiting for sentinel_fifo")
> +        try: self.sentinel_fifo.read(1)
> +        except OSError, e: pass
> +        self.sentinel_lock.acquire()
> +        try:
> +            if self.pid:
> +                (p,st) = os.waitpid(self.pid, os.WNOHANG)
> +                if p == self.pid:
> +                    message = oshelp.waitstatus_description(st)
> +                else:
> +                    # obviously it is malfunctioning, kill it now
> +                    try:
> +                        os.kill(self.pid, signal.SIGKILL)
> +                        message = "malfunctioning (closed sentinel), killed"
> +                    except:
> +                        message = "malfunctioning or died ?"
> +                message = "pid %d: %s" % (self.pid, message)
> +            else:
> +                message = "no longer running"
> +        except Exception, e:
> +            message = "waitpid failed: %s" % utils.exception_string(e)
> +        message = "device model failure: %s" % message
> +        try: message += "; see %s " % self.logfile
> +        except: pass
> +        self._dmfailed(message)
> +        self.pid = None
> +        self.sentinel_lock.release()
>  
>      def destroyDeviceModel(self):
>          if self.device_model is None:
>              return
>          if self.pid:
> +            self.sentinel_lock.acquire()
>              try:
> -                os.kill(self.pid, signal.SIGHUP)
> -            except OSError, exn:
> -                log.exception(exn)
> -            try:
> -                # Try to reap the child every 100ms for 10s. Then SIGKILL it.
> -                for i in xrange(100):
> -                    (p, rv) = os.waitpid(self.pid, os.WNOHANG)
> -                    if p == self.pid:
> -                        break
> -                    time.sleep(0.1)
> -                else:
> -                    log.warning("DeviceModel %d took more than 10s "
> -                                "to terminate: sending SIGKILL" % self.pid)
> +                try:
> +                    os.kill(self.pid, signal.SIGHUP)
> +                except OSError, exn:
> +                    log.exception(exn)
> +                try:
> +                    # Try to reap the child every 100ms for 10s. Then SIGKILL
> it.
> +                    for i in xrange(100):
> +                        (p, rv) = os.waitpid(self.pid, os.WNOHANG)
> +                        if p == self.pid:
> +                            break
> +                        time.sleep(0.1)
> +                    else:
> +                        log.warning("DeviceModel %d took more than 10s "
> +                                    "to terminate: sending SIGKILL" %
> self.pid)
> +                        os.kill(self.pid, signal.SIGKILL)
> +                        os.waitpid(self.pid, 0)
> +                except OSError, exn:
> +                    # This is expected if Xend has been restarted within the
> +                    # life of this domain.  In this case, we can kill the
> process,
> +                    # but we can't wait for it because it's not our child.
> +                    # We just make really sure it's going away (SIGKILL)
> first.
>                      os.kill(self.pid, signal.SIGKILL)
> -                    os.waitpid(self.pid, 0)
> -            except OSError, exn:
> -                # This is expected if Xend has been restarted within the
> -                # life of this domain.  In this case, we can kill the
> process,
> -                # but we can't wait for it because it's not our child.
> -                # We just make really sure it's going away (SIGKILL) first.
> -                os.kill(self.pid, signal.SIGKILL)
> -            self.pid = None
> -            state = xstransact.Remove("/local/domain/0/device-model/%i"
> -                                      % self.vm.getDomid())
> +                state = xstransact.Remove("/local/domain/0/device-model/%i"
> +                                          % self.vm.getDomid())
> +            finally:
> +                self.pid = None
> +                self.sentinel_lock.release()
>              
>              try:
>                  os.unlink('/var/run/tap/qemu-read-%d' % self.vm.getDomid())
>                  os.unlink('/var/run/tap/qemu-write-%d' % self.vm.getDomid())
>              except:
>                  pass
> -
> +            try:
> +                del sentinel_fifos_inuse[self.sentinel_path_fifo]
> +                os.unlink(self.sentinel_path_fifo)
> +            except:
> +                pass
>  
>  class LinuxImageHandler(ImageHandler):
>  
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] detect and report qemu-dm failure
  2008-06-12 14:28       ` Keir Fraser
@ 2008-06-12 16:00         ` Ian Jackson
  0 siblings, 0 replies; 7+ messages in thread
From: Ian Jackson @ 2008-06-12 16:00 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel, Daniel P. Berrange

[-- Attachment #1: message body text --]
[-- Type: text/plain, Size: 217 bytes --]

Keir Fraser writes ("Re: [Xen-devel] [PATCH] detect and report qemu-dm failure"):
> This patch doesn't apply to tip.

Updated (and given a quick retest).

Ian.

Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>


[-- Attachment #2: detect and log failures of qemu-dm --]
[-- Type: text/plain, Size: 14384 bytes --]

diff -r 4ac0898df538 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py	Thu Jun 12 16:18:31 2008 +0100
+++ b/tools/python/xen/xend/XendDomain.py	Thu Jun 12 16:46:56 2008 +0100
@@ -34,7 +34,7 @@ import xen.lowlevel.xc
 
 from xen.xend import XendOptions, XendCheckpoint, XendDomainInfo
 from xen.xend.PrettyPrint import prettyprint
-from xen.xend import XendConfig
+from xen.xend import XendConfig, image
 from xen.xend.XendError import XendError, XendInvalidDomain, VmError
 from xen.xend.XendError import VMBadState
 from xen.xend.XendLogging import log
@@ -178,6 +178,8 @@ class XendDomain:
                     except Exception:
                         log.exception("Failed to create reference to running "
                                       "domain id: %d" % dom['domid'])
+
+            image.cleanup_stale_sentinel_fifos()
 
             # add all managed domains as dormant domains.
             for dom in managed:
diff -r 4ac0898df538 tools/python/xen/xend/XendLogging.py
--- a/tools/python/xen/xend/XendLogging.py	Thu Jun 12 16:18:31 2008 +0100
+++ b/tools/python/xen/xend/XendLogging.py	Thu Jun 12 16:46:56 2008 +0100
@@ -25,10 +25,10 @@ import types
 import types
 import logging
 import logging.handlers
-import fcntl
 
 from xen.util import mkdir
 from xen.xend.server import params
+from xen.util import oshelp
 
 
 __all__ = [ 'log', 'init', 'getLogFilename' ]
@@ -103,9 +103,7 @@ class XendRotatingFileHandler(logging.ha
     # entire FileHandler, StreamHandler & RotatingFileHandler classes which
     # is even worse
     def setCloseOnExec(self):
-        flags = fcntl.fcntl(self.stream.fileno(), fcntl.F_GETFD)
-        flags |= fcntl.FD_CLOEXEC
-        fcntl.fcntl(self.stream.fileno(), fcntl.F_SETFD, flags)
+        oshelp.fcntl_setfd_cloexec(self.stream, True)
         
 
 def init(filename, level):
diff -r 4ac0898df538 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py	Thu Jun 12 16:18:31 2008 +0100
+++ b/tools/python/xen/xend/image.py	Thu Jun 12 16:47:49 2008 +0100
@@ -22,6 +22,12 @@ import math
 import math
 import time
 import signal
+import thread
+import fcntl
+import sys
+import errno
+import glob
+import traceback
 
 import xen.lowlevel.xc
 from xen.xend.XendConstants import *
@@ -32,11 +38,23 @@ from xen.xend.xenstore.xswatch import xs
 from xen.xend.xenstore.xswatch import xswatch
 from xen.xend import arch
 from xen.xend import XendOptions
+from xen.util import oshelp
+from xen.util import utils
 
 xc = xen.lowlevel.xc.xc()
 
 MAX_GUEST_CMDLINE = 1024
 
+sentinel_path_prefix = '/var/run/xend/dm-'
+sentinel_fifos_inuse = { }
+
+def cleanup_stale_sentinel_fifos():
+    for path in glob.glob(sentinel_path_prefix + '*.fifo'):
+        if path in sentinel_fifos_inuse: continue
+        try: os.unlink(path)
+        except OSError, e:
+            log.warning('could not delete stale fifo %s: %s',
+                path, utils.exception_string(e))
 
 def create(vm, vmConfig):
     """Create an image handler for a vm.
@@ -324,6 +342,13 @@ class ImageHandler:
         args = args + self.dmargs
         return args
 
+    def _openSentinel(self, sentinel_path_fifo):
+        self.sentinel_fifo = file(sentinel_path_fifo, 'r')
+        self.sentinel_lock = thread.allocate_lock()
+        oshelp.fcntl_setfd_cloexec(self.sentinel_fifo, True)
+        sentinel_fifos_inuse[sentinel_path_fifo] = 1
+        self.sentinel_path_fifo = sentinel_path_fifo
+
     def createDeviceModel(self, restore = False):
         if self.device_model is None:
             return
@@ -339,21 +364,29 @@ class ImageHandler:
             env['XAUTHORITY'] = self.xauthority
         if self.vncconsole:
             args = args + ([ "-vncviewer" ])
+        unique_id = "%i-%i" % (self.vm.getDomid(), time.time())
+        sentinel_path = sentinel_path_prefix + unique_id
+        sentinel_path_fifo = sentinel_path + '.fifo'
+        os.mkfifo(sentinel_path_fifo, 0600)
+        sentinel_write = file(sentinel_path_fifo, 'r+')
+        self._openSentinel(sentinel_path_fifo)
+        self.vm.storeDom("image/device-model-fifo", sentinel_path_fifo)
         xstransact.Mkdir("/local/domain/0/device-model/%i" % self.vm.getDomid())
         xstransact.SetPermissions("/local/domain/0/device-model/%i" % self.vm.getDomid(),
                         { 'dom': self.vm.getDomid(), 'read': True, 'write': True })
         log.info("spawning device models: %s %s", self.device_model, args)
         # keep track of pid and spawned options to kill it later
 
-        logfile = "/var/log/xen/qemu-dm-%s.log" %  str(self.vm.info['name_label'])
-        if os.path.exists(logfile):
-            if os.path.exists(logfile + ".1"):
-                os.unlink(logfile + ".1")
-            os.rename(logfile, logfile + ".1")
+        self.logfile = "/var/log/xen/qemu-dm-%s.log" %  str(self.vm.info['name_label'])
+        if os.path.exists(self.logfile):
+            if os.path.exists(self.logfile + ".1"):
+                os.unlink(self.logfile + ".1")
+            os.rename(self.logfile, self.logfile + ".1")
 
         null = os.open("/dev/null", os.O_RDONLY)
-        logfd = os.open(logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC)
+        logfd = os.open(self.logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
         
+        sys.stderr.flush()
         pid = os.fork()
         if pid == 0: #child
             try:
@@ -362,18 +395,26 @@ class ImageHandler:
                 os.dup2(logfd, 2)
                 os.close(null)
                 os.close(logfd)
+                self.sentinel_fifo.close()
                 try:
                     os.execve(self.device_model, args, env)
-                except:
-                    os._exit(127)
+                except Exception, e:
+                    print >>sys.stderr, (
+                        'failed to set up fds or execute dm %s: %s' %
+                        (self.device_model, utils.exception_string(e)))
+                    os._exit(126)
             except:
                 os._exit(127)
         else:
             self.pid = pid
             os.close(null)
             os.close(logfd)
+        sentinel_write.close()
         self.vm.storeDom("image/device-model-pid", self.pid)
         log.info("device model pid: %d", self.pid)
+        # we would very much prefer not to have a thread here and instead
+        #  have a callback but sadly we don't have Twisted in xend
+        self.sentinel_thread = thread.start_new_thread(self._sentinel_watch,())
 
     def signalDeviceModel(self, cmd, ret, par = None):
         if self.device_model is None:
@@ -419,44 +460,116 @@ class ImageHandler:
         xstransact.Store("/local/domain/0/device-model/%i"
                          % self.vm.getDomid(), ('command', 'continue'))
 
+    def _dmfailed(self, message):
+        log.warning("domain %s: %s", self.vm.getName(), message)
+        # ideally we would like to forcibly crash the domain with
+        # something like
+        #    xc.domain_shutdown(self.vm.getDomid(), DOMAIN_CRASH)
+        # but this can easily lead to very rapid restart loops against
+        # which we currently have no protection
+
     def recreate(self):
         if self.device_model is None:
             return
-        self.pid = self.vm.gatherDom(('image/device-model-pid', int))
+        name = self.vm.getName()
+        sentinel_path_fifo = self.vm.readDom('image/device-model-fifo')
+        fifo_fd = -1
+        log.debug("rediscovering %s", sentinel_path_fifo)
+        if sentinel_path_fifo is None:
+            log.debug("%s device model no sentinel, cannot rediscover", name)
+        else:
+            try:
+                # We open it O_WRONLY because that fails ENXIO if no-one
+                # has it open for reading (see SuSv3).  The dm process got
+                # a read/write descriptor from our earlier invocation.
+                fifo_fd = os.open(sentinel_path_fifo, os.O_WRONLY|os.O_NONBLOCK)
+            except OSError, e:
+                if e.errno == errno.ENXIO:
+                    self._dmfailed("%s device model no longer running"%name)
+                elif e.errno == errno.ENOENT:
+                    log.debug("%s device model sentinel %s absent!",
+                            name, sentinel_path_fifo)
+                else:
+                    raise
+        if fifo_fd >= 0:
+            self._openSentinel(sentinel_path_fifo)
+            os.close(fifo_fd)
+            self.pid = self.vm.gatherDom(('image/device-model-pid', int))
+            log.debug("%s device model rediscovered, pid %s sentinel fifo %s",
+                    name, self.pid, sentinel_path_fifo)
+            self.sentinel_thread = thread.start_new_thread(self._sentinel_watch,())
+
+    def _sentinel_watch(self):
+        log.info("waiting for sentinel_fifo")
+        try: self.sentinel_fifo.read(1)
+        except OSError, e: pass
+        self.sentinel_lock.acquire()
+        try:
+            if self.pid:
+                (p,st) = os.waitpid(self.pid, os.WNOHANG)
+                if p == self.pid:
+                    message = oshelp.waitstatus_description(st)
+                else:
+                    # obviously it is malfunctioning, kill it now
+                    try:
+                        os.kill(self.pid, signal.SIGKILL)
+                        message = "malfunctioning (closed sentinel), killed"
+                    except:
+                        message = "malfunctioning or died ?"
+                message = "pid %d: %s" % (self.pid, message)
+            else:
+                message = "no longer running"
+        except Exception, e:
+            message = "waitpid failed: %s" % utils.exception_string(e)
+        message = "device model failure: %s" % message
+        try: message += "; see %s " % self.logfile
+        except: pass
+        self._dmfailed(message)
+        self.pid = None
+        self.sentinel_lock.release()
 
     def destroyDeviceModel(self):
         if self.device_model is None:
             return
         if self.pid:
+            self.sentinel_lock.acquire()
             try:
-                os.kill(self.pid, signal.SIGHUP)
-            except OSError, exn:
-                log.exception(exn)
-            try:
-                # Try to reap the child every 100ms for 10s. Then SIGKILL it.
-                for i in xrange(100):
-                    (p, rv) = os.waitpid(self.pid, os.WNOHANG)
-                    if p == self.pid:
-                        break
-                    time.sleep(0.1)
-                else:
-                    log.warning("DeviceModel %d took more than 10s "
-                                "to terminate: sending SIGKILL" % self.pid)
+                try:
+                    os.kill(self.pid, signal.SIGHUP)
+                except OSError, exn:
+                    log.exception(exn)
+                try:
+                    # Try to reap the child every 100ms for 10s. Then SIGKILL it.
+                    for i in xrange(100):
+                        (p, rv) = os.waitpid(self.pid, os.WNOHANG)
+                        if p == self.pid:
+                            break
+                        time.sleep(0.1)
+                    else:
+                        log.warning("DeviceModel %d took more than 10s "
+                                    "to terminate: sending SIGKILL" % self.pid)
+                        os.kill(self.pid, signal.SIGKILL)
+                        os.waitpid(self.pid, 0)
+                except OSError, exn:
+                    # This is expected if Xend has been restarted within the
+                    # life of this domain.  In this case, we can kill the process,
+                    # but we can't wait for it because it's not our child.
+                    # We just make really sure it's going away (SIGKILL) first.
                     os.kill(self.pid, signal.SIGKILL)
-                    os.waitpid(self.pid, 0)
-            except OSError, exn:
-                # This is expected if Xend has been restarted within the
-                # life of this domain.  In this case, we can kill the process,
-                # but we can't wait for it because it's not our child.
-                # We just make really sure it's going away (SIGKILL) first.
-                os.kill(self.pid, signal.SIGKILL)
-            self.pid = None
-            state = xstransact.Remove("/local/domain/0/device-model/%i"
-                                      % self.vm.getDomid())
+                state = xstransact.Remove("/local/domain/0/device-model/%i"
+                                          % self.vm.getDomid())
+            finally:
+                self.pid = None
+                self.sentinel_lock.release()
             
             try:
                 os.unlink('/var/run/tap/qemu-read-%d' % self.vm.getDomid())
                 os.unlink('/var/run/tap/qemu-write-%d' % self.vm.getDomid())
+            except:
+                pass
+            try:
+                del sentinel_fifos_inuse[self.sentinel_path_fifo]
+                os.unlink(self.sentinel_path_fifo)
             except:
                 pass
 
diff -r 4ac0898df538 tools/python/xen/util/oshelp.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/python/xen/util/oshelp.py	Thu Jun 12 16:46:56 2008 +0100
@@ -0,0 +1,20 @@
+import fcntl
+import os
+
+def fcntl_setfd_cloexec(file, bool):
+        f = fcntl.fcntl(file, fcntl.F_GETFD)
+        if bool: f |= fcntl.FD_CLOEXEC
+        else: f &= ~fcntl.FD_CLOEXEC
+        fcntl.fcntl(file, fcntl.F_SETFD)
+
+def waitstatus_description(st):
+        if os.WIFEXITED(st):
+                es = os.WEXITSTATUS(st)
+                if es: return "exited with nonzero status %i" % es
+                else: return "exited"
+        elif os.WIFSIGNALED(st):
+                s = "died due to signal %i" % os.WTERMSIG(st)
+                if os.WCOREDUMP(st): s += " (core dumped)"
+                return s
+        else:
+                return "failed with unexpected wait status %i" % st
diff -r 4ac0898df538 tools/python/xen/util/utils.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/python/xen/util/utils.py	Thu Jun 12 16:46:56 2008 +0100
@@ -0,0 +1,6 @@
+import traceback
+import sys
+
+def exception_string(e):
+        (ty,v,tb) = sys.exc_info()
+        return traceback.format_exception_only(ty,v)

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2008-06-12 16:00 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-06-12 11:23 [PATCH] detect and report qemu-dm failure Ian Jackson
2008-06-12 11:33 ` Daniel P. Berrange
2008-06-12 12:49   ` Ian Jackson
2008-06-12 12:51     ` Ian Jackson
2008-06-12 13:30     ` Ian Jackson
2008-06-12 14:28       ` Keir Fraser
2008-06-12 16:00         ` Ian Jackson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.