All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] [RFC] Add lock on domain start
@ 2008-08-06  6:06 Jim Fehlig
  2008-08-07  2:23 ` Zhigang Wang
  2008-08-11 11:14 ` Ian Jackson
  0 siblings, 2 replies; 11+ messages in thread
From: Jim Fehlig @ 2008-08-06  6:06 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 1160 bytes --]

This patch adds a simple lock mechanism when starting domains by placing 
a lock file in xend-domains-path/<dom_uuid>.  The lock file is removed 
when domain is stopped.  The motivation for such a mechanism is to 
prevent starting the same domain from multiple hosts.

If xend-domains-path is set to shared mount point, a domain will fail to 
start on host B if it is already running on host A.  I've added an 
option to XendOptions to control the behavior with default of no lock.

The patch certainly needs some testing (and probably adjustment) to 
ensure the lock is handled properly on save, restore, migrate, domain 
crash, etc. but wanted to get folks' thought on this approach before 
continuing this endeavor.  Some simple improvements could include adding 
info (domain name/id, start time, vmm hostname) to the lock file, 
allowing such messages as "domain foo seems to be already running on 
host bar" and  a --force option to create/start to override the lock.  A 
per-domain config option could also be added to allow more fine-grained 
control.

Comments, suggestions, alternative approaches, ... are welcome and 
appreciated :-).

Regards,
Jim

[-- Attachment #2: xend-domain-lock.patch --]
[-- Type: text/x-patch, Size: 5055 bytes --]

diff -r f20fb83dac2c tools/examples/xend-config.sxp
--- a/tools/examples/xend-config.sxp	Tue Aug 05 13:55:14 2008 +0100
+++ b/tools/examples/xend-config.sxp	Tue Aug 05 23:28:55 2008 -0600
@@ -245,3 +245,9 @@
 
 # Rotation count of qemu-dm log file.
 #(qemu-dm-logrotate-count 10)
+
+# Create a lock file when domains are started.  Lock file is
+# placed in xend-domains-path on domain startup and removed
+# when domain is stopped.  By default, a lock file is not
+# created.  Set to yes to enable lock file creation.
+#(xend-domain-lock no)
diff -r f20fb83dac2c tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py	Tue Aug 05 13:55:14 2008 +0100
+++ b/tools/python/xen/xend/XendDomainInfo.py	Tue Aug 05 23:28:55 2008 -0600
@@ -34,7 +34,7 @@ from types import StringTypes
 from types import StringTypes
 
 import xen.lowlevel.xc
-from xen.util import asserts
+from xen.util import asserts, mkdir
 from xen.util.blkif import blkdev_uname_to_file, blkdev_uname_to_taptype
 import xen.util.xsm.xsm as security
 from xen.util import xsconstants
@@ -420,7 +420,10 @@ class XendDomainInfo:
         from xen.xend import XendDomain
 
         if self._stateGet() in (XEN_API_VM_POWER_STATE_HALTED, XEN_API_VM_POWER_STATE_SUSPENDED, XEN_API_VM_POWER_STATE_CRASHED):
-            try:
+            if self._is_locked():
+                raise XendError('VM is locked.  Is it running on another node?')
+
+             try:
                 XendTask.log_progress(0, 30, self._constructDomain)
                 XendTask.log_progress(31, 60, self._initDomain)
                 
@@ -441,6 +444,9 @@ class XendDomainInfo:
                     xendomains.domain_sched_credit_set(self.getDomid(),
                                                        self.getWeight(),
                                                        self.getCap())
+
+                self._create_lock()
+
             except:
                 log.exception('VM start failed')
                 self.destroy()
@@ -1104,6 +1110,53 @@ class XendDomainInfo:
     # internal functions ... TODO: re-categorised
     # 
 
+    def _is_locked(self):
+        if not xoptions.get_xend_domain_lock():
+            return False
+
+        from xen.xend import XendDomain
+        path = XendDomain.instance()._managed_path(self.get_uuid())
+        try:
+            if not os.path.exists(path):
+                return False
+            path += "/lock"
+            return os.access(path, os.F_OK)
+
+        except:
+            log.exception("%s could not be accessed")
+            return False
+
+    def _create_lock(self):
+        if not xoptions.get_xend_domain_lock():
+            return
+
+        from xen.xend import XendDomain
+        path = XendDomain.instance()._managed_path(self.get_uuid())
+        try:
+            if not os.path.exists(path):
+                mkdir.parents(path, stat.S_IRWXU)
+            path += "/lock"
+            oflags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
+            fd = os.open(path, oflags)
+            os.close(fd)
+
+        except:
+            log.exception("%s could not be created." % path)
+
+    def _remove_lock(self):
+        if not xoptions.get_xend_domain_lock():
+            return
+
+        from xen.xend import XendDomain
+        path = XendDomain.instance()._managed_path(self.get_uuid())
+        lock = path + "/lock"
+        try:
+            os.unlink(lock)
+            if not XendDomain.instance().is_domain_managed(self):
+                shutil.rmtree(path)
+        except:
+            log.exception("%s could not be created." % path)
+
     def _augmentInfo(self, priv):
         """Augment self.info, as given to us through L{recreate}, with
         values taken from the store.  This recovers those values known
@@ -2388,6 +2441,8 @@ class XendDomainInfo:
 
         log.debug("XendDomainInfo.destroy: domid=%s", str(self.domid))
 
+        self._remove_lock()
+
         paths = self._prepare_phantom_paths()
 
         self._cleanupVm()
diff -r f20fb83dac2c tools/python/xen/xend/XendOptions.py
--- a/tools/python/xen/xend/XendOptions.py	Tue Aug 05 13:55:14 2008 +0100
+++ b/tools/python/xen/xend/XendOptions.py	Tue Aug 05 23:28:55 2008 -0600
@@ -135,6 +135,11 @@ class XendOptions:
     """Default rotation count of qemu-dm log file."""
     qemu_dm_logrotate_count = 10
 
+    """Default for the flag indicating whether xend should create
+    a lock file for domains when they are started."""
+    xend_domain_lock = 'no'
+
+
     def __init__(self):
         self.configure()
 
@@ -357,6 +362,11 @@ class XendOptions:
     def get_qemu_dm_logrotate_count(self):
         return self.get_config_int("qemu-dm-logrotate-count",
                                    self.qemu_dm_logrotate_count)
+
+    def get_xend_domain_lock(self):
+        """Get the flag indicating whether xend should create a lock file
+        for domains when they are started."""
+        return self.get_config_bool("xend-domain-lock", self.xend_domain_lock)
 
 
 class XendOptionsFile(XendOptions):

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [RFC] Add lock on domain start
  2008-08-06  6:06 [PATCH] [RFC] Add lock on domain start Jim Fehlig
@ 2008-08-07  2:23 ` Zhigang Wang
  2008-08-07  4:26   ` Zhigang Wang
  2008-08-11 11:14 ` Ian Jackson
  1 sibling, 1 reply; 11+ messages in thread
From: Zhigang Wang @ 2008-08-07  2:23 UTC (permalink / raw)
  To: Jim Fehlig; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 1717 bytes --]

Jim Fehlig wrote:
> This patch adds a simple lock mechanism when starting domains by placing 
> a lock file in xend-domains-path/<dom_uuid>.  The lock file is removed 
> when domain is stopped.  The motivation for such a mechanism is to 
> prevent starting the same domain from multiple hosts.
> 
> If xend-domains-path is set to shared mount point, a domain will fail to 
> start on host B if it is already running on host A.  I've added an 
> option to XendOptions to control the behavior with default of no lock.
> 
> The patch certainly needs some testing (and probably adjustment) to 
> ensure the lock is handled properly on save, restore, migrate, domain 
> crash, etc. but wanted to get folks' thought on this approach before 
> continuing this endeavor.  Some simple improvements could include adding 
> info (domain name/id, start time, vmm hostname) to the lock file, 
> allowing such messages as "domain foo seems to be already running on 
> host bar" and  a --force option to create/start to override the lock.  A 
> per-domain config option could also be added to allow more fine-grained 
> control.
> 
> Comments, suggestions, alternative approaches, ... are welcome and 
> appreciated :-).
> 

this patch xen-running-lock.patch add a external lock facility to get the same
result. file-lock.c is a simple implement of the external lock utility.

the external locking facility can leverage the dlm if you are in a cluster
environment.

cheers,

zhigang

> Regards,
> Jim
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel


[-- Attachment #2: xen-running-lock.patch --]
[-- Type: text/x-patch, Size: 5418 bytes --]

diff -Nura xen-unstable.orig/tools/examples/xend-config.sxp xen-unstable/tools/examples/xend-config.sxp
--- xen-unstable.orig/tools/examples/xend-config.sxp	2008-08-06 17:26:37.000000000 +0800
+++ xen-unstable/tools/examples/xend-config.sxp	2008-08-06 17:28:45.000000000 +0800
@@ -63,6 +63,12 @@
 
 #(xend-unix-path /var/lib/xend/xend-socket)
 
+# External locking utility for get/release domain running lock. By default,
+# no utility is specified. Thus there will be no lock as VM running.
+# The locking utility should accept:
+# <--lock | --unlock> --name <name> --uuid <uuid>
+# command line options, and returns zero on success, others on error.
+#(xend-domains-lock-path '')
 
 # Address and port xend should use for the legacy TCP XMLRPC interface, 
 # if xend-tcp-xmlrpc-server is set.
diff -Nura xen-unstable.orig/tools/python/xen/xend/XendDomainInfo.py xen-unstable/tools/python/xen/xend/XendDomainInfo.py
--- xen-unstable.orig/tools/python/xen/xend/XendDomainInfo.py	2008-08-06 17:26:39.000000000 +0800
+++ xen-unstable/tools/python/xen/xend/XendDomainInfo.py	2008-08-06 17:31:27.000000000 +0800
@@ -328,6 +328,8 @@
     @type state_updated: threading.Condition
     @ivar refresh_shutdown_lock: lock for polling shutdown state
     @type refresh_shutdown_lock: threading.Condition
+    @ivar running_lock: lock for running VM
+    @type running_lock: bool or None
     @ivar _deviceControllers: device controller cache for this domain
     @type _deviceControllers: dict 'string' to DevControllers
     """
@@ -395,6 +397,8 @@
         self.refresh_shutdown_lock = threading.Condition()
         self._stateSet(DOM_STATE_HALTED)
 
+        self.running_lock = None
+
         self._deviceControllers = {}
 
         for state in DOM_STATES_OLD:
@@ -421,6 +425,7 @@
 
         if self._stateGet() in (XEN_API_VM_POWER_STATE_HALTED, XEN_API_VM_POWER_STATE_SUSPENDED, XEN_API_VM_POWER_STATE_CRASHED):
             try:
+                self.acquire_running_lock();
                 XendTask.log_progress(0, 30, self._constructDomain)
                 XendTask.log_progress(31, 60, self._initDomain)
                 
@@ -453,6 +458,7 @@
         state = self._stateGet()
         if state in (DOM_STATE_SUSPENDED, DOM_STATE_HALTED):
             try:
+                self.acquire_running_lock();
                 self._constructDomain()
                 self._storeVmDetails()
                 self._createDevices()
@@ -2292,6 +2298,11 @@
 
             self._stateSet(DOM_STATE_HALTED)
             self.domid = None  # Do not push into _stateSet()!
+      
+            try:
+                self.release_running_lock()
+            except:
+                log.exception("Release running lock failed: %s" % status)
         finally:
             self.refresh_shutdown_lock.release()
 
@@ -3520,6 +3531,28 @@
     def has_device(self, dev_class, dev_uuid):
         return (dev_uuid in self.info['%s_refs' % dev_class.lower()])
 
+    def acquire_running_lock(self):
+        if not self.running_lock:
+            lock_path = xoptions.get_xend_domains_lock_path()
+            if lock_path:
+                status = os.system('%s --lock --name %s --uuid %s' % \
+                                   (lock_path, self.info['name_label'], self.info['uuid']))
+                if status == 0:
+                    self.running_lock = True
+                else:
+                    raise XendError('Acquire running lock failed: %s' % status)
+
+    def release_running_lock(self):
+        if self.running_lock:
+            lock_path = xoptions.get_xend_domains_lock_path()
+            if lock_path:
+                status = os.system('%s --unlock --name %s --uuid %s' % \
+                                   (lock_path, self.info['name_label'], self.info['uuid']))
+                if status == 0:
+                    self.running_lock = False
+                else:
+                    raise XendError('Release running lock failed: %s' % status)
+
     def __str__(self):
         return '<domain id=%s name=%s memory=%s state=%s>' % \
                (str(self.domid), self.info['name_label'],
diff -Nura xen-unstable.orig/tools/python/xen/xend/XendDomain.py xen-unstable/tools/python/xen/xend/XendDomain.py
--- xen-unstable.orig/tools/python/xen/xend/XendDomain.py	2008-08-06 17:26:39.000000000 +0800
+++ xen-unstable/tools/python/xen/xend/XendDomain.py	2008-08-06 17:30:23.000000000 +0800
@@ -1295,6 +1295,7 @@
                              POWER_STATE_NAMES[dominfo._stateGet()])
 
         """ The following call may raise a XendError exception """
+        dominfo.release_running_lock();
         dominfo.testMigrateDevices(True, dst)
 
         if live:
diff -Nura xen-unstable.orig/tools/python/xen/xend/XendOptions.py xen-unstable/tools/python/xen/xend/XendOptions.py
--- xen-unstable.orig/tools/python/xen/xend/XendOptions.py	2008-08-06 17:26:39.000000000 +0800
+++ xen-unstable/tools/python/xen/xend/XendOptions.py	2008-08-06 17:28:45.000000000 +0800
@@ -271,6 +271,11 @@
         """
         return self.get_config_string("xend-domains-path", self.xend_domains_path_default)
 
+    def get_xend_domains_lock_path(self):
+        """ Get the path of the lock utility for running domains.
+        """
+        return self.get_config_string("xend-domains-lock-path")
+
     def get_xend_state_path(self):
         """ Get the path for persistent domain configuration storage
         """

[-- Attachment #3: file-lock.c --]
[-- Type: text/x-csrc, Size: 4649 bytes --]

/*
 * file-lock.c
 *
 * Copyright (C) 2008 Oracle Inc.
 * Copyright (C) 2008 Zhigang Wang <zhigang.x.wang@oracle.com>
 *
 * This program is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <getopt.h>

const char version[] = "0.0.1";
static char short_opts[] = "lup:d:n:hvV";
static struct option long_opts[] = {
	{ "lock",	no_argument,		NULL,	'l' },
	{ "unlock",	no_argument,		NULL,	'u' },
	{ "path",	required_argument,	NULL,	'p' },
	{ "name",	required_argument,	NULL,	'n' },
	{ "uuid",	required_argument,	NULL,	'd' },
	{ "help",	no_argument,		NULL,	'h' },
	{ "verbose",	no_argument,		NULL,	'v' },
	{ "version",	no_argument,		NULL,	'V' },
	{  NULL,	0,			NULL,	 0  }
};

static void usage(char *prog, FILE *fp, int n) {
	fprintf(fp, "usage: %s [options]\n", prog);
	fprintf(fp, "\n");
	fprintf(fp, "options:\n");
	fprintf(fp, " -l, --lock       Acquire the lock.\n");
	fprintf(fp, " -u, --unlock     Release the lock.\n");
	fprintf(fp, " -p, --path       Set the path for the locks.\n");
	fprintf(fp, " -n, --name       Set the name of the VM.\n");
	fprintf(fp, " -d, --uuid       Set the uuid of the VM.\n");
	fprintf(fp, " -v, --verbose    Show more infomation.\n");
	fprintf(fp, " -V, --version    Show version number and exit.\n");
	fprintf(fp, " -h, --help       Show this help information.\n");
	fprintf(fp, "\n");
	exit(n);
}

static int do_lock(char *path, char *name, char *uuid)
{
	char *fn;
	int fd;

	if (asprintf(&fn, "%s/%s-%s.lock", path, name, uuid) == -1)
		return -1;

	fd = open(fn, O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
	if (fd == -1) {
		free(fn);
		return errno;
	}

	free(fn);
	close(fd);
	return 0;
}

static int do_unlock(char *path, char *name, char *uuid)
{
	char *fn;

	if (asprintf(&fn, "%s/%s-%s.lock", path, name, uuid) == -1)
		return -1;

	if (unlink(fn) == -1) {
		free(fn);
		return errno;
	}

	free(fn);
	return 0;
}

int main(int argc, char *argv[])
{
	char *prog, *p;
	char *name = NULL;
	char *uuid = NULL;
	char *path = ".";	/* create lock file on current working directory by default*/
	int verbose = 0;	/* turn off verbose output by default */
	int status = 0;		/* returned value */
	int lock = 0, unlock = 0;
	int c;

	prog = argv[0];
	p = strrchr(prog, '/');
	if (p)
		prog = p+1;

	while ((c = getopt_long(argc, argv, short_opts,
				 long_opts, NULL)) != -1) {
		switch (c) {
		case 'l':		/* acquire the lock */
			lock = 1;
			break;
		case 'u':		/* release the lock */
			unlock = 1;
			break;
		case 'p':		/* path for lock file */
			path = optarg;
			break;
		case 'n':		/* name of vm  */
			name = optarg;
			break;
		case 'd':		/* uuid of vm  */
			uuid = optarg;
			break;
		case 'h':		/* help */
			usage(prog, stdout, 0);
			break;
		case 'v':		/* be chatty */
			++verbose;
			break;
		case 'V':		/* version */
			fprintf(stdout, "%s: %s\n", prog, version);
			exit(0);
		case 0:
			break;
		case '?':
		default:
			usage(prog, stderr, 1);
		}
	}

	if (optind < argc)
		usage(prog, stderr, 1);

	if (name==NULL || uuid==NULL) {
		fprintf(stderr, "you should specify the name and uuid of vm.\n\n");
		usage(prog, stderr, 1);
	}

	if (lock && unlock) {
		fprintf(stderr, "cannot execute lock and unlock at the same time.\n\n");
		usage(prog, stderr, 1);
	}

	if (lock) {
		if (verbose)
			fprintf(stdout, "creating lock file %s/%s-%s.lock\n", path, name, uuid);

		status = do_lock(path, name, uuid);

		if (verbose)
			if (status == 0)
				fprintf(stdout, "lock sucess.\n");
			else
				fprintf(stdout, "lock failed.\n");
	} else if (unlock) {
		if (verbose)
			fprintf(stdout, "removing lock file %s/%s-%s.lock\n", path, name, uuid);

		status = do_unlock(path, name, uuid);

		if (verbose)
			if (status == 0)
				fprintf(stdout, "unlock sucess.\n");
			else
				fprintf(stdout, "unlock failed.\n");
	} else {
		fprintf(stderr, "you should specify lock or unlock.\n\n");
		usage(prog, stderr, 1);
	}

	return status;
}


[-- Attachment #4: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [RFC] Add lock on domain start
  2008-08-07  2:23 ` Zhigang Wang
@ 2008-08-07  4:26   ` Zhigang Wang
  2008-08-08 17:07     ` Jim Fehlig
  0 siblings, 1 reply; 11+ messages in thread
From: Zhigang Wang @ 2008-08-07  4:26 UTC (permalink / raw)
  To: Jim Fehlig; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 2812 bytes --]

When we implement such a lock, we should pay more attention to the live
migration scenario: it should allow two vms started at the same time.

my patch still doesn't resolve it: it has a small time gap that allow other vms
to start. (when migrating VM1 on Sever1 to Server2 as VM2, VM1 release the lock
--> some prepare work for migration + network delay --> VM2 acquire the lock.)

maybe your suggestion of add a --force option can be used to solve this issue.

My approach is most flexible (maybe specially useful in certain circumstance)
as your approach can be easily integrated to the current xend implement.

let's first decide which approach is best in the long run together.

I ever considered your approach; You can refer to the  LockFile.py when you
polish your code.

thanks,

zhigang

Zhigang Wang wrote:
> Jim Fehlig wrote:
>> This patch adds a simple lock mechanism when starting domains by placing 
>> a lock file in xend-domains-path/<dom_uuid>.  The lock file is removed 
>> when domain is stopped.  The motivation for such a mechanism is to 
>> prevent starting the same domain from multiple hosts.
>>
>> If xend-domains-path is set to shared mount point, a domain will fail to 
>> start on host B if it is already running on host A.  I've added an 
>> option to XendOptions to control the behavior with default of no lock.
>>
>> The patch certainly needs some testing (and probably adjustment) to 
>> ensure the lock is handled properly on save, restore, migrate, domain 
>> crash, etc. but wanted to get folks' thought on this approach before 
>> continuing this endeavor.  Some simple improvements could include adding 
>> info (domain name/id, start time, vmm hostname) to the lock file, 
>> allowing such messages as "domain foo seems to be already running on 
>> host bar" and  a --force option to create/start to override the lock.  A 
>> per-domain config option could also be added to allow more fine-grained 
>> control.
>>
>> Comments, suggestions, alternative approaches, ... are welcome and 
>> appreciated :-).
>>
> 
> this patch xen-running-lock.patch add a external lock facility to get the same
> result. file-lock.c is a simple implement of the external lock utility.
> 
> the external locking facility can leverage the dlm if you are in a cluster
> environment.
> 
> cheers,
> 
> zhigang
> 
>> Regards,
>> Jim
>>
>>
>> ------------------------------------------------------------------------
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

[-- Attachment #2: LockFile.py --]
[-- Type: text/x-python, Size: 2182 bytes --]

#==============================================================================
# This library is free software; you can redistribute it and/or modify it under
# the terms of version 2.1 of the GNU Lesser General Public License as
# published by the Free Software Foundation.
#
# This library is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this library; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#==============================================================================
# Copyright (C) 2008 Oracle, Inc.
# Copyright (C) 2008 Zhigang Wang <zhigang.x.wang@oracle.com>
#==============================================================================

import os

# Exceptions that can be raised by this module
class LockError(Exception):
    pass

class LockFile(object):
    '''Represent a lock that is made on the file system, to prevent concurrent
    execution of this code.

    Linux's open(2) manual page says O_EXCL does not work with NFS, but it
    actually does work if both the NFS client (Linux v2.6.6+) and the server
    support it. Apparently it is commonly implemented nowadays, so it should
    be quite safe to use in new systems. Unfortunately there is no easy way to
    check if it is safe or not.
    '''
    def __init__(self, filename):
        self.filename=filename
        self.locked=False

    def acquire(self):
        try:
            os.open(self.filename, os.O_CREAT|os.O_RDWR|os.O_EXCL)
            self.locked=True
        except OSError, e:
            raise LockError('Could not create lock file: %s' % e)

    def release(self):
        if self.locked:
            try:
                os.unlink(self.filename)
                self.locked=False 
            except OSError, e:
                raise LockError('Could not remove lock file: %s' % e)

    def __del__(self):
        self.release()


[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [RFC] Add lock on domain start
  2008-08-07  4:26   ` Zhigang Wang
@ 2008-08-08 17:07     ` Jim Fehlig
  2008-08-11  2:22       ` Zhigang Wang
  0 siblings, 1 reply; 11+ messages in thread
From: Jim Fehlig @ 2008-08-08 17:07 UTC (permalink / raw)
  To: Zhigang Wang; +Cc: xen-devel

Hi Zhigang,

Sorry for the delay.

Zhigang Wang wrote:
> When we implement such a lock, we should pay more attention to the live
> migration scenario: it should allow two vms started at the same time.
>   

Yes, certainly.  Any such mechanism must accommodate live migration.  To 
be precise, it must not break any existing functionality - save, 
restore, live/non-live migration, reboot, domain crash, ...

> my patch still doesn't resolve it: it has a small time gap that allow other vms
> to start. (when migrating VM1 on Sever1 to Server2 as VM2, VM1 release the lock
> --> some prepare work for migration + network delay --> VM2 acquire the lock.)
>
> maybe your suggestion of add a --force option can be used to solve this issue.
>   

Hmm, perhaps but I was reserving that for cases where lock was not 
released do to 'unusual' circumstances.  E.g. HVM domain crashed and 
tools were not aware of it and thus domain not cleaned up.  But doing a 
destroy on the domain would invoke code path to release lock.  Anyhow, 
--force would give a knowledgeable admin a way to override the lock.

> My approach is most flexible (maybe specially useful in certain circumstance)
> as your approach can be easily integrated to the current xend implement.
>
> let's first decide which approach is best in the long run together.
>   

AFAICT, we're essentially doing the same thing - writing out a lock (in 
the form of a file) on domain start and removing the lock on domain 
shutdown.  Difference is in implementation.  Perhaps first we should 
agree on some requirements.

* Location of lock must be accessible to multiple hosts
   - Provide xend config option to specify the location.  xend-domains-path
     already exists and could be used for this purpose
* Lock feature should be globally optional, disabled by default
   - Provide xend config option to globally turn on/off domain lock
* Lock feature should be per-domain optional, disabled by default
   - Provide domU config option to turn on/off lock
* Lock should contain some useful info
   - Put domain name/id/uuid, host, start time in lock file
* Lock mechanism should be configurable
   - Provide xend config option to specify lock facility
* Lock must accommodate domain lifecycle operations (save, restore,
   migrate, reboot, etc.)
* Lock should be 'overridable', e.g. with a --lock-override option to 
start/create
* Lock mechanism must be acceptable to community :-)

How does this sound?  Am I missing anything?  If you agree I can spin a 
patch that satisfies these requirements.  With any luck it will satisfy 
the last one :-).

Regards,
Jim

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [RFC] Add lock on domain start
  2008-08-08 17:07     ` Jim Fehlig
@ 2008-08-11  2:22       ` Zhigang Wang
  0 siblings, 0 replies; 11+ messages in thread
From: Zhigang Wang @ 2008-08-11  2:22 UTC (permalink / raw)
  To: Jim Fehlig; +Cc: xen-devel

Hi Jim,

some comments inline.

Jim Fehlig wrote:
> Hi Zhigang,
> 
> Sorry for the delay.
> 
> Zhigang Wang wrote:
>> When we implement such a lock, we should pay more attention to the live
>> migration scenario: it should allow two vms started at the same time.
>>   
> 
> Yes, certainly.  Any such mechanism must accommodate live migration.  To 
> be precise, it must not break any existing functionality - save, 
> restore, live/non-live migration, reboot, domain crash, ...
> 
>> my patch still doesn't resolve it: it has a small time gap that allow other vms
>> to start. (when migrating VM1 on Sever1 to Server2 as VM2, VM1 release the lock
>> --> some prepare work for migration + network delay --> VM2 acquire the lock.)
>>
>> maybe your suggestion of add a --force option can be used to solve this issue.
>>   
> 
> Hmm, perhaps but I was reserving that for cases where lock was not 
> released do to 'unusual' circumstances.  E.g. HVM domain crashed and 
> tools were not aware of it and thus domain not cleaned up.  But doing a 
> destroy on the domain would invoke code path to release lock.  Anyhow, 
> --force would give a knowledgeable admin a way to override the lock.
> 
>> My approach is most flexible (maybe specially useful in certain circumstance)
>> as your approach can be easily integrated to the current xend implement.
>>
>> let's first decide which approach is best in the long run together.
>>   
> 
> AFAICT, we're essentially doing the same thing - writing out a lock (in 
> the form of a file) on domain start and removing the lock on domain 
> shutdown.  Difference is in implementation.  Perhaps first we should 
> agree on some requirements.
> 
> * Location of lock must be accessible to multiple hosts
>    - Provide xend config option to specify the location.  xend-domains-path
>      already exists and could be used for this purpose
yes we can leverage it. it is easy for xend managed domains. but shall we
consider none-xend-managed domains?
> * Lock feature should be globally optional, disabled by default
>    - Provide xend config option to globally turn on/off domain lock
> * Lock feature should be per-domain optional, disabled by default
>    - Provide domU config option to turn on/off lock
when we implement --lock-override, this is easy.
> * Lock should contain some useful info
>    - Put domain name/id/uuid, host, start time in lock file
this is useful.
> * Lock mechanism should be configurable
>    - Provide xend config option to specify lock facility
do you want to implement a hook for external locking facilities to use?
> * Lock must accommodate domain lifecycle operations (save, restore,
>    migrate, reboot, etc.)
we should find the right place to acquire/release the lock, see my patch for
reference.
> * Lock should be 'overridable', e.g. with a --lock-override option to 
> start/create
> * Lock mechanism must be acceptable to community :-)
> 
> How does this sound?  Am I missing anything?  If you agree I can spin a 
> patch that satisfies these requirements.  With any luck it will satisfy 
> the last one :-).
> 
we should consider XenAPI support too. I think we can first implement in the
API level, then other features (--lock-override, etc) can be implemented later.
maybe there are better solutions when we considering XenAPI.

the patch I give before is in use in our Oracle VM 2.1.2, the external lock is
using dlm, which is part of ocfs2 (we use ocfs2 as the cluster fs in Oracle VM).

If a patch can fulfill all the requirements you mentioned, it will definitely
satisfy our use.

please go ahead and wrap up a new patch, then we can try to make it accepted by
the community.

thanks,

zhigang

> Regards,
> Jim
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [RFC] Add lock on domain start
  2008-08-06  6:06 [PATCH] [RFC] Add lock on domain start Jim Fehlig
  2008-08-07  2:23 ` Zhigang Wang
@ 2008-08-11 11:14 ` Ian Jackson
  2008-08-11 16:45   ` Jim Fehlig
  1 sibling, 1 reply; 11+ messages in thread
From: Ian Jackson @ 2008-08-11 11:14 UTC (permalink / raw)
  To: Jim Fehlig; +Cc: xen-devel

Jim Fehlig writes ("[Xen-devel] [PATCH] [RFC] Add lock on domain start"):
> This patch adds a simple lock mechanism when starting domains by placing 
> a lock file in xend-domains-path/<dom_uuid>.  The lock file is removed 
> when domain is stopped.  The motivation for such a mechanism is to 
> prevent starting the same domain from multiple hosts.

I think this should be dealt with in your next-layer-up management
tools.

Lockfiles are bad because they can become stale.

Ian.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [RFC] Add lock on domain start
  2008-08-11 11:14 ` Ian Jackson
@ 2008-08-11 16:45   ` Jim Fehlig
  2009-08-05  7:41     ` Pasi Kärkkäinen
  0 siblings, 1 reply; 11+ messages in thread
From: Jim Fehlig @ 2008-08-11 16:45 UTC (permalink / raw)
  To: Ian Jackson; +Cc: xen-devel, Zhigang Wang

Ian Jackson wrote:
> Jim Fehlig writes ("[Xen-devel] [PATCH] [RFC] Add lock on domain start"):
>   
>> This patch adds a simple lock mechanism when starting domains by placing 
>> a lock file in xend-domains-path/<dom_uuid>.  The lock file is removed 
>> when domain is stopped.  The motivation for such a mechanism is to 
>> prevent starting the same domain from multiple hosts.
>>     
>
> I think this should be dealt with in your next-layer-up management
> tools.
>   

Perhaps.  I wanted to see if there was any interest in having such a
feature at the xend layer.  If not, I will no longer pursue this option.

> Lockfiles are bad because they can become stale.
>   

Yep.  Originally I considered a 'lockless-lock' approach where a bit it
set and counter is spun on a 'reserved' sector of vbd, e.g. first
sector.  Attempting to attach the vbd to another domain would fail if
lock bit is set and counter is incrementing.  If counter is not
incrementing assume lock is stale and proceed.  This approach is
certainly more complex.  We support various image formats (raw, qcow,
vmdk, ...) and such an approach may mean changing the format (e.g.
qcow3).  Wouldn't work for existing images.  Who is responsible for
spinning the counter?  Anyhow seemed like a lot of complexity as
compared to the suggested simple approach with override for stale lock.

Thanks,
Jim

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [RFC] Add lock on domain start
  2008-08-11 16:45   ` Jim Fehlig
@ 2009-08-05  7:41     ` Pasi Kärkkäinen
  2009-08-05  8:39       ` Zhigang Wang
  0 siblings, 1 reply; 11+ messages in thread
From: Pasi Kärkkäinen @ 2009-08-05  7:41 UTC (permalink / raw)
  To: Jim Fehlig; +Cc: Zhigang Wang, xen-devel, Ian Jackson

On Mon, Aug 11, 2008 at 10:45:23AM -0600, Jim Fehlig wrote:
> Ian Jackson wrote:
> > Jim Fehlig writes ("[Xen-devel] [PATCH] [RFC] Add lock on domain start"):
> >   
> >> This patch adds a simple lock mechanism when starting domains by placing 
> >> a lock file in xend-domains-path/<dom_uuid>.  The lock file is removed 
> >> when domain is stopped.  The motivation for such a mechanism is to 
> >> prevent starting the same domain from multiple hosts.
> >>     
> >
> > I think this should be dealt with in your next-layer-up management
> > tools.
> >   
> 
> Perhaps.  I wanted to see if there was any interest in having such a
> feature at the xend layer.  If not, I will no longer pursue this option.
> 

Replying a bit late to this.. I think there is demand for this feature! 

Many people (mostly in a smaller environments) don't want to use
'next-layer-up' management tools..

> > Lockfiles are bad because they can become stale.
> >   
> 
> Yep.  Originally I considered a 'lockless-lock' approach where a bit it
> set and counter is spun on a 'reserved' sector of vbd, e.g. first
> sector.  Attempting to attach the vbd to another domain would fail if
> lock bit is set and counter is incrementing.  If counter is not
> incrementing assume lock is stale and proceed.  This approach is
> certainly more complex.  We support various image formats (raw, qcow,
> vmdk, ...) and such an approach may mean changing the format (e.g.
> qcow3).  Wouldn't work for existing images.  Who is responsible for
> spinning the counter?  Anyhow seemed like a lot of complexity as
> compared to the suggested simple approach with override for stale lock.
> 

I assume you guys have this patch included in OpenSuse/SLES Xen rpms.

Is the latest version available from somewhere? 

-- Pasi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [RFC] Add lock on domain start
  2009-08-05  7:41     ` Pasi Kärkkäinen
@ 2009-08-05  8:39       ` Zhigang Wang
  2009-08-05  9:33         ` Pasi Kärkkäinen
  0 siblings, 1 reply; 11+ messages in thread
From: Zhigang Wang @ 2009-08-05  8:39 UTC (permalink / raw)
  To: Pasi �; +Cc: Jim Fehlig, xen-devel, Ian Jackson

[-- Attachment #1: Type: text/plain, Size: 2619 bytes --]

Pasi � wrote:
> On Mon, Aug 11, 2008 at 10:45:23AM -0600, Jim Fehlig wrote:
>> Ian Jackson wrote:
>>> Jim Fehlig writes ("[Xen-devel] [PATCH] [RFC] Add lock on domain start"):
>>>   
>>>> This patch adds a simple lock mechanism when starting domains by placing 
>>>> a lock file in xend-domains-path/<dom_uuid>.  The lock file is removed 
>>>> when domain is stopped.  The motivation for such a mechanism is to 
>>>> prevent starting the same domain from multiple hosts.
>>>>     
>>> I think this should be dealt with in your next-layer-up management
>>> tools.
>>>   
>> Perhaps.  I wanted to see if there was any interest in having such a
>> feature at the xend layer.  If not, I will no longer pursue this option.
>>
> 
> Replying a bit late to this.. I think there is demand for this feature! 
> 
> Many people (mostly in a smaller environments) don't want to use
> 'next-layer-up' management tools..
> 
>>> Lockfiles are bad because they can become stale.
>>>   
>> Yep.  Originally I considered a 'lockless-lock' approach where a bit it
>> set and counter is spun on a 'reserved' sector of vbd, e.g. first
>> sector.  Attempting to attach the vbd to another domain would fail if
>> lock bit is set and counter is incrementing.  If counter is not
>> incrementing assume lock is stale and proceed.  This approach is
>> certainly more complex.  We support various image formats (raw, qcow,
>> vmdk, ...) and such an approach may mean changing the format (e.g.
>> qcow3).  Wouldn't work for existing images.  Who is responsible for
>> spinning the counter?  Anyhow seemed like a lot of complexity as
>> compared to the suggested simple approach with override for stale lock.
>>
> 
> I assume you guys have this patch included in OpenSuse/SLES Xen rpms.
> 
> Is the latest version available from somewhere? 
> 
> -- Pasi
I ever seen a patch in SUSE xen rpm. maybe Jim can tell you the latest status.

In Oracle VM, we add hooks in xend and use a external locking utility.

currently, we use DLM (distributed lock manager) to manage the domain running lock to prevent the same
VM starts from two servers simultaneously.

We have add hooks to VM start/shutdown/migration for acquire/release the lock.

Note during migration, we release the lock before starting the migration process
and a lock will be acquired in the destination side. There still a chance for
other servers rather than the destination server to acquire the lock. thus cause
the migration fail.

hope someone can give some advice.

here is the patch for your reference.

thanks,

zhigang

[-- Attachment #2: xen-unstable-locking-callout-hook.patch --]
[-- Type: text/x-patch, Size: 5514 bytes --]

diff -Nurp --exclude '*.orig' xen-3.4.0.bak/tools/examples/xend-config.sxp xen-3.4.0/tools/examples/xend-config.sxp
--- xen-3.4.0.bak/tools/examples/xend-config.sxp	2009-08-05 16:17:42.000000000 +0800
+++ xen-3.4.0/tools/examples/xend-config.sxp	2009-08-04 10:23:17.000000000 +0800
@@ -69,6 +69,12 @@
 
 (xend-unix-path /var/lib/xend/xend-socket)
 
+# External locking utility for get/release domain running lock. By default,
+# no utility is specified. Thus there will be no lock as VM running.
+# The locking utility should accept:
+# <--lock | --unlock> --name <name> --uuid <uuid>
+# command line options, and returns zero on success, others on error.
+#(xend-domains-lock-path '')
 
 # Address and port xend should use for the legacy TCP XMLRPC interface, 
 # if xend-tcp-xmlrpc-server is set.
diff -Nurp --exclude '*.orig' xen-3.4.0.bak/tools/python/xen/xend/XendDomainInfo.py xen-3.4.0/tools/python/xen/xend/XendDomainInfo.py
--- xen-3.4.0.bak/tools/python/xen/xend/XendDomainInfo.py	2009-08-05 16:17:42.000000000 +0800
+++ xen-3.4.0/tools/python/xen/xend/XendDomainInfo.py	2009-08-05 16:35:35.000000000 +0800
@@ -359,6 +359,8 @@ class XendDomainInfo:
     @type state_updated: threading.Condition
     @ivar refresh_shutdown_lock: lock for polling shutdown state
     @type refresh_shutdown_lock: threading.Condition
+    @ivar running_lock: lock for running VM
+    @type running_lock: bool or None
     @ivar _deviceControllers: device controller cache for this domain
     @type _deviceControllers: dict 'string' to DevControllers
     """
@@ -427,6 +429,8 @@ class XendDomainInfo:
         self.refresh_shutdown_lock = threading.Condition()
         self._stateSet(DOM_STATE_HALTED)
 
+        self.running_lock = None
+
         self._deviceControllers = {}
 
         for state in DOM_STATES_OLD:
@@ -453,6 +457,7 @@ class XendDomainInfo:
 
         if self._stateGet() in (XEN_API_VM_POWER_STATE_HALTED, XEN_API_VM_POWER_STATE_SUSPENDED, XEN_API_VM_POWER_STATE_CRASHED):
             try:
+                self.acquire_running_lock();
                 XendTask.log_progress(0, 30, self._constructDomain)
                 XendTask.log_progress(31, 60, self._initDomain)
                 
@@ -485,6 +490,7 @@ class XendDomainInfo:
         state = self._stateGet()
         if state in (DOM_STATE_SUSPENDED, DOM_STATE_HALTED):
             try:
+                self.acquire_running_lock();
                 self._constructDomain()
 
                 try:
@@ -2617,6 +2623,11 @@ class XendDomainInfo:
 
             self._stateSet(DOM_STATE_HALTED)
             self.domid = None  # Do not push into _stateSet()!
+      
+            try:
+                self.release_running_lock()
+            except:
+                log.exception("Release running lock failed: %s" % status)
         finally:
             self.refresh_shutdown_lock.release()
 
@@ -4073,6 +4084,28 @@ class XendDomainInfo:
                                    params.get('burst', '50K'))
         return 1
 
+    def acquire_running_lock(self):
+        if not self.running_lock:
+            lock_path = xoptions.get_xend_domains_lock_path()
+            if lock_path:
+                status = os.system('%s --lock --name %s --uuid %s' % \
+                                   (lock_path, self.info['name_label'], self.info['uuid']))
+                if status == 0:
+                    self.running_lock = True
+                else:
+                    raise XendError('Acquire running lock failed: %s' % status)
+
+    def release_running_lock(self):
+        if self.running_lock:
+            lock_path = xoptions.get_xend_domains_lock_path()
+            if lock_path:
+                status = os.system('%s --unlock --name %s --uuid %s' % \
+                                   (lock_path, self.info['name_label'], self.info['uuid']))
+                if status == 0:
+                    self.running_lock = False
+                else:
+                    raise XendError('Release running lock failed: %s' % status)
+
     def __str__(self):
         return '<domain id=%s name=%s memory=%s state=%s>' % \
                (str(self.domid), self.info['name_label'],
diff -Nurp --exclude '*.orig' xen-3.4.0.bak/tools/python/xen/xend/XendDomain.py xen-3.4.0/tools/python/xen/xend/XendDomain.py
--- xen-3.4.0.bak/tools/python/xen/xend/XendDomain.py	2009-08-05 16:17:09.000000000 +0800
+++ xen-3.4.0/tools/python/xen/xend/XendDomain.py	2009-08-04 10:23:17.000000000 +0800
@@ -1317,6 +1317,7 @@ class XendDomain:
                              POWER_STATE_NAMES[dominfo._stateGet()])
 
         """ The following call may raise a XendError exception """
+        dominfo.release_running_lock();
         dominfo.testMigrateDevices(True, dst)
 
         if live:
diff -Nurp --exclude '*.orig' xen-3.4.0.bak/tools/python/xen/xend/XendOptions.py xen-3.4.0/tools/python/xen/xend/XendOptions.py
--- xen-3.4.0.bak/tools/python/xen/xend/XendOptions.py	2009-08-05 16:17:42.000000000 +0800
+++ xen-3.4.0/tools/python/xen/xend/XendOptions.py	2009-08-04 10:23:17.000000000 +0800
@@ -281,6 +281,11 @@ class XendOptions:
         """
         return self.get_config_string("xend-domains-path", self.xend_domains_path_default)
 
+    def get_xend_domains_lock_path(self):
+        """ Get the path of the lock utility for running domains.
+        """
+        return self.get_config_string("xend-domains-lock-path")
+
     def get_xend_state_path(self):
         """ Get the path for persistent domain configuration storage
         """

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [RFC] Add lock on domain start
  2009-08-05  8:39       ` Zhigang Wang
@ 2009-08-05  9:33         ` Pasi Kärkkäinen
  2009-08-05 16:30           ` Jia Ju Zhang
  0 siblings, 1 reply; 11+ messages in thread
From: Pasi Kärkkäinen @ 2009-08-05  9:33 UTC (permalink / raw)
  To: Zhigang Wang; +Cc: Jim Fehlig, xen-devel, Ian Jackson

On Wed, Aug 05, 2009 at 04:39:23PM +0800, Zhigang Wang wrote:
> Pasi ??? wrote:
> > On Mon, Aug 11, 2008 at 10:45:23AM -0600, Jim Fehlig wrote:
> >> Ian Jackson wrote:
> >>> Jim Fehlig writes ("[Xen-devel] [PATCH] [RFC] Add lock on domain start"):
> >>>   
> >>>> This patch adds a simple lock mechanism when starting domains by placing 
> >>>> a lock file in xend-domains-path/<dom_uuid>.  The lock file is removed 
> >>>> when domain is stopped.  The motivation for such a mechanism is to 
> >>>> prevent starting the same domain from multiple hosts.
> >>>>     
> >>> I think this should be dealt with in your next-layer-up management
> >>> tools.
> >>>   
> >> Perhaps.  I wanted to see if there was any interest in having such a
> >> feature at the xend layer.  If not, I will no longer pursue this option.
> >>
> > 
> > Replying a bit late to this.. I think there is demand for this feature! 
> > 
> > Many people (mostly in a smaller environments) don't want to use
> > 'next-layer-up' management tools..
> > 
> >>> Lockfiles are bad because they can become stale.
> >>>   
> >> Yep.  Originally I considered a 'lockless-lock' approach where a bit it
> >> set and counter is spun on a 'reserved' sector of vbd, e.g. first
> >> sector.  Attempting to attach the vbd to another domain would fail if
> >> lock bit is set and counter is incrementing.  If counter is not
> >> incrementing assume lock is stale and proceed.  This approach is
> >> certainly more complex.  We support various image formats (raw, qcow,
> >> vmdk, ...) and such an approach may mean changing the format (e.g.
> >> qcow3).  Wouldn't work for existing images.  Who is responsible for
> >> spinning the counter?  Anyhow seemed like a lot of complexity as
> >> compared to the suggested simple approach with override for stale lock.
> >>
> > 
> > I assume you guys have this patch included in OpenSuse/SLES Xen rpms.
> > 
> > Is the latest version available from somewhere? 
> > 
> > -- Pasi
> I ever seen a patch in SUSE xen rpm. maybe Jim can tell you the latest status.
> 

http://serverfault.com/questions/21699/how-to-manage-xen-virtual-machines-on-shared-san-storage

In that discussion someone says xend-lock stuff can be found from SLES11 Xen.

> In Oracle VM, we add hooks in xend and use a external locking utility.
> 
> currently, we use DLM (distributed lock manager) to manage the domain running lock to prevent the same
> VM starts from two servers simultaneously.
> 
> We have add hooks to VM start/shutdown/migration for acquire/release the lock.
> 
> Note during migration, we release the lock before starting the migration process
> and a lock will be acquired in the destination side. There still a chance for
> other servers rather than the destination server to acquire the lock. thus cause
> the migration fail.
> 

Hmm.. I guess that also leaves a small time window for disk corruption? If
the domU was started on some other host at _exact_ correct (or bad) time
when the lock is not held anymore by the migration source host..

> hope someone can give some advice.
> 
> here is the patch for your reference.
> 

Thanks. Looks like possible method aswell.

-- Pasi

> thanks,
> 
> zhigang

> diff -Nurp --exclude '*.orig' xen-3.4.0.bak/tools/examples/xend-config.sxp xen-3.4.0/tools/examples/xend-config.sxp
> --- xen-3.4.0.bak/tools/examples/xend-config.sxp	2009-08-05 16:17:42.000000000 +0800
> +++ xen-3.4.0/tools/examples/xend-config.sxp	2009-08-04 10:23:17.000000000 +0800
> @@ -69,6 +69,12 @@
>  
>  (xend-unix-path /var/lib/xend/xend-socket)
>  
> +# External locking utility for get/release domain running lock. By default,
> +# no utility is specified. Thus there will be no lock as VM running.
> +# The locking utility should accept:
> +# <--lock | --unlock> --name <name> --uuid <uuid>
> +# command line options, and returns zero on success, others on error.
> +#(xend-domains-lock-path '')
>  
>  # Address and port xend should use for the legacy TCP XMLRPC interface, 
>  # if xend-tcp-xmlrpc-server is set.
> diff -Nurp --exclude '*.orig' xen-3.4.0.bak/tools/python/xen/xend/XendDomainInfo.py xen-3.4.0/tools/python/xen/xend/XendDomainInfo.py
> --- xen-3.4.0.bak/tools/python/xen/xend/XendDomainInfo.py	2009-08-05 16:17:42.000000000 +0800
> +++ xen-3.4.0/tools/python/xen/xend/XendDomainInfo.py	2009-08-05 16:35:35.000000000 +0800
> @@ -359,6 +359,8 @@ class XendDomainInfo:
>      @type state_updated: threading.Condition
>      @ivar refresh_shutdown_lock: lock for polling shutdown state
>      @type refresh_shutdown_lock: threading.Condition
> +    @ivar running_lock: lock for running VM
> +    @type running_lock: bool or None
>      @ivar _deviceControllers: device controller cache for this domain
>      @type _deviceControllers: dict 'string' to DevControllers
>      """
> @@ -427,6 +429,8 @@ class XendDomainInfo:
>          self.refresh_shutdown_lock = threading.Condition()
>          self._stateSet(DOM_STATE_HALTED)
>  
> +        self.running_lock = None
> +
>          self._deviceControllers = {}
>  
>          for state in DOM_STATES_OLD:
> @@ -453,6 +457,7 @@ class XendDomainInfo:
>  
>          if self._stateGet() in (XEN_API_VM_POWER_STATE_HALTED, XEN_API_VM_POWER_STATE_SUSPENDED, XEN_API_VM_POWER_STATE_CRASHED):
>              try:
> +                self.acquire_running_lock();
>                  XendTask.log_progress(0, 30, self._constructDomain)
>                  XendTask.log_progress(31, 60, self._initDomain)
>                  
> @@ -485,6 +490,7 @@ class XendDomainInfo:
>          state = self._stateGet()
>          if state in (DOM_STATE_SUSPENDED, DOM_STATE_HALTED):
>              try:
> +                self.acquire_running_lock();
>                  self._constructDomain()
>  
>                  try:
> @@ -2617,6 +2623,11 @@ class XendDomainInfo:
>  
>              self._stateSet(DOM_STATE_HALTED)
>              self.domid = None  # Do not push into _stateSet()!
> +      
> +            try:
> +                self.release_running_lock()
> +            except:
> +                log.exception("Release running lock failed: %s" % status)
>          finally:
>              self.refresh_shutdown_lock.release()
>  
> @@ -4073,6 +4084,28 @@ class XendDomainInfo:
>                                     params.get('burst', '50K'))
>          return 1
>  
> +    def acquire_running_lock(self):
> +        if not self.running_lock:
> +            lock_path = xoptions.get_xend_domains_lock_path()
> +            if lock_path:
> +                status = os.system('%s --lock --name %s --uuid %s' % \
> +                                   (lock_path, self.info['name_label'], self.info['uuid']))
> +                if status == 0:
> +                    self.running_lock = True
> +                else:
> +                    raise XendError('Acquire running lock failed: %s' % status)
> +
> +    def release_running_lock(self):
> +        if self.running_lock:
> +            lock_path = xoptions.get_xend_domains_lock_path()
> +            if lock_path:
> +                status = os.system('%s --unlock --name %s --uuid %s' % \
> +                                   (lock_path, self.info['name_label'], self.info['uuid']))
> +                if status == 0:
> +                    self.running_lock = False
> +                else:
> +                    raise XendError('Release running lock failed: %s' % status)
> +
>      def __str__(self):
>          return '<domain id=%s name=%s memory=%s state=%s>' % \
>                 (str(self.domid), self.info['name_label'],
> diff -Nurp --exclude '*.orig' xen-3.4.0.bak/tools/python/xen/xend/XendDomain.py xen-3.4.0/tools/python/xen/xend/XendDomain.py
> --- xen-3.4.0.bak/tools/python/xen/xend/XendDomain.py	2009-08-05 16:17:09.000000000 +0800
> +++ xen-3.4.0/tools/python/xen/xend/XendDomain.py	2009-08-04 10:23:17.000000000 +0800
> @@ -1317,6 +1317,7 @@ class XendDomain:
>                               POWER_STATE_NAMES[dominfo._stateGet()])
>  
>          """ The following call may raise a XendError exception """
> +        dominfo.release_running_lock();
>          dominfo.testMigrateDevices(True, dst)
>  
>          if live:
> diff -Nurp --exclude '*.orig' xen-3.4.0.bak/tools/python/xen/xend/XendOptions.py xen-3.4.0/tools/python/xen/xend/XendOptions.py
> --- xen-3.4.0.bak/tools/python/xen/xend/XendOptions.py	2009-08-05 16:17:42.000000000 +0800
> +++ xen-3.4.0/tools/python/xen/xend/XendOptions.py	2009-08-04 10:23:17.000000000 +0800
> @@ -281,6 +281,11 @@ class XendOptions:
>          """
>          return self.get_config_string("xend-domains-path", self.xend_domains_path_default)
>  
> +    def get_xend_domains_lock_path(self):
> +        """ Get the path of the lock utility for running domains.
> +        """
> +        return self.get_config_string("xend-domains-lock-path")
> +
>      def get_xend_state_path(self):
>          """ Get the path for persistent domain configuration storage
>          """

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [RFC] Add lock on domain start
  2009-08-05  9:33         ` Pasi Kärkkäinen
@ 2009-08-05 16:30           ` Jia Ju Zhang
  0 siblings, 0 replies; 11+ messages in thread
From: Jia Ju Zhang @ 2009-08-05 16:30 UTC (permalink / raw)
  To: Pasi Kärkkäinen, Zhigang Wang
  Cc: Jim Fehlig, xen-devel, Ian Jackson

Another idea ( Just RFC):
Use SCSI reserve method to implement the lock.
The advantage is as follows:
1) It is a mandatory lock, not advisory lock.
2) It can resolve the stale lock issue.
3) It doesn't need a cluster filesystem.

A 3rd idea is using SFEX (a component in Linux-HA) to implement the
locking.
It is much like the above method but it is an advisory lock.

Attached is the patch using SFEX method, the lock mechanism is still
based on
Jim's patch. 
The disadvantage of this patch is it still has issues in live
migration.

Thanks,
Jiaju

diff -Nupr xen-3.3.1-testing.orig/tools/examples/domain-lock-via-sfex
xen-3.3.1-testing/tools/examples/domain-lock-via-sfex
---
xen-3.3.1-testing.orig/tools/examples/domain-lock-via-sfex	1970-01-01
08:00:00.000000000 +0800
+++
xen-3.3.1-testing/tools/examples/domain-lock-via-sfex	2009-06-23
16:33:38.000000000 +0800
@@ -0,0 +1,123 @@
+#!/bin/bash
+
+#error code:
+# 0: success
+# 1: generic error
+# 2: not running
+
+if [ `uname -m` = "x86_64" ]; then
+    SFEX_DAEMON=/usr/lib64/heartbeat/sfex_daemon
+else
+    SFEX_DAEMON=/usr/lib/heartbeat/sfex_daemon
+fi
+COLLISION_TIMEOUT=1
+LOCK_TIMEOUT=10
+MONITOR_INTERVAL=5
+PID_FILE=/var/run/sfex.pid
+
+usage() {
+    echo "usage: domain-lock-via-sfex [-l|-u|-s] -i <vm uuid> -x <sfex
device> index"
+    echo ""
+    echo "-l    lock"
+    echo "-u    unlock"
+    echo "-s    status (default)"
+    echo "-i    Virtual Machine Id or UUID"
+    echo "-x    SFEX device which used for sfex lock"
+    echo "path  A per-VM, unique location where external lock will be
managed"
+    exit 1
+}
+
+create_lock() {
+    local suffix=$1
+    local device=$2
+    local index=$3
+
+    get_status $suffix
+    if [ $? -eq 0 ]; then
+        return 0
+    fi
+
+    $SFEX_DAEMON -i $index -c $COLLISION_TIMEOUT -t $LOCK_TIMEOUT -m
$MONITOR_INTERVAL -r default -d $PID_FILE"_"$suffix $device
+
+    rc=$?
+    if [ $rc -ne 0 ]; then
+        return 1
+    fi
+
+    sleep 10
+    get_status $suffix
+    if [ $? -eq 0 ]; then
+        return 0
+    fi
+    return 1
+}
+
+remove_lock(){
+    local suffix=$1
+
+    /sbin/killproc -p $PID_FILE"_"$suffix $SFEX_DAEMON
+    rc=$?
+    if [ $rc -ne 0 ]; then
+        return $rc
+    fi
+
+    sleep 4
+    get_status $suffix
+    rc=$?
+    if [ $rc -ne 2 ]; then
+        return $rc
+    fi
+    return 0
+
+}
+
+get_status() {
+    local suffix=$1
+
+    /sbin/checkproc -k -p $PID_FILE"_"$suffix $SFEX_DAEMON
+    rc=$?
+    return $rc
+}
+
+mode="status"
+
+while getopts ":lusi:x:" opt; do
+    case $opt in
+        l )
+            mode="lock"
+            ;;
+	u )
+	    mode="unlock"
+	    ;;
+	s )
+            mode="status"
+            ;;
+	i )
+            vm_uuid=$OPTARG
+            ;;
+        x )
+            vm_sfex_device=$OPTARG
+            ;;
+	\? )
+	    usage
+            ;;
+    esac
+done
+
+shift $(($OPTIND - 1))
+vm_index=$1
+[ -z $vm_index ] && usage
+[ -z $vm_uuid ] && usage
+[ -z $vm_sfex_device ] && usage
+
+case $mode in
+    lock )
+        create_lock $vm_uuid $vm_sfex_device $vm_index
+        ;;
+    unlock )
+        remove_lock $vm_uuid
+        ;;
+    status )
+        get_status $vm_uuid
+	;;
+esac
diff -Nupr xen-3.3.1-testing.orig/tools/examples/Makefile
xen-3.3.1-testing/tools/examples/Makefile
--- xen-3.3.1-testing.orig/tools/examples/Makefile	2009-06-18
01:44:28.000000000 +0800
+++ xen-3.3.1-testing/tools/examples/Makefile	2009-06-18
03:43:39.000000000 +0800
@@ -36,6 +36,7 @@ XEN_SCRIPTS += xen-hotplug-cleanup
 XEN_SCRIPTS += external-device-migrate
 XEN_SCRIPTS += vscsi
 XEN_SCRIPTS += domain-lock
+XEN_SCRIPTS += domain-lock-via-sfex
 XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
 XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh
vif-common.sh
 XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh
vtpm-hotplug-common.sh
diff -Nupr xen-3.3.1-testing.orig/tools/examples/xend-config.sxp
xen-3.3.1-testing/tools/examples/xend-config.sxp
---
xen-3.3.1-testing.orig/tools/examples/xend-config.sxp	2009-06-18
01:44:28.000000000 +0800
+++ xen-3.3.1-testing/tools/examples/xend-config.sxp	2009-06-18
02:56:21.000000000 +0800
@@ -266,7 +266,7 @@
 # environment that protects shared resources, but may be useful in
 # some circumstances nonetheless.
 #
-#(xend-domain-lock no)
+(xend-domain-lock yes)
 
 # Path where domain lock is stored if xend-domain-lock is enabled.
 # Note:  This path must be accessible to all VM Servers participating
@@ -297,4 +297,9 @@
 # <xend-domain-lock-path>/<vm-uuid>/lock and write <vm-name>,
<vm-id>,
 # and <vm-host> (if supplied) to the lock file in that order.
 #
-#(xend-domain-lock-utility domain-lock)
+(xend-domain-lock-utility domain-lock-via-sfex)
+# This is the sfex device, when you enable the domain-lock-via-sfex,
+# you MUST enable this. it should be a partition in shared disk used 
+# for locking. Please note that there is no default value for this 
+# option, you MUST modify the following configuration!
+(xend-domain-lock-sfex-device /dev/sdb1)
diff -Nupr xen-3.3.1-testing.orig/tools/python/xen/xend/XendConfig.py
xen-3.3.1-testing/tools/python/xen/xend/XendConfig.py
---
xen-3.3.1-testing.orig/tools/python/xen/xend/XendConfig.py	2009-06-18
01:44:28.000000000 +0800
+++
xen-3.3.1-testing/tools/python/xen/xend/XendConfig.py	2009-06-22
18:01:14.000000000 +0800
@@ -188,6 +188,7 @@ XENAPI_CFG_TYPES = {
     'VCPUs_max': int,
     'VCPUs_at_startup': int,
     'VCPUs_live': int,
+    'sfex_index': int,
     'actions_after_shutdown': str,
     'actions_after_reboot': str,
     'actions_after_crash': str,
@@ -221,6 +222,7 @@ LEGACY_UNSUPPORTED_BY_XENAPI_CFG = [
     'vcpu_avail',
     'features',
     # read/write
+    'sfex_index',
     'on_xend_start',
     'on_xend_stop',
     # read-only
@@ -241,6 +243,7 @@ LEGACY_CFG_TYPES = {
     'memory':        int,
     'shadow_memory': int,
     'maxmem':        int,
+    'sfex_index':    int,
     'start_time':    float,
     'cpu_time':      float,
     'features':      str,
@@ -268,6 +271,7 @@ LEGACY_XENSTORE_VM_PARAMS = [
     'memory',
     'shadow_memory',
     'maxmem',
+    'sfex_index',
     'start_time',
     'name',
     'on_poweroff',
@@ -576,6 +580,10 @@ class XendConfig(dict):
             cfg["memory"] = int(sxp.child_value(sxp_cfg, "memory"))
         if sxp.child_value(sxp_cfg, "maxmem") != None:
             cfg["maxmem"] = int(sxp.child_value(sxp_cfg, "maxmem"))
+
+        # Parse sfex index
+        if sxp.child_value(sxp_cfg, "sfex_index") != None:
+            cfg["sfex_index"] = int(sxp.child_value(sxp_cfg,
"sfex_index"))
             
         # Convert scheduling parameters to vcpus_params
         if 'vcpus_params' not in cfg:
@@ -823,6 +831,9 @@ class XendConfig(dict):
 
         self._memory_sanity_check()
 
+        if "sfex_index" in cfg:
+            self["sfex_index"] = int(cfg["sfex_index"])
+
         def update_with(n, o):
             if not self.get(n):
                 self[n] = cfg.get(o, '')
diff -Nupr
xen-3.3.1-testing.orig/tools/python/xen/xend/XendDomainInfo.py
xen-3.3.1-testing/tools/python/xen/xend/XendDomainInfo.py
---
xen-3.3.1-testing.orig/tools/python/xen/xend/XendDomainInfo.py	2009-06-18
01:44:28.000000000 +0800
+++
xen-3.3.1-testing/tools/python/xen/xend/XendDomainInfo.py	2009-06-23
16:53:07.000000000 +0800
@@ -3598,23 +3598,32 @@ class XendDomainInfo:
 
         path = xoptions.get_xend_domain_lock_path()
         path = os.path.join(path, self.get_uuid())
+	status = 0
 
-        if self.is_dom_locked(path):
-            raise XendError("The VM is locked and appears to be
running on host %s." % self.get_lock_host(path))
+	if xoptions.get_xend_domain_lock_utility() !=
"domain-lock-via-sfex":
+            if self.is_dom_locked(path):
+                raise XendError("The VM is locked and appears to be
running on host %s." % self.get_lock_host(path))
+
+            try:
+                if not os.path.exists(path):
+                    mkdir.parents(path, stat.S_IRWXU)
+            except:
+                log.exception("%s could not be created." % path)
+                raise XendError("%s could not be created." % path)
+
+            status = os.system('%s -l -p %s -n %s -i %s %s' % \
+                              
(xoptions.get_xend_domain_lock_utility(), \
+                                XendNode.instance().get_name(), \
+                                self.info['name_label'], \
+                                self.info['uuid'], \
+                                path))
+	else:
+	    status = os.system('/etc/xen/scripts/%s -l -i %s -x %s %s' %
\
+                              
(xoptions.get_xend_domain_lock_utility(), \
+                                self.info['uuid'], \
+		               
xoptions.get_xend_domain_lock_sfex_device(), \
+                                self.info['sfex_index']))
 
-        try:
-            if not os.path.exists(path):
-                mkdir.parents(path, stat.S_IRWXU)
-        except:
-            log.exception("%s could not be created." % path)
-            raise XendError("%s could not be created." % path)
-
-        status = os.system('%s -l -p %s -n %s -i %s %s' % \
-                           (xoptions.get_xend_domain_lock_utility(),
\
-                            XendNode.instance().get_name(), \
-                            self.info['name_label'], \
-                            self.info['uuid'], \
-                            path))
         if status != 0:
             raise XendError('Acquire running lock failed: %s' %
status)
 
@@ -3625,16 +3634,27 @@ class XendDomainInfo:
 
         path = xoptions.get_xend_domain_lock_path()
         path = os.path.join(path, self.get_uuid())
-        status = os.system('%s -u %s' % \
-                           (xoptions.get_xend_domain_lock_utility(),
\
-                            path))
+        status = 0
+
+	if xoptions.get_xend_domain_lock_utility() !=
"domain-lock-via-sfex":
+            status = os.system('%s -u %s' % \
+                              
(xoptions.get_xend_domain_lock_utility(), \
+                                path))
+            if status != 0:
+                log.exception("Release running lock failed: %s" %
status)
+            try:
+                if len(os.listdir(path)) == 0:
+                    shutil.rmtree(path)
+            except:
+                log.exception("Failed to remove unmanaged directory
%s." % path)
+	else:
+	    status = os.system('/etc/xen/scripts/%s -u -i %s -x %s %s' %
\
+                              
(xoptions.get_xend_domain_lock_utility(), \
+                                self.info['uuid'], \
+		      xoptions.get_xend_domain_lock_sfex_device(), \
+                                self.info['sfex_index']))
         if status != 0:
             log.exception("Release running lock failed: %s" % status)
-        try:
-            if len(os.listdir(path)) == 0:
-                shutil.rmtree(path)
-        except:
-            log.exception("Failed to remove unmanaged directory %s." %
path)
 
 
     def __str__(self):
diff -Nupr xen-3.3.1-testing.orig/tools/python/xen/xend/XendOptions.py
xen-3.3.1-testing/tools/python/xen/xend/XendOptions.py
---
xen-3.3.1-testing.orig/tools/python/xen/xend/XendOptions.py	2009-06-18
01:44:28.000000000 +0800
+++
xen-3.3.1-testing/tools/python/xen/xend/XendOptions.py	2009-06-19
10:42:57.000000000 +0800
@@ -145,6 +145,9 @@ class XendOptions:
     """Default script to acquire/release domain lock"""
     xend_domain_lock_utility = osdep.scripts_dir + "/domain-lock"
 
+    """Default sfex device used by domain lock """
+    xend_domain_lock_sfex_device = ''
+
 
     def __init__(self):
         self.configure()
@@ -382,6 +385,8 @@ class XendOptions:
     def get_xend_domain_lock_utility(self):
         return self.get_config_string('xend-domain-lock-utility',
self.xend_domain_lock_utility)
 
+    def get_xend_domain_lock_sfex_device(self):
+        return self.get_config_string('xend-domain-lock-sfex-device',
self.xend_domain_lock_sfex_device)
 
 class XendOptionsFile(XendOptions):
 
diff -Nupr xen-3.3.1-testing.orig/tools/python/xen/xm/create.py
xen-3.3.1-testing/tools/python/xen/xm/create.py
---
xen-3.3.1-testing.orig/tools/python/xen/xm/create.py	2009-06-18
01:44:28.000000000 +0800
+++ xen-3.3.1-testing/tools/python/xen/xm/create.py	2009-06-22
14:23:55.000000000 +0800
@@ -189,6 +189,10 @@ gopts.var('maxmem', val='MEMORY',
           fn=set_int, default=None,
           use="Maximum domain memory in MB.")
 
+gopts.var('sfex_index', val='SFEX',
+          fn=set_int, default=None,
+          use="Sfex index.")
+
 gopts.var('shadow_memory', val='MEMORY',
           fn=set_int, default=0,
           use="Domain shadow memory in MB.")
@@ -884,7 +888,7 @@ def make_config(vals):
             if v:
                 config.append([n, v])
 
-    map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
+    map(add_conf, ['name', 'memory', 'maxmem', 'sfex_index',
'shadow_memory',
                    'restart', 'on_poweroff',
                    'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail',
'features',
                    'on_xend_start', 'on_xend_stop', 'target', 'cpuid',


>>> On 8/5/2009 at  5:33 PM, in message
<20090805093333.GH24960@edu.joroinen.fi>,
Pasi Kärkkäinen<pasik@iki.fi> wrote: 
> On Wed, Aug 05, 2009 at 04:39:23PM +0800, Zhigang Wang wrote:
>> Pasi ??? wrote:
>> > On Mon, Aug 11, 2008 at 10:45:23AM -0600, Jim Fehlig wrote:
>> >> Ian Jackson wrote:
>> >>> Jim Fehlig writes ("[Xen-devel] [PATCH] [RFC] Add lock on domain
start"):
>> >>>   
>> >>>> This patch adds a simple lock mechanism when starting domains
by placing 
>> >>>> a lock file in xend-domains-path/<dom_uuid>.  The lock file is
removed 
>> >>>> when domain is stopped.  The motivation for such a mechanism is
to 
>> >>>> prevent starting the same domain from multiple hosts.
>> >>>>     
>> >>> I think this should be dealt with in your next-layer-up
management
>> >>> tools.
>> >>>   
>> >> Perhaps.  I wanted to see if there was any interest in having
such a
>> >> feature at the xend layer.  If not, I will no longer pursue this
option.
>> >>
>> > 
>> > Replying a bit late to this.. I think there is demand for this
feature! 
>> > 
>> > Many people (mostly in a smaller environments) don't want to use
>> > 'next-layer-up' management tools..
>> > 
>> >>> Lockfiles are bad because they can become stale.
>> >>>   
>> >> Yep.  Originally I considered a 'lockless-lock' approach where a
bit it
>> >> set and counter is spun on a 'reserved' sector of vbd, e.g.
first
>> >> sector.  Attempting to attach the vbd to another domain would
fail if
>> >> lock bit is set and counter is incrementing.  If counter is not
>> >> incrementing assume lock is stale and proceed.  This approach is
>> >> certainly more complex.  We support various image formats (raw,
qcow,
>> >> vmdk, ...) and such an approach may mean changing the format
(e.g.
>> >> qcow3).  Wouldn't work for existing images.  Who is responsible
for
>> >> spinning the counter?  Anyhow seemed like a lot of complexity as
>> >> compared to the suggested simple approach with override for stale
lock.
>> >>
>> > 
>> > I assume you guys have this patch included in OpenSuse/SLES Xen
rpms.
>> > 
>> > Is the latest version available from somewhere? 
>> > 
>> > -- Pasi
>> I ever seen a patch in SUSE xen rpm. maybe Jim can tell you the
latest 
> status.
>> 
> 
>
http://serverfault.com/questions/21699/how-to-manage-xen-virtual-machines-on-shared-
> san-storage
> 
> In that discussion someone says xend-lock stuff can be found from
SLES11 Xen.
> 
>> In Oracle VM, we add hooks in xend and use a external locking
utility.
>> 
>> currently, we use DLM (distributed lock manager) to manage the
domain 
> running lock to prevent the same
>> VM starts from two servers simultaneously.
>> 
>> We have add hooks to VM start/shutdown/migration for acquire/release
the 
> lock.
>> 
>> Note during migration, we release the lock before starting the
migration 
> process
>> and a lock will be acquired in the destination side. There still a
chance 
> for
>> other servers rather than the destination server to acquire the
lock. thus 
> cause
>> the migration fail.
>> 
> 
> Hmm.. I guess that also leaves a small time window for disk
corruption? If
> the domU was started on some other host at _exact_ correct (or bad)
time
> when the lock is not held anymore by the migration source host..
> 
>> hope someone can give some advice.
>> 
>> here is the patch for your reference.
>> 
> 
> Thanks. Looks like possible method aswell.
> 
> -- Pasi
> 
>> thanks,
>> 
>> zhigang
> 
>> diff -Nurp --exclude '*.orig'
xen-3.4.0.bak/tools/examples/xend-config.sxp 
> xen-3.4.0/tools/examples/xend-config.sxp
>> --- xen-3.4.0.bak/tools/examples/xend-config.sxp	2009-08-05
16:17:42.000000000 +0800
>> +++ xen-3.4.0/tools/examples/xend-config.sxp	2009-08-04
10:23:17.000000000 +0800
>> @@ -69,6 +69,12 @@
>>  
>>  (xend-unix-path /var/lib/xend/xend-socket)
>>  
>> +# External locking utility for get/release domain running lock. By
default,
>> +# no utility is specified. Thus there will be no lock as VM
running.
>> +# The locking utility should accept:
>> +# <--lock | --unlock> --name <name> --uuid <uuid>
>> +# command line options, and returns zero on success, others on
error.
>> +#(xend-domains-lock-path '')
>>  
>>  # Address and port xend should use for the legacy TCP XMLRPC
interface, 
>>  # if xend-tcp-xmlrpc-server is set.
>> diff -Nurp --exclude '*.orig' 
> xen-3.4.0.bak/tools/python/xen/xend/XendDomainInfo.py 
> xen-3.4.0/tools/python/xen/xend/XendDomainInfo.py
>> ---
xen-3.4.0.bak/tools/python/xen/xend/XendDomainInfo.py	2009-08-05

> 16:17:42.000000000 +0800
>> +++
xen-3.4.0/tools/python/xen/xend/XendDomainInfo.py	2009-08-05 
> 16:35:35.000000000 +0800
>> @@ -359,6 +359,8 @@ class XendDomainInfo:
>>      @type state_updated: threading.Condition
>>      @ivar refresh_shutdown_lock: lock for polling shutdown state
>>      @type refresh_shutdown_lock: threading.Condition
>> +    @ivar running_lock: lock for running VM
>> +    @type running_lock: bool or None
>>      @ivar _deviceControllers: device controller cache for this
domain
>>      @type _deviceControllers: dict 'string' to DevControllers
>>      """
>> @@ -427,6 +429,8 @@ class XendDomainInfo:
>>          self.refresh_shutdown_lock = threading.Condition()
>>          self._stateSet(DOM_STATE_HALTED)
>>  
>> +        self.running_lock = None
>> +
>>          self._deviceControllers = {}
>>  
>>          for state in DOM_STATES_OLD:
>> @@ -453,6 +457,7 @@ class XendDomainInfo:
>>  
>>          if self._stateGet() in (XEN_API_VM_POWER_STATE_HALTED, 
> XEN_API_VM_POWER_STATE_SUSPENDED, XEN_API_VM_POWER_STATE_CRASHED):
>>              try:
>> +                self.acquire_running_lock();
>>                  XendTask.log_progress(0, 30,
self._constructDomain)
>>                  XendTask.log_progress(31, 60, self._initDomain)
>>                  
>> @@ -485,6 +490,7 @@ class XendDomainInfo:
>>          state = self._stateGet()
>>          if state in (DOM_STATE_SUSPENDED, DOM_STATE_HALTED):
>>              try:
>> +                self.acquire_running_lock();
>>                  self._constructDomain()
>>  
>>                  try:
>> @@ -2617,6 +2623,11 @@ class XendDomainInfo:
>>  
>>              self._stateSet(DOM_STATE_HALTED)
>>              self.domid = None  # Do not push into _stateSet()!
>> +      
>> +            try:
>> +                self.release_running_lock()
>> +            except:
>> +                log.exception("Release running lock failed: %s" %
status)
>>          finally:
>>              self.refresh_shutdown_lock.release()
>>  
>> @@ -4073,6 +4084,28 @@ class XendDomainInfo:
>>                                     params.get('burst', '50K'))
>>          return 1
>>  
>> +    def acquire_running_lock(self):
>> +        if not self.running_lock:
>> +            lock_path = xoptions.get_xend_domains_lock_path()
>> +            if lock_path:
>> +                status = os.system('%s --lock --name %s --uuid %s'
% \
>> +                                   (lock_path,
self.info['name_label'], 
> self.info['uuid']))
>> +                if status == 0:
>> +                    self.running_lock = True
>> +                else:
>> +                    raise XendError('Acquire running lock failed:
%s' % 
> status)
>> +
>> +    def release_running_lock(self):
>> +        if self.running_lock:
>> +            lock_path = xoptions.get_xend_domains_lock_path()
>> +            if lock_path:
>> +                status = os.system('%s --unlock --name %s --uuid
%s' % \
>> +                                   (lock_path,
self.info['name_label'], 
> self.info['uuid']))
>> +                if status == 0:
>> +                    self.running_lock = False
>> +                else:
>> +                    raise XendError('Release running lock failed:
%s' % 
> status)
>> +
>>      def __str__(self):
>>          return '<domain id=%s name=%s memory=%s state=%s>' % \
>>                 (str(self.domid), self.info['name_label'],
>> diff -Nurp --exclude '*.orig'
xen-3.4.0.bak/tools/python/xen/xend/XendDomain.py 
> xen-3.4.0/tools/python/xen/xend/XendDomain.py
>> ---
xen-3.4.0.bak/tools/python/xen/xend/XendDomain.py	2009-08-05
16:17:09.000000000 
> +0800
>> +++ xen-3.4.0/tools/python/xen/xend/XendDomain.py	2009-08-04
10:23:17.000000000 
> +0800
>> @@ -1317,6 +1317,7 @@ class XendDomain:
>>                              
POWER_STATE_NAMES[dominfo._stateGet()])
>>  
>>          """ The following call may raise a XendError exception """
>> +        dominfo.release_running_lock();
>>          dominfo.testMigrateDevices(True, dst)
>>  
>>          if live:
>> diff -Nurp --exclude '*.orig'
xen-3.4.0.bak/tools/python/xen/xend/XendOptions.py 
> xen-3.4.0/tools/python/xen/xend/XendOptions.py
>> ---
xen-3.4.0.bak/tools/python/xen/xend/XendOptions.py	2009-08-05 
> 16:17:42.000000000 +0800
>> +++ xen-3.4.0/tools/python/xen/xend/XendOptions.py	2009-08-04
10:23:17.000000000 
> +0800
>> @@ -281,6 +281,11 @@ class XendOptions:
>>          """
>>          return self.get_config_string("xend-domains-path", 
> self.xend_domains_path_default)
>>  
>> +    def get_xend_domains_lock_path(self):
>> +        """ Get the path of the lock utility for running domains.
>> +        """
>> +        return self.get_config_string("xend-domains-lock-path")
>> +
>>      def get_xend_state_path(self):
>>          """ Get the path for persistent domain configuration
storage
>>          """
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
advisory

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2009-08-05 16:30 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-06  6:06 [PATCH] [RFC] Add lock on domain start Jim Fehlig
2008-08-07  2:23 ` Zhigang Wang
2008-08-07  4:26   ` Zhigang Wang
2008-08-08 17:07     ` Jim Fehlig
2008-08-11  2:22       ` Zhigang Wang
2008-08-11 11:14 ` Ian Jackson
2008-08-11 16:45   ` Jim Fehlig
2009-08-05  7:41     ` Pasi Kärkkäinen
2009-08-05  8:39       ` Zhigang Wang
2009-08-05  9:33         ` Pasi Kärkkäinen
2009-08-05 16:30           ` Jia Ju Zhang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.