From mboxrd@z Thu Jan 1 00:00:00 1970 From: Konrad Rzeszutek Wilk Subject: Re: [PATCH] xend: do not polling vcpus info if guest state is not RUNNING or PAUSED Date: Tue, 19 Nov 2013 09:06:51 -0500 Message-ID: <20131119140651.GC5332@phenom.dumpdata.com> References: <528B017D.5020202@oracle.com> <528B1B4F.2010102@citrix.com> <528B4061.1000305@oracle.com> Mime-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Return-path: Content-Disposition: inline In-Reply-To: <528B4061.1000305@oracle.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Joe Jin , msw@amazon.com Cc: Keir Fraser , xen-devel , ian.jackson@eu.citrix.com, Ian Campbell , Roger Pau =?iso-8859-1?Q?Monn=E9?= List-Id: xen-devel@lists.xenproject.org On Tue, Nov 19, 2013 at 06:41:37PM +0800, Joe Jin wrote: > On 11/19/13 16:03, Roger Pau Monn=E9 wrote: > > On 19/11/13 07:13, Joe Jin wrote: > >> When created new guest on NUMA server, xend tried to get the best node= by > >> calculated all vcpus info, the race is if other geust is rebooting, the > >> guest in the list when entered find_relaxed_node(), but when call > >> getVCPUInfo() the guest be terminated, then getVCPUInfo() will fail wi= th > >> below error: > >> > >> [2013-09-04 20:01:26 6254] ERROR (XendDomainInfo:496) VM start failed > >> Traceback (most recent call last): > >> File "/usr/lib64/python2.4/site-packages/xen/xend/XendDomainInfo.py"= , line 482, in start > >> XendTask.log_progress(31, 60, self._initDomain) > >> File "/usr/lib64/python2.4/site-packages/xen/xend/XendTask.py", line= 209, in log_progress > >> retval =3D func(*args, **kwds) > >> File "/usr/lib64/python2.4/site-packages/xen/xend/XendDomainInfo.py"= , line 2918, in _initDomain > >> node =3D self._setCPUAffinity() > >> File "/usr/lib64/python2.4/site-packages/xen/xend/XendDomainInfo.py"= , line 2835, in _setCPUAffinity > >> best_node =3D find_relaxed_node(candidate_node_list)[0] > >> File "/usr/lib64/python2.4/site-packages/xen/xend/XendDomainInfo.py"= , line 2803, in find_relaxed_node > >> cpuinfo =3D dom.getVCPUInfo() > >> File "/usr/lib64/python2.4/site-packages/xen/xend/XendDomainInfo.py"= , line 1600, in getVCPUInfo > >> raise XendError(str(exn)) > >> XendError: (3, 'No such process') > >> > >> This patch will let find_relaxed_node() only polling the RUNNING or PA= USED > >> guest vpus info to avoid the race. > >> > >> Signed-off-by: Joe Jin > >> --- > >> tools/python/xen/xend/XendDomainInfo.py | 2 ++ > >> 1 files changed, 2 insertions(+), 0 deletions(-) > >> > >> diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xe= n/xend/XendDomainInfo.py > >> index e9d3e7e..66e4b9f 100644 > >> --- a/tools/python/xen/xend/XendDomainInfo.py > >> +++ b/tools/python/xen/xend/XendDomainInfo.py > >> @@ -2734,6 +2734,8 @@ class XendDomainInfo: > >> from xen.xend import XendDomain > >> doms =3D XendDomain.instance().list('all') > >> for dom in filter (lambda d: d.domid !=3D self.domid,= doms): > >> + if dom._stateGet() not in (DOM_STATE_RUNNING,DOM_= STATE_PAUSED): > >> + continue > > = > > Isn't it possible that the domain has rebooted and is no longer there > > between this two calls? > > = > > IMHO it's very unlikely, but there's still a window where getVCPUInfo > > could fail. > > = > = > Yes your right, this patch just reduce the window. = > I created a new patch for this, please comment! > = > [PATCH] xend: getVCPUInfo should handle died domain > = > When created new guest on NUMA server, xend tried to get the best node by > calculated all vcpus info, the race is if other geust is rebooting, the > guest in the list when entered find_relaxed_node(), but when call > getVCPUInfo() the guest already be terminated, then getVCPUInfo() will > fail with below error: > = > [2013-09-04 20:01:26 6254] ERROR (XendDomainInfo:496) VM start failed > Traceback (most recent call last): > File "/usr/lib64/python2.4/site-packages/xen/xend/XendDomainInfo.py", l= ine 482, in start > XendTask.log_progress(31, 60, self._initDomain) > File "/usr/lib64/python2.4/site-packages/xen/xend/XendTask.py", line 20= 9, in log_progress > retval =3D func(*args, **kwds) > File "/usr/lib64/python2.4/site-packages/xen/xend/XendDomainInfo.py", l= ine 2918, in _initDomain > node =3D self._setCPUAffinity() > File "/usr/lib64/python2.4/site-packages/xen/xend/XendDomainInfo.py", l= ine 2835, in _setCPUAffinity > best_node =3D find_relaxed_node(candidate_node_list)[0] > File "/usr/lib64/python2.4/site-packages/xen/xend/XendDomainInfo.py", l= ine 2803, in find_relaxed_node > cpuinfo =3D dom.getVCPUInfo() > File "/usr/lib64/python2.4/site-packages/xen/xend/XendDomainInfo.py", l= ine 1600, in getVCPUInfo > raise XendError(str(exn)) > XendError: (3, 'No such process') > = > This patch will handle the situation. > = > Signed-off-by: Joe Jin > --- > tools/python/xen/xend/XendDomainInfo.py | 4 ++++ > 1 files changed, 4 insertions(+), 0 deletions(-) > = > diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/x= end/XendDomainInfo.py > index e9d3e7e..c6414ed 100644 > --- a/tools/python/xen/xend/XendDomainInfo.py > +++ b/tools/python/xen/xend/XendDomainInfo.py > @@ -34,6 +34,7 @@ import os > import stat > import shutil > import traceback > +import errno > from types import StringTypes > = > import xen.lowlevel.xc > @@ -1541,6 +1542,9 @@ class XendDomainInfo: > return sxpr > = > except RuntimeError, exn: > + # Domain already died. > + if exn.args[0] =3D=3D errno.ESRCH: > + return sxpr > raise XendError(str(exn)) > = > = Adding Matt as he has stepped up to be the bug-fix maintainer of Xend (I think? Is that correct - should that be reflected in the MAINTAINERS fil= e?) > -- = > 1.7.1 > = > =