From mboxrd@z Thu Jan 1 00:00:00 1970 From: rmccabe@sourceware.org Date: 18 Oct 2006 23:12:32 -0000 Subject: [Cluster-devel] conga/luci/site/luci/Extensions cluster_adapte ... Message-ID: <20061018231232.3501.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: conga Changes by: rmccabe at sourceware.org 2006-10-18 23:12:31 Modified files: luci/site/luci/Extensions: cluster_adapters.py homebase_adapters.py Log message: better error handling log important (or those useful for debugging) errors to syslog Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/conga/luci/site/luci/Extensions/cluster_adapters.py.diff?cvsroot=cluster&r1=1.121&r2=1.122 http://sourceware.org/cgi-bin/cvsweb.cgi/conga/luci/site/luci/Extensions/homebase_adapters.py.diff?cvsroot=cluster&r1=1.34&r2=1.35 --- conga/luci/site/luci/Extensions/cluster_adapters.py 2006/10/18 19:16:17 1.121 +++ conga/luci/site/luci/Extensions/cluster_adapters.py 2006/10/18 23:12:31 1.122 @@ -22,7 +22,8 @@ from clusterOS import resolveOSType from GeneralError import GeneralError from UnknownClusterError import UnknownClusterError -from homebase_adapters import nodeUnauth, nodeAuth, manageCluster, createClusterSystems, havePermCreateCluster, setNodeFlag, delNodeFlag, userAuthenticated +from homebase_adapters import nodeUnauth, nodeAuth, manageCluster, createClusterSystems, havePermCreateCluster, setNodeFlag, delNodeFlag, userAuthenticated, getStorageNode, getClusterNode +from LuciSyslog import LuciSyslogError, LuciSyslog #Policy for showing the cluster chooser menu: #1) If there are no clusters in the ManagedClusterSystems @@ -34,6 +35,11 @@ CLUSTER_FOLDER_PATH = '/luci/systems/cluster/' +try: + luci_log = LuciSyslog() +except LuciSyslogError, e: + pass + def validateClusterNodes(request, sessionData, clusterName, numStorage): nodeList = list() nodeHash = {} @@ -205,11 +211,24 @@ batch_id_map = {} rc = None for i in nodeList: + success = True try: rc = RicciCommunicator(i['ricci_host']) - resultNode = rc.process_batch(batchNode, async=True) - batch_id_map[i['ricci_host']] = resultNode.getAttribute('batch_id') + except RicciError, e: + luci_log.debug('Unable to connect to the ricci agent on %s: %s'\ + % (i['ricci_host'], str(e))) + success = False except: + success = False + + if success == True: + try: + resultNode = rc.process_batch(batchNode, async=True) + batch_id_map[i['ricci_host']] = resultNode.getAttribute('batch_id') + except: + success = False + + if not success: nodeUnauth(nodeList) cluster_properties['isComplete'] = False errors.append('An error occurred while attempting to add cluster node \"' + i['ricci_host'] + '\"') @@ -294,6 +313,7 @@ clusterObj = self.restrictedTraverse(PLONE_ROOT + '/systems/cluster/' + clusterName) cluster_os = clusterObj.manage_getProperty('cluster_os') if not cluster_os: + luci_log.debug('The cluster OS property is missing for cluster ' + clusterName) raise Exception, 'no cluster OS was found.' try: if len(filter(lambda x: x['os'] != cluster_os, nodeList)) > 0: @@ -342,17 +362,28 @@ batch_id_map = {} for i in nodeList: clunode = nodeList[i] + success = True try: rc = RicciCommunicator(clunode['ricci_host']) - resultNode = rc.process_batch(batchNode, async=True) - batch_id_map[clunode['ricci_host']] = resultNode.getAttribute('batch_id') - messages.append('Cluster join initiated for host \"' + clunode['ricci_host'] + '\"') except: + luci_log.info('Unable to connect to the ricci daemon on host ' + clunode['ricci_host']) + success = False + + if success: + try: + resultNode = rc.process_batch(batchNode, async=True) + batch_id_map[clunode['ricci_host']] = resultNode.getAttribute('batch_id') + except: + success = False + + if not success: nodeUnauth(nodeList) cluster_properties['isComplete'] = False errors.append('An error occurred while attempting to add cluster node \"' + clunode['ricci_host'] + '\"') return (False, {'errors': errors, 'requestResults': cluster_properties}) + messages.append('Cluster join initiated for host \"' + clunode['ricci_host'] + '\"') + buildClusterCreateFlags(self, batch_id_map, clusterName) return (True, {'errors': errors, 'messages': messages}) @@ -412,6 +443,7 @@ try: resObj = resourceAddHandler[res_type](self, dummy_form) except: + luci_log resObj = None if resObj is None: @@ -1304,9 +1336,12 @@ try: clusterfolder = self.restrictedTraverse(path) if not clusterfolder: + luci_log.debug('cluster folder %s for %s is missing.' \ + % (path, clustername)) raise nodes = clusterfolder.objectItems('Folder') if len(nodes) < 1: + luci_log.debug('no cluster nodes for %s found.' % clustername) return None except: return None @@ -1324,15 +1359,15 @@ try: rc = RicciCommunicator(hostname) - if not rc: - raise - except: - #raise Exception, ('unable to communicate with the ricci agent on %s', hostname) + except RicciError, e: + luci_log.debug('ricci error: %s' % str(e)) continue try: clu_info = rc.cluster_info() if cluname != lower(clu_info[0]) and cluname != lower(clu_info[1]): + luci_log.debug('%s reports it\'s in cluster %s:%s; we expect %s' \ + % (hostname, clu_info[0], clu_info[1], cluname)) # node reports it's in a different cluster raise except: @@ -1340,7 +1375,9 @@ if rc.authed(): return rc - setNodeFlag(self, node[1], CLUSTER_NODE_NEED_AUTH) + setNodeFlag(node[1], CLUSTER_NODE_NEED_AUTH) + + luci_log.debug('no ricci agent could be found for cluster %s' % cluname) return None def getRicciAgentForCluster(self, req): @@ -1352,11 +1389,13 @@ if not clustername: raise except: + luci_log.debug('no cluster name was specified in getRicciAgentForCluster') return None return getRicciAgent(self, clustername) def getClusterStatus(self, rc): clustatus_batch ='' + try: clustatuscmd_xml = minidom.parseString(clustatus_batch).firstChild except: @@ -1364,6 +1403,8 @@ try: ricci_xml = rc.process_batch(clustatuscmd_xml, async=False) + except RicciError, e: + luci_log.debug('ricci error: %s', str(e)) except: return {} @@ -1998,16 +2039,44 @@ # to be performed. try: rc = RicciCommunicator(nodename_resolved) - # XXX - check the cluster - if not rc.authed(): - # set the flag - rc = None - - if not rc: - raise + except RicciError, e: + luci_log.debug('ricci error from %s: %s' \ + % (nodename_resolved, str(e))) + return None except: return None + cluinfo = rc.cluster_info() + if not cluinfo[0] and not cluinfo[1]: + luci_log.debug('host %s not in a cluster (expected %s)' \ + % (nodename_resolved, clustername)) + return None + + cname = lower(clustername) + if cname != lower(cluinfo[0]) and cname != lower(cluinfo[1]): + luci_log.debug('host %s in unknown cluster %s:%s (expected %s)' \ + % (nodename_resolved, cluinfo[0], cluinfo[1], clustername)) + return None + + if not rc.authed(): + rc = None + try: + snode = getStorageNode(self, nodename) + setNodeFlag(snode, CLUSTER_NODE_NEED_AUTH) + except: + # we'll hit it again, and try again then + pass + + try: + cnode = getClusterNode(self, nodename, clustername) + setNodeFlag(cnode, CLUSTER_NODE_NEED_AUTH) + except: + # we'll hit it again, and try again then + pass + + if rc is None: + return None + if task == NODE_LEAVE_CLUSTER: batch_number, result = nodeLeaveCluster(rc) @@ -2056,40 +2125,64 @@ #Now we need to annotate the new DB object objpath = path + "/" + objname flag = self.restrictedTraverse(objpath) - flag.manage_addProperty(BATCH_ID,batch_id, "string") - flag.manage_addProperty(TASKTYPE,NODE_REBOOT, "string") - flag.manage_addProperty(FLAG_DESC,"Node \'" + nodename + "\' is being rebooted", "string") + flag.manage_addProperty(BATCH_ID, batch_id, "string") + flag.manage_addProperty(TASKTYPE, NODE_REBOOT, "string") + flag.manage_addProperty(FLAG_DESC, "Node \'" + nodename + "\' is being rebooted", "string") response = request.RESPONSE #Once again, is this correct? Should we re-direct to the cluster page? response.redirect(request['URL'] + "?pagetype=" + CLUSTER_CONFIG + "&clustername=" + clustername) elif task == NODE_FENCE: #here, we DON'T want to open connection to node to be fenced. - path = CLUSTER_FOLDER_PATH + clustername + path = str(CLUSTER_FOLDER_PATH + clustername) try: clusterfolder = self.restrictedTraverse(path) if not clusterfolder: raise except: + luci_log.debug('The cluster folder for %s could not be found.' \ + % clustername) + return None + + try: + nodes = clusterfolder.objectItems('Folder') + except: + luci_log.debug('No cluster nodes for %s were found' % clustername) return None - nodes = clusterfolder.objectItems('Folder') found_one = False for node in nodes: - if node[1].getID().find(nodename) != (-1): + if node[1].getId().find(nodename) != (-1): continue try: rc = RicciCommunicator(node[1].getId()) - if not rc.authed(): - # set the node flag - rc = None if not rc: - raise - found_one = True - break + continue + except RicciError, e: + luci_log.debug('ricci error for host %s: %s' \ + % (node[0], str(e))) + continue except: continue + + if not rc.authed(): + rc = None + try: + snode = getStorageNode(self, node[1].getId()) + setNodeFlag(snode, CLUSTER_NODE_NEED_AUTH) + except: + pass + + try: + setNodeFlag(node[1], CLUSTER_NODE_NEED_AUTH) + except: + pass + + continue + found_one = True + break + if not found_one: return None @@ -3430,14 +3523,23 @@ raise def noNodeFlagsPresent(self, nodefolder, flagname, hostname): - items = nodefolder.objectItems('ManagedSystem') + try: + items = nodefolder.objectItems('ManagedSystem') + except: + luci_log.debug('An error occurred while trying to list flags for cluster ' + nodefolder[0]) + return False for item in items: if item[0] != flagname: continue #a flag already exists... try to delete it - rc = RicciCommunicator(hostname) + try: + rc = RicciCommunicator(hostname) + except: + luci_log.info('Unable to connect to the ricci daemon on host ' + hostname) + return False + finished = checkBatch(rc, item[1].getProperty(BATCH_ID)) if finished == True: try: --- conga/luci/site/luci/Extensions/homebase_adapters.py 2006/10/16 20:46:46 1.34 +++ conga/luci/site/luci/Extensions/homebase_adapters.py 2006/10/18 23:12:31 1.35 @@ -1367,7 +1367,7 @@ pass return False -def setNodeFlag(self, node, flag_mask): +def setNodeFlag(node, flag_mask): try: flags = node.getProperty('flags') node.manage_changeProperties({ 'flags': flags | flag_mask }) @@ -1377,7 +1377,7 @@ except: pass -def delNodeFlag(self, node, flag_mask): +def delNodeFlag(node, flag_mask): try: flags = node.getProperty('flags') if flags & flag_mask != 0: