From mboxrd@z Thu Jan 1 00:00:00 1970 From: rmccabe@sourceware.org Date: 31 Oct 2006 00:16:19 -0000 Subject: [Cluster-devel] conga/luci/site/luci/Extensions cluster_adapte ... Message-ID: <20061031001619.15181.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: conga Changes by: rmccabe at sourceware.org 2006-10-31 00:16:15 Modified files: luci/site/luci/Extensions: cluster_adapters.py Log message: more logging and exception robustness Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/conga/luci/site/luci/Extensions/cluster_adapters.py.diff?cvsroot=cluster&r1=1.133&r2=1.134 --- conga/luci/site/luci/Extensions/cluster_adapters.py 2006/10/30 22:52:00 1.133 +++ conga/luci/site/luci/Extensions/cluster_adapters.py 2006/10/31 00:16:14 1.134 @@ -1637,6 +1637,7 @@ try: svcname = req.form['servicename'] except: + luci_log.debug_verbose('serviceStart error: no service name') return None try: @@ -1645,22 +1646,28 @@ try: nodename = req.form['nodename'] except: - return None + nodename = None + cluname = None try: cluname = req['clustername'] except KeyError, e: try: - cluname = req.form['clusterName'] + cluname = req.form['clustername'] except: - return None + pass + + if cluname is None: + luci_log.debug_verbose('serviceStart error: %s no service name' \ + % svcname) + return None ricci_agent = rc.hostname() batch_number, result = startService(rc, svcname, nodename) - if not batch_number or not result: - luci_log.debug_verbose('startService %s @ %s call failed' \ - % (svcname, nodename)) + if batch_number is None or result is None: + luci_log.debug_verbose('startService %s call failed' \ + % svcname) return None #Now we need to create a DB flag for this system. @@ -1704,17 +1711,14 @@ try: cluname = req.form['clustername'] except: - try: - cluname = rc.cluster_info()[0] - except: - pass + pass if cluname is None: luci_log.debug_verbose('unable to determine cluser name for serviceRestart %s' % svcname) return None batch_number, result = restartService(rc, svcname) - if not batch_number or not result: + if batch_number is None or result is None: luci_log.debug_verbose('restartService for %s failed' % svcname) return None @@ -1762,17 +1766,14 @@ try: cluname = req.form['clustername'] except: - try: - cluname = rc.cluster_info()[0] - except: - pass + pass if cluname is None: luci_log.debug_verbose('unable to determine cluser name for serviceStop %s' % svcname) return None batch_number, result = stopService(rc, svcname) - if not batch_number or not result: + if batch_number is None or result is None: luci_log.debug_verbose('stopService for %s failed' % svcname) return None @@ -2097,7 +2098,7 @@ clustername = request['clustername'] except KeyError, e: try: - clustername = request.form['clusterName'] + clustername = request.form['clustername'] except: luci_log.debug('missing cluster name for NTP') return None @@ -2194,16 +2195,20 @@ return None batch_number, result = nodeLeaveCluster(rc) - batch_id = str(batch_number) + if batch_number is None or result is None: + luci_log.debug_verbose('nodeLeaveCluster error: batch_number and/or result is None') + return None + batch_id = str(batch_number) objpath = str(path + "/" + objname) + try: nodefolder.manage_addProduct['ManagedSystem'].addManagedSystem(objname) #Now we need to annotate the new DB object flag = self.restrictedTraverse(objpath) flag.manage_addProperty(BATCH_ID, batch_id, "string") - flag.manage_addProperty(TASKTYPE,NODE_LEAVE_CLUSTER, "string") - flag.manage_addProperty(FLAG_DESC,"Node \'" + nodename + "\' leaving cluster", "string") + flag.manage_addProperty(TASKTYPE, NODE_LEAVE_CLUSTER, "string") + flag.manage_addProperty(FLAG_DESC, "Node \'" + nodename + "\' leaving cluster", "string") except: luci_log.debug('An error occurred while setting flag %s' % objpath) @@ -2212,34 +2217,52 @@ response.redirect(request['URL'] + "?pagetype=" + CLUSTER_CONFIG + "&clustername=" + clustername) elif task == NODE_JOIN_CLUSTER: batch_number, result = nodeJoinCluster(rc) - path = CLUSTER_FOLDER_PATH + clustername + "/" + nodename_resolved - nodefolder = self.restrictedTraverse(path) + if batch_number is None or result is None: + luci_log.debug_verbose('nodeJoin error: batch_number and/or result is None') + return None + + path = str(CLUSTER_FOLDER_PATH + clustername + "/" + nodename_resolved) batch_id = str(batch_number) - objname = nodename_resolved + "____flag" - nodefolder.manage_addProduct['ManagedSystem'].addManagedSystem(objname) - #Now we need to annotate the new DB object - objpath = path + "/" + objname - flag = self.restrictedTraverse(objpath) - flag.manage_addProperty(BATCH_ID,batch_id, "string") - flag.manage_addProperty(TASKTYPE,NODE_JOIN_CLUSTER, "string") - flag.manage_addProperty(FLAG_DESC,"Node \'" + nodename + "\' joining cluster", "string") + objname = str(nodename_resolved + "____flag") + objpath = str(path + "/" + objname) + + try: + nodefolder = self.restrictedTraverse(path) + nodefolder.manage_addProduct['ManagedSystem'].addManagedSystem(objname) + #Now we need to annotate the new DB object + flag = self.restrictedTraverse(objpath) + flag.manage_addProperty(BATCH_ID, batch_id, "string") + flag.manage_addProperty(TASKTYPE, NODE_JOIN_CLUSTER, "string") + flag.manage_addProperty(FLAG_DESC, "Node \'" + nodename + "\' joining cluster", "string") + except Exception, e: + luci_log.debug_verbose('nodeJoin error: creating flags at %s: %s' \ + % (path, str(e))) response = request.RESPONSE #Once again, is this correct? Should we re-direct to the cluster page? response.redirect(request['URL'] + "?pagetype=" + CLUSTER_CONFIG + "&clustername=" + clustername) elif task == NODE_REBOOT: batch_number, result = nodeReboot(rc) - path = CLUSTER_FOLDER_PATH + clustername + "/" + nodename_resolved - nodefolder = self.restrictedTraverse(path) + if batch_number is None or result is None: + luci_log.debug_verbose('nodeReboot: batch_number and/or result is None') + return None + + path = str(CLUSTER_FOLDER_PATH + clustername + "/" + nodename_resolved) batch_id = str(batch_number) - objname = nodename_resolved + "____flag" - nodefolder.manage_addProduct['ManagedSystem'].addManagedSystem(objname) - #Now we need to annotate the new DB object - objpath = path + "/" + objname - flag = self.restrictedTraverse(objpath) - flag.manage_addProperty(BATCH_ID, batch_id, "string") - flag.manage_addProperty(TASKTYPE, NODE_REBOOT, "string") - flag.manage_addProperty(FLAG_DESC, "Node \'" + nodename + "\' is being rebooted", "string") + objname = str(nodename_resolved + "____flag") + objpath = str(path + "/" + objname) + + try: + nodefolder = self.restrictedTraverse(path) + nodefolder.manage_addProduct['ManagedSystem'].addManagedSystem(objname) + #Now we need to annotate the new DB object + flag = self.restrictedTraverse(objpath) + flag.manage_addProperty(BATCH_ID, batch_id, "string") + flag.manage_addProperty(TASKTYPE, NODE_REBOOT, "string") + flag.manage_addProperty(FLAG_DESC, "Node \'" + nodename + "\' is being rebooted", "string") + except Exception, e: + luci_log.debug_verbose('nodeReboot err: creating flags at %s: %s' \ + % (path, str(e))) response = request.RESPONSE #Once again, is this correct? Should we re-direct to the cluster page? @@ -2250,16 +2273,19 @@ try: clusterfolder = self.restrictedTraverse(path) if not clusterfolder: - raise - except: - luci_log.debug('The cluster folder for %s could not be found.' \ - % clustername) + raise Exception, 'no cluster folder at %s' % path + except Exception, e: + luci_log.debug('The cluster folder for %s could not be found: %s' \ + % (clustername, str(e))) return None try: nodes = clusterfolder.objectItems('Folder') - except: - luci_log.debug('No cluster nodes for %s were found' % clustername) + if not nodes or len(nodes) < 1: + raise Exception, 'no cluster nodes' + except Exception, e: + luci_log.debug('No cluster nodes for %s were found: %s' \ + % (clustername, str(e))) return None found_one = False @@ -2299,17 +2325,26 @@ return None batch_number, result = nodeFence(rc, nodename) - path = path + "/" + nodename_resolved - nodefolder = self.restrictedTraverse(path) + if batch_number is None or result is None: + luci_log.debug_verbose('nodeFence: batch_number and/or result is None') + return None + + path = str(path + "/" + nodename_resolved) batch_id = str(batch_number) - objname = nodename_resolved + "____flag" - nodefolder.manage_addProduct['ManagedSystem'].addManagedSystem(objname) - #Now we need to annotate the new DB object - objpath = path + "/" + objname - flag = self.restrictedTraverse(objpath) - flag.manage_addProperty(BATCH_ID,batch_id, "string") - flag.manage_addProperty(TASKTYPE,NODE_FENCE, "string") - flag.manage_addProperty(FLAG_DESC,"Node \'" + nodename + "\' is being fenced", "string") + objname = str(nodename_resolved + "____flag") + objpath = str(path + "/" + objname) + + try: + nodefolder = self.restrictedTraverse(path) + nodefolder.manage_addProduct['ManagedSystem'].addManagedSystem(objname) + #Now we need to annotate the new DB object + flag = self.restrictedTraverse(objpath) + flag.manage_addProperty(BATCH_ID, batch_id, "string") + flag.manage_addProperty(TASKTYPE, NODE_FENCE, "string") + flag.manage_addProperty(FLAG_DESC, "Node \'" + nodename + "\' is being fenced", "string") + except Exception, e: + luci_log.debug_verbose('nodeFence err: creating flags at %s: %s' \ + % (path, str(e))) response = request.RESPONSE #Once again, is this correct? Should we re-direct to the cluster page? @@ -2320,17 +2355,25 @@ #and propogate it. We will need two ricci agents for this task. # Make sure we can find a second node before we hose anything. - path = CLUSTER_FOLDER_PATH + clustername + path = str(CLUSTER_FOLDER_PATH + clustername) try: clusterfolder = self.restrictedTraverse(path) if not clusterfolder: - raise - except: + raise Exception, 'no cluster folder at %s' % path + except Exception, e: + luci_log.debug_verbose('node delete error for cluster %s: %s' \ + % (clustername, str(e))) return None - nodes = clusterfolder.objectItems('Folder') - found_one = False + try: + nodes = clusterfolder.objectItems('Folder') + if not nodes or len(nodes) < 1: + raise Exception, 'no cluster nodes in DB' + except Exception, e: + luci_log.debug_verbose('node delete error for cluster %s: %s' \ + % (clustername, str(e))) + found_one = False for node in nodes: if node[1].getId().find(nodename) != (-1): continue @@ -2339,38 +2382,59 @@ # in the cluster we believe it is. try: rc2 = RicciCommunicator(node[1].getId()) - if not rc2.authed(): - # set the flag - rc2 = None - if not rc2: - raise - found_one = True - break + except Exception, e: + luci_log.info('ricci %s error: %s' % (node[0], str(e))) + continue except: continue + if not rc2.authed(): + try: + setNodeFlag(node[1], CLUSTER_NODE_NEED_AUTH) + except: + pass + + try: + snode = getStorageNode(self, node[0]) + setNodeFlag(snode, CLUSTER_NODE_NEED_AUTH) + except: + pass + + luci_log.debug_verbose('%s is not authed' % node[0]) + rc2 = None + continue + else: + found_one = True + break + if not found_one: + luci_log.debug_verbose('unable to find ricci node to delete %s from %s' % (nodename, clustername)) return None #First, delete cluster.conf from node to be deleted. #next, have node leave cluster. batch_number, result = nodeLeaveCluster(rc, purge=True) + if batch_number is None or result is None: + luci_log.debug_verbose('nodeDelete: batch_number and/or result is None') + return None #It is not worth flagging this node in DB, as we are going #to delete it anyway. Now, we need to delete node from model #and send out new cluster.conf delete_target = None - try: - nodelist = model.getNodes() - find_node = lower(nodename) - for n in nodelist: + nodelist = model.getNodes() + find_node = lower(nodename) + for n in nodelist: + try: if lower(n.getName()) == find_node: delete_target = n break - except: - pass + except: + continue if delete_target is None: + luci_log.debug_verbose('unable to find delete target for %s in %s' \ + % (nodename, clustername)) return None model.deleteNode(delete_target) @@ -2386,6 +2450,7 @@ # propagate the new cluster.conf via the second node batch_number, result = setClusterConf(rc2, str(str_buf)) if batch_number is None: + luci_log.debug_verbose('batch number is None after del node in NTP') return None #Now we need to delete the node from the DB @@ -2396,19 +2461,24 @@ delnode = self.restrictedTraverse(del_path) clusterfolder = self.restrictedTraverse(path) clusterfolder.manage_delObjects(delnode[0]) - except: - # XXX - we need to handle this - pass + except Exception, e: + luci_log.debug_verbose('error deleting %s: %s' % (del_path, str(e))) batch_id = str(batch_number) objname = str(nodename_resolved + "____flag") - clusterfolder.manage_addProduct['ManagedSystem'].addManagedSystem(objname) - #Now we need to annotate the new DB object objpath = str(path + "/" + objname) - flag = self.restrictedTraverse(objpath) - flag.manage_addProperty(BATCH_ID,batch_id, "string") - flag.manage_addProperty(TASKTYPE,NODE_DELETE, "string") - flag.manage_addProperty(FLAG_DESC,"Deleting node \'" + nodename + "\'", "string") + + try: + clusterfolder.manage_addProduct['ManagedSystem'].addManagedSystem(objname) + #Now we need to annotate the new DB object + flag = self.restrictedTraverse(objpath) + flag.manage_addProperty(BATCH_ID, batch_id, "string") + flag.manage_addProperty(TASKTYPE, NODE_DELETE, "string") + flag.manage_addProperty(FLAG_DESC, "Deleting node \'" + nodename + "\'", "string") + except Exception, e: + luci_log.debug_verbose('nodeDelete %s err setting flag@%s: %s' \ + % (nodename, objpath, str(e))) + response = request.RESPONSE response.redirect(request['HTTP_REFERER'] + "&busyfirst=true")