From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andrew Price Date: Thu, 01 Nov 2012 16:45:51 +0000 Subject: [Cluster-devel] [PATCH] Adding gfs2_lockcapture In-Reply-To: <1351783613-18537-1-git-send-email-sbradley@redhat.com> References: <1351783613-18537-1-git-send-email-sbradley@redhat.com> Message-ID: <5092A73F.1050508@redhat.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Hi Shane, A couple of comments: On 01/11/12 15:26, Shane Bradley wrote: > --- We generally like to keep complete, descriptive commit logs so it would be good to have a description of the script in the commit log and a brief note about the state it's in. See the previous logs for examples. Also include a "signed-off-by" line, which git commit -s will add for you. Also if you could prefix the patch subject with "gfs2-utils:" (and in future "gfs2_lockcapture:") it will help to distinguish it from the other projects which use cluster-devel@ for patches. > gfs2/lockgather/gfs2_lockcapture | 723 ++++++++++++++++++++++++++++++++++++++ > 1 files changed, 723 insertions(+), 0 deletions(-) > create mode 100644 gfs2/lockgather/gfs2_lockcapture Could you also edit gfs2/lockgather/Makefile.am to plug this into the build system. That way it will get installed with 'make install'. If you just replace the entry for gfs2_lockgather we'll remove that script in a separate commit. I've made a few minor comments on the code inline below. Thanks, Andy > diff --git a/gfs2/lockgather/gfs2_lockcapture b/gfs2/lockgather/gfs2_lockcapture > new file mode 100644 > index 0000000..d040738 > --- /dev/null > +++ b/gfs2/lockgather/gfs2_lockcapture > @@ -0,0 +1,723 @@ > +#!/usr/bin/env python > +""" > +This script will gather gfs2 and dlm lock information for a single cluster node > +for all the mounted GFS2 filesystems. > + > +TODO: > +* Should there be option to disable sysrq events in case it could trigger panic. > +* Add option to write log to file > + > +* Add a better description. > +* Add examples for all options. > +* Add better description of options and has steve to review those and tweak my > + option descriptions. > + > + at author : Shane Bradley > + at contact : sbradley at redhat.com > + at version : 0.9 > + at copyright : GPLv2 > +""" > +import sys > +import os > +import os.path > +import logging > +from optparse import OptionParser, Option > +import time > +import platform > +import shutil > +import subprocess > +import tarfile > + > +VERSION_NUMBER = "0.9-1" > +# ##################################################################### > +# Global vars: > +# ##################################################################### > +# Name of the logger > +MAIN_LOGGER_NAME = "%s" %(os.path.basename(sys.argv[0])) > +# Format of the logger > +MAIN_LOGGER_FORMAT = "%(levelname)s %(message)s" > +# Path to debug root > +PATH_TO_DEBUG_DIR="/sys/kernel/debug" > +# Path to the pid file that will be used for locking. > +PATH_TO_PID_FILENAME = "/var/run/%s.pid" %(os.path.basename(sys.argv[0])) > + > + > +# ##################################################################### > +# Class to define what a clusternode is. > +# ##################################################################### > +class ClusterNode: > + def __init__(self, clusternodeName, clusterName, listOfGFS2Names): > + self.__clusternodeName = clusternodeName > + self.__clusterName = clusterName > + > + # List of the mounted filesystem from the mount -l command. > + self.__listOfMountedGFS2Filesystems = self.__getMountedGFS2Filesystems() > + > + # List of mounted GFS2 labels for this cluster from mount -l command. > + listOfGFS2MountedFilesystemLabels = self.__getMountedFilesystemLabel(self.__listOfMountedGFS2Filesystems) > + self.__listOfGFS2MountedFilesystemLabels = [] > + if (not len(listOfGFS2Names) > 0): > + # If no items in listOfGFS2Names then add them all. > + self.__listOfGFS2MountedFilesystemLabels = listOfGFS2MountedFilesystemLabels > + else: > + for label in listOfGFS2MountedFilesystemLabels: > + for name in listOfGFS2Names: > + if ((name == label) or ("%s:%s"%(self.__clusterName, name) == label)): > + self.__listOfGFS2MountedFilesystemLabels.append(label) > + > + def __str__(self): > + rString = "" > + rString += "%s:%s" %(self.getClusterName(), self.getClusterNodeName()) > + for fsName in self.getMountedGFS2FilesystemNames(): > + rString += "\n\t%s" %(fsName) > + for mountedFS in self.__listOfMountedGFS2Filesystems: > + if (mountedFS.find(fsName) >= 0): > + rString += " --> %s" %(mountedFS) > + break > + return rString.rstrip() > + > + def __getMountedFilesystemLabel(self, listOfMountedFilesystems): > + listOfMountedFilesystemsLabels = [] > + for mountedFilesystem in listOfMountedFilesystems: > + splitMountedFilesystem = mountedFilesystem.split() > + fsLabel = splitMountedFilesystem[-1].strip().strip("[").rstrip("]") > + if (len(fsLabel) > 0): > + # Verify it starts with name of the cluster. > + if (fsLabel.startswith("%s:" %(self.getClusterName()))): > + listOfMountedFilesystemsLabels.append(fsLabel) > + return listOfMountedFilesystemsLabels > + > + def __getMountedGFS2Filesystems(self): > + listOfMountedFilesystems = [] > + commandList= ["mount", "-l"] > + stdout = "" > + try: > + task = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.PIPE) > + task.wait() > + (stdout, stderr) = task.communicate() > + except OSError: > + commandOptionString = "" > + for option in commandList: > + commandOptionString += "%s " %(option) > + message = "An error occurred running the command: $ %s" %(commandOptionString) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return listOfMountedFilesystems > + stdoutSplit = stdout.split("\n") > + for line in stdoutSplit: > + splitLine = line.split() > + if (len(splitLine) >= 5): > + if (splitLine[4] == "gfs2"): > + listOfMountedFilesystems.append(line) > + return listOfMountedFilesystems > + > + def getClusterNodeName(self): > + return self.__clusternodeName > + > + def getClusterName(self): > + return self.__clusterName > + > + def getMountedGFS2FilesystemNames(self, includeClusterName=True): > + # If true will prepend the cluster name to gfs2 fs name > + if (includeClusterName): > + return self.__listOfGFS2MountedFilesystemLabels > + else: > + listOfGFS2MountedFilesystemLabels = [] > + for fsLabel in self.__listOfGFS2MountedFilesystemLabels: > + fsLabelSplit = fsLabel.split(":", 1) > + if (len(fsLabelSplit) == 2): > + listOfGFS2MountedFilesystemLabels.append(fsLabelSplit[1]) > + return listOfGFS2MountedFilesystemLabels > + > +# ##################################################################### > +# Helper functions. > +# ##################################################################### > +def runCommand(command, listOfCommandOptions, standardOut=subprocess.PIPE, standardError=subprocess.PIPE, debug=False): > + stdout = "" > + stderr = "" > + try: > + commandList = [command] > + commandList += listOfCommandOptions > + task = subprocess.Popen(commandList, stdout=standardOut, stderr=standardError) > + task.wait() > + (stdout, stderr) = task.communicate() > + return (task.returncode == 0) > + except OSError: > + commandOptionString = "" > + for option in listOfCommandOptions: > + commandOptionString += "%s " %(option) > + message = "An error occurred running the command: $ %s %s" %(command, commandOptionString) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + if (debug): > + if (len(stdout) > 0): > + print stdout > + if (len(stderr) > 0): > + print stderr > + return False > + > +def writeToFile(pathToFilename, data, appendToFile=True, createFile=False): > + [parentDir, filename] = os.path.split(pathToFilename) > + if (os.path.isfile(pathToFilename) or (os.path.isdir(parentDir) and createFile)): > + try: > + filemode = "w" > + if (appendToFile): > + filemode = "a" > + fout = open(pathToFilename, filemode) > + fout.write(data + "\n") > + fout.close() > + return True > + except UnicodeEncodeError, e: > + message = "There was a unicode encode error writing to the file: %s." %(pathToFilename) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + except IOError: > + message = "There was an error writing to the file: %s." %(pathToFilename) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + return False > + > +def mkdirs(pathToDSTDir): > + if (os.path.isdir(pathToDSTDir)): > + return True > + elif ((not os.access(pathToDSTDir, os.F_OK)) and (len(pathToDSTDir) > 0)): > + try: > + os.makedirs(pathToDSTDir) > + except (OSError, os.error): > + message = "Could not create the directory: %s." %(pathToDSTDir) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + except (IOError, os.error): > + message = "Could not create the directory with the path: %s." %(pathToDSTDir) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + return os.path.isdir(pathToDSTDir) > + > +def removePIDFile(): > + message = "Removing the pid file: %s" %(PATH_TO_PID_FILENAME) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > + if (os.path.exists(PATH_TO_PID_FILENAME)): > + try: > + os.remove(PATH_TO_PID_FILENAME) > + except IOError: > + message = "There was an error removing the file: %s." %(PATH_TO_PID_FILENAME) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + > +def exitScript(removePidFile=True, errorCode=0): > + if (removePidFile): > + removePIDFile() > + message = "The script will exit." > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + sys.exit(errorCode) > + > +# ##################################################################### > +# Helper functions for gathering the lockdumps. > +# ##################################################################### > +def getClusterNode(listOfGFS2Names): > + # Return a ClusterNode object if the clusternode and cluster name are found > + # in the output, else return None. > + commandList= ["cman_tool", "status"] Since cman is no longer around we should update this script to work with a Fedora cluster before we ship this script in the Fedora package. > + stdout = "" > + try: > + task = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.PIPE) > + task.wait() > + (stdout, stderr) = task.communicate() > + except OSError: > + commandOptionString = "" > + for option in commandList: > + commandOptionString += "%s " %(option) > + message = "An error occurred running the command: $ %s" %(commandOptionString) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return None > + stdoutSplit = stdout.split("\n") > + clusterName = "" > + clusternodeName = "" > + for line in stdoutSplit: > + if (line.startswith("Cluster Name:")): > + clusterName = line.split("Cluster Name:")[1].strip().rstrip() > + if (line.startswith("Node name: ")): > + clusternodeName = line.split("Node name:")[1].strip().rstrip() > + if ((len(clusterName) > 0) and (len(clusternodeName) > 0)): > + return ClusterNode(clusternodeName, clusterName, listOfGFS2Names) > + return None > + > +def mountFilesystemDebug(enableMounting=True): > + if (os.path.ismount(PATH_TO_DEBUG_DIR)): > + message = "The debug filesystem %s is mounted." %(PATH_TO_DEBUG_DIR) > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + return True > + else: > + message = "The debug filesystem %s is not mounted." %(PATH_TO_DEBUG_DIR) > + logging.getLogger(MAIN_LOGGER_NAME).warning(message) > + if (cmdLineOpts.enableMountDebugFS): > + if(mountFilesystem("/bin/mount", "none", PATH_TO_DEBUG_DIR, "debugfs")): > + message = "The debug filesystem was mounted: %s." %(PATH_TO_DEBUG_DIR) > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + return True > + return False > + > +def mountFilesystem(pathToMountCommand, pathToDevice, pathToMountPoint, filesystemType): > + if (os.path.ismount(PATH_TO_DEBUG_DIR)): > + return True > + listOfCommandOptions = ["-t", filesystemType, pathToDevice, pathToMountPoint] > + if (not runCommand(pathToMountCommand, listOfCommandOptions)): > + message = "There was an error mounting the filesystem type %s for the device %s to the mount point %s." %(filesystemType, pathToDevice, pathToMountPoint) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + message = "The standard error is below: \n\t %s" %(stderr) I'm not sure where stderr comes from here ^ > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > + return os.path.ismount(PATH_TO_DEBUG_DIR) > + > +def gatherGeneralInformation(pathToDSTDir): > + # Maybe add cluster node name, uname -a, etc > + systemString = "HOSTNAME: %s\nDATE: %s" %(platform.node(), time.strftime("%Y-%m-%d_%H:%M:%S")) > + writeToFile(os.path.join(pathToDSTDir, "system.txt"), systemString, createFile=True) > + # Get "cman_tool node -F id,type,name" data. > + command = "cman_tool" > + pathToCommandOutput = os.path.join(pathToDSTDir, "cman_tool-nodes.txt") > + try: > + fout = open(pathToCommandOutput, "w") > + runCommand(command, ["nodes", "-F", "id,type,name"], standardOut=fout) > + fout.close() > + except IOError: > + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + # Get "cman_tool services" data. > + command = "cman_tool" > + pathToCommandOutput = os.path.join(pathToDSTDir, "cman_tool-services.txt") > + try: > + fout = open(pathToCommandOutput, "w") > + runCommand(command, ["services"], standardOut=fout) > + fout.close() > + except IOError: > + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + # Get "clustat" data. > + command = "clustat" > + pathToCommandOutput = os.path.join(pathToDSTDir, "clustat.txt") > + try: > + fout = open(pathToCommandOutput, "w") > + runCommand(command, [], standardOut=fout) > + fout.close() > + except IOError: > + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + # Get "mount -l" filesystem data. > + command = "cat" > + pathToCommandOutput = os.path.join(pathToDSTDir, "cat-proc_mounts.txt") > + try: > + fout = open(pathToCommandOutput, "w") > + runCommand(command, ["/proc/mounts"], standardOut=fout) > + fout.close() > + except IOError: > + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + # Get "ps -eo user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan" data. > + command = "ps" > + pathToCommandOutput = os.path.join(pathToDSTDir, "ps.txt") > + try: > + fout = open(pathToCommandOutput, "w") > + #runCommand(command, ["-eo", "user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan"], standardOut=fout) > + runCommand(command, ["h", "-AL", "-o", "tid,s,cmd"], standardOut=fout) > + fout.close() > + except IOError: > + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + # Get "group_tool ls" data. > + command = "group_tool" > + pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-ls.txt") > + try: > + fout = open(pathToCommandOutput, "w") > + runCommand(command, ["ls"], standardOut=fout) > + fout.close() > + except IOError: > + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + # Get "group_tool dump fence" data. > + command = "group_tool" > + pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-dump_fence.txt") > + try: > + fout = open(pathToCommandOutput, "w") > + runCommand(command, ["dump", "fence"], standardOut=fout) > + fout.close() > + except IOError: > + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + # Get "group_tool dump gfs2" data. > + command = "group_tool" > + pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-dump_gfs2.txt") > + try: > + fout = open(pathToCommandOutput, "w") > + runCommand(command, ["dump", "gfs2"], standardOut=fout) > + fout.close() > + except IOError: > + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + > +def triggerSysRQEvents(): > + command = "echo" > + pathToSysrqTriggerFile = "/proc/sysrq-trigger" > + # m - dump information about memory allocation > + # t - dump thread state information > + triggers = ["m", "t"] > + for trigger in triggers: > + try: > + fout = open(pathToSysrqTriggerFile, "w") > + runCommand(command, [trigger], standardOut=fout) > + fout.close() > + except IOError: > + message = "There was an error the command output for %s to the file %s." %(command, pathToSysrqTriggerFile) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + > +def gatherLogs(pathToDSTDir): > + if (mkdirs(pathToDSTDir)): > + # Copy messages logs that contain the sysrq data. > + pathToLogFile = "/var/log/messages" > + pathToDSTLogFile = os.path.join(pathToDSTDir, os.path.basename(pathToLogFile)) > + try: > + shutil.copyfile(pathToLogFile, pathToDSTLogFile) > + except shutil.Error: > + message = "There was an error copying the file: %s to %s." %(pathToLogFile, pathToDSTLogFile) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + > + pathToLogDir = "/var/log/cluster" > + pathToDSTLogDir = os.path.join(pathToDSTDir, os.path.basename(pathToLogDir)) > + if (os.path.isdir(pathToLogDir)): > + try: > + shutil.copytree(pathToLogDir, pathToDSTLogDir) > + except shutil.Error: > + message = "There was an error copying the directory: %s to %s." %(pathToLogDir, pathToDSTLogDir) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + > +def gatherDLMLockDumps(pathToDSTDir, listOfGFS2Filesystems): > + lockDumpType = "dlm" > + pathToSrcDir = os.path.join(PATH_TO_DEBUG_DIR, lockDumpType) > + pathToOutputDir = os.path.join(pathToDSTDir, lockDumpType) > + message = "Copying the %s lockdump data from the directory for the %s." %(lockDumpType, pathToSrcDir) > + logging.getLogger(MAIN_LOGGER_NAME).status(message) pylint tells me that the logger doesn't have a .status() method > + for filename in os.listdir(pathToSrcDir): > + for name in listOfGFS2Filesystems: > + if (filename.startswith(name)): > + pathToCurrentFilename = os.path.join(pathToSrcDir, filename) > + pathToDSTDir = os.path.join(pathToOutputDir, name) > + mkdirs(pathToDSTDir) > + pathToDSTFilename = os.path.join(pathToDSTDir, filename) > + try: > + shutil.copy(pathToCurrentFilename, pathToDSTFilename) > + except shutil.Error: > + message = "There was an error copying the file: %s to %s." %(pathToCurrentFilename, pathToDSTFilename) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + except OSError: > + message = "There was an error copying the file: %s to %s." %(pathToCurrentFilename, pathToDSTFilename) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + > +def gatherGFS2LockDumps(pathToDSTDir, listOfGFS2Filesystems): > + lockDumpType = "gfs2" > + pathToSrcDir = os.path.join(PATH_TO_DEBUG_DIR, lockDumpType) > + pathToOutputDir = os.path.join(pathToDSTDir, lockDumpType) > + for dirName in os.listdir(pathToSrcDir): > + pathToCurrentDir = os.path.join(pathToSrcDir, dirName) > + if ((os.path.isdir(pathToCurrentDir)) and (dirName in listOfGFS2Filesystems)): > + mkdirs(pathToOutputDir) > + pathToDSTDir = os.path.join(pathToOutputDir, dirName) > + try: > + message = "Copying the lockdump data for the %s filesystem: %s" %(lockDumpType, dirName) > + logging.getLogger(MAIN_LOGGER_NAME).status(message) > + shutil.copytree(pathToCurrentDir, pathToDSTDir) > + except shutil.Error: > + message = "There was an error copying the directory: %s to %s." %(pathToCurrentDir, pathToDSTDir) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + except OSError: > + message = "There was an error copying the directory: %s to %s." %(pathToCurrentDir, pathToDSTDir) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + > +def archiveData(pathToSrcDir): > + # Compress the file so that it will have a smaller file name. > + if (os.path.exists(pathToSrcDir)): > + pathToTarFilename = "%s.tar.bz2" %(pathToSrcDir) > + message = "Creating a compressed archvied file: %s" %(pathToTarFilename) > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + try: > + tar = tarfile.open(pathToTarFilename, "w:bz2") > + tar.add(pathToSrcDir, arcname=os.path.basename(pathToSrcDir)) > + tar.close() > + except tarfile.TarError: > + message = "There was an error creating the tarfile: %s." %(pathToTarFilename) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return "" > + if (os.path.exists(pathToTarFilename)): > + return pathToTarFilename > + return "" > + > +# ############################################################################## > +# Get user selected options > +# ############################################################################## > +def __getOptions(version) : > + cmdParser = OptionParserExtended(version) > + cmdParser.add_option("-d", "--debug", > + action="store_true", > + dest="enableDebugLogging", > + help="Enables debug logging.", > + default=False) > + cmdParser.add_option("-q", "--quiet", > + action="store_true", > + dest="disableLoggingToConsole", > + help="Disables logging to console.", > + default=False) > + cmdParser.add_option("-i", "--info", > + action="store_true", > + dest="enablePrintInfo", > + help="Prints to console some basic information about the GFS2 filesystems mounted on the cluster node.", > + default=False) > + cmdParser.add_option("-M", "--mount_debug_fs", > + action="store_true", > + dest="enableMountDebugFS", > + help="Enables the mounting of the debug filesystem if it is not mounted. Default is disabled.", > + default=False) > + cmdParser.add_option("-o", "--path_to_output_dir", > + action="store", > + dest="pathToOutputDir", > + help="The path to the output directory where all the collect data will be stored. Default is /tmp/--%s" %(os.path.basename(sys.argv[0])), > + type="string", > + default="") > + cmdParser.add_option("-r", "--num_of_runs", > + action="store", > + dest="numberOfRuns", > + help="The number of lockdumps runs to do. Default is 2.", > + type="int", > + default=2) > + cmdParser.add_option("-s", "--seconds_sleep", > + action="store", > + dest="secondsToSleep", > + help="The number of seconds sleep between runs. Default is 120 seconds.", > + type="int", > + default=120) > + cmdParser.add_option("-t", "--archive", > + action="store_true", > + dest="enableArchiveOutputDir", > + help="Enables archiving and compressing of the output directory with tar and bzip2. Default is disabled.", > + default=False) > + cmdParser.add_option("-n", "--fs_name", > + action="extend", > + dest="listOfGFS2Names", > + help="List of GFS2 filesystems that will have their lockdump data gathered.", > + type="string", > + default=[]) # Get the options and return the result. > + (cmdLineOpts, cmdLineArgs) = cmdParser.parse_args() > + return (cmdLineOpts, cmdLineArgs) > + > +# ############################################################################## > +# OptParse classes for commandline options > +# ############################################################################## > +class OptionParserExtended(OptionParser): > + """ > + This is the class that gets the command line options the end user > + selects. > + """ > + def __init__(self, version) : > + self.__commandName = os.path.basename(sys.argv[0]) > + versionMessage = "%s %s\n" %(self.__commandName, version) > + > + commandDescription ="%s will capture information about lockdata data for GFS2 and DLM required to analyze a GFS2 filesystem.\n"%(self.__commandName) > + > + OptionParser.__init__(self, option_class=ExtendOption, > + version=versionMessage, > + description=commandDescription) > + > + def print_help(self): > + self.print_version() > + examplesMessage = "\n" > + examplesMessage = "\nPrints information about the available GFS2 filesystems that can have lockdump data captured." > + examplesMessage += "\n$ %s -i\n" %(self.__commandName) > + examplesMessage += "\nThis command will mount the debug directory if it is not mounted. It will do 3 runs of\n" > + examplesMessage += "gathering the lockdump information in 10 second intervals for only the GFS2 filesystems\n" > + examplesMessage += "with the names myGFS2vol2,myGFS2vol1. Then it will archive and compress the data collected." > + examplesMessage += "\n$ %s -M -r 3 -s 10 -t -n myGFS2vol2,myGFS2vol1\n" %(self.__commandName) > + OptionParser.print_help(self) > + print examplesMessage > + > + > +class ExtendOption (Option): > + """ > + Allow to specify comma delimited list of entries for arrays > + and dictionaries. > + """ > + ACTIONS = Option.ACTIONS + ("extend",) > + STORE_ACTIONS = Option.STORE_ACTIONS + ("extend",) > + TYPED_ACTIONS = Option.TYPED_ACTIONS + ("extend",) > + > + def take_action(self, action, dest, opt, value, values, parser): > + if (action == "extend") : > + valueList=[] > + try: > + for v in value.split(","): > + # Need to add code for dealing with paths if there is option for paths. > + valueList.append(v) > + except: > + pass > + else: > + values.ensure_value(dest, []).extend(valueList) > + else: > + Option.take_action(self, action, dest, opt, value, values, parser) > + > +# ############################################################################### > +# Main Function > +# ############################################################################### > +if __name__ == "__main__": > + try: > + # ####################################################################### > + # Get the options from the commandline. > + # ####################################################################### > + (cmdLineOpts, cmdLineArgs) = __getOptions(VERSION_NUMBER) > + > + # ####################################################################### > + # Setup the logger and create config directory > + # ####################################################################### > + # Create the logger > + logLevel = logging.INFO > + logger = logging.getLogger(MAIN_LOGGER_NAME) > + logger.setLevel(logLevel) > + # Create a new status function and level. > + logging.STATUS = logging.INFO + 2 > + logging.addLevelName(logging.STATUS, "STATUS") > + # Create a function for the STATUS_LEVEL since not defined by python. This > + # means you can call it like the other predefined message > + # functions. Example: logging.getLogger("loggerName").status(message) > + setattr(logger, "status", lambda *args: logger.log(logging.STATUS, *args)) > + ch = logging.StreamHandler() > + ch.setLevel(logLevel) > + ch.setFormatter(logging.Formatter(MAIN_LOGGER_FORMAT)) > + logger.addHandler(ch) > + > + # ####################################################################### > + # Set the logging levels. > + # ####################################################################### > + if ((cmdLineOpts.enableDebugLogging) and (not cmdLineOpts.disableLoggingToConsole)): > + logging.getLogger(MAIN_LOGGER_NAME).setLevel(logging.DEBUG) > + ch.setLevel(logging.DEBUG) > + message = "Debugging has been enabled." > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > + if (cmdLineOpts.disableLoggingToConsole): > + logging.disable(logging.CRITICAL) > + > + # ####################################################################### > + # Check to see if pid file exists and error if it does. > + # ####################################################################### > + if (os.path.exists(PATH_TO_PID_FILENAME)): > + message = "The PID file %s already exists and this script cannot run till it does not exist." %(PATH_TO_PID_FILENAME) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + message = "Verify that there are no other existing processes running. If there are running processes those need to be stopped first and the file removed." > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + exitScript(removePidFile=False, errorCode=1) > + else: > + message = "Creating the pid file: %s" %(PATH_TO_PID_FILENAME) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > + # Creata the pid file so we dont have more than 1 process of this > + # script running. > + writeToFile(PATH_TO_PID_FILENAME, str(os.getpid()), createFile=True) > + > + # Get the clusternode name. > + clusternode = getClusterNode(cmdLineOpts.listOfGFS2Names) > + if (clusternode == None): > + message = "The cluster or cluster node name could not be found from \"cman_tool status\"." > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + exitScript(removePidFile=False, errorCode=1) > + if (cmdLineOpts.enablePrintInfo): > + logging.disable(logging.CRITICAL) > + print "List of all the mounted GFS2 filesystems that can have their lockdump data captured:" > + print clusternode > + exitScript() > + # ####################################################################### > + # Create the output directory to verify it can be created before > + # proceeding unless it is already created from a previous run data needs > + # to be analyzed. Probably could add more debugging on if file or dir. > + # ####################################################################### > + message = "The gathering of the lockdumps will be performed on the clusternode \"%s\" which is part of the cluster \"%s\"." %(clusternode.getClusterNodeName(), clusternode.getClusterName()) > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + pathToOutputDir = cmdLineOpts.pathToOutputDir > + if (not len(pathToOutputDir) > 0): > + pathToOutputDir = "%s" %(os.path.join("/tmp", "%s-%s-%s" %(time.strftime("%Y-%m-%d_%H%M%S"), clusternode.getClusterNodeName(), os.path.basename(sys.argv[0])))) > + if (os.path.exists(pathToOutputDir)): > + message = "The directory already exists and could contain previous lockdump data: %s" %(pathToOutputDir) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + exitScript(errorCode=1) > + else: > + message = "This directory that will be used to capture all the data: %s" %(pathToOutputDir) > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + if (not mkdirs(pathToOutputDir)): > + exitScript(errorCode=1) > + > + # ####################################################################### > + # Check to see if the debug directory is mounted. If not then > + # log an error. > + # ####################################################################### > + result = mountFilesystemDebug(cmdLineOpts.enableMountDebugFS) > + if (not result): > + message = "Please mount the debug filesystem before running this script. For example: $ mount none -t debugfs %s" %(PATH_TO_DEBUG_DIR) > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + exitScript(errorCode=1) > + > + # ####################################################################### > + # Gather data and the lockdumps. > + # ####################################################################### > + message = "The process of gathering all the required files will begin before capturing the lockdumps." > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + for i in range(0,cmdLineOpts.numberOfRuns): > + # Add clusternode name under each run dir to make combining multple > + # clusternode gfs2_lockgather data together and all data in each run directory. > + pathToOutputRunDir = os.path.join(pathToOutputDir, "run%d/%s" %(i, clusternode.getClusterNodeName())) > + if (not mkdirs(pathToOutputRunDir)): > + exitOnError() exitOnError doesn't seem to be defined? > + # Gather various bits of data from the clusternode. > + message = "Gathering some general information about the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i) > + logging.getLogger(MAIN_LOGGER_NAME).status(message) > + gatherGeneralInformation(pathToOutputRunDir) > + # Trigger sysrq events to capture memory and thread information > + message = "Triggering the sysrq events for the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i) > + logging.getLogger(MAIN_LOGGER_NAME).status(message) > + triggerSysRQEvents() > + # Gather the dlm locks. > + lockDumpType = "dlm" > + message = "Gathering the %s lock dumps for clusternode %s for run %d." %(lockDumpType, clusternode.getClusterNodeName(), i) > + logging.getLogger(MAIN_LOGGER_NAME).status(message) > + gatherDLMLockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames(includeClusterName=False)) > + # Gather the glock locks from gfs2. > + lockDumpType = "gfs2" > + message = "Gathering the %s lock dumps for clusternode %s for run %d." %(lockDumpType, clusternode.getClusterNodeName(), i) > + logging.getLogger(MAIN_LOGGER_NAME).status(message) > + gatherGFS2LockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames()) > + # Gather log files > + message = "Gathering the log files for the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i) > + logging.getLogger(MAIN_LOGGER_NAME).status(message) > + gatherLogs(os.path.join(pathToOutputRunDir, "logs")) > + if (cmdLineOpts.secondsToSleep > 0): > + message = "The script will sleep for %d seconds between each run of capturing the lockdumps." %(cmdLineOpts.secondsToSleep) > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + time.sleep(cmdLineOpts.secondsToSleep) > + # ####################################################################### > + # Archive the file if enabled and print the location of the output > + # directory. > + # ####################################################################### > + # After it is done the we should print out where the files that were > + # generated are located and what to do. > + message = "All the files have been gathered and this directory contains all the captured data: %s" %(pathToOutputDir) > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + > + # ####################################################################### > + # Archive the directory that contains all the data and archive it. > + # ####################################################################### > + if (cmdLineOpts.enableArchiveOutputDir): > + message = "The lockdump data will now be archived. This could some time depending on the size of the data collected." > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + pathToTarFilename = archiveData(pathToOutputDir) > + if (os.path.exists(pathToTarFilename)): > + message = "The compressed archvied file was created: %s" %(pathToTarFilename) > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + > + # ####################################################################### > + except KeyboardInterrupt: > + print "" > + message = "This script will exit since control-c was executed by end user." > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + exitScript(errorCode=1) > + # ####################################################################### > + # Exit the application with zero exit code since we cleanly exited. > + # ####################################################################### > + exitScript() >