From mboxrd@z Thu Jan 1 00:00:00 1970 From: Shane Bradley Date: Thu, 1 Nov 2012 11:26:53 -0400 Subject: [Cluster-devel] [PATCH] Adding gfs2_lockcapture Message-ID: <1351783613-18537-1-git-send-email-sbradley@redhat.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit --- gfs2/lockgather/gfs2_lockcapture | 723 ++++++++++++++++++++++++++++++++++++++ 1 files changed, 723 insertions(+), 0 deletions(-) create mode 100644 gfs2/lockgather/gfs2_lockcapture diff --git a/gfs2/lockgather/gfs2_lockcapture b/gfs2/lockgather/gfs2_lockcapture new file mode 100644 index 0000000..d040738 --- /dev/null +++ b/gfs2/lockgather/gfs2_lockcapture @@ -0,0 +1,723 @@ +#!/usr/bin/env python +""" +This script will gather gfs2 and dlm lock information for a single cluster node +for all the mounted GFS2 filesystems. + +TODO: +* Should there be option to disable sysrq events in case it could trigger panic. +* Add option to write log to file + +* Add a better description. +* Add examples for all options. +* Add better description of options and has steve to review those and tweak my + option descriptions. + + at author : Shane Bradley + at contact : sbradley at redhat.com + at version : 0.9 + at copyright : GPLv2 +""" +import sys +import os +import os.path +import logging +from optparse import OptionParser, Option +import time +import platform +import shutil +import subprocess +import tarfile + +VERSION_NUMBER = "0.9-1" +# ##################################################################### +# Global vars: +# ##################################################################### +# Name of the logger +MAIN_LOGGER_NAME = "%s" %(os.path.basename(sys.argv[0])) +# Format of the logger +MAIN_LOGGER_FORMAT = "%(levelname)s %(message)s" +# Path to debug root +PATH_TO_DEBUG_DIR="/sys/kernel/debug" +# Path to the pid file that will be used for locking. +PATH_TO_PID_FILENAME = "/var/run/%s.pid" %(os.path.basename(sys.argv[0])) + + +# ##################################################################### +# Class to define what a clusternode is. +# ##################################################################### +class ClusterNode: + def __init__(self, clusternodeName, clusterName, listOfGFS2Names): + self.__clusternodeName = clusternodeName + self.__clusterName = clusterName + + # List of the mounted filesystem from the mount -l command. + self.__listOfMountedGFS2Filesystems = self.__getMountedGFS2Filesystems() + + # List of mounted GFS2 labels for this cluster from mount -l command. + listOfGFS2MountedFilesystemLabels = self.__getMountedFilesystemLabel(self.__listOfMountedGFS2Filesystems) + self.__listOfGFS2MountedFilesystemLabels = [] + if (not len(listOfGFS2Names) > 0): + # If no items in listOfGFS2Names then add them all. + self.__listOfGFS2MountedFilesystemLabels = listOfGFS2MountedFilesystemLabels + else: + for label in listOfGFS2MountedFilesystemLabels: + for name in listOfGFS2Names: + if ((name == label) or ("%s:%s"%(self.__clusterName, name) == label)): + self.__listOfGFS2MountedFilesystemLabels.append(label) + + def __str__(self): + rString = "" + rString += "%s:%s" %(self.getClusterName(), self.getClusterNodeName()) + for fsName in self.getMountedGFS2FilesystemNames(): + rString += "\n\t%s" %(fsName) + for mountedFS in self.__listOfMountedGFS2Filesystems: + if (mountedFS.find(fsName) >= 0): + rString += " --> %s" %(mountedFS) + break + return rString.rstrip() + + def __getMountedFilesystemLabel(self, listOfMountedFilesystems): + listOfMountedFilesystemsLabels = [] + for mountedFilesystem in listOfMountedFilesystems: + splitMountedFilesystem = mountedFilesystem.split() + fsLabel = splitMountedFilesystem[-1].strip().strip("[").rstrip("]") + if (len(fsLabel) > 0): + # Verify it starts with name of the cluster. + if (fsLabel.startswith("%s:" %(self.getClusterName()))): + listOfMountedFilesystemsLabels.append(fsLabel) + return listOfMountedFilesystemsLabels + + def __getMountedGFS2Filesystems(self): + listOfMountedFilesystems = [] + commandList= ["mount", "-l"] + stdout = "" + try: + task = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + task.wait() + (stdout, stderr) = task.communicate() + except OSError: + commandOptionString = "" + for option in commandList: + commandOptionString += "%s " %(option) + message = "An error occurred running the command: $ %s" %(commandOptionString) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + return listOfMountedFilesystems + stdoutSplit = stdout.split("\n") + for line in stdoutSplit: + splitLine = line.split() + if (len(splitLine) >= 5): + if (splitLine[4] == "gfs2"): + listOfMountedFilesystems.append(line) + return listOfMountedFilesystems + + def getClusterNodeName(self): + return self.__clusternodeName + + def getClusterName(self): + return self.__clusterName + + def getMountedGFS2FilesystemNames(self, includeClusterName=True): + # If true will prepend the cluster name to gfs2 fs name + if (includeClusterName): + return self.__listOfGFS2MountedFilesystemLabels + else: + listOfGFS2MountedFilesystemLabels = [] + for fsLabel in self.__listOfGFS2MountedFilesystemLabels: + fsLabelSplit = fsLabel.split(":", 1) + if (len(fsLabelSplit) == 2): + listOfGFS2MountedFilesystemLabels.append(fsLabelSplit[1]) + return listOfGFS2MountedFilesystemLabels + +# ##################################################################### +# Helper functions. +# ##################################################################### +def runCommand(command, listOfCommandOptions, standardOut=subprocess.PIPE, standardError=subprocess.PIPE, debug=False): + stdout = "" + stderr = "" + try: + commandList = [command] + commandList += listOfCommandOptions + task = subprocess.Popen(commandList, stdout=standardOut, stderr=standardError) + task.wait() + (stdout, stderr) = task.communicate() + return (task.returncode == 0) + except OSError: + commandOptionString = "" + for option in listOfCommandOptions: + commandOptionString += "%s " %(option) + message = "An error occurred running the command: $ %s %s" %(command, commandOptionString) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + if (debug): + if (len(stdout) > 0): + print stdout + if (len(stderr) > 0): + print stderr + return False + +def writeToFile(pathToFilename, data, appendToFile=True, createFile=False): + [parentDir, filename] = os.path.split(pathToFilename) + if (os.path.isfile(pathToFilename) or (os.path.isdir(parentDir) and createFile)): + try: + filemode = "w" + if (appendToFile): + filemode = "a" + fout = open(pathToFilename, filemode) + fout.write(data + "\n") + fout.close() + return True + except UnicodeEncodeError, e: + message = "There was a unicode encode error writing to the file: %s." %(pathToFilename) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + return False + except IOError: + message = "There was an error writing to the file: %s." %(pathToFilename) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + return False + return False + +def mkdirs(pathToDSTDir): + if (os.path.isdir(pathToDSTDir)): + return True + elif ((not os.access(pathToDSTDir, os.F_OK)) and (len(pathToDSTDir) > 0)): + try: + os.makedirs(pathToDSTDir) + except (OSError, os.error): + message = "Could not create the directory: %s." %(pathToDSTDir) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + return False + except (IOError, os.error): + message = "Could not create the directory with the path: %s." %(pathToDSTDir) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + return False + return os.path.isdir(pathToDSTDir) + +def removePIDFile(): + message = "Removing the pid file: %s" %(PATH_TO_PID_FILENAME) + logging.getLogger(MAIN_LOGGER_NAME).debug(message) + if (os.path.exists(PATH_TO_PID_FILENAME)): + try: + os.remove(PATH_TO_PID_FILENAME) + except IOError: + message = "There was an error removing the file: %s." %(PATH_TO_PID_FILENAME) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + +def exitScript(removePidFile=True, errorCode=0): + if (removePidFile): + removePIDFile() + message = "The script will exit." + logging.getLogger(MAIN_LOGGER_NAME).info(message) + sys.exit(errorCode) + +# ##################################################################### +# Helper functions for gathering the lockdumps. +# ##################################################################### +def getClusterNode(listOfGFS2Names): + # Return a ClusterNode object if the clusternode and cluster name are found + # in the output, else return None. + commandList= ["cman_tool", "status"] + stdout = "" + try: + task = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + task.wait() + (stdout, stderr) = task.communicate() + except OSError: + commandOptionString = "" + for option in commandList: + commandOptionString += "%s " %(option) + message = "An error occurred running the command: $ %s" %(commandOptionString) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + return None + stdoutSplit = stdout.split("\n") + clusterName = "" + clusternodeName = "" + for line in stdoutSplit: + if (line.startswith("Cluster Name:")): + clusterName = line.split("Cluster Name:")[1].strip().rstrip() + if (line.startswith("Node name: ")): + clusternodeName = line.split("Node name:")[1].strip().rstrip() + if ((len(clusterName) > 0) and (len(clusternodeName) > 0)): + return ClusterNode(clusternodeName, clusterName, listOfGFS2Names) + return None + +def mountFilesystemDebug(enableMounting=True): + if (os.path.ismount(PATH_TO_DEBUG_DIR)): + message = "The debug filesystem %s is mounted." %(PATH_TO_DEBUG_DIR) + logging.getLogger(MAIN_LOGGER_NAME).info(message) + return True + else: + message = "The debug filesystem %s is not mounted." %(PATH_TO_DEBUG_DIR) + logging.getLogger(MAIN_LOGGER_NAME).warning(message) + if (cmdLineOpts.enableMountDebugFS): + if(mountFilesystem("/bin/mount", "none", PATH_TO_DEBUG_DIR, "debugfs")): + message = "The debug filesystem was mounted: %s." %(PATH_TO_DEBUG_DIR) + logging.getLogger(MAIN_LOGGER_NAME).info(message) + return True + return False + +def mountFilesystem(pathToMountCommand, pathToDevice, pathToMountPoint, filesystemType): + if (os.path.ismount(PATH_TO_DEBUG_DIR)): + return True + listOfCommandOptions = ["-t", filesystemType, pathToDevice, pathToMountPoint] + if (not runCommand(pathToMountCommand, listOfCommandOptions)): + message = "There was an error mounting the filesystem type %s for the device %s to the mount point %s." %(filesystemType, pathToDevice, pathToMountPoint) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + message = "The standard error is below: \n\t %s" %(stderr) + logging.getLogger(MAIN_LOGGER_NAME).debug(message) + return os.path.ismount(PATH_TO_DEBUG_DIR) + +def gatherGeneralInformation(pathToDSTDir): + # Maybe add cluster node name, uname -a, etc + systemString = "HOSTNAME: %s\nDATE: %s" %(platform.node(), time.strftime("%Y-%m-%d_%H:%M:%S")) + writeToFile(os.path.join(pathToDSTDir, "system.txt"), systemString, createFile=True) + # Get "cman_tool node -F id,type,name" data. + command = "cman_tool" + pathToCommandOutput = os.path.join(pathToDSTDir, "cman_tool-nodes.txt") + try: + fout = open(pathToCommandOutput, "w") + runCommand(command, ["nodes", "-F", "id,type,name"], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + # Get "cman_tool services" data. + command = "cman_tool" + pathToCommandOutput = os.path.join(pathToDSTDir, "cman_tool-services.txt") + try: + fout = open(pathToCommandOutput, "w") + runCommand(command, ["services"], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + # Get "clustat" data. + command = "clustat" + pathToCommandOutput = os.path.join(pathToDSTDir, "clustat.txt") + try: + fout = open(pathToCommandOutput, "w") + runCommand(command, [], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + # Get "mount -l" filesystem data. + command = "cat" + pathToCommandOutput = os.path.join(pathToDSTDir, "cat-proc_mounts.txt") + try: + fout = open(pathToCommandOutput, "w") + runCommand(command, ["/proc/mounts"], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + # Get "ps -eo user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan" data. + command = "ps" + pathToCommandOutput = os.path.join(pathToDSTDir, "ps.txt") + try: + fout = open(pathToCommandOutput, "w") + #runCommand(command, ["-eo", "user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan"], standardOut=fout) + runCommand(command, ["h", "-AL", "-o", "tid,s,cmd"], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + # Get "group_tool ls" data. + command = "group_tool" + pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-ls.txt") + try: + fout = open(pathToCommandOutput, "w") + runCommand(command, ["ls"], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + # Get "group_tool dump fence" data. + command = "group_tool" + pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-dump_fence.txt") + try: + fout = open(pathToCommandOutput, "w") + runCommand(command, ["dump", "fence"], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + # Get "group_tool dump gfs2" data. + command = "group_tool" + pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-dump_gfs2.txt") + try: + fout = open(pathToCommandOutput, "w") + runCommand(command, ["dump", "gfs2"], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + +def triggerSysRQEvents(): + command = "echo" + pathToSysrqTriggerFile = "/proc/sysrq-trigger" + # m - dump information about memory allocation + # t - dump thread state information + triggers = ["m", "t"] + for trigger in triggers: + try: + fout = open(pathToSysrqTriggerFile, "w") + runCommand(command, [trigger], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToSysrqTriggerFile) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + +def gatherLogs(pathToDSTDir): + if (mkdirs(pathToDSTDir)): + # Copy messages logs that contain the sysrq data. + pathToLogFile = "/var/log/messages" + pathToDSTLogFile = os.path.join(pathToDSTDir, os.path.basename(pathToLogFile)) + try: + shutil.copyfile(pathToLogFile, pathToDSTLogFile) + except shutil.Error: + message = "There was an error copying the file: %s to %s." %(pathToLogFile, pathToDSTLogFile) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + + pathToLogDir = "/var/log/cluster" + pathToDSTLogDir = os.path.join(pathToDSTDir, os.path.basename(pathToLogDir)) + if (os.path.isdir(pathToLogDir)): + try: + shutil.copytree(pathToLogDir, pathToDSTLogDir) + except shutil.Error: + message = "There was an error copying the directory: %s to %s." %(pathToLogDir, pathToDSTLogDir) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + +def gatherDLMLockDumps(pathToDSTDir, listOfGFS2Filesystems): + lockDumpType = "dlm" + pathToSrcDir = os.path.join(PATH_TO_DEBUG_DIR, lockDumpType) + pathToOutputDir = os.path.join(pathToDSTDir, lockDumpType) + message = "Copying the %s lockdump data from the directory for the %s." %(lockDumpType, pathToSrcDir) + logging.getLogger(MAIN_LOGGER_NAME).status(message) + for filename in os.listdir(pathToSrcDir): + for name in listOfGFS2Filesystems: + if (filename.startswith(name)): + pathToCurrentFilename = os.path.join(pathToSrcDir, filename) + pathToDSTDir = os.path.join(pathToOutputDir, name) + mkdirs(pathToDSTDir) + pathToDSTFilename = os.path.join(pathToDSTDir, filename) + try: + shutil.copy(pathToCurrentFilename, pathToDSTFilename) + except shutil.Error: + message = "There was an error copying the file: %s to %s." %(pathToCurrentFilename, pathToDSTFilename) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + except OSError: + message = "There was an error copying the file: %s to %s." %(pathToCurrentFilename, pathToDSTFilename) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + +def gatherGFS2LockDumps(pathToDSTDir, listOfGFS2Filesystems): + lockDumpType = "gfs2" + pathToSrcDir = os.path.join(PATH_TO_DEBUG_DIR, lockDumpType) + pathToOutputDir = os.path.join(pathToDSTDir, lockDumpType) + for dirName in os.listdir(pathToSrcDir): + pathToCurrentDir = os.path.join(pathToSrcDir, dirName) + if ((os.path.isdir(pathToCurrentDir)) and (dirName in listOfGFS2Filesystems)): + mkdirs(pathToOutputDir) + pathToDSTDir = os.path.join(pathToOutputDir, dirName) + try: + message = "Copying the lockdump data for the %s filesystem: %s" %(lockDumpType, dirName) + logging.getLogger(MAIN_LOGGER_NAME).status(message) + shutil.copytree(pathToCurrentDir, pathToDSTDir) + except shutil.Error: + message = "There was an error copying the directory: %s to %s." %(pathToCurrentDir, pathToDSTDir) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + except OSError: + message = "There was an error copying the directory: %s to %s." %(pathToCurrentDir, pathToDSTDir) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + +def archiveData(pathToSrcDir): + # Compress the file so that it will have a smaller file name. + if (os.path.exists(pathToSrcDir)): + pathToTarFilename = "%s.tar.bz2" %(pathToSrcDir) + message = "Creating a compressed archvied file: %s" %(pathToTarFilename) + logging.getLogger(MAIN_LOGGER_NAME).info(message) + try: + tar = tarfile.open(pathToTarFilename, "w:bz2") + tar.add(pathToSrcDir, arcname=os.path.basename(pathToSrcDir)) + tar.close() + except tarfile.TarError: + message = "There was an error creating the tarfile: %s." %(pathToTarFilename) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + return "" + if (os.path.exists(pathToTarFilename)): + return pathToTarFilename + return "" + +# ############################################################################## +# Get user selected options +# ############################################################################## +def __getOptions(version) : + cmdParser = OptionParserExtended(version) + cmdParser.add_option("-d", "--debug", + action="store_true", + dest="enableDebugLogging", + help="Enables debug logging.", + default=False) + cmdParser.add_option("-q", "--quiet", + action="store_true", + dest="disableLoggingToConsole", + help="Disables logging to console.", + default=False) + cmdParser.add_option("-i", "--info", + action="store_true", + dest="enablePrintInfo", + help="Prints to console some basic information about the GFS2 filesystems mounted on the cluster node.", + default=False) + cmdParser.add_option("-M", "--mount_debug_fs", + action="store_true", + dest="enableMountDebugFS", + help="Enables the mounting of the debug filesystem if it is not mounted. Default is disabled.", + default=False) + cmdParser.add_option("-o", "--path_to_output_dir", + action="store", + dest="pathToOutputDir", + help="The path to the output directory where all the collect data will be stored. Default is /tmp/--%s" %(os.path.basename(sys.argv[0])), + type="string", + default="") + cmdParser.add_option("-r", "--num_of_runs", + action="store", + dest="numberOfRuns", + help="The number of lockdumps runs to do. Default is 2.", + type="int", + default=2) + cmdParser.add_option("-s", "--seconds_sleep", + action="store", + dest="secondsToSleep", + help="The number of seconds sleep between runs. Default is 120 seconds.", + type="int", + default=120) + cmdParser.add_option("-t", "--archive", + action="store_true", + dest="enableArchiveOutputDir", + help="Enables archiving and compressing of the output directory with tar and bzip2. Default is disabled.", + default=False) + cmdParser.add_option("-n", "--fs_name", + action="extend", + dest="listOfGFS2Names", + help="List of GFS2 filesystems that will have their lockdump data gathered.", + type="string", + default=[]) # Get the options and return the result. + (cmdLineOpts, cmdLineArgs) = cmdParser.parse_args() + return (cmdLineOpts, cmdLineArgs) + +# ############################################################################## +# OptParse classes for commandline options +# ############################################################################## +class OptionParserExtended(OptionParser): + """ + This is the class that gets the command line options the end user + selects. + """ + def __init__(self, version) : + self.__commandName = os.path.basename(sys.argv[0]) + versionMessage = "%s %s\n" %(self.__commandName, version) + + commandDescription ="%s will capture information about lockdata data for GFS2 and DLM required to analyze a GFS2 filesystem.\n"%(self.__commandName) + + OptionParser.__init__(self, option_class=ExtendOption, + version=versionMessage, + description=commandDescription) + + def print_help(self): + self.print_version() + examplesMessage = "\n" + examplesMessage = "\nPrints information about the available GFS2 filesystems that can have lockdump data captured." + examplesMessage += "\n$ %s -i\n" %(self.__commandName) + examplesMessage += "\nThis command will mount the debug directory if it is not mounted. It will do 3 runs of\n" + examplesMessage += "gathering the lockdump information in 10 second intervals for only the GFS2 filesystems\n" + examplesMessage += "with the names myGFS2vol2,myGFS2vol1. Then it will archive and compress the data collected." + examplesMessage += "\n$ %s -M -r 3 -s 10 -t -n myGFS2vol2,myGFS2vol1\n" %(self.__commandName) + OptionParser.print_help(self) + print examplesMessage + + +class ExtendOption (Option): + """ + Allow to specify comma delimited list of entries for arrays + and dictionaries. + """ + ACTIONS = Option.ACTIONS + ("extend",) + STORE_ACTIONS = Option.STORE_ACTIONS + ("extend",) + TYPED_ACTIONS = Option.TYPED_ACTIONS + ("extend",) + + def take_action(self, action, dest, opt, value, values, parser): + if (action == "extend") : + valueList=[] + try: + for v in value.split(","): + # Need to add code for dealing with paths if there is option for paths. + valueList.append(v) + except: + pass + else: + values.ensure_value(dest, []).extend(valueList) + else: + Option.take_action(self, action, dest, opt, value, values, parser) + +# ############################################################################### +# Main Function +# ############################################################################### +if __name__ == "__main__": + try: + # ####################################################################### + # Get the options from the commandline. + # ####################################################################### + (cmdLineOpts, cmdLineArgs) = __getOptions(VERSION_NUMBER) + + # ####################################################################### + # Setup the logger and create config directory + # ####################################################################### + # Create the logger + logLevel = logging.INFO + logger = logging.getLogger(MAIN_LOGGER_NAME) + logger.setLevel(logLevel) + # Create a new status function and level. + logging.STATUS = logging.INFO + 2 + logging.addLevelName(logging.STATUS, "STATUS") + # Create a function for the STATUS_LEVEL since not defined by python. This + # means you can call it like the other predefined message + # functions. Example: logging.getLogger("loggerName").status(message) + setattr(logger, "status", lambda *args: logger.log(logging.STATUS, *args)) + ch = logging.StreamHandler() + ch.setLevel(logLevel) + ch.setFormatter(logging.Formatter(MAIN_LOGGER_FORMAT)) + logger.addHandler(ch) + + # ####################################################################### + # Set the logging levels. + # ####################################################################### + if ((cmdLineOpts.enableDebugLogging) and (not cmdLineOpts.disableLoggingToConsole)): + logging.getLogger(MAIN_LOGGER_NAME).setLevel(logging.DEBUG) + ch.setLevel(logging.DEBUG) + message = "Debugging has been enabled." + logging.getLogger(MAIN_LOGGER_NAME).debug(message) + if (cmdLineOpts.disableLoggingToConsole): + logging.disable(logging.CRITICAL) + + # ####################################################################### + # Check to see if pid file exists and error if it does. + # ####################################################################### + if (os.path.exists(PATH_TO_PID_FILENAME)): + message = "The PID file %s already exists and this script cannot run till it does not exist." %(PATH_TO_PID_FILENAME) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + message = "Verify that there are no other existing processes running. If there are running processes those need to be stopped first and the file removed." + logging.getLogger(MAIN_LOGGER_NAME).info(message) + exitScript(removePidFile=False, errorCode=1) + else: + message = "Creating the pid file: %s" %(PATH_TO_PID_FILENAME) + logging.getLogger(MAIN_LOGGER_NAME).debug(message) + # Creata the pid file so we dont have more than 1 process of this + # script running. + writeToFile(PATH_TO_PID_FILENAME, str(os.getpid()), createFile=True) + + # Get the clusternode name. + clusternode = getClusterNode(cmdLineOpts.listOfGFS2Names) + if (clusternode == None): + message = "The cluster or cluster node name could not be found from \"cman_tool status\"." + logging.getLogger(MAIN_LOGGER_NAME).error(message) + exitScript(removePidFile=False, errorCode=1) + if (cmdLineOpts.enablePrintInfo): + logging.disable(logging.CRITICAL) + print "List of all the mounted GFS2 filesystems that can have their lockdump data captured:" + print clusternode + exitScript() + # ####################################################################### + # Create the output directory to verify it can be created before + # proceeding unless it is already created from a previous run data needs + # to be analyzed. Probably could add more debugging on if file or dir. + # ####################################################################### + message = "The gathering of the lockdumps will be performed on the clusternode \"%s\" which is part of the cluster \"%s\"." %(clusternode.getClusterNodeName(), clusternode.getClusterName()) + logging.getLogger(MAIN_LOGGER_NAME).info(message) + pathToOutputDir = cmdLineOpts.pathToOutputDir + if (not len(pathToOutputDir) > 0): + pathToOutputDir = "%s" %(os.path.join("/tmp", "%s-%s-%s" %(time.strftime("%Y-%m-%d_%H%M%S"), clusternode.getClusterNodeName(), os.path.basename(sys.argv[0])))) + if (os.path.exists(pathToOutputDir)): + message = "The directory already exists and could contain previous lockdump data: %s" %(pathToOutputDir) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + exitScript(errorCode=1) + else: + message = "This directory that will be used to capture all the data: %s" %(pathToOutputDir) + logging.getLogger(MAIN_LOGGER_NAME).info(message) + if (not mkdirs(pathToOutputDir)): + exitScript(errorCode=1) + + # ####################################################################### + # Check to see if the debug directory is mounted. If not then + # log an error. + # ####################################################################### + result = mountFilesystemDebug(cmdLineOpts.enableMountDebugFS) + if (not result): + message = "Please mount the debug filesystem before running this script. For example: $ mount none -t debugfs %s" %(PATH_TO_DEBUG_DIR) + logging.getLogger(MAIN_LOGGER_NAME).info(message) + exitScript(errorCode=1) + + # ####################################################################### + # Gather data and the lockdumps. + # ####################################################################### + message = "The process of gathering all the required files will begin before capturing the lockdumps." + logging.getLogger(MAIN_LOGGER_NAME).info(message) + for i in range(0,cmdLineOpts.numberOfRuns): + # Add clusternode name under each run dir to make combining multple + # clusternode gfs2_lockgather data together and all data in each run directory. + pathToOutputRunDir = os.path.join(pathToOutputDir, "run%d/%s" %(i, clusternode.getClusterNodeName())) + if (not mkdirs(pathToOutputRunDir)): + exitOnError() + # Gather various bits of data from the clusternode. + message = "Gathering some general information about the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i) + logging.getLogger(MAIN_LOGGER_NAME).status(message) + gatherGeneralInformation(pathToOutputRunDir) + # Trigger sysrq events to capture memory and thread information + message = "Triggering the sysrq events for the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i) + logging.getLogger(MAIN_LOGGER_NAME).status(message) + triggerSysRQEvents() + # Gather the dlm locks. + lockDumpType = "dlm" + message = "Gathering the %s lock dumps for clusternode %s for run %d." %(lockDumpType, clusternode.getClusterNodeName(), i) + logging.getLogger(MAIN_LOGGER_NAME).status(message) + gatherDLMLockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames(includeClusterName=False)) + # Gather the glock locks from gfs2. + lockDumpType = "gfs2" + message = "Gathering the %s lock dumps for clusternode %s for run %d." %(lockDumpType, clusternode.getClusterNodeName(), i) + logging.getLogger(MAIN_LOGGER_NAME).status(message) + gatherGFS2LockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames()) + # Gather log files + message = "Gathering the log files for the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i) + logging.getLogger(MAIN_LOGGER_NAME).status(message) + gatherLogs(os.path.join(pathToOutputRunDir, "logs")) + if (cmdLineOpts.secondsToSleep > 0): + message = "The script will sleep for %d seconds between each run of capturing the lockdumps." %(cmdLineOpts.secondsToSleep) + logging.getLogger(MAIN_LOGGER_NAME).info(message) + time.sleep(cmdLineOpts.secondsToSleep) + # ####################################################################### + # Archive the file if enabled and print the location of the output + # directory. + # ####################################################################### + # After it is done the we should print out where the files that were + # generated are located and what to do. + message = "All the files have been gathered and this directory contains all the captured data: %s" %(pathToOutputDir) + logging.getLogger(MAIN_LOGGER_NAME).info(message) + + # ####################################################################### + # Archive the directory that contains all the data and archive it. + # ####################################################################### + if (cmdLineOpts.enableArchiveOutputDir): + message = "The lockdump data will now be archived. This could some time depending on the size of the data collected." + logging.getLogger(MAIN_LOGGER_NAME).info(message) + pathToTarFilename = archiveData(pathToOutputDir) + if (os.path.exists(pathToTarFilename)): + message = "The compressed archvied file was created: %s" %(pathToTarFilename) + logging.getLogger(MAIN_LOGGER_NAME).info(message) + + # ####################################################################### + except KeyboardInterrupt: + print "" + message = "This script will exit since control-c was executed by end user." + logging.getLogger(MAIN_LOGGER_NAME).error(message) + exitScript(errorCode=1) + # ####################################################################### + # Exit the application with zero exit code since we cleanly exited. + # ####################################################################### + exitScript() -- 1.7.1