* [Cluster-devel] [PATCH] Adding gfs2_lockcapture
@ 2012-11-01 15:26 Shane Bradley
2012-11-01 16:45 ` Andrew Price
0 siblings, 1 reply; 2+ messages in thread
From: Shane Bradley @ 2012-11-01 15:26 UTC (permalink / raw)
To: cluster-devel.redhat.com
---
gfs2/lockgather/gfs2_lockcapture | 723 ++++++++++++++++++++++++++++++++++++++
1 files changed, 723 insertions(+), 0 deletions(-)
create mode 100644 gfs2/lockgather/gfs2_lockcapture
diff --git a/gfs2/lockgather/gfs2_lockcapture b/gfs2/lockgather/gfs2_lockcapture
new file mode 100644
index 0000000..d040738
--- /dev/null
+++ b/gfs2/lockgather/gfs2_lockcapture
@@ -0,0 +1,723 @@
+#!/usr/bin/env python
+"""
+This script will gather gfs2 and dlm lock information for a single cluster node
+for all the mounted GFS2 filesystems.
+
+TODO:
+* Should there be option to disable sysrq events in case it could trigger panic.
+* Add option to write log to file
+
+* Add a better description.
+* Add examples for all options.
+* Add better description of options and has steve to review those and tweak my
+ option descriptions.
+
+ at author : Shane Bradley
+ at contact : sbradley at redhat.com
+ at version : 0.9
+ at copyright : GPLv2
+"""
+import sys
+import os
+import os.path
+import logging
+from optparse import OptionParser, Option
+import time
+import platform
+import shutil
+import subprocess
+import tarfile
+
+VERSION_NUMBER = "0.9-1"
+# #####################################################################
+# Global vars:
+# #####################################################################
+# Name of the logger
+MAIN_LOGGER_NAME = "%s" %(os.path.basename(sys.argv[0]))
+# Format of the logger
+MAIN_LOGGER_FORMAT = "%(levelname)s %(message)s"
+# Path to debug root
+PATH_TO_DEBUG_DIR="/sys/kernel/debug"
+# Path to the pid file that will be used for locking.
+PATH_TO_PID_FILENAME = "/var/run/%s.pid" %(os.path.basename(sys.argv[0]))
+
+
+# #####################################################################
+# Class to define what a clusternode is.
+# #####################################################################
+class ClusterNode:
+ def __init__(self, clusternodeName, clusterName, listOfGFS2Names):
+ self.__clusternodeName = clusternodeName
+ self.__clusterName = clusterName
+
+ # List of the mounted filesystem from the mount -l command.
+ self.__listOfMountedGFS2Filesystems = self.__getMountedGFS2Filesystems()
+
+ # List of mounted GFS2 labels for this cluster from mount -l command.
+ listOfGFS2MountedFilesystemLabels = self.__getMountedFilesystemLabel(self.__listOfMountedGFS2Filesystems)
+ self.__listOfGFS2MountedFilesystemLabels = []
+ if (not len(listOfGFS2Names) > 0):
+ # If no items in listOfGFS2Names then add them all.
+ self.__listOfGFS2MountedFilesystemLabels = listOfGFS2MountedFilesystemLabels
+ else:
+ for label in listOfGFS2MountedFilesystemLabels:
+ for name in listOfGFS2Names:
+ if ((name == label) or ("%s:%s"%(self.__clusterName, name) == label)):
+ self.__listOfGFS2MountedFilesystemLabels.append(label)
+
+ def __str__(self):
+ rString = ""
+ rString += "%s:%s" %(self.getClusterName(), self.getClusterNodeName())
+ for fsName in self.getMountedGFS2FilesystemNames():
+ rString += "\n\t%s" %(fsName)
+ for mountedFS in self.__listOfMountedGFS2Filesystems:
+ if (mountedFS.find(fsName) >= 0):
+ rString += " --> %s" %(mountedFS)
+ break
+ return rString.rstrip()
+
+ def __getMountedFilesystemLabel(self, listOfMountedFilesystems):
+ listOfMountedFilesystemsLabels = []
+ for mountedFilesystem in listOfMountedFilesystems:
+ splitMountedFilesystem = mountedFilesystem.split()
+ fsLabel = splitMountedFilesystem[-1].strip().strip("[").rstrip("]")
+ if (len(fsLabel) > 0):
+ # Verify it starts with name of the cluster.
+ if (fsLabel.startswith("%s:" %(self.getClusterName()))):
+ listOfMountedFilesystemsLabels.append(fsLabel)
+ return listOfMountedFilesystemsLabels
+
+ def __getMountedGFS2Filesystems(self):
+ listOfMountedFilesystems = []
+ commandList= ["mount", "-l"]
+ stdout = ""
+ try:
+ task = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ task.wait()
+ (stdout, stderr) = task.communicate()
+ except OSError:
+ commandOptionString = ""
+ for option in commandList:
+ commandOptionString += "%s " %(option)
+ message = "An error occurred running the command: $ %s" %(commandOptionString)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ return listOfMountedFilesystems
+ stdoutSplit = stdout.split("\n")
+ for line in stdoutSplit:
+ splitLine = line.split()
+ if (len(splitLine) >= 5):
+ if (splitLine[4] == "gfs2"):
+ listOfMountedFilesystems.append(line)
+ return listOfMountedFilesystems
+
+ def getClusterNodeName(self):
+ return self.__clusternodeName
+
+ def getClusterName(self):
+ return self.__clusterName
+
+ def getMountedGFS2FilesystemNames(self, includeClusterName=True):
+ # If true will prepend the cluster name to gfs2 fs name
+ if (includeClusterName):
+ return self.__listOfGFS2MountedFilesystemLabels
+ else:
+ listOfGFS2MountedFilesystemLabels = []
+ for fsLabel in self.__listOfGFS2MountedFilesystemLabels:
+ fsLabelSplit = fsLabel.split(":", 1)
+ if (len(fsLabelSplit) == 2):
+ listOfGFS2MountedFilesystemLabels.append(fsLabelSplit[1])
+ return listOfGFS2MountedFilesystemLabels
+
+# #####################################################################
+# Helper functions.
+# #####################################################################
+def runCommand(command, listOfCommandOptions, standardOut=subprocess.PIPE, standardError=subprocess.PIPE, debug=False):
+ stdout = ""
+ stderr = ""
+ try:
+ commandList = [command]
+ commandList += listOfCommandOptions
+ task = subprocess.Popen(commandList, stdout=standardOut, stderr=standardError)
+ task.wait()
+ (stdout, stderr) = task.communicate()
+ return (task.returncode == 0)
+ except OSError:
+ commandOptionString = ""
+ for option in listOfCommandOptions:
+ commandOptionString += "%s " %(option)
+ message = "An error occurred running the command: $ %s %s" %(command, commandOptionString)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ if (debug):
+ if (len(stdout) > 0):
+ print stdout
+ if (len(stderr) > 0):
+ print stderr
+ return False
+
+def writeToFile(pathToFilename, data, appendToFile=True, createFile=False):
+ [parentDir, filename] = os.path.split(pathToFilename)
+ if (os.path.isfile(pathToFilename) or (os.path.isdir(parentDir) and createFile)):
+ try:
+ filemode = "w"
+ if (appendToFile):
+ filemode = "a"
+ fout = open(pathToFilename, filemode)
+ fout.write(data + "\n")
+ fout.close()
+ return True
+ except UnicodeEncodeError, e:
+ message = "There was a unicode encode error writing to the file: %s." %(pathToFilename)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ return False
+ except IOError:
+ message = "There was an error writing to the file: %s." %(pathToFilename)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ return False
+ return False
+
+def mkdirs(pathToDSTDir):
+ if (os.path.isdir(pathToDSTDir)):
+ return True
+ elif ((not os.access(pathToDSTDir, os.F_OK)) and (len(pathToDSTDir) > 0)):
+ try:
+ os.makedirs(pathToDSTDir)
+ except (OSError, os.error):
+ message = "Could not create the directory: %s." %(pathToDSTDir)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ return False
+ except (IOError, os.error):
+ message = "Could not create the directory with the path: %s." %(pathToDSTDir)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ return False
+ return os.path.isdir(pathToDSTDir)
+
+def removePIDFile():
+ message = "Removing the pid file: %s" %(PATH_TO_PID_FILENAME)
+ logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+ if (os.path.exists(PATH_TO_PID_FILENAME)):
+ try:
+ os.remove(PATH_TO_PID_FILENAME)
+ except IOError:
+ message = "There was an error removing the file: %s." %(PATH_TO_PID_FILENAME)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+
+def exitScript(removePidFile=True, errorCode=0):
+ if (removePidFile):
+ removePIDFile()
+ message = "The script will exit."
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ sys.exit(errorCode)
+
+# #####################################################################
+# Helper functions for gathering the lockdumps.
+# #####################################################################
+def getClusterNode(listOfGFS2Names):
+ # Return a ClusterNode object if the clusternode and cluster name are found
+ # in the output, else return None.
+ commandList= ["cman_tool", "status"]
+ stdout = ""
+ try:
+ task = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ task.wait()
+ (stdout, stderr) = task.communicate()
+ except OSError:
+ commandOptionString = ""
+ for option in commandList:
+ commandOptionString += "%s " %(option)
+ message = "An error occurred running the command: $ %s" %(commandOptionString)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ return None
+ stdoutSplit = stdout.split("\n")
+ clusterName = ""
+ clusternodeName = ""
+ for line in stdoutSplit:
+ if (line.startswith("Cluster Name:")):
+ clusterName = line.split("Cluster Name:")[1].strip().rstrip()
+ if (line.startswith("Node name: ")):
+ clusternodeName = line.split("Node name:")[1].strip().rstrip()
+ if ((len(clusterName) > 0) and (len(clusternodeName) > 0)):
+ return ClusterNode(clusternodeName, clusterName, listOfGFS2Names)
+ return None
+
+def mountFilesystemDebug(enableMounting=True):
+ if (os.path.ismount(PATH_TO_DEBUG_DIR)):
+ message = "The debug filesystem %s is mounted." %(PATH_TO_DEBUG_DIR)
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ return True
+ else:
+ message = "The debug filesystem %s is not mounted." %(PATH_TO_DEBUG_DIR)
+ logging.getLogger(MAIN_LOGGER_NAME).warning(message)
+ if (cmdLineOpts.enableMountDebugFS):
+ if(mountFilesystem("/bin/mount", "none", PATH_TO_DEBUG_DIR, "debugfs")):
+ message = "The debug filesystem was mounted: %s." %(PATH_TO_DEBUG_DIR)
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ return True
+ return False
+
+def mountFilesystem(pathToMountCommand, pathToDevice, pathToMountPoint, filesystemType):
+ if (os.path.ismount(PATH_TO_DEBUG_DIR)):
+ return True
+ listOfCommandOptions = ["-t", filesystemType, pathToDevice, pathToMountPoint]
+ if (not runCommand(pathToMountCommand, listOfCommandOptions)):
+ message = "There was an error mounting the filesystem type %s for the device %s to the mount point %s." %(filesystemType, pathToDevice, pathToMountPoint)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ message = "The standard error is below: \n\t %s" %(stderr)
+ logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+ return os.path.ismount(PATH_TO_DEBUG_DIR)
+
+def gatherGeneralInformation(pathToDSTDir):
+ # Maybe add cluster node name, uname -a, etc
+ systemString = "HOSTNAME: %s\nDATE: %s" %(platform.node(), time.strftime("%Y-%m-%d_%H:%M:%S"))
+ writeToFile(os.path.join(pathToDSTDir, "system.txt"), systemString, createFile=True)
+ # Get "cman_tool node -F id,type,name" data.
+ command = "cman_tool"
+ pathToCommandOutput = os.path.join(pathToDSTDir, "cman_tool-nodes.txt")
+ try:
+ fout = open(pathToCommandOutput, "w")
+ runCommand(command, ["nodes", "-F", "id,type,name"], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ # Get "cman_tool services" data.
+ command = "cman_tool"
+ pathToCommandOutput = os.path.join(pathToDSTDir, "cman_tool-services.txt")
+ try:
+ fout = open(pathToCommandOutput, "w")
+ runCommand(command, ["services"], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ # Get "clustat" data.
+ command = "clustat"
+ pathToCommandOutput = os.path.join(pathToDSTDir, "clustat.txt")
+ try:
+ fout = open(pathToCommandOutput, "w")
+ runCommand(command, [], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ # Get "mount -l" filesystem data.
+ command = "cat"
+ pathToCommandOutput = os.path.join(pathToDSTDir, "cat-proc_mounts.txt")
+ try:
+ fout = open(pathToCommandOutput, "w")
+ runCommand(command, ["/proc/mounts"], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ # Get "ps -eo user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan" data.
+ command = "ps"
+ pathToCommandOutput = os.path.join(pathToDSTDir, "ps.txt")
+ try:
+ fout = open(pathToCommandOutput, "w")
+ #runCommand(command, ["-eo", "user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan"], standardOut=fout)
+ runCommand(command, ["h", "-AL", "-o", "tid,s,cmd"], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ # Get "group_tool ls" data.
+ command = "group_tool"
+ pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-ls.txt")
+ try:
+ fout = open(pathToCommandOutput, "w")
+ runCommand(command, ["ls"], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ # Get "group_tool dump fence" data.
+ command = "group_tool"
+ pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-dump_fence.txt")
+ try:
+ fout = open(pathToCommandOutput, "w")
+ runCommand(command, ["dump", "fence"], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ # Get "group_tool dump gfs2" data.
+ command = "group_tool"
+ pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-dump_gfs2.txt")
+ try:
+ fout = open(pathToCommandOutput, "w")
+ runCommand(command, ["dump", "gfs2"], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+
+def triggerSysRQEvents():
+ command = "echo"
+ pathToSysrqTriggerFile = "/proc/sysrq-trigger"
+ # m - dump information about memory allocation
+ # t - dump thread state information
+ triggers = ["m", "t"]
+ for trigger in triggers:
+ try:
+ fout = open(pathToSysrqTriggerFile, "w")
+ runCommand(command, [trigger], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToSysrqTriggerFile)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+
+def gatherLogs(pathToDSTDir):
+ if (mkdirs(pathToDSTDir)):
+ # Copy messages logs that contain the sysrq data.
+ pathToLogFile = "/var/log/messages"
+ pathToDSTLogFile = os.path.join(pathToDSTDir, os.path.basename(pathToLogFile))
+ try:
+ shutil.copyfile(pathToLogFile, pathToDSTLogFile)
+ except shutil.Error:
+ message = "There was an error copying the file: %s to %s." %(pathToLogFile, pathToDSTLogFile)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+
+ pathToLogDir = "/var/log/cluster"
+ pathToDSTLogDir = os.path.join(pathToDSTDir, os.path.basename(pathToLogDir))
+ if (os.path.isdir(pathToLogDir)):
+ try:
+ shutil.copytree(pathToLogDir, pathToDSTLogDir)
+ except shutil.Error:
+ message = "There was an error copying the directory: %s to %s." %(pathToLogDir, pathToDSTLogDir)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+
+def gatherDLMLockDumps(pathToDSTDir, listOfGFS2Filesystems):
+ lockDumpType = "dlm"
+ pathToSrcDir = os.path.join(PATH_TO_DEBUG_DIR, lockDumpType)
+ pathToOutputDir = os.path.join(pathToDSTDir, lockDumpType)
+ message = "Copying the %s lockdump data from the directory for the %s." %(lockDumpType, pathToSrcDir)
+ logging.getLogger(MAIN_LOGGER_NAME).status(message)
+ for filename in os.listdir(pathToSrcDir):
+ for name in listOfGFS2Filesystems:
+ if (filename.startswith(name)):
+ pathToCurrentFilename = os.path.join(pathToSrcDir, filename)
+ pathToDSTDir = os.path.join(pathToOutputDir, name)
+ mkdirs(pathToDSTDir)
+ pathToDSTFilename = os.path.join(pathToDSTDir, filename)
+ try:
+ shutil.copy(pathToCurrentFilename, pathToDSTFilename)
+ except shutil.Error:
+ message = "There was an error copying the file: %s to %s." %(pathToCurrentFilename, pathToDSTFilename)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ except OSError:
+ message = "There was an error copying the file: %s to %s." %(pathToCurrentFilename, pathToDSTFilename)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+
+def gatherGFS2LockDumps(pathToDSTDir, listOfGFS2Filesystems):
+ lockDumpType = "gfs2"
+ pathToSrcDir = os.path.join(PATH_TO_DEBUG_DIR, lockDumpType)
+ pathToOutputDir = os.path.join(pathToDSTDir, lockDumpType)
+ for dirName in os.listdir(pathToSrcDir):
+ pathToCurrentDir = os.path.join(pathToSrcDir, dirName)
+ if ((os.path.isdir(pathToCurrentDir)) and (dirName in listOfGFS2Filesystems)):
+ mkdirs(pathToOutputDir)
+ pathToDSTDir = os.path.join(pathToOutputDir, dirName)
+ try:
+ message = "Copying the lockdump data for the %s filesystem: %s" %(lockDumpType, dirName)
+ logging.getLogger(MAIN_LOGGER_NAME).status(message)
+ shutil.copytree(pathToCurrentDir, pathToDSTDir)
+ except shutil.Error:
+ message = "There was an error copying the directory: %s to %s." %(pathToCurrentDir, pathToDSTDir)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ except OSError:
+ message = "There was an error copying the directory: %s to %s." %(pathToCurrentDir, pathToDSTDir)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+
+def archiveData(pathToSrcDir):
+ # Compress the file so that it will have a smaller file name.
+ if (os.path.exists(pathToSrcDir)):
+ pathToTarFilename = "%s.tar.bz2" %(pathToSrcDir)
+ message = "Creating a compressed archvied file: %s" %(pathToTarFilename)
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ try:
+ tar = tarfile.open(pathToTarFilename, "w:bz2")
+ tar.add(pathToSrcDir, arcname=os.path.basename(pathToSrcDir))
+ tar.close()
+ except tarfile.TarError:
+ message = "There was an error creating the tarfile: %s." %(pathToTarFilename)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ return ""
+ if (os.path.exists(pathToTarFilename)):
+ return pathToTarFilename
+ return ""
+
+# ##############################################################################
+# Get user selected options
+# ##############################################################################
+def __getOptions(version) :
+ cmdParser = OptionParserExtended(version)
+ cmdParser.add_option("-d", "--debug",
+ action="store_true",
+ dest="enableDebugLogging",
+ help="Enables debug logging.",
+ default=False)
+ cmdParser.add_option("-q", "--quiet",
+ action="store_true",
+ dest="disableLoggingToConsole",
+ help="Disables logging to console.",
+ default=False)
+ cmdParser.add_option("-i", "--info",
+ action="store_true",
+ dest="enablePrintInfo",
+ help="Prints to console some basic information about the GFS2 filesystems mounted on the cluster node.",
+ default=False)
+ cmdParser.add_option("-M", "--mount_debug_fs",
+ action="store_true",
+ dest="enableMountDebugFS",
+ help="Enables the mounting of the debug filesystem if it is not mounted. Default is disabled.",
+ default=False)
+ cmdParser.add_option("-o", "--path_to_output_dir",
+ action="store",
+ dest="pathToOutputDir",
+ help="The path to the output directory where all the collect data will be stored. Default is /tmp/<date>-<hostname>-%s" %(os.path.basename(sys.argv[0])),
+ type="string",
+ default="")
+ cmdParser.add_option("-r", "--num_of_runs",
+ action="store",
+ dest="numberOfRuns",
+ help="The number of lockdumps runs to do. Default is 2.",
+ type="int",
+ default=2)
+ cmdParser.add_option("-s", "--seconds_sleep",
+ action="store",
+ dest="secondsToSleep",
+ help="The number of seconds sleep between runs. Default is 120 seconds.",
+ type="int",
+ default=120)
+ cmdParser.add_option("-t", "--archive",
+ action="store_true",
+ dest="enableArchiveOutputDir",
+ help="Enables archiving and compressing of the output directory with tar and bzip2. Default is disabled.",
+ default=False)
+ cmdParser.add_option("-n", "--fs_name",
+ action="extend",
+ dest="listOfGFS2Names",
+ help="List of GFS2 filesystems that will have their lockdump data gathered.",
+ type="string",
+ default=[]) # Get the options and return the result.
+ (cmdLineOpts, cmdLineArgs) = cmdParser.parse_args()
+ return (cmdLineOpts, cmdLineArgs)
+
+# ##############################################################################
+# OptParse classes for commandline options
+# ##############################################################################
+class OptionParserExtended(OptionParser):
+ """
+ This is the class that gets the command line options the end user
+ selects.
+ """
+ def __init__(self, version) :
+ self.__commandName = os.path.basename(sys.argv[0])
+ versionMessage = "%s %s\n" %(self.__commandName, version)
+
+ commandDescription ="%s will capture information about lockdata data for GFS2 and DLM required to analyze a GFS2 filesystem.\n"%(self.__commandName)
+
+ OptionParser.__init__(self, option_class=ExtendOption,
+ version=versionMessage,
+ description=commandDescription)
+
+ def print_help(self):
+ self.print_version()
+ examplesMessage = "\n"
+ examplesMessage = "\nPrints information about the available GFS2 filesystems that can have lockdump data captured."
+ examplesMessage += "\n$ %s -i\n" %(self.__commandName)
+ examplesMessage += "\nThis command will mount the debug directory if it is not mounted. It will do 3 runs of\n"
+ examplesMessage += "gathering the lockdump information in 10 second intervals for only the GFS2 filesystems\n"
+ examplesMessage += "with the names myGFS2vol2,myGFS2vol1. Then it will archive and compress the data collected."
+ examplesMessage += "\n$ %s -M -r 3 -s 10 -t -n myGFS2vol2,myGFS2vol1\n" %(self.__commandName)
+ OptionParser.print_help(self)
+ print examplesMessage
+
+
+class ExtendOption (Option):
+ """
+ Allow to specify comma delimited list of entries for arrays
+ and dictionaries.
+ """
+ ACTIONS = Option.ACTIONS + ("extend",)
+ STORE_ACTIONS = Option.STORE_ACTIONS + ("extend",)
+ TYPED_ACTIONS = Option.TYPED_ACTIONS + ("extend",)
+
+ def take_action(self, action, dest, opt, value, values, parser):
+ if (action == "extend") :
+ valueList=[]
+ try:
+ for v in value.split(","):
+ # Need to add code for dealing with paths if there is option for paths.
+ valueList.append(v)
+ except:
+ pass
+ else:
+ values.ensure_value(dest, []).extend(valueList)
+ else:
+ Option.take_action(self, action, dest, opt, value, values, parser)
+
+# ###############################################################################
+# Main Function
+# ###############################################################################
+if __name__ == "__main__":
+ try:
+ # #######################################################################
+ # Get the options from the commandline.
+ # #######################################################################
+ (cmdLineOpts, cmdLineArgs) = __getOptions(VERSION_NUMBER)
+
+ # #######################################################################
+ # Setup the logger and create config directory
+ # #######################################################################
+ # Create the logger
+ logLevel = logging.INFO
+ logger = logging.getLogger(MAIN_LOGGER_NAME)
+ logger.setLevel(logLevel)
+ # Create a new status function and level.
+ logging.STATUS = logging.INFO + 2
+ logging.addLevelName(logging.STATUS, "STATUS")
+ # Create a function for the STATUS_LEVEL since not defined by python. This
+ # means you can call it like the other predefined message
+ # functions. Example: logging.getLogger("loggerName").status(message)
+ setattr(logger, "status", lambda *args: logger.log(logging.STATUS, *args))
+ ch = logging.StreamHandler()
+ ch.setLevel(logLevel)
+ ch.setFormatter(logging.Formatter(MAIN_LOGGER_FORMAT))
+ logger.addHandler(ch)
+
+ # #######################################################################
+ # Set the logging levels.
+ # #######################################################################
+ if ((cmdLineOpts.enableDebugLogging) and (not cmdLineOpts.disableLoggingToConsole)):
+ logging.getLogger(MAIN_LOGGER_NAME).setLevel(logging.DEBUG)
+ ch.setLevel(logging.DEBUG)
+ message = "Debugging has been enabled."
+ logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+ if (cmdLineOpts.disableLoggingToConsole):
+ logging.disable(logging.CRITICAL)
+
+ # #######################################################################
+ # Check to see if pid file exists and error if it does.
+ # #######################################################################
+ if (os.path.exists(PATH_TO_PID_FILENAME)):
+ message = "The PID file %s already exists and this script cannot run till it does not exist." %(PATH_TO_PID_FILENAME)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ message = "Verify that there are no other existing processes running. If there are running processes those need to be stopped first and the file removed."
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ exitScript(removePidFile=False, errorCode=1)
+ else:
+ message = "Creating the pid file: %s" %(PATH_TO_PID_FILENAME)
+ logging.getLogger(MAIN_LOGGER_NAME).debug(message)
+ # Creata the pid file so we dont have more than 1 process of this
+ # script running.
+ writeToFile(PATH_TO_PID_FILENAME, str(os.getpid()), createFile=True)
+
+ # Get the clusternode name.
+ clusternode = getClusterNode(cmdLineOpts.listOfGFS2Names)
+ if (clusternode == None):
+ message = "The cluster or cluster node name could not be found from \"cman_tool status\"."
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ exitScript(removePidFile=False, errorCode=1)
+ if (cmdLineOpts.enablePrintInfo):
+ logging.disable(logging.CRITICAL)
+ print "List of all the mounted GFS2 filesystems that can have their lockdump data captured:"
+ print clusternode
+ exitScript()
+ # #######################################################################
+ # Create the output directory to verify it can be created before
+ # proceeding unless it is already created from a previous run data needs
+ # to be analyzed. Probably could add more debugging on if file or dir.
+ # #######################################################################
+ message = "The gathering of the lockdumps will be performed on the clusternode \"%s\" which is part of the cluster \"%s\"." %(clusternode.getClusterNodeName(), clusternode.getClusterName())
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ pathToOutputDir = cmdLineOpts.pathToOutputDir
+ if (not len(pathToOutputDir) > 0):
+ pathToOutputDir = "%s" %(os.path.join("/tmp", "%s-%s-%s" %(time.strftime("%Y-%m-%d_%H%M%S"), clusternode.getClusterNodeName(), os.path.basename(sys.argv[0]))))
+ if (os.path.exists(pathToOutputDir)):
+ message = "The directory already exists and could contain previous lockdump data: %s" %(pathToOutputDir)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ exitScript(errorCode=1)
+ else:
+ message = "This directory that will be used to capture all the data: %s" %(pathToOutputDir)
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ if (not mkdirs(pathToOutputDir)):
+ exitScript(errorCode=1)
+
+ # #######################################################################
+ # Check to see if the debug directory is mounted. If not then
+ # log an error.
+ # #######################################################################
+ result = mountFilesystemDebug(cmdLineOpts.enableMountDebugFS)
+ if (not result):
+ message = "Please mount the debug filesystem before running this script. For example: $ mount none -t debugfs %s" %(PATH_TO_DEBUG_DIR)
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ exitScript(errorCode=1)
+
+ # #######################################################################
+ # Gather data and the lockdumps.
+ # #######################################################################
+ message = "The process of gathering all the required files will begin before capturing the lockdumps."
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ for i in range(0,cmdLineOpts.numberOfRuns):
+ # Add clusternode name under each run dir to make combining multple
+ # clusternode gfs2_lockgather data together and all data in each run directory.
+ pathToOutputRunDir = os.path.join(pathToOutputDir, "run%d/%s" %(i, clusternode.getClusterNodeName()))
+ if (not mkdirs(pathToOutputRunDir)):
+ exitOnError()
+ # Gather various bits of data from the clusternode.
+ message = "Gathering some general information about the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i)
+ logging.getLogger(MAIN_LOGGER_NAME).status(message)
+ gatherGeneralInformation(pathToOutputRunDir)
+ # Trigger sysrq events to capture memory and thread information
+ message = "Triggering the sysrq events for the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i)
+ logging.getLogger(MAIN_LOGGER_NAME).status(message)
+ triggerSysRQEvents()
+ # Gather the dlm locks.
+ lockDumpType = "dlm"
+ message = "Gathering the %s lock dumps for clusternode %s for run %d." %(lockDumpType, clusternode.getClusterNodeName(), i)
+ logging.getLogger(MAIN_LOGGER_NAME).status(message)
+ gatherDLMLockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames(includeClusterName=False))
+ # Gather the glock locks from gfs2.
+ lockDumpType = "gfs2"
+ message = "Gathering the %s lock dumps for clusternode %s for run %d." %(lockDumpType, clusternode.getClusterNodeName(), i)
+ logging.getLogger(MAIN_LOGGER_NAME).status(message)
+ gatherGFS2LockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames())
+ # Gather log files
+ message = "Gathering the log files for the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i)
+ logging.getLogger(MAIN_LOGGER_NAME).status(message)
+ gatherLogs(os.path.join(pathToOutputRunDir, "logs"))
+ if (cmdLineOpts.secondsToSleep > 0):
+ message = "The script will sleep for %d seconds between each run of capturing the lockdumps." %(cmdLineOpts.secondsToSleep)
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ time.sleep(cmdLineOpts.secondsToSleep)
+ # #######################################################################
+ # Archive the file if enabled and print the location of the output
+ # directory.
+ # #######################################################################
+ # After it is done the we should print out where the files that were
+ # generated are located and what to do.
+ message = "All the files have been gathered and this directory contains all the captured data: %s" %(pathToOutputDir)
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+
+ # #######################################################################
+ # Archive the directory that contains all the data and archive it.
+ # #######################################################################
+ if (cmdLineOpts.enableArchiveOutputDir):
+ message = "The lockdump data will now be archived. This could some time depending on the size of the data collected."
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+ pathToTarFilename = archiveData(pathToOutputDir)
+ if (os.path.exists(pathToTarFilename)):
+ message = "The compressed archvied file was created: %s" %(pathToTarFilename)
+ logging.getLogger(MAIN_LOGGER_NAME).info(message)
+
+ # #######################################################################
+ except KeyboardInterrupt:
+ print ""
+ message = "This script will exit since control-c was executed by end user."
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ exitScript(errorCode=1)
+ # #######################################################################
+ # Exit the application with zero exit code since we cleanly exited.
+ # #######################################################################
+ exitScript()
--
1.7.1
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [Cluster-devel] [PATCH] Adding gfs2_lockcapture
2012-11-01 15:26 [Cluster-devel] [PATCH] Adding gfs2_lockcapture Shane Bradley
@ 2012-11-01 16:45 ` Andrew Price
0 siblings, 0 replies; 2+ messages in thread
From: Andrew Price @ 2012-11-01 16:45 UTC (permalink / raw)
To: cluster-devel.redhat.com
Hi Shane,
A couple of comments:
On 01/11/12 15:26, Shane Bradley wrote:
> ---
We generally like to keep complete, descriptive commit logs so it would
be good to have a description of the script in the commit log and a
brief note about the state it's in. See the previous logs for examples.
Also include a "signed-off-by" line, which git commit -s will add for you.
Also if you could prefix the patch subject with "gfs2-utils:" (and in
future "gfs2_lockcapture:") it will help to distinguish it from the
other projects which use cluster-devel@ for patches.
> gfs2/lockgather/gfs2_lockcapture | 723 ++++++++++++++++++++++++++++++++++++++
> 1 files changed, 723 insertions(+), 0 deletions(-)
> create mode 100644 gfs2/lockgather/gfs2_lockcapture
Could you also edit gfs2/lockgather/Makefile.am to plug this into the
build system. That way it will get installed with 'make install'. If you
just replace the entry for gfs2_lockgather we'll remove that script in a
separate commit.
I've made a few minor comments on the code inline below.
Thanks,
Andy
> diff --git a/gfs2/lockgather/gfs2_lockcapture b/gfs2/lockgather/gfs2_lockcapture
> new file mode 100644
> index 0000000..d040738
> --- /dev/null
> +++ b/gfs2/lockgather/gfs2_lockcapture
> @@ -0,0 +1,723 @@
> +#!/usr/bin/env python
> +"""
> +This script will gather gfs2 and dlm lock information for a single cluster node
> +for all the mounted GFS2 filesystems.
> +
> +TODO:
> +* Should there be option to disable sysrq events in case it could trigger panic.
> +* Add option to write log to file
> +
> +* Add a better description.
> +* Add examples for all options.
> +* Add better description of options and has steve to review those and tweak my
> + option descriptions.
> +
> + at author : Shane Bradley
> + at contact : sbradley at redhat.com
> + at version : 0.9
> + at copyright : GPLv2
> +"""
> +import sys
> +import os
> +import os.path
> +import logging
> +from optparse import OptionParser, Option
> +import time
> +import platform
> +import shutil
> +import subprocess
> +import tarfile
> +
> +VERSION_NUMBER = "0.9-1"
> +# #####################################################################
> +# Global vars:
> +# #####################################################################
> +# Name of the logger
> +MAIN_LOGGER_NAME = "%s" %(os.path.basename(sys.argv[0]))
> +# Format of the logger
> +MAIN_LOGGER_FORMAT = "%(levelname)s %(message)s"
> +# Path to debug root
> +PATH_TO_DEBUG_DIR="/sys/kernel/debug"
> +# Path to the pid file that will be used for locking.
> +PATH_TO_PID_FILENAME = "/var/run/%s.pid" %(os.path.basename(sys.argv[0]))
> +
> +
> +# #####################################################################
> +# Class to define what a clusternode is.
> +# #####################################################################
> +class ClusterNode:
> + def __init__(self, clusternodeName, clusterName, listOfGFS2Names):
> + self.__clusternodeName = clusternodeName
> + self.__clusterName = clusterName
> +
> + # List of the mounted filesystem from the mount -l command.
> + self.__listOfMountedGFS2Filesystems = self.__getMountedGFS2Filesystems()
> +
> + # List of mounted GFS2 labels for this cluster from mount -l command.
> + listOfGFS2MountedFilesystemLabels = self.__getMountedFilesystemLabel(self.__listOfMountedGFS2Filesystems)
> + self.__listOfGFS2MountedFilesystemLabels = []
> + if (not len(listOfGFS2Names) > 0):
> + # If no items in listOfGFS2Names then add them all.
> + self.__listOfGFS2MountedFilesystemLabels = listOfGFS2MountedFilesystemLabels
> + else:
> + for label in listOfGFS2MountedFilesystemLabels:
> + for name in listOfGFS2Names:
> + if ((name == label) or ("%s:%s"%(self.__clusterName, name) == label)):
> + self.__listOfGFS2MountedFilesystemLabels.append(label)
> +
> + def __str__(self):
> + rString = ""
> + rString += "%s:%s" %(self.getClusterName(), self.getClusterNodeName())
> + for fsName in self.getMountedGFS2FilesystemNames():
> + rString += "\n\t%s" %(fsName)
> + for mountedFS in self.__listOfMountedGFS2Filesystems:
> + if (mountedFS.find(fsName) >= 0):
> + rString += " --> %s" %(mountedFS)
> + break
> + return rString.rstrip()
> +
> + def __getMountedFilesystemLabel(self, listOfMountedFilesystems):
> + listOfMountedFilesystemsLabels = []
> + for mountedFilesystem in listOfMountedFilesystems:
> + splitMountedFilesystem = mountedFilesystem.split()
> + fsLabel = splitMountedFilesystem[-1].strip().strip("[").rstrip("]")
> + if (len(fsLabel) > 0):
> + # Verify it starts with name of the cluster.
> + if (fsLabel.startswith("%s:" %(self.getClusterName()))):
> + listOfMountedFilesystemsLabels.append(fsLabel)
> + return listOfMountedFilesystemsLabels
> +
> + def __getMountedGFS2Filesystems(self):
> + listOfMountedFilesystems = []
> + commandList= ["mount", "-l"]
> + stdout = ""
> + try:
> + task = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
> + task.wait()
> + (stdout, stderr) = task.communicate()
> + except OSError:
> + commandOptionString = ""
> + for option in commandList:
> + commandOptionString += "%s " %(option)
> + message = "An error occurred running the command: $ %s" %(commandOptionString)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + return listOfMountedFilesystems
> + stdoutSplit = stdout.split("\n")
> + for line in stdoutSplit:
> + splitLine = line.split()
> + if (len(splitLine) >= 5):
> + if (splitLine[4] == "gfs2"):
> + listOfMountedFilesystems.append(line)
> + return listOfMountedFilesystems
> +
> + def getClusterNodeName(self):
> + return self.__clusternodeName
> +
> + def getClusterName(self):
> + return self.__clusterName
> +
> + def getMountedGFS2FilesystemNames(self, includeClusterName=True):
> + # If true will prepend the cluster name to gfs2 fs name
> + if (includeClusterName):
> + return self.__listOfGFS2MountedFilesystemLabels
> + else:
> + listOfGFS2MountedFilesystemLabels = []
> + for fsLabel in self.__listOfGFS2MountedFilesystemLabels:
> + fsLabelSplit = fsLabel.split(":", 1)
> + if (len(fsLabelSplit) == 2):
> + listOfGFS2MountedFilesystemLabels.append(fsLabelSplit[1])
> + return listOfGFS2MountedFilesystemLabels
> +
> +# #####################################################################
> +# Helper functions.
> +# #####################################################################
> +def runCommand(command, listOfCommandOptions, standardOut=subprocess.PIPE, standardError=subprocess.PIPE, debug=False):
> + stdout = ""
> + stderr = ""
> + try:
> + commandList = [command]
> + commandList += listOfCommandOptions
> + task = subprocess.Popen(commandList, stdout=standardOut, stderr=standardError)
> + task.wait()
> + (stdout, stderr) = task.communicate()
> + return (task.returncode == 0)
> + except OSError:
> + commandOptionString = ""
> + for option in listOfCommandOptions:
> + commandOptionString += "%s " %(option)
> + message = "An error occurred running the command: $ %s %s" %(command, commandOptionString)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + if (debug):
> + if (len(stdout) > 0):
> + print stdout
> + if (len(stderr) > 0):
> + print stderr
> + return False
> +
> +def writeToFile(pathToFilename, data, appendToFile=True, createFile=False):
> + [parentDir, filename] = os.path.split(pathToFilename)
> + if (os.path.isfile(pathToFilename) or (os.path.isdir(parentDir) and createFile)):
> + try:
> + filemode = "w"
> + if (appendToFile):
> + filemode = "a"
> + fout = open(pathToFilename, filemode)
> + fout.write(data + "\n")
> + fout.close()
> + return True
> + except UnicodeEncodeError, e:
> + message = "There was a unicode encode error writing to the file: %s." %(pathToFilename)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + return False
> + except IOError:
> + message = "There was an error writing to the file: %s." %(pathToFilename)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + return False
> + return False
> +
> +def mkdirs(pathToDSTDir):
> + if (os.path.isdir(pathToDSTDir)):
> + return True
> + elif ((not os.access(pathToDSTDir, os.F_OK)) and (len(pathToDSTDir) > 0)):
> + try:
> + os.makedirs(pathToDSTDir)
> + except (OSError, os.error):
> + message = "Could not create the directory: %s." %(pathToDSTDir)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + return False
> + except (IOError, os.error):
> + message = "Could not create the directory with the path: %s." %(pathToDSTDir)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + return False
> + return os.path.isdir(pathToDSTDir)
> +
> +def removePIDFile():
> + message = "Removing the pid file: %s" %(PATH_TO_PID_FILENAME)
> + logging.getLogger(MAIN_LOGGER_NAME).debug(message)
> + if (os.path.exists(PATH_TO_PID_FILENAME)):
> + try:
> + os.remove(PATH_TO_PID_FILENAME)
> + except IOError:
> + message = "There was an error removing the file: %s." %(PATH_TO_PID_FILENAME)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> +
> +def exitScript(removePidFile=True, errorCode=0):
> + if (removePidFile):
> + removePIDFile()
> + message = "The script will exit."
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + sys.exit(errorCode)
> +
> +# #####################################################################
> +# Helper functions for gathering the lockdumps.
> +# #####################################################################
> +def getClusterNode(listOfGFS2Names):
> + # Return a ClusterNode object if the clusternode and cluster name are found
> + # in the output, else return None.
> + commandList= ["cman_tool", "status"]
Since cman is no longer around we should update this script to work with
a Fedora cluster before we ship this script in the Fedora package.
> + stdout = ""
> + try:
> + task = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
> + task.wait()
> + (stdout, stderr) = task.communicate()
> + except OSError:
> + commandOptionString = ""
> + for option in commandList:
> + commandOptionString += "%s " %(option)
> + message = "An error occurred running the command: $ %s" %(commandOptionString)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + return None
> + stdoutSplit = stdout.split("\n")
> + clusterName = ""
> + clusternodeName = ""
> + for line in stdoutSplit:
> + if (line.startswith("Cluster Name:")):
> + clusterName = line.split("Cluster Name:")[1].strip().rstrip()
> + if (line.startswith("Node name: ")):
> + clusternodeName = line.split("Node name:")[1].strip().rstrip()
> + if ((len(clusterName) > 0) and (len(clusternodeName) > 0)):
> + return ClusterNode(clusternodeName, clusterName, listOfGFS2Names)
> + return None
> +
> +def mountFilesystemDebug(enableMounting=True):
> + if (os.path.ismount(PATH_TO_DEBUG_DIR)):
> + message = "The debug filesystem %s is mounted." %(PATH_TO_DEBUG_DIR)
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + return True
> + else:
> + message = "The debug filesystem %s is not mounted." %(PATH_TO_DEBUG_DIR)
> + logging.getLogger(MAIN_LOGGER_NAME).warning(message)
> + if (cmdLineOpts.enableMountDebugFS):
> + if(mountFilesystem("/bin/mount", "none", PATH_TO_DEBUG_DIR, "debugfs")):
> + message = "The debug filesystem was mounted: %s." %(PATH_TO_DEBUG_DIR)
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + return True
> + return False
> +
> +def mountFilesystem(pathToMountCommand, pathToDevice, pathToMountPoint, filesystemType):
> + if (os.path.ismount(PATH_TO_DEBUG_DIR)):
> + return True
> + listOfCommandOptions = ["-t", filesystemType, pathToDevice, pathToMountPoint]
> + if (not runCommand(pathToMountCommand, listOfCommandOptions)):
> + message = "There was an error mounting the filesystem type %s for the device %s to the mount point %s." %(filesystemType, pathToDevice, pathToMountPoint)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + message = "The standard error is below: \n\t %s" %(stderr)
I'm not sure where stderr comes from here ^
> + logging.getLogger(MAIN_LOGGER_NAME).debug(message)
> + return os.path.ismount(PATH_TO_DEBUG_DIR)
> +
> +def gatherGeneralInformation(pathToDSTDir):
> + # Maybe add cluster node name, uname -a, etc
> + systemString = "HOSTNAME: %s\nDATE: %s" %(platform.node(), time.strftime("%Y-%m-%d_%H:%M:%S"))
> + writeToFile(os.path.join(pathToDSTDir, "system.txt"), systemString, createFile=True)
> + # Get "cman_tool node -F id,type,name" data.
> + command = "cman_tool"
> + pathToCommandOutput = os.path.join(pathToDSTDir, "cman_tool-nodes.txt")
> + try:
> + fout = open(pathToCommandOutput, "w")
> + runCommand(command, ["nodes", "-F", "id,type,name"], standardOut=fout)
> + fout.close()
> + except IOError:
> + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + # Get "cman_tool services" data.
> + command = "cman_tool"
> + pathToCommandOutput = os.path.join(pathToDSTDir, "cman_tool-services.txt")
> + try:
> + fout = open(pathToCommandOutput, "w")
> + runCommand(command, ["services"], standardOut=fout)
> + fout.close()
> + except IOError:
> + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + # Get "clustat" data.
> + command = "clustat"
> + pathToCommandOutput = os.path.join(pathToDSTDir, "clustat.txt")
> + try:
> + fout = open(pathToCommandOutput, "w")
> + runCommand(command, [], standardOut=fout)
> + fout.close()
> + except IOError:
> + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + # Get "mount -l" filesystem data.
> + command = "cat"
> + pathToCommandOutput = os.path.join(pathToDSTDir, "cat-proc_mounts.txt")
> + try:
> + fout = open(pathToCommandOutput, "w")
> + runCommand(command, ["/proc/mounts"], standardOut=fout)
> + fout.close()
> + except IOError:
> + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + # Get "ps -eo user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan" data.
> + command = "ps"
> + pathToCommandOutput = os.path.join(pathToDSTDir, "ps.txt")
> + try:
> + fout = open(pathToCommandOutput, "w")
> + #runCommand(command, ["-eo", "user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan"], standardOut=fout)
> + runCommand(command, ["h", "-AL", "-o", "tid,s,cmd"], standardOut=fout)
> + fout.close()
> + except IOError:
> + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + # Get "group_tool ls" data.
> + command = "group_tool"
> + pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-ls.txt")
> + try:
> + fout = open(pathToCommandOutput, "w")
> + runCommand(command, ["ls"], standardOut=fout)
> + fout.close()
> + except IOError:
> + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + # Get "group_tool dump fence" data.
> + command = "group_tool"
> + pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-dump_fence.txt")
> + try:
> + fout = open(pathToCommandOutput, "w")
> + runCommand(command, ["dump", "fence"], standardOut=fout)
> + fout.close()
> + except IOError:
> + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + # Get "group_tool dump gfs2" data.
> + command = "group_tool"
> + pathToCommandOutput = os.path.join(pathToDSTDir, "group_tool-dump_gfs2.txt")
> + try:
> + fout = open(pathToCommandOutput, "w")
> + runCommand(command, ["dump", "gfs2"], standardOut=fout)
> + fout.close()
> + except IOError:
> + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> +
> +def triggerSysRQEvents():
> + command = "echo"
> + pathToSysrqTriggerFile = "/proc/sysrq-trigger"
> + # m - dump information about memory allocation
> + # t - dump thread state information
> + triggers = ["m", "t"]
> + for trigger in triggers:
> + try:
> + fout = open(pathToSysrqTriggerFile, "w")
> + runCommand(command, [trigger], standardOut=fout)
> + fout.close()
> + except IOError:
> + message = "There was an error the command output for %s to the file %s." %(command, pathToSysrqTriggerFile)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> +
> +def gatherLogs(pathToDSTDir):
> + if (mkdirs(pathToDSTDir)):
> + # Copy messages logs that contain the sysrq data.
> + pathToLogFile = "/var/log/messages"
> + pathToDSTLogFile = os.path.join(pathToDSTDir, os.path.basename(pathToLogFile))
> + try:
> + shutil.copyfile(pathToLogFile, pathToDSTLogFile)
> + except shutil.Error:
> + message = "There was an error copying the file: %s to %s." %(pathToLogFile, pathToDSTLogFile)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> +
> + pathToLogDir = "/var/log/cluster"
> + pathToDSTLogDir = os.path.join(pathToDSTDir, os.path.basename(pathToLogDir))
> + if (os.path.isdir(pathToLogDir)):
> + try:
> + shutil.copytree(pathToLogDir, pathToDSTLogDir)
> + except shutil.Error:
> + message = "There was an error copying the directory: %s to %s." %(pathToLogDir, pathToDSTLogDir)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> +
> +def gatherDLMLockDumps(pathToDSTDir, listOfGFS2Filesystems):
> + lockDumpType = "dlm"
> + pathToSrcDir = os.path.join(PATH_TO_DEBUG_DIR, lockDumpType)
> + pathToOutputDir = os.path.join(pathToDSTDir, lockDumpType)
> + message = "Copying the %s lockdump data from the directory for the %s." %(lockDumpType, pathToSrcDir)
> + logging.getLogger(MAIN_LOGGER_NAME).status(message)
pylint tells me that the logger doesn't have a .status() method
> + for filename in os.listdir(pathToSrcDir):
> + for name in listOfGFS2Filesystems:
> + if (filename.startswith(name)):
> + pathToCurrentFilename = os.path.join(pathToSrcDir, filename)
> + pathToDSTDir = os.path.join(pathToOutputDir, name)
> + mkdirs(pathToDSTDir)
> + pathToDSTFilename = os.path.join(pathToDSTDir, filename)
> + try:
> + shutil.copy(pathToCurrentFilename, pathToDSTFilename)
> + except shutil.Error:
> + message = "There was an error copying the file: %s to %s." %(pathToCurrentFilename, pathToDSTFilename)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + except OSError:
> + message = "There was an error copying the file: %s to %s." %(pathToCurrentFilename, pathToDSTFilename)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> +
> +def gatherGFS2LockDumps(pathToDSTDir, listOfGFS2Filesystems):
> + lockDumpType = "gfs2"
> + pathToSrcDir = os.path.join(PATH_TO_DEBUG_DIR, lockDumpType)
> + pathToOutputDir = os.path.join(pathToDSTDir, lockDumpType)
> + for dirName in os.listdir(pathToSrcDir):
> + pathToCurrentDir = os.path.join(pathToSrcDir, dirName)
> + if ((os.path.isdir(pathToCurrentDir)) and (dirName in listOfGFS2Filesystems)):
> + mkdirs(pathToOutputDir)
> + pathToDSTDir = os.path.join(pathToOutputDir, dirName)
> + try:
> + message = "Copying the lockdump data for the %s filesystem: %s" %(lockDumpType, dirName)
> + logging.getLogger(MAIN_LOGGER_NAME).status(message)
> + shutil.copytree(pathToCurrentDir, pathToDSTDir)
> + except shutil.Error:
> + message = "There was an error copying the directory: %s to %s." %(pathToCurrentDir, pathToDSTDir)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + except OSError:
> + message = "There was an error copying the directory: %s to %s." %(pathToCurrentDir, pathToDSTDir)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> +
> +def archiveData(pathToSrcDir):
> + # Compress the file so that it will have a smaller file name.
> + if (os.path.exists(pathToSrcDir)):
> + pathToTarFilename = "%s.tar.bz2" %(pathToSrcDir)
> + message = "Creating a compressed archvied file: %s" %(pathToTarFilename)
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + try:
> + tar = tarfile.open(pathToTarFilename, "w:bz2")
> + tar.add(pathToSrcDir, arcname=os.path.basename(pathToSrcDir))
> + tar.close()
> + except tarfile.TarError:
> + message = "There was an error creating the tarfile: %s." %(pathToTarFilename)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + return ""
> + if (os.path.exists(pathToTarFilename)):
> + return pathToTarFilename
> + return ""
> +
> +# ##############################################################################
> +# Get user selected options
> +# ##############################################################################
> +def __getOptions(version) :
> + cmdParser = OptionParserExtended(version)
> + cmdParser.add_option("-d", "--debug",
> + action="store_true",
> + dest="enableDebugLogging",
> + help="Enables debug logging.",
> + default=False)
> + cmdParser.add_option("-q", "--quiet",
> + action="store_true",
> + dest="disableLoggingToConsole",
> + help="Disables logging to console.",
> + default=False)
> + cmdParser.add_option("-i", "--info",
> + action="store_true",
> + dest="enablePrintInfo",
> + help="Prints to console some basic information about the GFS2 filesystems mounted on the cluster node.",
> + default=False)
> + cmdParser.add_option("-M", "--mount_debug_fs",
> + action="store_true",
> + dest="enableMountDebugFS",
> + help="Enables the mounting of the debug filesystem if it is not mounted. Default is disabled.",
> + default=False)
> + cmdParser.add_option("-o", "--path_to_output_dir",
> + action="store",
> + dest="pathToOutputDir",
> + help="The path to the output directory where all the collect data will be stored. Default is /tmp/<date>-<hostname>-%s" %(os.path.basename(sys.argv[0])),
> + type="string",
> + default="")
> + cmdParser.add_option("-r", "--num_of_runs",
> + action="store",
> + dest="numberOfRuns",
> + help="The number of lockdumps runs to do. Default is 2.",
> + type="int",
> + default=2)
> + cmdParser.add_option("-s", "--seconds_sleep",
> + action="store",
> + dest="secondsToSleep",
> + help="The number of seconds sleep between runs. Default is 120 seconds.",
> + type="int",
> + default=120)
> + cmdParser.add_option("-t", "--archive",
> + action="store_true",
> + dest="enableArchiveOutputDir",
> + help="Enables archiving and compressing of the output directory with tar and bzip2. Default is disabled.",
> + default=False)
> + cmdParser.add_option("-n", "--fs_name",
> + action="extend",
> + dest="listOfGFS2Names",
> + help="List of GFS2 filesystems that will have their lockdump data gathered.",
> + type="string",
> + default=[]) # Get the options and return the result.
> + (cmdLineOpts, cmdLineArgs) = cmdParser.parse_args()
> + return (cmdLineOpts, cmdLineArgs)
> +
> +# ##############################################################################
> +# OptParse classes for commandline options
> +# ##############################################################################
> +class OptionParserExtended(OptionParser):
> + """
> + This is the class that gets the command line options the end user
> + selects.
> + """
> + def __init__(self, version) :
> + self.__commandName = os.path.basename(sys.argv[0])
> + versionMessage = "%s %s\n" %(self.__commandName, version)
> +
> + commandDescription ="%s will capture information about lockdata data for GFS2 and DLM required to analyze a GFS2 filesystem.\n"%(self.__commandName)
> +
> + OptionParser.__init__(self, option_class=ExtendOption,
> + version=versionMessage,
> + description=commandDescription)
> +
> + def print_help(self):
> + self.print_version()
> + examplesMessage = "\n"
> + examplesMessage = "\nPrints information about the available GFS2 filesystems that can have lockdump data captured."
> + examplesMessage += "\n$ %s -i\n" %(self.__commandName)
> + examplesMessage += "\nThis command will mount the debug directory if it is not mounted. It will do 3 runs of\n"
> + examplesMessage += "gathering the lockdump information in 10 second intervals for only the GFS2 filesystems\n"
> + examplesMessage += "with the names myGFS2vol2,myGFS2vol1. Then it will archive and compress the data collected."
> + examplesMessage += "\n$ %s -M -r 3 -s 10 -t -n myGFS2vol2,myGFS2vol1\n" %(self.__commandName)
> + OptionParser.print_help(self)
> + print examplesMessage
> +
> +
> +class ExtendOption (Option):
> + """
> + Allow to specify comma delimited list of entries for arrays
> + and dictionaries.
> + """
> + ACTIONS = Option.ACTIONS + ("extend",)
> + STORE_ACTIONS = Option.STORE_ACTIONS + ("extend",)
> + TYPED_ACTIONS = Option.TYPED_ACTIONS + ("extend",)
> +
> + def take_action(self, action, dest, opt, value, values, parser):
> + if (action == "extend") :
> + valueList=[]
> + try:
> + for v in value.split(","):
> + # Need to add code for dealing with paths if there is option for paths.
> + valueList.append(v)
> + except:
> + pass
> + else:
> + values.ensure_value(dest, []).extend(valueList)
> + else:
> + Option.take_action(self, action, dest, opt, value, values, parser)
> +
> +# ###############################################################################
> +# Main Function
> +# ###############################################################################
> +if __name__ == "__main__":
> + try:
> + # #######################################################################
> + # Get the options from the commandline.
> + # #######################################################################
> + (cmdLineOpts, cmdLineArgs) = __getOptions(VERSION_NUMBER)
> +
> + # #######################################################################
> + # Setup the logger and create config directory
> + # #######################################################################
> + # Create the logger
> + logLevel = logging.INFO
> + logger = logging.getLogger(MAIN_LOGGER_NAME)
> + logger.setLevel(logLevel)
> + # Create a new status function and level.
> + logging.STATUS = logging.INFO + 2
> + logging.addLevelName(logging.STATUS, "STATUS")
> + # Create a function for the STATUS_LEVEL since not defined by python. This
> + # means you can call it like the other predefined message
> + # functions. Example: logging.getLogger("loggerName").status(message)
> + setattr(logger, "status", lambda *args: logger.log(logging.STATUS, *args))
> + ch = logging.StreamHandler()
> + ch.setLevel(logLevel)
> + ch.setFormatter(logging.Formatter(MAIN_LOGGER_FORMAT))
> + logger.addHandler(ch)
> +
> + # #######################################################################
> + # Set the logging levels.
> + # #######################################################################
> + if ((cmdLineOpts.enableDebugLogging) and (not cmdLineOpts.disableLoggingToConsole)):
> + logging.getLogger(MAIN_LOGGER_NAME).setLevel(logging.DEBUG)
> + ch.setLevel(logging.DEBUG)
> + message = "Debugging has been enabled."
> + logging.getLogger(MAIN_LOGGER_NAME).debug(message)
> + if (cmdLineOpts.disableLoggingToConsole):
> + logging.disable(logging.CRITICAL)
> +
> + # #######################################################################
> + # Check to see if pid file exists and error if it does.
> + # #######################################################################
> + if (os.path.exists(PATH_TO_PID_FILENAME)):
> + message = "The PID file %s already exists and this script cannot run till it does not exist." %(PATH_TO_PID_FILENAME)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + message = "Verify that there are no other existing processes running. If there are running processes those need to be stopped first and the file removed."
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + exitScript(removePidFile=False, errorCode=1)
> + else:
> + message = "Creating the pid file: %s" %(PATH_TO_PID_FILENAME)
> + logging.getLogger(MAIN_LOGGER_NAME).debug(message)
> + # Creata the pid file so we dont have more than 1 process of this
> + # script running.
> + writeToFile(PATH_TO_PID_FILENAME, str(os.getpid()), createFile=True)
> +
> + # Get the clusternode name.
> + clusternode = getClusterNode(cmdLineOpts.listOfGFS2Names)
> + if (clusternode == None):
> + message = "The cluster or cluster node name could not be found from \"cman_tool status\"."
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + exitScript(removePidFile=False, errorCode=1)
> + if (cmdLineOpts.enablePrintInfo):
> + logging.disable(logging.CRITICAL)
> + print "List of all the mounted GFS2 filesystems that can have their lockdump data captured:"
> + print clusternode
> + exitScript()
> + # #######################################################################
> + # Create the output directory to verify it can be created before
> + # proceeding unless it is already created from a previous run data needs
> + # to be analyzed. Probably could add more debugging on if file or dir.
> + # #######################################################################
> + message = "The gathering of the lockdumps will be performed on the clusternode \"%s\" which is part of the cluster \"%s\"." %(clusternode.getClusterNodeName(), clusternode.getClusterName())
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + pathToOutputDir = cmdLineOpts.pathToOutputDir
> + if (not len(pathToOutputDir) > 0):
> + pathToOutputDir = "%s" %(os.path.join("/tmp", "%s-%s-%s" %(time.strftime("%Y-%m-%d_%H%M%S"), clusternode.getClusterNodeName(), os.path.basename(sys.argv[0]))))
> + if (os.path.exists(pathToOutputDir)):
> + message = "The directory already exists and could contain previous lockdump data: %s" %(pathToOutputDir)
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + exitScript(errorCode=1)
> + else:
> + message = "This directory that will be used to capture all the data: %s" %(pathToOutputDir)
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + if (not mkdirs(pathToOutputDir)):
> + exitScript(errorCode=1)
> +
> + # #######################################################################
> + # Check to see if the debug directory is mounted. If not then
> + # log an error.
> + # #######################################################################
> + result = mountFilesystemDebug(cmdLineOpts.enableMountDebugFS)
> + if (not result):
> + message = "Please mount the debug filesystem before running this script. For example: $ mount none -t debugfs %s" %(PATH_TO_DEBUG_DIR)
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + exitScript(errorCode=1)
> +
> + # #######################################################################
> + # Gather data and the lockdumps.
> + # #######################################################################
> + message = "The process of gathering all the required files will begin before capturing the lockdumps."
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + for i in range(0,cmdLineOpts.numberOfRuns):
> + # Add clusternode name under each run dir to make combining multple
> + # clusternode gfs2_lockgather data together and all data in each run directory.
> + pathToOutputRunDir = os.path.join(pathToOutputDir, "run%d/%s" %(i, clusternode.getClusterNodeName()))
> + if (not mkdirs(pathToOutputRunDir)):
> + exitOnError()
exitOnError doesn't seem to be defined?
> + # Gather various bits of data from the clusternode.
> + message = "Gathering some general information about the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i)
> + logging.getLogger(MAIN_LOGGER_NAME).status(message)
> + gatherGeneralInformation(pathToOutputRunDir)
> + # Trigger sysrq events to capture memory and thread information
> + message = "Triggering the sysrq events for the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i)
> + logging.getLogger(MAIN_LOGGER_NAME).status(message)
> + triggerSysRQEvents()
> + # Gather the dlm locks.
> + lockDumpType = "dlm"
> + message = "Gathering the %s lock dumps for clusternode %s for run %d." %(lockDumpType, clusternode.getClusterNodeName(), i)
> + logging.getLogger(MAIN_LOGGER_NAME).status(message)
> + gatherDLMLockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames(includeClusterName=False))
> + # Gather the glock locks from gfs2.
> + lockDumpType = "gfs2"
> + message = "Gathering the %s lock dumps for clusternode %s for run %d." %(lockDumpType, clusternode.getClusterNodeName(), i)
> + logging.getLogger(MAIN_LOGGER_NAME).status(message)
> + gatherGFS2LockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames())
> + # Gather log files
> + message = "Gathering the log files for the clusternode %s for run %d." %(clusternode.getClusterNodeName(), i)
> + logging.getLogger(MAIN_LOGGER_NAME).status(message)
> + gatherLogs(os.path.join(pathToOutputRunDir, "logs"))
> + if (cmdLineOpts.secondsToSleep > 0):
> + message = "The script will sleep for %d seconds between each run of capturing the lockdumps." %(cmdLineOpts.secondsToSleep)
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + time.sleep(cmdLineOpts.secondsToSleep)
> + # #######################################################################
> + # Archive the file if enabled and print the location of the output
> + # directory.
> + # #######################################################################
> + # After it is done the we should print out where the files that were
> + # generated are located and what to do.
> + message = "All the files have been gathered and this directory contains all the captured data: %s" %(pathToOutputDir)
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> +
> + # #######################################################################
> + # Archive the directory that contains all the data and archive it.
> + # #######################################################################
> + if (cmdLineOpts.enableArchiveOutputDir):
> + message = "The lockdump data will now be archived. This could some time depending on the size of the data collected."
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> + pathToTarFilename = archiveData(pathToOutputDir)
> + if (os.path.exists(pathToTarFilename)):
> + message = "The compressed archvied file was created: %s" %(pathToTarFilename)
> + logging.getLogger(MAIN_LOGGER_NAME).info(message)
> +
> + # #######################################################################
> + except KeyboardInterrupt:
> + print ""
> + message = "This script will exit since control-c was executed by end user."
> + logging.getLogger(MAIN_LOGGER_NAME).error(message)
> + exitScript(errorCode=1)
> + # #######################################################################
> + # Exit the application with zero exit code since we cleanly exited.
> + # #######################################################################
> + exitScript()
>
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2012-11-01 16:45 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-11-01 15:26 [Cluster-devel] [PATCH] Adding gfs2_lockcapture Shane Bradley
2012-11-01 16:45 ` Andrew Price
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).