From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andrew Price Date: Fri, 14 Dec 2012 13:37:47 +0000 Subject: [Cluster-devel] [PATCH] gfs2-lockcapture: Modified some of the data gathered In-Reply-To: <1355411652-6150-1-git-send-email-sbradley@redhat.com> References: <1355411652-6150-1-git-send-email-sbradley@redhat.com> Message-ID: <50CB2BAB.4040707@redhat.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Hi Shane, On 13/12/12 15:14, sbradley at redhat.com wrote: > From: sbradley Looks good to me. Thanks for adding a manpage. Please make sure the authorship info is corrected before pushing. Andy > Changed some var names in host data collected, added /proc// to files > collected, and added man page. > > Signed-off-by: shane bradley > --- > gfs2/lockcapture/gfs2_lockcapture | 465 +++++++++++++++++++++++++------------- > gfs2/man/Makefile.am | 3 +- > gfs2/man/gfs2_lockcapture.8 | 53 +++++ > 3 files changed, 364 insertions(+), 157 deletions(-) > create mode 100644 gfs2/man/gfs2_lockcapture.8 > > diff --git a/gfs2/lockcapture/gfs2_lockcapture b/gfs2/lockcapture/gfs2_lockcapture > index a930a2f..1a64188 100644 > --- a/gfs2/lockcapture/gfs2_lockcapture > +++ b/gfs2/lockcapture/gfs2_lockcapture > @@ -1,9 +1,7 @@ > #!/usr/bin/env python > """ > -This script will gather GFS2 glocks and dlm lock dump information for a cluster > -node. The script can get all the mounted GFS2 filesystem data or set of selected > -GFS2 filesystems. The script will also gather some general information about the > -system. > +The script gfs2_lockcapture will capture locking information from GFS2 file > +systems and DLM. > > @author : Shane Bradley > @contact : sbradley at redhat.com > @@ -35,7 +33,7 @@ import tarfile > sure only 1 instance of this script is running at any time. > @type PATH_TO_PID_FILENAME: String > """ > -VERSION_NUMBER = "0.9-1" > +VERSION_NUMBER = "0.9-2" > MAIN_LOGGER_NAME = "%s" %(os.path.basename(sys.argv[0])) > PATH_TO_DEBUG_DIR="/sys/kernel/debug" > PATH_TO_PID_FILENAME = "/var/run/%s.pid" %(os.path.basename(sys.argv[0])) > @@ -313,7 +311,7 @@ def archiveData(pathToSrcDir): > @type pathToSrcDir: String > """ > if (os.path.exists(pathToSrcDir)): > - pathToTarFilename = "%s.tar.bz2" %(pathToSrcDir) > + pathToTarFilename = "%s-%s.tar.bz2" %(pathToSrcDir, platform.node()) > if (os.path.exists(pathToTarFilename)): > message = "A compressed archvied file already exists and will be removed: %s" %(pathToTarFilename) > logging.getLogger(MAIN_LOGGER_NAME).status(message) > @@ -337,6 +335,127 @@ def archiveData(pathToSrcDir): > return pathToTarFilename > return "" > > +def getDataFromFile(pathToSrcFile) : > + """ > + This function will return the data in an array. Where each newline in file > + is a seperate item in the array. This should really just be used on > + relatively small files. > + > + None is returned if no file is found. > + > + @return: Returns an array of Strings, where each newline in file is an item > + in the array. > + @rtype: Array > + > + @param pathToSrcFile: The path to the file which will be read. > + @type pathToSrcFile: String > + """ > + if (len(pathToSrcFile) > 0) : > + try: > + fin = open(pathToSrcFile, "r") > + data = fin.readlines() > + fin.close() > + return data > + except (IOError, os.error): > + message = "An error occured reading the file: %s." %(pathToSrcFile) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return None > + > +def copyFile(pathToSrcFile, pathToDstFile): > + """ > + This function will copy a src file to dst file. > + > + @return: Returns True if the file was copied successfully. > + @rtype: Boolean > + > + @param pathToSrcFile: The path to the source file that will be copied. > + @type pathToSrcFile: String > + @param pathToDstFile: The path to the destination of the file. > + @type pathToDstFile: String > + """ > + if(not os.path.exists(pathToSrcFile)): > + message = "The file does not exist with the path: %s." %(pathToSrcFile) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + elif (not os.path.isfile(pathToSrcFile)): > + message = "The path to the source file is not a regular file: %s." %(pathToSrcFile) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + elif (pathToSrcFile == pathToDstFile): > + message = "The path to the source file and path to destination file cannot be the same: %s." %(pathToDstFile) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + else: > + # Create the directory structure if it does not exist. > + (head, tail) = os.path.split(pathToDstFile) > + if (not mkdirs(head)) : > + # The path to the directory was not created so file > + # could not be copied. > + return False > + # Copy the file to the dst path. > + try: > + shutil.copy(pathToSrcFile, pathToDstFile) > + except shutil.Error: > + message = "Cannot copy the file %s to %s." %(pathToSrcFile, pathToDstFile) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + except OSError: > + message = "Cannot copy the file %s to %s." %(pathToSrcFile, pathToDstFile) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + except IOError: > + message = "Cannot copy the file %s to %s." %(pathToSrcFile, pathToDstFile) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + return (os.path.exists(pathToDstFile)) > + > +def copyDirectory(pathToSrcDir, pathToDstDir): > + """ > + This function will copy a src dir to dst dir. > + > + @return: Returns True if the dir was copied successfully. > + @rtype: Boolean > + > + @param pathToSrcDir: The path to the source dir that will be copied. > + @type pathToSrcDir: String > + @param pathToDstDir: The path to the destination of the dir. > + @type pathToDstDir: String > + """ > + if(not os.path.exists(pathToSrcDir)): > + message = "The directory does not exist with the path: %s." %(pathToSrcDir) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + elif (not os.path.isdir(pathToSrcDir)): > + message = "The path to the source directory is not a directory: %s." %(pathToSrcDir) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + elif (pathToSrcDir == pathToDstDir): > + message = "The path to the source directory and path to destination directory cannot be the same: %s." %(pathToDstDir) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + else: > + if (not mkdirs(pathToDstDir)) : > + # The path to the directory was not created so file > + # could not be copied. > + return False > + # Copy the file to the dst path. > + dst = os.path.join(pathToDstDir, os.path.basename(pathToSrcDir)) > + try: > + shutil.copytree(pathToSrcDir, dst) > + except shutil.Error: > + message = "Cannot copy the directory %s to %s." %(pathToSrcDir, dst) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + except OSError: > + message = "Cannot copy the directory %s to %s." %(pathToSrcDir, dst) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + except IOError: > + message = "Cannot copy the directory %s to %s." %(pathToSrcDir, dst) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + return False > + return (os.path.exists(dst)) > + > def backupOutputDirectory(pathToOutputDir): > """ > This function will return True if the pathToOutputDir does not exist or the > @@ -464,8 +583,8 @@ def getClusterNode(listOfGFS2Names): > if (len(listOfGFS2Names) > 0): > for label in mapOfMountedFilesystemLabels.keys(): > foundMatch = False > - for name in listOfGFS2Names: > - if ((name == label) or ("%s:%s"%(clusterName, name) == label)): > + for gfs2FSName in listOfGFS2Names: > + if ((gfs2FSName == label) or ("%s:%s"%(clusterName, gfs2FSName) == label)): > foundMatch = True > break > if ((not foundMatch) and (mapOfMountedFilesystemLabels.has_key(label))): > @@ -518,33 +637,6 @@ def getLabelMapForMountedFilesystems(clusterName, listOfMountedFilesystems): > mapOfMountedFilesystemLabels[fsLabel] = mountedFilesystem > return mapOfMountedFilesystemLabels > > -def verifyDebugFilesystemMounted(enableMounting=True): > - """ > - This function verifies that the debug filesystem is mounted. If the debug > - filesystem is mounted then True is returned, otherwise False is returned. > - > - @return: If the debug filesystem is mounted then True is returned, otherwise > - False is returned. > - @rtype: Boolean > - > - @param enableMounting: If True then the debug filesystem will be mounted if > - it is currently not mounted. > - @type enableMounting: Boolean > - """ > - if (os.path.ismount(PATH_TO_DEBUG_DIR)): > - message = "The debug filesystem %s is mounted." %(PATH_TO_DEBUG_DIR) > - logging.getLogger(MAIN_LOGGER_NAME).info(message) > - return True > - else: > - message = "The debug filesystem %s is not mounted." %(PATH_TO_DEBUG_DIR) > - logging.getLogger(MAIN_LOGGER_NAME).warning(message) > - if (cmdLineOpts.enableMountDebugFS): > - if(mountFilesystem("debugfs", "none", PATH_TO_DEBUG_DIR)): > - message = "The debug filesystem was mounted: %s." %(PATH_TO_DEBUG_DIR) > - logging.getLogger(MAIN_LOGGER_NAME).info(message) > - return True > - return False > - > def mountFilesystem(filesystemType, pathToDevice, pathToMountPoint): > """ > This function will attempt to mount a filesystem. If the filesystem is > @@ -583,29 +675,24 @@ def gatherGeneralInformation(pathToDSTDir): > @type pathToDSTDir: String > """ > # Gather some general information and write to system.txt. > - systemString = "HOSTNAME: %s\nDATE: %s\n" %(platform.node(), time.strftime("%Y-%m-%d_%H:%M:%S")) > - stdout = runCommandOutput("uname", ["-a"]) > + systemString = "HOSTNAME=%s\nTIMESTAMP=%s\n" %(platform.node(), time.strftime("%Y-%m-%d %H:%M:%S")) > + stdout = runCommandOutput("uname", ["-a"]).strip().rstrip() > if (not stdout == None): > - systemString += "UNAME-A: %s\n" %(stdout) > - stdout = runCommandOutput("uptime", []) > + systemString += "UNAMEA=%s\n" %(stdout) > + stdout = runCommandOutput("uptime", []).strip().rstrip() > if (not stdout == None): > - systemString += "UPTIME: %s\n" %(stdout) > - writeToFile(os.path.join(pathToDSTDir, "system.txt"), systemString, createFile=True) > + systemString += "UPTIME=%s" %(stdout) > + writeToFile(os.path.join(pathToDSTDir, "hostinformation.txt"), systemString, createFile=True) > > - # Get "mount -l" filesystem data. > - command = "cat" > - pathToCommandOutput = os.path.join(pathToDSTDir, "cat-proc_mounts.txt") > - try: > - fout = open(pathToCommandOutput, "w") > - runCommand(command, ["/proc/mounts"], standardOut=fout) > - fout.close() > - except IOError: > - message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) > - logging.getLogger(MAIN_LOGGER_NAME).error(message) > + # Copy misc files > + pathToSrcFile = "/proc/mounts" > + copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) > + pathToSrcFile = "/proc/slabinfo" > + copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) > > # Get "ps -eo user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan" data. > command = "ps" > - pathToCommandOutput = os.path.join(pathToDSTDir, "ps.txt") > + pathToCommandOutput = os.path.join(pathToDSTDir, "ps_hALo-tid.s.cmd") > try: > fout = open(pathToCommandOutput, "w") > #runCommand(command, ["-eo", "user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan"], standardOut=fout) > @@ -615,6 +702,48 @@ def gatherGeneralInformation(pathToDSTDir): > message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) > logging.getLogger(MAIN_LOGGER_NAME).error(message) > > + > +def isProcPidStackEnabled(pathToPidData): > + """ > + Returns true if the init process has the file "stack" in its pid data > + directory which contains the task functions for that process. > + > + @return: Returns true if the init process has the file "stack" in its pid > + data directory which contains the task functions for that process. > + @rtype: Boolean > + > + @param pathToPidData: The path to the directory where all the pid data > + directories are located. > + @type pathToPidData: String > + """ > + return os.path.exists(os.path.join(pathToPidData, "1/stack")) > + > +def gatherPidData(pathToPidData, pathToDSTDir): > + """ > + This command will gather all the directories which contain data about all the pids. > + > + @return: Returns a list of paths to the directory that contains the > + information about the pid. > + @rtype: Array > + > + @param pathToPidData: The path to the directory where all the pid data > + directories are located. > + @type pathToPidData: String > + """ > + # Status has: command name, pid, ppid, state, possibly registers > + listOfFilesToCopy = ["cmdline", "stack", "status"] > + listOfPathToPidsData = [] > + if (os.path.exists(pathToPidData)): > + for srcFilename in os.listdir(pathToPidData): > + pathToPidDirDST = os.path.join(pathToDSTDir, srcFilename) > + if (srcFilename.isdigit()): > + pathToSrcDir = os.path.join(pathToPidData, srcFilename) > + for filenameToCopy in listOfFilesToCopy: > + copyFile(os.path.join(pathToSrcDir, filenameToCopy), os.path.join(pathToPidDirDST, filenameToCopy)) > + if (os.path.exists(pathToPidDirDST)): > + listOfPathToPidsData.append(pathToPidDirDST) > + return listOfPathToPidsData > + > def triggerSysRQEvents(): > """ > This command will trigger sysrq events which will write the output to > @@ -626,14 +755,15 @@ def triggerSysRQEvents(): > pathToSysrqTriggerFile = "/proc/sysrq-trigger" > # m - dump information about memory allocation > # t - dump thread state information > - triggers = ["m", "t"] > + # triggers = ["m", "t"] > + triggers = ["t"] > for trigger in triggers: > try: > fout = open(pathToSysrqTriggerFile, "w") > runCommand(command, [trigger], standardOut=fout) > fout.close() > except IOError: > - message = "There was an error the command output for %s to the file %s." %(command, pathToSysrqTriggerFile) > + message = "There was an error writing the command output for %s to the file %s." %(command, pathToSysrqTriggerFile) > logging.getLogger(MAIN_LOGGER_NAME).error(message) > > def gatherLogs(pathToDSTDir): > @@ -645,24 +775,14 @@ def gatherLogs(pathToDSTDir): > copied to. > @type pathToDSTDir: String > """ > - if (mkdirs(pathToDSTDir)): > - # Copy messages logs that contain the sysrq data. > - pathToLogFile = "/var/log/messages" > - pathToDSTLogFile = os.path.join(pathToDSTDir, os.path.basename(pathToLogFile)) > - try: > - shutil.copyfile(pathToLogFile, pathToDSTLogFile) > - except shutil.Error: > - message = "There was an error copying the file: %s to %s." %(pathToLogFile, pathToDSTLogFile) > - logging.getLogger(MAIN_LOGGER_NAME).error(message) > + pathToLogFile = "/var/log/messages" > + pathToDSTLogFile = os.path.join(pathToDSTDir, os.path.basename(pathToLogFile)) > + copyFile(pathToLogFile, pathToDSTLogFile) > > - pathToLogDir = "/var/log/cluster" > + pathToLogDir = "/var/log/cluster" > + if (os.path.exists(pathToLogDir)): > pathToDSTLogDir = os.path.join(pathToDSTDir, os.path.basename(pathToLogDir)) > - if (os.path.isdir(pathToLogDir)): > - try: > - shutil.copytree(pathToLogDir, pathToDSTLogDir) > - except shutil.Error: > - message = "There was an error copying the directory: %s to %s." %(pathToLogDir, pathToDSTLogDir) > - logging.getLogger(MAIN_LOGGER_NAME).error(message) > + copyDirectory(pathToLogDir, pathToDSTDir) > > def gatherDLMLockDumps(pathToDSTDir, listOfGFS2Filesystems): > """ > @@ -680,23 +800,13 @@ def gatherDLMLockDumps(pathToDSTDir, listOfGFS2Filesystems): > lockDumpType = "dlm" > pathToSrcDir = os.path.join(PATH_TO_DEBUG_DIR, lockDumpType) > pathToOutputDir = os.path.join(pathToDSTDir, lockDumpType) > - message = "Copying the files in the %s lockdump data directory %s for the selected GFS2 filesystem with dlm debug files." %(lockDumpType.upper(), pathToSrcDir) > - logging.getLogger(MAIN_LOGGER_NAME).status(message) > + message = "Copying the files in the %s lockdump data directory %s." %(lockDumpType.upper(), pathToSrcDir) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > for filename in os.listdir(pathToSrcDir): > for name in listOfGFS2Filesystems: > if (filename.startswith(name)): > - pathToCurrentFilename = os.path.join(pathToSrcDir, filename) > - pathToDSTDir = os.path.join(pathToOutputDir, name) > - mkdirs(pathToDSTDir) > - pathToDSTFilename = os.path.join(pathToDSTDir, filename) > - try: > - shutil.copy(pathToCurrentFilename, pathToDSTFilename) > - except shutil.Error: > - message = "There was an error copying the file: %s to %s." %(pathToCurrentFilename, pathToDSTFilename) > - logging.getLogger(MAIN_LOGGER_NAME).error(message) > - except OSError: > - message = "There was an error copying the file: %s to %s." %(pathToCurrentFilename, pathToDSTFilename) > - logging.getLogger(MAIN_LOGGER_NAME).error(message) > + copyFile(os.path.join(pathToSrcDir, filename), > + os.path.join(os.path.join(pathToOutputDir, name), filename)) > > def gatherGFS2LockDumps(pathToDSTDir, listOfGFS2Filesystems): > """ > @@ -718,18 +828,9 @@ def gatherGFS2LockDumps(pathToDSTDir, listOfGFS2Filesystems): > for dirName in os.listdir(pathToSrcDir): > pathToCurrentDir = os.path.join(pathToSrcDir, dirName) > if ((os.path.isdir(pathToCurrentDir)) and (dirName in listOfGFS2Filesystems)): > - mkdirs(pathToOutputDir) > - pathToDSTDir = os.path.join(pathToOutputDir, dirName) > - try: > - message = "Copying the lockdump data for the %s filesystem: %s" %(lockDumpType.upper(), dirName) > - logging.getLogger(MAIN_LOGGER_NAME).status(message) > - shutil.copytree(pathToCurrentDir, pathToDSTDir) > - except shutil.Error: > - message = "There was an error copying the directory: %s to %s." %(pathToCurrentDir, pathToDSTDir) > - logging.getLogger(MAIN_LOGGER_NAME).error(message) > - except OSError: > - message = "There was an error copying the directory: %s to %s." %(pathToCurrentDir, pathToDSTDir) > - logging.getLogger(MAIN_LOGGER_NAME).error(message) > + message = "Copying the lockdump data for the %s filesystem: %s" %(lockDumpType.upper(), dirName) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > + copyDirectory(pathToCurrentDir, pathToOutputDir) > > # ############################################################################## > # Get user selected options > @@ -752,52 +853,57 @@ def __getOptions(version) : > cmdParser.add_option("-d", "--debug", > action="store_true", > dest="enableDebugLogging", > - help="Enables debug logging.", > + help="enables debug logging", > default=False) > cmdParser.add_option("-q", "--quiet", > action="store_true", > dest="disableLoggingToConsole", > - help="Disables logging to console.", > + help="disables logging to console", > + default=False) > + cmdParser.add_option("-y", "--no_ask", > + action="store_true", > + dest="disableQuestions", > + help="disables all questions and assumes yes", > default=False) > cmdParser.add_option("-i", "--info", > action="store_true", > dest="enablePrintInfo", > - help="Prints to console some basic information about the GFS2 filesystems mounted on the cluster node.", > + help="prints information about the mounted GFS2 file systems", > default=False) > - cmdParser.add_option("-M", "--mount_debug_fs", > + cmdParser.add_option("-t", "--archive", > action="store_true", > - dest="enableMountDebugFS", > - help="Enables the mounting of the debug filesystem if it is not mounted. Default is disabled.", > + dest="enableArchiveOutputDir", > + help="the output directory will be archived(tar) and compressed(.bz2)", > default=False) > cmdParser.add_option("-o", "--path_to_output_dir", > action="store", > dest="pathToOutputDir", > - help="The path to the output directory where all the collect data will be stored. Default is /tmp/--%s" %(os.path.basename(sys.argv[0])), > + help="the directory where all the collect data will be stored", > type="string", > + metavar="", > default="") > cmdParser.add_option("-r", "--num_of_runs", > action="store", > dest="numberOfRuns", > - help="The number of lockdumps runs to do. Default is 2.", > + help="number of runs capturing the lockdump data", > type="int", > + metavar="", > default=2) > cmdParser.add_option("-s", "--seconds_sleep", > action="store", > dest="secondsToSleep", > - help="The number of seconds sleep between runs. Default is 120 seconds.", > + help="number of seconds to sleep between runs of capturing the lockdump data", > type="int", > + metavar="", > default=120) > - cmdParser.add_option("-t", "--archive", > - action="store_true", > - dest="enableArchiveOutputDir", > - help="Enables archiving and compressing of the output directory with tar and bzip2. Default is disabled.", > - default=False) > cmdParser.add_option("-n", "--fs_name", > action="extend", > dest="listOfGFS2Names", > - help="List of GFS2 filesystems that will have their lockdump data gathered.", > + help="name of the GFS2 filesystem(s) that will have their lockdump data captured", > type="string", > - default=[]) # Get the options and return the result. > + metavar="", > + default=[]) > + # Get the options and return the result. > (cmdLineOpts, cmdLineArgs) = cmdParser.parse_args() > return (cmdLineOpts, cmdLineArgs) > > @@ -817,7 +923,7 @@ class OptionParserExtended(OptionParser): > self.__commandName = os.path.basename(sys.argv[0]) > versionMessage = "%s %s\n" %(self.__commandName, version) > > - commandDescription ="%s will capture information about lockdata data for GFS2 and DLM required to analyze a GFS2 filesystem.\n"%(self.__commandName) > + commandDescription ="%s gfs2_lockcapture will capture locking information from GFS2 file systems and DLM.\n"%(self.__commandName) > > OptionParser.__init__(self, option_class=ExtendOption, > version=versionMessage, > @@ -831,10 +937,17 @@ class OptionParserExtended(OptionParser): > examplesMessage = "\n" > examplesMessage = "\nPrints information about the available GFS2 filesystems that can have lockdump data captured." > examplesMessage += "\n$ %s -i\n" %(self.__commandName) > - examplesMessage += "\nThis command will mount the debug directory if it is not mounted. It will do 3 runs of\n" > - examplesMessage += "gathering the lockdump information in 10 second intervals for only the GFS2 filesystems\n" > - examplesMessage += "with the names myGFS2vol2,myGFS2vol1. Then it will archive and compress the data collected." > - examplesMessage += "\n$ %s -M -r 3 -s 10 -t -n myGFS2vol2,myGFS2vol1\n" %(self.__commandName) > + > + examplesMessage += "\nIt will do 3 runs of gathering the lockdump information in 10 second intervals for only the" > + examplesMessage += "\nGFS2 filesystems with the names myGFS2vol2,myGFS2vol1. Then it will archive and compress" > + examplesMessage += "\nthe data collected. All of the lockdump data will be written to the directory: " > + examplesMessage += "\n/tmp/2012-11-12_095556-gfs2_lockcapture and all the questions will be answered with yes.\n" > + examplesMessage += "\n$ %s -r 3 -s 10 -t -n myGFS2vol2,myGFS2vol1 -o /tmp/2012-11-12_095556-gfs2_lockcapture -y\n" %(self.__commandName) > + > + examplesMessage += "\nIt will do 2 runs of gathering the lockdump information in 25 second intervals for all the" > + examplesMessage += "\nmounted GFS2 filesystems. Then it will archive and compress the data collected. All of the" > + examplesMessage += "\nlockdump data will be written to the directory: /tmp/2012-11-12_095556-gfs2_lockcapture.\n" > + examplesMessage += "\n$ %s -r 2 -s 25 -t -o /tmp/2012-11-12_095556-gfs2_lockcapture\n" %(self.__commandName) > OptionParser.print_help(self) > print examplesMessage > > @@ -869,11 +982,13 @@ class ExtendOption (Option): > @type parser: OptionParser > """ > if (action == "extend") : > - valueList=[] > + valueList = [] > try: > for v in value.split(","): > # Need to add code for dealing with paths if there is option for paths. > - valueList.append(v) > + newValue = value.strip().rstrip() > + if (len(newValue) > 0): > + valueList.append(newValue) > except: > pass > else: > @@ -912,17 +1027,10 @@ if __name__ == "__main__": > streamHandler.setFormatter(logging.Formatter("%(levelname)s %(message)s")) > logger.addHandler(streamHandler) > > - # Set the handler for writing to log file. > - pathToLogFile = "/tmp/%s.log" %(MAIN_LOGGER_NAME) > - if (((os.access(pathToLogFile, os.W_OK) and os.access("/tmp", os.R_OK))) or (not os.path.exists(pathToLogFile))): > - fileHandler = logging.FileHandler(pathToLogFile) > - fileHandler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s", "%Y-%m-%d %H:%M:%S")) > - logger.addHandler(fileHandler) > - message = "A log file will be created or appened to: %s" %(pathToLogFile) > - logging.getLogger(MAIN_LOGGER_NAME).info(message) > - else: > - message = "There was permission problem accessing the write attributes for the log file: %s." %(pathToLogFile) > - logging.getLogger(MAIN_LOGGER_NAME).error(message) > + # Please note there will not be a global log file created. If a log file > + # is needed then redirect the output. There will be a log file created > + # for each run in the corresponding directory. > + > # ####################################################################### > # Set the logging levels. > # ####################################################################### > @@ -949,6 +1057,26 @@ if __name__ == "__main__": > # script running. > writeToFile(PATH_TO_PID_FILENAME, str(os.getpid()), createFile=True) > # ####################################################################### > + # Verify they want to continue because this script will trigger sysrq events. > + # ####################################################################### > + if (not cmdLineOpts.disableQuestions): > + valid = {"yes":True, "y":True, "no":False, "n":False} > + question = "This script will trigger a sysrq -t event or collect the data for each pid directory located in /proc for each run. Are you sure you want to continue?" > + prompt = " [y/n] " > + while True: > + sys.stdout.write(question + prompt) > + choice = raw_input().lower() > + if (choice in valid): > + if (valid.get(choice)): > + # If yes, or y then exit loop and continue. > + break > + else: > + message = "The script will not continue since you chose not to continue." > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + exitScript(removePidFile=True, errorCode=1) > + else: > + sys.stdout.write("Please respond with '(y)es' or '(n)o'.\n") > + # ####################################################################### > # Get the clusternode name and verify that mounted GFS2 filesystems were > # found. > # ####################################################################### > @@ -976,8 +1104,6 @@ if __name__ == "__main__": > # proceeding unless it is already created from a previous run data needs > # to be analyzed. Probably could add more debugging on if file or dir. > # ####################################################################### > - message = "The gathering of the lockdumps will be performed on the clusternode \"%s\" which is part of the cluster \"%s\"." %(clusternode.getClusterNodeName(), clusternode.getClusterName()) > - logging.getLogger(MAIN_LOGGER_NAME).info(message) > pathToOutputDir = cmdLineOpts.pathToOutputDir > if (not len(pathToOutputDir) > 0): > pathToOutputDir = "%s" %(os.path.join("/tmp", "%s-%s-%s" %(time.strftime("%Y-%m-%d_%H%M%S"), clusternode.getClusterNodeName(), os.path.basename(sys.argv[0])))) > @@ -1000,56 +1126,83 @@ if __name__ == "__main__": > # Check to see if the debug directory is mounted. If not then > # log an error. > # ####################################################################### > - result = verifyDebugFilesystemMounted(cmdLineOpts.enableMountDebugFS) > - if (not result): > - message = "Please mount the debug filesystem before running this script. For example: $ mount none -t debugfs %s" %(PATH_TO_DEBUG_DIR) > + if(mountFilesystem("debugfs", "none", PATH_TO_DEBUG_DIR)): > + message = "The debug filesystem %s is mounted." %(PATH_TO_DEBUG_DIR) > + logging.getLogger(MAIN_LOGGER_NAME).info(message) > + else: > + message = "There was a problem mounting the debug filesystem: %s" %(PATH_TO_DEBUG_DIR) > + logging.getLogger(MAIN_LOGGER_NAME).error(message) > + message = "The debug filesystem is required to be mounted for this script to run." > logging.getLogger(MAIN_LOGGER_NAME).info(message) > exitScript(errorCode=1) > - > # ####################################################################### > # Gather data and the lockdumps. > # ####################################################################### > - message = "The process of gathering all the required files will begin before capturing the lockdumps." > - logging.getLogger(MAIN_LOGGER_NAME).info(message) > - for i in range(0,cmdLineOpts.numberOfRuns): > + if (cmdLineOpts.numberOfRuns <= 0): > + message = "The number of runs should be greater than zero." > + exitScript(errorCode=1) > + for i in range(1,(cmdLineOpts.numberOfRuns + 1)): > # The current log count that will start at 1 and not zero to make it > # make sense in logs. > - currentLogRunCount = (i + 1) > # Add clusternode name under each run dir to make combining multple > # clusternode gfs2_lockgather data together and all data in each run directory. > pathToOutputRunDir = os.path.join(pathToOutputDir, "run%d/%s" %(i, clusternode.getClusterNodeName())) > + # Create the the directory that will be used to capture the data. > if (not mkdirs(pathToOutputRunDir)): > exitScript(errorCode=1) > - # Gather various bits of data from the clusternode. > - message = "Gathering some general information about the clusternode %s for run %d/%d." %(clusternode.getClusterNodeName(), currentLogRunCount, cmdLineOpts.numberOfRuns) > + # Set the handler for writing to log file for this run. > + currentRunFileHandler = None > + pathToLogFile = os.path.join(pathToOutputRunDir, "%s.log" %(MAIN_LOGGER_NAME)) > + if (((os.access(pathToLogFile, os.W_OK) and os.access("/tmp", os.R_OK))) or (not os.path.exists(pathToLogFile))): > + currentRunFileHandler = logging.FileHandler(pathToLogFile) > + currentRunFileHandler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s", "%Y-%m-%d %H:%M:%S")) > + logging.getLogger(MAIN_LOGGER_NAME).addHandler(currentRunFileHandler) > + message = "Pass (%d/%d): Gathering all the lockdump data." %(i, cmdLineOpts.numberOfRuns) > logging.getLogger(MAIN_LOGGER_NAME).status(message) > + > + # Gather various bits of data from the clusternode. > + message = "Pass (%d/%d): Gathering general information about the host." %(i, cmdLineOpts.numberOfRuns) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > gatherGeneralInformation(pathToOutputRunDir) > - # Trigger sysrq events to capture memory and thread information > - message = "Triggering the sysrq events for the clusternode %s for run %d/%d." %(clusternode.getClusterNodeName(), currentLogRunCount, cmdLineOpts.numberOfRuns) > - logging.getLogger(MAIN_LOGGER_NAME).status(message) > - triggerSysRQEvents() > + # Going to sleep for 2 seconds, so that TIMESTAMP should be in the > + # past in the logs so that capturing sysrq data will be guaranteed. > + time.sleep(2) > + # Gather the backtraces for all the pids, by grabbing the /proc/ + # number> or triggering sysrq events to capture task bask traces > + # from log. > + message = "Pass (%d/%d): Triggering the sysrq events for the host." %(i, cmdLineOpts.numberOfRuns) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > + # Gather the data in the /proc/ directory if the file > + # /stack exists. If file exists we will not trigger > + # sysrq events. > + pathToPidData = "/proc" > + if (isProcPidStackEnabled(pathToPidData)): > + gatherPidData(pathToPidData, os.path.join(pathToOutputRunDir, pathToPidData.strip("/"))) > + else: > + triggerSysRQEvents() > # Gather the dlm locks. > lockDumpType = "dlm" > - message = "Gathering the %s lock dumps for clusternode %s for run %d/%d." %(lockDumpType.upper(), clusternode.getClusterNodeName(), currentLogRunCount, cmdLineOpts.numberOfRuns) > - logging.getLogger(MAIN_LOGGER_NAME).status(message) > + message = "Pass (%d/%d): Gathering the %s lock dumps for the host." %(i, cmdLineOpts.numberOfRuns, lockDumpType.upper()) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > gatherDLMLockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames(includeClusterName=False)) > # Gather the glock locks from gfs2. > lockDumpType = "gfs2" > - message = "Gathering the %s lock dumps for clusternode %s for run %d/%d." %(lockDumpType.upper(), clusternode.getClusterNodeName(), currentLogRunCount, cmdLineOpts.numberOfRuns) > - logging.getLogger(MAIN_LOGGER_NAME).status(message) > + message = "Pass (%d/%d): Gathering the %s lock dumps for the host." %(i, cmdLineOpts.numberOfRuns, lockDumpType.upper()) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > gatherGFS2LockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames()) > # Gather log files > - message = "Gathering the log files for the clusternode %s for run %d/%d." %(clusternode.getClusterNodeName(), currentLogRunCount, cmdLineOpts.numberOfRuns) > - logging.getLogger(MAIN_LOGGER_NAME).status(message) > + message = "Pass (%d/%d): Gathering the log files for the host." %(i, cmdLineOpts.numberOfRuns) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > gatherLogs(os.path.join(pathToOutputRunDir, "logs")) > # Sleep between each run if secondsToSleep is greater than or equal > # to 0 and current run is not the last run. > - if ((cmdLineOpts.secondsToSleep >= 0) and (i < (cmdLineOpts.numberOfRuns - 1))): > - message = "The script will sleep for %d seconds between each run of capturing the lockdumps." %(cmdLineOpts.secondsToSleep) > + if ((cmdLineOpts.secondsToSleep >= 0) and (i <= (cmdLineOpts.numberOfRuns))): > + message = "The script will sleep for %d seconds between each run of capturing the lockdump data." %(cmdLineOpts.secondsToSleep) > logging.getLogger(MAIN_LOGGER_NAME).info(message) > - message = "The script is sleeping before beginning the next run." > - logging.getLogger(MAIN_LOGGER_NAME).status(message) > time.sleep(cmdLineOpts.secondsToSleep) > + # Remove the handler: > + logging.getLogger(MAIN_LOGGER_NAME).removeHandler(currentRunFileHandler) > + > # ####################################################################### > # Archive the directory that contains all the data and archive it after > # all the information has been gathered. > diff --git a/gfs2/man/Makefile.am b/gfs2/man/Makefile.am > index 83d6251..8655a76 100644 > --- a/gfs2/man/Makefile.am > +++ b/gfs2/man/Makefile.am > @@ -7,4 +7,5 @@ dist_man_MANS = fsck.gfs2.8 \ > gfs2_grow.8 \ > gfs2_jadd.8 \ > mkfs.gfs2.8 \ > - tunegfs2.8 > + tunegfs2.8 \ > + gfs2_lockcapture.8 > diff --git a/gfs2/man/gfs2_lockcapture.8 b/gfs2/man/gfs2_lockcapture.8 > new file mode 100644 > index 0000000..854cd71 > --- /dev/null > +++ b/gfs2/man/gfs2_lockcapture.8 > @@ -0,0 +1,53 @@ > +.TH gfs2_lockcapture 8 > + > +.SH NAME > +gfs2_lockcapture \- will capture locking information from GFS2 file systems and DLM. > + > +.SH SYNOPSIS > +.B gfs2_lockcapture \fR[-dqyt] [-o \fIoutput directory]\fR [-r \fInumber of runs]\fR [-s \fIseconds to sleep]\fR [-n \fIname of GFS2 filesystem]\fP > +.PP > +.B gfs2_lockcapture \fR[-dqyi] > + > +.SH DESCRIPTION > +\fIgfs2_lockcapture\fR is used to capture all the GFS2 lockdump data and > +corresponding DLM data. The command can be configured to capture the data > +multiple times and how much time to sleep between each iteration of capturing > +the data. By default all of the mounted GFS2 filesystems will have their data > +collected unless GFS2 filesystems are specified. > +.PP > +Please note that sysrq -t and -m events are trigger or the pid directories in /proc are > +collected on each iteration of capturing the data. > + > +.SH OPTIONS > +.TP > +\fB-h, --help\fP > +Prints out a short usage message and exits. > +.TP > +\fB-d, --debug\fP > +enables debug logging. > +.TP > +\fB-q, --quiet\fP > +disables logging to console. > +.TP > +\fB-y, --no_ask\fP > +disables all questions and assumes yes. > +.TP > +\fB-i, --info\fP > +prints information about the mounted GFS2 file systems. > +.TP > +\fB-t, --archive\fP > +the output directory will be archived(tar) and compressed(.bz2). > +.TP > +\fB-o \fI, \fB--path_to_output_dir\fR=\fI\fP > +the directory where all the collect data will stored. > +.TP > +\fB-r \fI, \fB--num_of_runs\fR=\fI\fP > +number of runs capturing the lockdump data. > +.TP > +\fB-s \fI, \fB--seconds_sleep\fR=\fI\fP > +number of seconds to sleep between runs of capturing the lockdump data. > +.TP > +\fB-n \fI, \fB--fs_name\fR=\fI\fP > +name of the GFS2 filesystem(s) that will have their lockdump data captured. > +. > +.SH SEE ALSO >