From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andrew Price Date: Mon, 19 Aug 2013 16:11:37 +0100 Subject: [Cluster-devel] [PATCH] gfs2_lockcapture: Fixed rhbz#987019, modified option -o, added private option -R, removed lsof data capture. In-Reply-To: <1376921448-3940-1-git-send-email-sbradley@redhat.com> References: <1376921448-3940-1-git-send-email-sbradley@redhat.com> Message-ID: <521235A9.1060603@redhat.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Hi Shane, The shortlog is far too long and references a bz#. Good shortlogs and commit logs are discussed in doc/README.contributing so please read that. On 19/08/13 15:10, sbradley at redhat.com wrote: > From: Shane Bradley > > Fix the header to use absolute path for rhbz#987019. The "-o" has been modified > so that it takes a path that will be used to create the root directory of where > the data will be written and will be location of the tarball that is > created. Added undocumented(private) "-R" option that will only capture required The patch seems to make the -P option undocumented too. Why do they have to be undocumented/"private"? Also, you've referenced -R in several places but the patch only adds an -m option...? Andy > data. Removed the data capture of the command "lsof" cause it might cause hangs > when capturing lockdumps cause of symlink lookup. > > Signed-off-by: Shane Bradley > --- > gfs2/scripts/gfs2_lockcapture | 181 +++++++++++++++++++++++------------------ > 1 files changed, 101 insertions(+), 80 deletions(-) > > diff --git a/gfs2/scripts/gfs2_lockcapture b/gfs2/scripts/gfs2_lockcapture > index 81a0aeb..f8ace76 100644 > --- a/gfs2/scripts/gfs2_lockcapture > +++ b/gfs2/scripts/gfs2_lockcapture > @@ -1,4 +1,4 @@ > -#!/usr/bin/env python > +#!/usr/bin/python > """ > The script "gfs2_lockcapture" will capture locking information from GFS2 file > systems and DLM. > @@ -13,7 +13,7 @@ import os > import os.path > import logging > import logging.handlers > -from optparse import OptionParser, Option > +from optparse import OptionParser, Option, SUPPRESS_HELP > import time > import platform > import shutil > @@ -34,7 +34,7 @@ import tarfile > sure only 1 instance of this script is running at any time. > @type PATH_TO_PID_FILENAME: String > """ > -VERSION_NUMBER = "0.9-7" > +VERSION_NUMBER = "0.9-8" > MAIN_LOGGER_NAME = "%s" %(os.path.basename(sys.argv[0])) > PATH_TO_DEBUG_DIR="/sys/kernel/debug" > PATH_TO_PID_FILENAME = "/var/run/%s.pid" %(os.path.basename(sys.argv[0])) > @@ -190,12 +190,11 @@ def runCommand(command, listOfCommandOptions, standardOut=subprocess.PIPE, stand > commandOptionString = "" > for option in listOfCommandOptions: > commandOptionString += "%s " %(option) > - message = "An error occurred running the command: $ %s %s\n" %(command, commandOptionString) > - if (len(stdout) > 0): > - message += stdout > - message += "\n" > - if (len(stderr) > 0): > - message += stderr > + message = "An error occurred running the command: $ %s %s" %(command, commandOptionString) > + if (len(stdout.rstrip()) > 0): > + message += "\n%s" %(stdout.rstrip()) > + if (len(stderr.rstrip()) > 0): > + message += "\n%s" %(stderr.rstrip()) > logging.getLogger(MAIN_LOGGER_NAME).error(message) > return False > > @@ -232,12 +231,11 @@ def runCommandOutput(command, listOfCommandOptions, standardOut=subprocess.PIPE, > commandOptionString = "" > for option in listOfCommandOptions: > commandOptionString += "%s " %(option) > - message = "An error occurred running the command: $ %s %s\n" %(command, commandOptionString) > - if (len(stdout) > 0): > - message += stdout > - message += "\n" > - if (len(stderr) > 0): > - message += stderr > + message = "An error occurred running the command: $ %s %s" %(command, commandOptionString) > + if (len(stdout.rstrip()) > 0): > + message += "\n%s" %(stdout.rstrip()) > + if (len(stderr.rstrip()) > 0): > + message += "\n%s" %(stderr.rstrip()) > logging.getLogger(MAIN_LOGGER_NAME).error(message) > return None > return stdout.strip().rstrip() > @@ -790,12 +788,11 @@ def getLabelMapForMountedFilesystems(clusterName, listOfMountedFilesystems): > # ##################################################################### > # Gather output from command functions > # ##################################################################### > -def gatherGeneralInformation(pathToDSTDir): > +def gatherHostData(pathToDSTDir): > """ > This function will gather general information about the cluster and write > the results to a file. The following data will be captured: hostname, date, > - uname -a, uptime, contents of /proc/mounts, and ps h -AL -o tid,s,cmd. > - > + uname -a, uptime. > > @param pathToDSTDir: This is the path to directory where the files will be > written to. > @@ -811,19 +808,16 @@ def gatherGeneralInformation(pathToDSTDir): > systemString += "UPTIME=%s" %(stdout) > writeToFile(os.path.join(pathToDSTDir, "hostinformation.txt"), systemString, createFile=True) > > - # Copy misc files > - pathToSrcFile = "/proc/mounts" > - copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) > - pathToSrcFile = "/proc/slabinfo" > - copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) > +def gatherDiagnosticData(pathToDSTDir): > + """ > + This function will gather general information about the cluster and write (or > + copy) the results to a file. > > - # Copy the DLM hash table sizes: > - pathToHashTableFiles = ["/sys/kernel/config/dlm/cluster/lkbtbl_size", "/sys/kernel/config/dlm/cluster/dirtbl_size", > - "/sys/kernel/config/dlm/cluster/rsbtbl_size"] > - for pathToSrcFile in pathToHashTableFiles: > - if (os.path.exists(pathToSrcFile)): > - copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) > + @param pathToDSTDir: This is the path to directory where the files will be > + written to. > + @type pathToDSTDir: String > > + """ > # Get "ps -eo user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan" data. > # Get " ps h -AL -o tid,s,cmd > command = "ps" > @@ -837,6 +831,28 @@ def gatherGeneralInformation(pathToDSTDir): > message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput) > logging.getLogger(MAIN_LOGGER_NAME).error(message) > > + # Copy misc files > + pathToSrcFile = "/proc/mounts" > + copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) > + pathToSrcFile = "/proc/slabinfo" > + copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) > + > + # Copy the DLM hash table sizes: > + pathToHashTableFiles = ["/sys/kernel/config/dlm/cluster/lkbtbl_size", "/sys/kernel/config/dlm/cluster/dirtbl_size", > + "/sys/kernel/config/dlm/cluster/rsbtbl_size"] > + for pathToSrcFile in pathToHashTableFiles: > + if (os.path.exists(pathToSrcFile)): > + copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) > + > +def gatherOptionalDiagnosticData(pathToDSTDir): > + """ > + This function will gather optional information about the cluster and write > + the results to a file. > + > + @param pathToDSTDir: This is the path to directory where the files will be > + written to. > + @type pathToDSTDir: String > + """ > # Get df -h ouput > command = "df" > pathToCommandOutput = os.path.join(pathToDSTDir, "df-h.cmd") > @@ -848,17 +864,6 @@ def gatherGeneralInformation(pathToDSTDir): > message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput) > logging.getLogger(MAIN_LOGGER_NAME).error(message) > > - # Get lsof ouput > - command = "lsof" > - pathToCommandOutput = os.path.join(pathToDSTDir, "lsof.cmd") > - try: > - fout = open(pathToCommandOutput, "w") > - runCommand(command, [], standardOut=fout) > - fout.close() > - except IOError: > - message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput) > - logging.getLogger(MAIN_LOGGER_NAME).error(message) > - > # Write the status of all the nodes in the cluster out. > if (runCommand("which", ["cman_tool"])): > command = "cman_tool" > @@ -1087,7 +1092,12 @@ def __getOptions(version) : > cmdParser.add_option("-P", "--disable_process_gather", > action="store_true", > dest="disableProcessGather", > - help="the gathering of process information will be disabled", > + help=SUPPRESS_HELP, > + default=False) > + cmdParser.add_option("-m", "--diagnostic_data", > + action="store_true", > + dest="enableDiagnosticData", > + help=SUPPRESS_HELP, > default=False) > cmdParser.add_option("-o", "--path_to_output_dir", > action="store", > @@ -1095,7 +1105,7 @@ def __getOptions(version) : > help="the directory where all the collect data will be stored", > type="string", > metavar="", > - default="") > + default="/tmp") > cmdParser.add_option("-r", "--num_of_runs", > action="store", > dest="numberOfRuns", > @@ -1264,7 +1274,7 @@ if __name__ == "__main__": > message = "Debugging has been enabled." > logging.getLogger(MAIN_LOGGER_NAME).debug(message) > if (cmdLineOpts.disableLoggingToConsole): > - logging.disable(logging.CRITICAL) > + streamHandler.setLevel(logging.CRITICAL) > # ####################################################################### > # Check to see if pid file exists and error if it does. > # ####################################################################### > @@ -1305,7 +1315,7 @@ if __name__ == "__main__": > # ####################################################################### > # Verify they want to continue because this script will trigger sysrq events. > # ####################################################################### > - if (not cmdLineOpts.disableQuestions): > + if (not cmdLineOpts.disableQuestions and not cmdLineOpts.disableProcessGather): > valid = {"yes":True, "y":True, "no":False, "n":False} > question = "This script will trigger a sysrq -t event or collect the data for each pid directory located in /proc for each run. Are you sure you want to continue?" > prompt = " [y/n] " > @@ -1326,14 +1336,11 @@ if __name__ == "__main__": > # Create the output directory to verify it can be created before > # proceeding unless it is already created from a previous run data needs > # to be analyzed. Probably could add more debugging on if file or dir. > - # ####################################################################### > - pathToOutputDir = cmdLineOpts.pathToOutputDir > - if (not len(pathToOutputDir) > 0): > - pathToOutputDir = "%s" %(os.path.join("/tmp", "%s-%s" %(time.strftime("%Y-%m-%d_%H%M%S"), os.path.basename(sys.argv[0])))) > - # ####################################################################### > + > # Backup any existing directory with same name as current output > # directory. > # ####################################################################### > + pathToOutputDir = "%s" %(os.path.join(cmdLineOpts.pathToOutputDir, "%s-%s" %(os.path.basename(sys.argv[0]), time.strftime("%Y-%m-%d")))) > if (backupOutputDirectory(pathToOutputDir)): > message = "This directory that will be used to capture all the data: %s" %(pathToOutputDir) > logging.getLogger(MAIN_LOGGER_NAME).info(message) > @@ -1388,38 +1395,13 @@ if __name__ == "__main__": > logging.getLogger(MAIN_LOGGER_NAME).status(message) > > # Gather various bits of data from the clusternode. > - message = "Pass (%d/%d): Gathering general information about the host." %(i, cmdLineOpts.numberOfRuns) > + message = "Pass (%d/%d): Gathering simple data about the host." %(i, cmdLineOpts.numberOfRuns) > logging.getLogger(MAIN_LOGGER_NAME).debug(message) > - gatherGeneralInformation(pathToOutputRunDir) > + gatherHostData(pathToOutputRunDir) > # Write the clusternode name and id to the general information file. > writeToFile(os.path.join(pathToOutputRunDir, "hostinformation.txt"), > "NODE_NAME=%s\nNODE_ID=%d" %(clusternode.getClusterNodeName(), clusternode.getClusterNodeID()), > appendToFile=True, createFile=True) > - > - # Going to sleep for 2 seconds, so that TIMESTAMP should be in the > - # past in the logs so that capturing sysrq data will be guaranteed. > - time.sleep(2) > - > - # If enabled then gather the process data. > - if (not cmdLineOpts.disableProcessGather): > - # Gather the backtraces for all the pids, by grabbing the /proc/ - # number> or triggering sysrq events to capture task bask traces > - # from log. > - # Gather the data in the /proc/ directory if the file > - # /stack exists. If file exists we will not trigger > - # sysrq events. > - > - # Should I gather anyhow and only capture sysrq if needed. > - pathToPidData = "/proc" > - if (isProcPidStackEnabled(pathToPidData)): > - message = "Pass (%d/%d): Triggering the capture of all pid directories in %s." %(i, cmdLineOpts.numberOfRuns, pathToPidData) > - logging.getLogger(MAIN_LOGGER_NAME).debug(message) > - gatherPidData(pathToPidData, os.path.join(pathToOutputRunDir, pathToPidData.strip("/"))) > - else: > - message = "Pass (%d/%d): Triggering the sysrq events for the host since stack was not captured in pid directory." %(i, cmdLineOpts.numberOfRuns) > - logging.getLogger(MAIN_LOGGER_NAME).debug(message) > - triggerSysRQEvents() > - > # ####################################################################### > # Gather the DLM data and lock-dumps > # ####################################################################### > @@ -1444,16 +1426,55 @@ if __name__ == "__main__": > logging.getLogger(MAIN_LOGGER_NAME).debug(message) > if(gatherGFS2LockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames())): > exitCode = 0 > + # If enabled then gather the process data. This will be included even if -R option is enabled. > + if (not cmdLineOpts.disableProcessGather): > + # Gather the backtraces for all the pids, by grabbing the /proc/ + # number> or triggering sysrq events to capture task bask traces > + # from log. > + # Gather the data in the /proc/ directory if the file > + # /stack exists. If file exists we will not trigger > + # sysrq events. > + > + # Should I gather anyhow and only capture sysrq if needed. > + pathToPidData = "/proc" > + if (isProcPidStackEnabled(pathToPidData)): > + message = "Pass (%d/%d): Triggering the capture of all pid directories in %s." %(i, cmdLineOpts.numberOfRuns, pathToPidData) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > + gatherPidData(pathToPidData, os.path.join(pathToOutputRunDir, pathToPidData.strip("/"))) > + else: > + message = "Pass (%d/%d): Triggering the sysrq events for the host since stack was not captured in pid directory." %(i, cmdLineOpts.numberOfRuns) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > + triggerSysRQEvents() > + > # Gather log files > message = "Pass (%d/%d): Gathering the log files for the host." %(i, cmdLineOpts.numberOfRuns) > logging.getLogger(MAIN_LOGGER_NAME).debug(message) > gatherLogs(os.path.join(pathToOutputRunDir, "logs")) > + > + # Gather diagnostic data > + message = "Pass (%d/%d): Gathering diagnostic data about the host." %(i, cmdLineOpts.numberOfRuns) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > + gatherDiagnosticData(pathToOutputRunDir) > + if (cmdLineOpts.enableDiagnosticData): > + # Gather diagnostic data > + message = "Pass (%d/%d): Gathering optional diagnostic data about the host." %(i, cmdLineOpts.numberOfRuns) > + logging.getLogger(MAIN_LOGGER_NAME).debug(message) > + gatherOptionalDiagnosticData(pathToOutputRunDir) > + > + # ####################################################################### > + # Sleep for X seconds between runs > + # ####################################################################### > # Sleep between each run if secondsToSleep is greater than or equal > - # to 0 and current run is not the last run. > - if ((cmdLineOpts.secondsToSleep >= 0) and (i < (cmdLineOpts.numberOfRuns))): > - message = "The script will sleep for %d seconds between each run of capturing the lockdump data." %(cmdLineOpts.secondsToSleep) > + # to 0 and current run is not the last run. Add 2 seconds to each sleep so > + # that we know that there is a timestamp difference in logs between runs. > + # The minimal sleep is 2 seconds. > + secondsToSleep = cmdLineOpts.secondsToSleep + 2 > + if (secondsToSleep < 2): > + secondsToSleep = 2 > + if (i < cmdLineOpts.numberOfRuns): > + message = "The script will sleep for %d seconds between each run of capturing the lockdump data." %(secondsToSleep) > logging.getLogger(MAIN_LOGGER_NAME).info(message) > - time.sleep(cmdLineOpts.secondsToSleep) > + time.sleep(secondsToSleep) > # Remove the handler: > logging.getLogger(MAIN_LOGGER_NAME).removeHandler(currentRunFileHandler) > >