From mboxrd@z Thu Jan 1 00:00:00 1970 From: Steven Whitehouse Date: Tue, 10 Jan 2012 09:24:44 +0000 Subject: [Cluster-devel] [PATCH] gfs2_utils: Add gfs2_lockgather data gathering script In-Reply-To: <42079BC4-5241-40D8-B30A-BB4C5B874B32@redhat.com> References: <42079BC4-5241-40D8-B30A-BB4C5B874B32@redhat.com> Message-ID: <1326187485.2717.0.camel@menhir> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Hi, Looks good to me. ACK. Do you need one of us to apply this or are you able to do it directly? Steve. On Mon, 2012-01-09 at 17:52 -0500, Adam Drew wrote: > I wrote a simple data gathering script for GFS2 called gfs2_lockgather. It should help in situations where data about a possible locking or performance issue involving GFS2 is required. It gathers system information, DLM data, glock data, and thread dumps. The data gather can be run on a single node or a single node can run it on all nodes. The data gathered is quite good for diagnosing performance and locking issues. > > - Adam > > diff --git a/configure.ac b/configure.ac > index 81ffad8..3fe1a49 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -285,6 +285,7 @@ AC_CONFIG_FILES([Makefile > gfs2/tool/Makefile > gfs2/tune/Makefile > gfs2/man/Makefile > + gfs2/lockgather/Makefile > doc/Makefile > po/Makefile.in > ]) > diff --git a/gfs2/Makefile.am b/gfs2/Makefile.am > index 9116bd3..08e59c4 100644 > --- a/gfs2/Makefile.am > +++ b/gfs2/Makefile.am > @@ -1,4 +1,4 @@ > MAINTAINERCLEANFILES = Makefile.in > > SUBDIRS = libgfs2 convert edit fsck mkfs mount quota tool man \ > - tune include #init.d > + tune include lockgather #init.d > diff --git a/gfs2/lockgather/Makefile.am b/gfs2/lockgather/Makefile.am > new file mode 100644 > index 0000000..fe8b480 > --- /dev/null > +++ b/gfs2/lockgather/Makefile.am > @@ -0,0 +1,12 @@ > +MAINTAINERCLEANFILES = Makefile.in > + > +# When an exec_prefix setting would have us install into /usr/sbin, > +# use /sbin instead. > +# Accept an existing sbindir value of /usr/sbin (probably for older automake), > +# or an empty value, for automake-1.11 and newer. > +sbindir := $(shell rpl=0; test '$(exec_prefix):$(sbindir)' = /usr:/usr/sbin \ > + || test '$(exec_prefix):$(sbindir)' = /usr: && rpl=1; \ > + test $$rpl = 1 && echo /sbin || echo '$(exec_prefix)/sbin') > + > + > +dist_sbin_SCRIPTS = gfs2_lockgather > diff --git a/gfs2/lockgather/gfs2_lockgather b/gfs2/lockgather/gfs2_lockgather > new file mode 100644 > index 0000000..ed4a0c5 > --- /dev/null > +++ b/gfs2/lockgather/gfs2_lockgather > @@ -0,0 +1,129 @@ > +#!/bin/bash > + > +# gfs2_lockgather - A script that gathers data for diagnosing GFS2 locking issues > +# Copyright 2012 Adam Drew > + > +# This program is free software: you can redistribute it and/or modify > +# it under the terms of the GNU General Public License as published by > +# the Free Software Foundation, either version 3 of the License, or > +# (at your option) any later version. > + > +# This program is distributed in the hope that it will be useful, > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > +# GNU General Public License for more details. > + > +# You should have received a copy of the GNU General Public License > +# along with this program. If not, see . > + > + > +QUIET=false > + > +#Handle arguments > +for var in "$@" > +do > + #Handle running on all nodes > + if [ $var == "--allnodes" ] || [ $var == "-a" ] ; then > + > + for node in $(ccs_tool lsnode | tail --lines=+5 | grep -v "Cluster name" | grep -v "Nodename" | awk '{print $1}') ; do > + #We gather via SSH on all nodes, even the local node > + #We do this becuase determining which node name is the > + #node running the script is too much logic to be worth it > + echo "Starting data gathering on $node..." > + ssh -q -f root@$node '/sbin/gfs2_lockgather -q' > + echo "gfs2_lockgather will log a message in /var/log/messages on $node when complete or if there is an error." > + done > + exit 0 > + fi > + > + #Handle quiet mode > + if [ $var == "-q" ] || [ $var == "--quiet" ] ; then > + QUIET=true > + fi > + > + #Handle help request > + if [ $var == "--help" ] || [ $var == "--info" ] || [ $var == "-h" ] ; then > + > + echo "gfs2_lockgather, version 1" > + echo "A script that gathers data for diagnosing GFS2 locking issues." > + echo "---------------------------------------------------------------" > + echo "To gather on a single node invoke the script with no arguments." > + echo "To see this message use --help, --info, or -h." > + echo "To run with messages supressed use --quiet or -q." > + echo "To gather on all nodes invoke the script with --allnodes or -a." > + echo "Only 1 instance of gfs2_lockgather may run on a node at a time." > + echo "" > + exit 0 > + fi > + > +done > + > +#Check for the lock file. We only want one instance running at a time. > +if [ -e /var/run/gfs2_lockgather.lock ]; then > + echo -ne 'Error: Lock file /var/run/gfs2_lockgather.lock found.\nAnother instance of gfs2_lockgather may be running.\nAnother node may be running a gather on this node.\n' > + logger -t gfs2_lockgather 'Error: Lock file /var/run/gfs2_lockgather.lock found. Another instance may be running. Quitting.' > + exit 1 > +fi > + > +#Create the gather lock > +touch /var/run/gfs2_lockgather.lock > + > +logger -t gfs2_lockgather 'Gather started.' > + > +if [ $QUIET == false ] ; then echo -ne '[ ] Setting up for gather.\t\t\t\t\t\t\t\t\r' ; fi > +#Get the current datetime for unique naming > +DATETIME=$(date +%m%d%Y-%H%M%S) > + > +#Set up the directory structure > +mkdir /tmp/debugfs > +mount -t debugfs none /tmp/debugfs > +mkdir /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata > +mkdir /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run1 > +mkdir /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run2 > + > +if [ $QUIET == false ] ; then echo -ne '[# ] Gathering environment data.\t\t\t\t\t\t\t\t\r' ; fi > +#Gather some basics > +clustat > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/clustat.out > +cman_tool services > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/clustat.out > +mount -l > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/mount-l.out > +ps aux > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/ps-aux.out > +uname -a > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/uname-a.out > + > +if [ $QUIET == false ] ; then echo -ne '[## ] Gathering GFS2 and DLM lock data: pass 1\t\t\t\t\t\t\t\t\r' ; fi > +#Glock and DLM lock dump 1 > +for dlmfile in $(ls -lsv /tmp/debugfs/dlm/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/dlm/$dlmfile bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run1/$dlmfile &> /dev/null; done > +for fs in $(ls -lsv /tmp/debugfs/gfs2/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/gfs2/$fs/glocks bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run1/$fs-glocks &> /dev/null; done > + > +#Enable and trigger sysrq > +echo 1 > /proc/sys/kernel/sysrq > + > +#Thread Dump > +#This is much faster than waiting for syslog to dump the thread dumps to the messages log > +if [ $QUIET == false ] ; then echo -ne '[### ] Gathering thread dumps.\t\t\t\t\t\t\t\t\r' ; fi > + > +$( > +cat /proc/kmsg > /tmp/thread-dumps & > +echo 't' > /proc/sysrq-trigger > +sleep 10 > +kill -9 $! > +) > + > +if [ $QUIET == false ] ; then echo -ne '[#### ] Gathering GFS2 and DLM lock data: pass 2.\t\t\t\t\t\t\t\t\r' ; fi > +#Glock and DLM dump 2 > +for dlmfile in $(ls -lsv /tmp/debugfs/dlm/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/dlm/$dlmfile bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run2/$dlmfile &> /dev/null; done > +for fs in $(ls -lsv /tmp/debugfs/gfs2/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/gfs2/$fs/glocks bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run2/$fs-glocks &> /dev/null; done > + > +if [ $QUIET == false ] ; then echo -ne '[##### ] Gathering messages logs\t\t\t\t\t\t\t\t\r' ; fi > +#Get the messages log file > +cp /var/log/messages /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/ > + > +#Tar up the results and clean up temporary files > +if [ $QUIET == false ] ; then echo -ne '[###### ] Cleaning up... 80%.\t\t\t\t\t\t\t\t\r' ; fi > +tar cjf /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata.tar.bz /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/ &> /dev/null > +umount /tmp/debugfs/ > +rm -f /var/run/gfs2_lockgather.lock > +rm -rf /tmp/debugfs > +rm -rf /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata > +logger -t gfs2_lockgather "Gather completed. File is /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata.tar.bz" > +if [ $QUIET == false ] ; then echo -ne "[#######] Done. File is /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata.tar.bz\r\t\t\t\t\t\t\t\t\r\n" ; fi > +exit 0 > diff --git a/gfs2/man/Makefile.am b/gfs2/man/Makefile.am > index 0f132d6..648ed84 100644 > --- a/gfs2/man/Makefile.am > +++ b/gfs2/man/Makefile.am > @@ -9,5 +9,6 @@ dist_man_MANS = fsck.gfs2.8 \ > gfs2_quota.8 \ > gfs2_tool.8 \ > mkfs.gfs2.8 \ > + gfs2_lockgather.8 \ > mount.gfs2.8 \ > tunegfs2.8 > diff --git a/gfs2/man/gfs2_lockgather.8 b/gfs2/man/gfs2_lockgather.8 > new file mode 100644 > index 0000000..3cd8b9c > --- /dev/null > +++ b/gfs2/man/gfs2_lockgather.8 > @@ -0,0 +1,26 @@ > +.TH gfs2_lockgather 8 > + > +.SH NAME > +gfs2_lockgather - Gathers data for diagnosing GFS2 locking issues > + > +.SH SYNOPSIS > +.B gfs2_lockgather > +[\fIOPTIONS\fR] > + > +.SH DESCRIPTION > +gfs2_lockgather will gather data that is useful for diagnosing performance and locking issues > +involving GFS2 filesystems. The script gathers basic system and cluster data such as rpm output, > +kernel version, thread dumps from all processes, and 2 passes of glock and DLM locking data. After > +the data is gathered it is stored in a tarball under /tmp. The script can be invoked to gather > +data from a single node, or to gather data from all nodes via ssh. > +.SH OPTIONS > +.TP > +\fB-h, --help, --info\fP > +Display help and usage information. > +.TP > +\fB-q, --quiet\fP > +Quiet mode. Run with output supressed. > +.TP > +\fB-a, --allnodes\fP > +Gather data from all nodes via ssh. > + > > > >