From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 4 Dec 2007 21:59:54 -0000 Subject: [Cluster-devel] cluster/rgmanager/src/resources Makefile netfs ... Message-ID: <20071204215954.7316.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL5 Changes by: lhh at sourceware.org 2007-12-04 21:59:54 Modified files: rgmanager/src/resources: Makefile netfs.sh Added files: rgmanager/src/resources: default_event_script.sl Log message: Port force-unmount from RHEL4 branch Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/default_event_script.sl.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.13.2.6&r2=1.13.2.7 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/netfs.sh.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.7.2.2&r2=1.7.2.3 /cvs/cluster/cluster/rgmanager/src/resources/default_event_script.sl,v --> standard output revision 1.1.2.1 --- cluster/rgmanager/src/resources/default_event_script.sl +++ - 2007-12-04 21:59:54.602852000 +0000 @@ -0,0 +1,291 @@ +define node_in_set(node_list, node) +{ + variable x, len; + + len = length(node_list); + for (x = 0; x < len; x++) { + if (node_list[x] == node) + return 1; + } + + return 0; +} + +define move_or_start(service, node_list) +{ + variable len; + variable state, owner; + variable depends; + + depends = service_property(service, "depend"); + if (depends != "") { + (owner, state) = service_status(depends); + if (owner < 0) { + debug(service, " is not runnable; dependency not met"); + return ERR_DEPEND; + } + } + + (owner, state) = service_status(service); + debug("Evaluating ", service, " state=", state, " owner=", owner); + + len = length(node_list); + if (len == 0) { + debug(service, " is not runnable"); + return ERR_DOMAIN; + } + + if (((event_type != EVENT_USER) and (state == "disabled")) or (state == "failed")) { + % + % Commenting out this block will -not- allow you to + % recover failed services from event scripts. Sorry. + % All it will get you is a false log message about + % starting this service. + % + % You may enable disabled services, but I recommend + % against it. + % + debug(service, " is not runnable"); + return -1; + } + + if (node_list[0] == owner) { + debug(service, " is already running on best node"); + return ERR_RUNNING; + } + + if ((owner >= 0) and (node_in_set(node_list, owner) == 1)) { + notice("Moving ", service, " from ", owner, + " to ", node_list); + if (service_stop(service) < 0) { + return ERR_ABORT; + } + } else { + notice("Starting ", service, " on ", node_list); + } + + return service_start(service, node_list); +} + + +% +% Returns the set of online nodes in preferred/shuffled order which +% are allowed to run this service. Gives highest preference to current +% owner if nofailback is specified. +% +define allowed_nodes(service) +{ + variable anodes; + variable online; + variable nodes_domain; + variable ordered, restricted, nofailback; + variable state, owner; + variable depends; + + (nofailback, restricted, ordered, nodes_domain) = + service_domain_info(service); + + (owner, state) = service_status(service); + + anodes = nodes_online(); + + % Shuffle the array so we don't start all services on the same + % node. TODO - add RR, Least-services, placement policies... + online = shuffle(anodes); + + if (restricted == 1) { + anodes = intersection(nodes_domain, online); + } else { + % Ordered failover domains (nodes_domain) unioned with the + % online nodes basically just reorders the online node list + % according to failover domain priority rules. + anodes = union(intersection(nodes_domain, online), + online); + } + + if ((nofailback == 1) or (ordered == 0)) { + + if ((owner < 0) or (node_in_set(anodes, owner) == 0)) { + return anodes; + } + + % Because union takes left as priority, we can + % return the union of the current owner with the + % allowed node list. This means the service will + % remain on the same node it's currently on. + return union(owner, anodes); + } + + return anodes; +} + + +define default_node_event_handler() +{ + variable services = service_list(); + variable x; + variable nodes; + + % debug("Executing default node event handler"); + for (x = 0; x < length(services); x++) { + nodes = allowed_nodes(services[x]); + ()=move_or_start(services[x], nodes); + } +} + + +define default_service_event_handler() +{ + variable services = service_list(); + variable x; + variable depends; + variable policy; + variable nodes; + variable tmp; + variable owner; + variable state; + + % debug("Executing default service event handler"); + + if (service_state == "recovering") { + + policy = service_property(service_name, "recovery"); + debug("Recovering", + " Service: ", service_name, + " Last owner: ", service_last_owner, + " Policy: ", policy); + + if (policy == "disable") { + () = service_stop(service_name, 1); + return; + } + + nodes = allowed_nodes(service_name); + if (policy == "restart") { + tmp = union(service_last_owner, nodes); + } else { + % relocate + tmp = subtract(nodes, service_last_owner); + nodes = tmp; + tmp = union(nodes, service_last_owner); + } + + ()=move_or_start(service_name, nodes); + + return; + } + + for (x = 0; x < length(services); x++) { + if (service_name == services[x]) { + % don't do anything to ourself! + continue; + } + + % + % Simplistic dependency handling + % + depends = service_property(services[x], "depend"); + + % No dependency; do nothing + if (depends != service_name) { + continue; + } + + (owner, state) = service_status(services[x]); + if ((service_state == "started") and (owner < 0)) { + info("Dependency met; starting ", services[x]); + nodes = allowed_nodes(services[x]); + ()=move_or_start(services[x], nodes); + } + + % service died - stop service(s) that depend on the dead + if ((service_owner < 0) and (owner >= 0)) { + info("Dependency lost; stopping ", services[x]); + ()=service_stop(services[x]); + } + } +} + +define default_config_event_handler() +{ + % debug("Executing default config event handler"); +} + +define default_user_event_handler() +{ + variable ret; + variable nodes; + variable reordered; + variable x; + variable target = user_target; + variable found = 0; + variable owner, state; + + nodes = allowed_nodes(service_name); + (owner, state) = service_status(service_name); + + if (user_request == USER_RESTART) { + + if (owner >= 0) { + reordered = union(owner, nodes); + nodes = reordered; + } + + notice("Stopping ", service_name, " for relocate to ", nodes); + + found = service_stop(service_name); + if (found < 0) { + return ERR_ABORT; + } + + ret = move_or_start(service_name, nodes); + + } else if ((user_request == USER_RELOCATE) or + (user_request == USER_ENABLE)) { + + if (user_target > 0) { + for (x = 0; x < length(nodes); x++) { + if (nodes[x] == user_target) { + reordered = union(user_target, nodes); + nodes = reordered; + found = 1; + } + } + + if (found == 0) { + warning("User specified node ", user_target, + " is offline"); + } + } + + if ((owner >= 0) and (user_request == USER_RELOCATE)) { + if (service_stop(service_name) < 0) { + return ERR_ABORT; + } + } + + ret = move_or_start(service_name, nodes); + + } else if (user_request == USER_DISABLE) { + + ret = service_stop(service_name, 1); + + } else if (user_request == USER_STOP) { + + ret = service_stop(service_name); + + } + % todo - migrate + + return ret; +} + +if (event_type == EVENT_NODE) + default_node_event_handler(); +if (event_type == EVENT_SERVICE) + default_service_event_handler(); +if (event_type == EVENT_CONFIG) + default_config_event_handler(); +if (event_type == EVENT_USER) + user_return=default_user_event_handler(); + --- cluster/rgmanager/src/resources/Makefile 2007/07/12 11:23:16 1.13.2.6 +++ cluster/rgmanager/src/resources/Makefile 2007/12/04 21:59:54 1.13.2.7 @@ -34,6 +34,9 @@ utils/httpd-parse-config.pl utils/tomcat-parse-config.pl \ utils/member_util.sh +EVENT_TARGETS= \ + default_event_script.sl + all: install: all @@ -44,6 +47,7 @@ install $(TARGETS) ${sharedir} install $(UTIL_TARGETS) ${sharedir}/utils install -m 644 $(METADATA) ${sharedir} + install -m 644 $(EVENT_TARGETS) ${sharedir} uninstall: ${UNINSTALL} ${UTIL_TARGETS} ${sharedir}/utils --- cluster/rgmanager/src/resources/netfs.sh 2007/10/03 16:44:15 1.7.2.2 +++ cluster/rgmanager/src/resources/netfs.sh 2007/12/04 21:59:54 1.7.2.3 @@ -348,6 +348,112 @@ return $NO } +# +# killMountProcesses mount_point +# +# Using lsof or fuser try to unmount the mount by killing of the processes +# that might be keeping it busy. +# +killMountProcesses() +{ + typeset -i ret=$SUCCESS + typeset have_lsof="" + typeset have_fuser="" + typeset try + + if [ $# -ne 1 ]; then + ocf_log err \ + "Usage: killMountProcesses mount_point" + return $FAIL + fi + + typeset mp=$1 + + ocf_log notice "Forcefully unmounting $mp" + + # + # Not all distributions have lsof. If not use fuser. If it + # does, try both. + # + file=$(which lsof 2>/dev/null) + if [ -f "$file" ]; then + have_lsof=$YES + fi + + file=$(which fuser 2>/dev/null) + if [ -f "$file" ]; then + have_fuser=$YES + fi + + if [ -z "$have_lsof" -a -z "$have_fuser" ]; then + ocf_log warn \ + "Cannot forcefully unmount $mp; cannot find lsof or fuser commands" + return $FAIL + fi + + for try in 1 2 3; do + if [ -n "$have_lsof" ]; then + # + # Use lsof to free up mount point + # + while read command pid user + do + if [ -z "$pid" ]; then + continue + fi + + if [ $try -eq 1 ]; then + ocf_log warn \ + "killing process $pid ($user $command $mp)" + elif [ $try -eq 3 ]; then + ocf_log crit \ + "Could not clean up mountpoint $mp" + ret=$FAIL + fi + + if [ $try -gt 1 ]; then + kill -9 $pid + else + kill -TERM $pid + fi + done < <(lsof -w -bn 2>/dev/null | \ + grep -w -E "$mp(/.*|)\$" | \ + awk '{print $1,$2,$3}' | \ + sort -u -k 1,3) + elif [ -n "$have_fuser" ]; then + # + # Use fuser to free up mount point + # + while read command pid user + do + if [ -z "$pid" ]; then + continue + fi + + if [ $try -eq 1 ]; then + ocf_log warn \ + "killing process $pid ($user $command $mp)" + elif [ $try -eq 3 ]; then + ocf_log crit \ + "Could not clean up mount point $mp" + ret=$FAIL + fi + + if [ $try -gt 1 ]; then + kill -9 $pid + else + kill -TERM $pid + fi + done < <(fuser -vm $mp | \ + grep -v PID | \ + sed 's;^'$mp';;' | \ + awk '{print $4,$2,$1}' | \ + sort -u -k 1,3) + fi + done + + return $ret +} # # startNFSFilesystem @@ -498,8 +604,8 @@ # if [ -n "$mp" ]; then case ${OCF_RESKEY_force_unmount} in - $YES_STR) force_umount="-f" ;; - 0) force_umount="-f" ;; + $YES_STR) force_umount="$YES" ;; + 1) force_umount="$YES" ;; *) force_umount="" ;; esac fi @@ -507,6 +613,7 @@ # # Unmount # + while [ ! "$done" ]; do isMounted $fullpath $mp case $? in $NO) @@ -519,26 +626,46 @@ ;; $YES) sync; sync; sync - ocf_log info "unmounting $fullpath ($mp)" + ocf_log info "unmounting $mp" - umount $force_umount $mp + umount $mp if [ $? -eq 0 ]; then - return $SUCCESS + umount_failed= + done=$YES + continue fi umount_failed=yes + if [ "$force_umount" ]; then + killMountProcesses $mp + fi + + if [ $try -ge $max_tries ]; then + done=$YES + else + sleep $sleep_time + let try=try+1 + fi ;; *) return $FAIL ;; esac + if [ $try -ge $max_tries ]; then + done=$YES + else + sleep $sleep_time + let try=try+1 + fi + done # while if [ -n "$umount_failed" ]; then ocf_log err "'umount $fullpath' failed ($mp), error=$ret_val" return $FAIL fi + return $SUCCESS }