From mboxrd@z Thu Jan 1 00:00:00 1970 From: jbrassow@sourceware.org Date: 18 Apr 2007 18:14:56 -0000 Subject: [Cluster-devel] cluster/rgmanager/src/resources lvm.sh Message-ID: <20070418181456.18011.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: jbrassow at sourceware.org 2007-04-18 19:14:56 Modified files: rgmanager/src/resources: lvm.sh Log message: Bug 236580: [HA LVM]: Bringing site back on-line after failure causes pr... Setup: - 2 interconnected sites - each site has a disk and a machine - LVM mirroring is used to mirror the disks from the sites When one site fails, the LVM happily moves over to the second site - removing the failed disk from the VG that was part of the failed site. However, when the failed site is restored and the service attempts to move back to the original machine, it fails because of the conflicts in LVM metadata on the disks. This fix allows the LV to be reactivated on the original node by filtering out the devices which have stale metadata (i.e the device that was removed during the failure). Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/lvm.sh.diff?cvsroot=cluster&r1=1.4&r2=1.5 --- cluster/rgmanager/src/resources/lvm.sh 2007/04/05 15:08:20 1.4 +++ cluster/rgmanager/src/resources/lvm.sh 2007/04/18 18:14:56 1.5 @@ -149,6 +149,78 @@ return $OCF_ERR_GENERIC } +# lvm_exec_resilient +# +# Sometimes, devices can come back. Their metadata will conflict +# with the good devices that remain. This function filters out those +# failed devices when executing the given command +# +# Finishing with vgscan resets the cache/filter +lvm_exec_resilient() +{ + declare command=$1 + declare all_pvs + + ocf_log notice "Making resilient : $command" + + if [ -z $command ]; then + ocf_log err "lvm_exec_resilient: Arguments not supplied" + return $OCF_ERR_ARGS + fi + + # pvs will print out only those devices that are valid + # If a device dies and comes back, it will not appear + # in pvs output (but you will get a Warning). + all_pvs=(`pvs --noheadings -o pv_name | grep -v Warning`) + + # Now we use those valid devices in a filter which we set up. + # The device will then be activated because there are no + # metadata conflicts. + command=$command" --config devices{filter=["; + for i in ${all_pvs[*]}; do + command=$command'"a|'$i'|",' + done + command=$command"\"r|.*|\"]}" + + ocf_log notice "Resilient command: $command" + if ! $command ; then + ocf_log err "lvm_exec_resilient failed" + vgscan + return $OCF_ERR_GENERIC + else + vgscan + return $OCF_SUCCESS + fi +} + +# lv_activate_resilient +# +# Sometimes, devices can come back. Their metadata will conflict +# with the good devices that remain. We must filter out those +# failed devices when trying to reactivate +lv_activate_resilient() +{ + declare action=$1 + declare lv_path=$2 + declare op="-ay" + + if [ -z $action ] || [ -z $lv_path ]; then + ocf_log err "lv_activate_resilient: Arguments not supplied" + return $OCF_ERR_ARGS + fi + + if [ $action != "start" ]; then + op="-an" + fi + + if ! lvm_exec_resilient "lvchange $op $lv_path" ; then + ocf_log err "lv_activate_resilient $action failed on $lv_path" + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi +} + # lv_status # # Is the LV active? @@ -203,7 +275,7 @@ ocf_log err "WARNING: $my_name does not own $lv_path" ocf_log err "WARNING: Attempting shutdown of $lv_path" - lvchange -an $lv_path + lv_activate_resilient "stop" $lv_path return $OCF_ERR_GENERIC fi @@ -229,15 +301,14 @@ ocf_log err "Unable to add tag to $lv_path" return $OCF_ERR_GENERIC fi - lvchange -ay $lv_path - if [ $? -ne 0 ]; then + + if ! lv_activate_resilient $action $lv_path; then ocf_log err "Unable to activate $lv_path" return $OCF_ERR_GENERIC fi else ocf_log notice "Deactivating $lv_path" - lvchange -an $lv_path - if [ $? -ne 0 ]; then + if ! lv_activate_resilient $action $lv_path; then ocf_log err "Unable to deactivate $lv_path" return $OCF_ERR_GENERIC fi