From mboxrd@z Thu Jan 1 00:00:00 1970 From: olivier arsac Date: Mon, 13 Aug 2007 09:49:23 +0000 Subject: Re: [LARTC] How to check an inactive slave in a bond? Message-Id: <46C02923.1010904@arsac.org> List-Id: References: <469F6422.5060303@arsac.org> In-Reply-To: <469F6422.5060303@arsac.org> MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit To: lartc@vger.kernel.org Thank you for this very complete answer. - Assuming I really have to implement some sort of inactive slave-link check. - Assuming it is acceptable to remove the inactive slave from the bound for the duration of the check. Could you help me check my script? It works well for me but as I'm about to deploy it for production purpose I'd rather have a double check from you guys. (Note: I'm not reliable when it comes to (among other things) routing and network related topics) Thx. Olivier --------------------------------------------------------------------------------------- #!/bin/bash # Check all nics enslaved in a bond. # This is a way to check that all nics (including inactive ones) are working properly. # # Authors: # OA: Olivier Arsac # History: # 19/04/2007: OA scratch # 31/06/2007: OA better handling of "free" IPs used during test # TODO: # remove all TODOs from the script #set -x # try to be robust -> exit if a variable is not set (probably something went wrong) set -o nounset trap clean INT TERM PATH=/exploit/local/sbin:/exploit/local/bin:/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/exploit/unix/prod/bin com=`basename "$0"` fullcom="$*" usage() { echo "Usage: $com [-q] [-i ip] [-t target] [bond]" echo " Check all nics enslaved to a bond." echo " This is a way to check that all nics (including inactive ones) are working properly. You should check that periodicaly to avoid nasty surprises when your active nic stops working and you have to fallback to your (unchecked) slave one." echo " exit 0 if all is OK (or if no bond is present)." echo " -q: quiet (no verbose message for human operator)." echo " -i: ip to use during check of inactive slaves." echo " -t: target ip to ping during checks." echo "eg: $com" echo " check all nics from all bonds." echo "eg: $com -q bond0" echo " check silently all nics from bond0." } quiet=0 ip="" target="" while getopts "qi:t:" option do case $option in q) quiet=1;; i) ip=$OPTARG;; t) target=$OPTARG;; *) usage; exit 1;; esac done # drop what has been parsed by getopts shift `expr $OPTIND - 1` # get args if [ "$#" -ne 0 ] then bonds="$@" for bond in $bonds; do if [ ! -f /proc/net/bonding/$bond ]; then echoe "Error: $bond is not a valid bond." exit 6 fi done else bonds=`ls /proc/net/bonding/ 2>/dev/null` fi #match a MAC address re_mac="([a-zA-Z0-9][a-zA-Z0-9]:){5}[a-zA-Z0-9][a-zA-Z0-9]" re_ip="(([0-9]{3}[.]){3}[0-9]{3}" function echoe(){ echo "$@" >/dev/stderr } function echoq(){ if [ $quiet -eq 0 ] ; then echo "$@"; fi } # set a valid mac to a nic # (must get a mac from the slaves in bond but one that is not currently in use by the bond) get_free_mac_ret="" function get_free_mac(){ bond=$1 nic=$2 free_mac="" macs=`grep "Permanent HW addr:" /proc/net/bonding/$bond | egrep -o $re_mac| tr 'a-z' 'A-Z'` bond_mac=`ifconfig $bond | grep HWaddr | egrep -o $re_mac` for mac in $macs; do if [ "$mac" != "$bond_mac" ]; then free_mac=$mac fi done get_free_mac_ret=$free_mac } # ping a target using a specified nic to test for IP connectivity check_nic_ret=0 function check_nic(){ target=$1 if [ $# -ge 2 ]; then nic=$2 ping -n -c 3 -I $nic $target 1>/dev/null 2>/dev/null else ping -n -c 3 $target 1>/dev/null 2>/dev/null fi if [ $? -ne 0 ]; then ma="" if [ $# -ge 3 ]; then ma="(using $3 as ip)" fi echoq " [ERROR]" echo "$nic interface on $host is not working properly! $ma" > /dev/stderr check_nic_ret=1 else echoq " [OK]" check_nic_ret=0 fi } # arping a target using a specified nic to test for IP connectivity function exercise_nic_arp(){ target=$1 nic=$2 src_ip=$3 arping -c 3 -s "$src_ip" -I "$nic" "$target" 1>/dev/null 2>/dev/null } # reset a properly configured bond if someone interrupts the script clean_bond="" clean_nic="" function clean(){ echoq echoq "Script interrupted, restoring bond." if [ ! -z $clean_bond ] && [ ! -z $clean_nic ]; then ifenslave $clean_bond $clean_nic 2>/dev/null fi exit 2 } host=`hostname -s` table 0 if [ ! -d /proc/net/bonding ]; then echoe "Warning: Module bonding not loaded. Obviously no bond to check." #trying to check a bond on a server where none is present is probably not realy an error -> exit 0 with a warning message exit 0 fi if [ -z $target ]; then # no target given as parameter -> auto-detect # get the default gateway as a ping target target=`route -n | grep UG | awk '{print $2}'` if [ -z $target ]; then echoe "Error: Unable to auto-detect the target to use during test (use -t?)." exit 3 fi fi if [ -z "$ip" ]; then # no ip given as parameter -> auto-detect ip_b1=`host "${host}-bond-t1" | grep -o "$re_ip"` ip_b2=`host "${host}-bond-t2" | grep -o "$re_ip"` if [ -z "$ip_b1" ] && [ -z "$ip_b2" ]; then echoe "Error: Unable to auto-detect an ip to use during test (use -i?)." exit 4 fi fi error_nb=0 for bond in $bonds do bond=`basename $bond` echoq "checking bond $bond" active=`grep "Active Slave" /proc/net/bonding/$bond |cut -d':' -f2` echoq -n " active slave :$active" check_nic $target error_nb=$(($error_nb + $check_nic_ret)) slaves=`grep "Slave Interface:" /proc/net/bonding/$bond |cut -d':' -f2` slave_nb=0 for slave in $slaves do if [ $slave != $active ]; then # this nic is enslaved but not active. we want to check if it is ready to work (no cable or VPN trouble that will bite us only when the active slave will change) echoq -n " inactive slave : $slave" # search for a free mac in this bond (ie a real phy MAC that is not the one used by the bond) get_free_mac $bond $slave free_mac=$get_free_mac_ret # store the bond/nic we are going to un-enslave (to be able de re-enslave it in case of interrupt) clean_bond=$bond clean_nic=$slave if [ -z "$ip" ]; then ip="$ip_b1" # TODO: use a clever way to match slave and free ip fi # free this nic from the bond ifenslave -d $bond $slave # set it up with a "free" mac ifconfig $slave hw ether $free_mac # set it up with a temp IP ifconfig $slave $ip netmask 255.255.255.255 # it seems we need a small temporisation here or the rest may fail sleep 2 exercise_nic_arp $target $slave $ip check_nic $target $slave $ip error_nb=$(($error_nb + $check_nic_ret)) # clean this temporary ip/route ifconfig $slave down # re-enslave this nic to the bond ifenslave $bond $slave clean_bond=""; clean_nic="" slave_nb=$(($slave_nb + 1)) fi done echoq -n " bond : $bond" check_nic $target $bond error_nb=$(($error_nb + $check_nic_ret)) if [ $slave_nb -eq 0 ]; then echoe "Error: No inactive slave in $bond." exit 5 fi done if [ $error_nb -ne 0 ]; then exit $((10 + $error_nb)) fi exit 0 --------------------------------------------------------------------------------------- _______________________________________________ LARTC mailing list LARTC@mailman.ds9a.nl http://mailman.ds9a.nl/cgi-bin/mailman/listinfo/lartc