Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH RFC] test: revise 'test' and make it easier to understand
From: Zhilong Liu @ 2017-02-28  2:47 UTC (permalink / raw)
  To: neilb, Jes.Sorensen; +Cc: linux-raid, gqjiang, Zhilong Liu
In-Reply-To: <a81137d5-3042-e983-fe0e-1b9a0de836ff@suse.com>

1. use 'Tab' as the code style.
2. arrange the testing steps and provide the 'main' entrance.
3. draft the log_save feature, it captures the /proc/mdstat,
   md superblock info, bitmap info and the detail dmesg.
4. modified the mdadm() func, adding the operation that clear
   the superblock when create or build one new array, and it
   would exit testing when mdadm command returned non-0 value.
5. delete no_errors() func, it only used in tests/04update-uuid,
   I recommend the new mdadm() using method.
6. delete fast_sync() func.
7. testdev(), add the object file checking, otherwise this command
   would create one regular file, it's one trouble thing.
8. add dmesg checking in do_test() func, it's necessary to check
   dmesg whether or not printed abnormal message.
9. add checking conditions in main(), such as $pwd/raid6check need
   exists, here is a prompt to remind users to 'make everything'
   before testing; the $targetdir should mount under ext[2-4] FS,
   because the external bitmap only supports ext, the bmap() API
   of bitmap.c doesn't exist in all filesystem, such as btrfs.

Signed-off-by: Zhilong Liu <zlliu@suse.com>

diff --git a/test b/test
index 13f1bda..e23addb 100755
--- a/test
+++ b/test
@@ -1,36 +1,21 @@
 #!/bin/bash
 #
 # run test suite for mdadm
-user=`id -un`
-if [ " $user" != " root" ]
-then echo >&2 "test: testing can only be done as 'root'."
-     exit 1;
-fi
-
-prefix='[0-9][0-9]'
-
-dir=`pwd`
+dir=$(pwd)
+DEVTYPE=loop
 mdadm=$dir/mdadm
-if [ \! -x $mdadm ]
-then
-   echo >&2 "test: $mdadm isn't usable."
-fi
-
 testdir="tests"
-logdir="$testdir/logs"
 logsave=0
 exitonerror=1
-
-echo "Testing on linux-$(uname -r) kernel"
-
-# Check whether to run multipath tests
-modprobe multipath 2> /dev/null
-if grep -s 'Personalities : .*multipath' > /dev/null /proc/mdstat ; then
-    MULTIPATH="yes"
-fi
+prefix='[0-9][0-9]'
 INTEGRITY=yes
-DEVTYPE=loop
 LVM_VOLGROUP=mdtest
+targetdir="/var/tmp/mdtest"
+[ -d "$targetdir" ] &&
+	rm -fr $targetdir
+logdir="$dir/$testdir/log"
+[ -d "$logdir" ] &&
+	rm -fr $logdir
 
 # make sure to test local mdmon, not system one
 export MDADM_NO_SYSTEMCTL=1
@@ -42,7 +27,7 @@ mdp1=/dev/md_d1
 
 # We test mdadm on loop-back block devices.
 # dir for storing files should be settable by command line maybe
-targetdir=/var/tmp
+#targetdir=/var/tmp/mdtest
 size=20000
 # super0, round down to multiple of 64 and substract 64
 mdsize0=19904
@@ -68,22 +53,64 @@ config=/tmp/mdadm.conf
 
 cleanup() {
 	udevadm settle
-	$mdadm -Ssq 2> /dev/null
-        case $DEVTYPE in
-        loop)
-	  for d in 0 1 2 3 4 5 6 7  8 9 10 11 12 13
-	  do
-	    losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d
-	    rm -f /dev/disk/by-path/loop*
-	  done
-          ;;
-        lvm)
-	  for d in 0 1 2 3 4 5 6 7  8 9 10 11 12 13
-	  do
-	    eval "lvremove --quiet -f \$dev$d"
-	  done
-          ;;
-        esac
+	$mdadm -Ssq
+	case $DEVTYPE in
+		loop )
+			for d in 0 1 2 3 4 5 6 7  8 9 10 11 12 13
+			do
+				losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d
+				rm -f /dev/disk/by-path/loop*
+			done
+		;;
+		lvm )
+			for d in 0 1 2 3 4 5 6 7  8 9 10 11 12 13
+			do
+				eval "lvremove --quiet -f \$dev$d"
+			done
+		;;
+	esac
+	dmesg -c > /dev/null
+}
+
+die()
+{
+	echo -e "\n\tERROR: $* \n"
+	log_save fail
+	exit 2
+}
+
+# add save_log func, $1 is the flag why save log.
+log_save() {
+	status=$1
+	save_log="$status""$_basename".log
+
+	echo "## $HOSTNAME: dmesg saved." >> $logdir/$save_log
+	dmesg -c >> $logdir/$save_log
+	$mdadm -As 2> /dev/null
+	echo "## $HOSTNAME: md status message saved." >> $logdir/$save_log
+	cat /proc/mdstat >> $logdir/$save_log
+
+	if [ $DEVTYPE == 'lvm' ]
+	then
+		# waiting for supporting.
+		echo
+	elif [ $DEVTYPE == 'loop' ]
+	then
+		array=($($mdadm -Ds | cut -d' ' -f2))
+		if [ ${#array[@]} -ge 1 ]; then
+			md_disks=($($mdadm -D -Y ${array[@]} | grep "/dev/$DEVTYPE" | cut -d'=' -f2))
+			echo "## $HOSTNAME: mdadm -D ${array[@]}" >> $logdir/$save_log
+			$mdadm -D ${array[@]} >> $logdir/$save_log
+			$mdadm -X $md_disks &> /dev/null
+			if [ $? -eq 0 ]
+			then
+				echo "## $HOSTNAME: mdadm -X ${md_disks[@]}" >> $logdir/$save_log
+				$mdadm -X ${md_disks[@]} >> $logdir/$save_log
+			fi
+		elif [ ${#array[@]} -lt 1 ]; then
+			echo "## $HOSTNAME: no array assembled!" >> $logdir/$save_log
+		fi
+	fi
 }
 
 ctrl_c() {
@@ -91,350 +118,374 @@ ctrl_c() {
 }
 
 do_setup() {
-  trap cleanup 0 1 3 15
-  trap ctrl_c 2
+	trap cleanup 0 1 3 15
+	trap ctrl_c 2
 
-  # make sure there are no loop devices remaining.
-  # udev started things can sometimes prevent them being stopped
-  # immediately
-  while grep loop /proc/partitions > /dev/null 2>&1
-  do
-    mdadm -Ss
-    losetup -d /dev/loop[0-9]* 2> /dev/null
-    sleep 1
-  done
-  devlist=
-  for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13
-  do
-    sz=$size
-    if [ $d -gt 7 ]; then sz=$ddfsize ; fi
-    case $DEVTYPE in
-    loop)
-      [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1
-      # make sure udev doesn't touch
-      mdadm --zero $targetdir/mdtest$d 2> /dev/null
-      [ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d
-      if [ $d -eq 7 ]
-      then
-        losetup /dev/loop$d $targetdir/mdtest6 # for multipath use
-      else
-        losetup /dev/loop$d $targetdir/mdtest$d
-      fi
-      eval dev$d=/dev/loop$d
-      eval file$d=$targetdir/mdtest$d
-      ;;
-    lvm)
-      unset MULTIPATH
-      eval dev$d=/dev/mapper/${LVM_VOLGROUP}-mdtest$d
-      if ! lvcreate --quiet -L ${sz}K -n mdtest$d $LVM_VOLGROUP; then
-	  trap '' 0 # make sure lvremove is not called
-	  eval echo error creating \$dev$d
-	  exit 129
-      fi
-      ;;
-    ram)
-      unset MULTIPATH
-      eval dev$d=/dev/ram$d
-      ;;
-    esac
-    eval devlist=\"\$devlist \$dev$d\"
-    eval devlist$d=\"\$devlist\"
-   #" <-- add this quote to un-confuse vim syntax highlighting
-  done
-  path0=$dev6
-  path1=$dev7
+	# make sure there are no loop devices remaining.
+	# udev started things can sometimes prevent them being stopped
+	# immediately
+	while grep loop /proc/partitions > /dev/null 2>&1
+	do
+		mdadm -Ss
+		losetup -d /dev/loop[0-9]* 2> /dev/null
+		sleep 0.2
+	done
+	devlist=
+	for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13
+	do
+		sz=$size
+		[ $d -gt 7 ] && sz=$ddfsize
+		case $DEVTYPE in
+		loop )
+			[ -f $targetdir/mdtest$d ] ||
+				dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1
+			# make sure udev doesn't touch
+			mdadm --zero $targetdir/mdtest$d 2> /dev/null
+			[ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d
+			if [ $d -eq 7 ]
+			then
+				losetup /dev/loop$d $targetdir/mdtest6 # for multipath use
+			else
+				losetup /dev/loop$d $targetdir/mdtest$d
+			fi
+			eval dev$d=/dev/loop$d
+			eval file$d=$targetdir/mdtest$d
+		;;
+		lvm )
+			unset MULTIPATH
+			eval dev$d=/dev/mapper/${LVM_VOLGROUP}-mdtest$d
+			if ! lvcreate --quiet -L ${sz}K -n mdtest$d $LVM_VOLGROUP; then
+				trap '' 0 # make sure lvremove is not called
+				eval echo error creating \$dev$d
+				exit 129
+			fi
+		;;
+		ram )
+			unset MULTIPATH
+			eval dev$d=/dev/ram$d
+		;;
+		esac
+		eval devlist=\"\$devlist \$dev$d\"
+		eval devlist$d=\"\$devlist\"
+		#" <-- add this quote to un-confuse vim syntax highlighting
+	done
+	path0=$dev6
+	path1=$dev7
 
-  ulimit -c unlimited
-  [ -f /proc/mdstat ] || modprobe md_mod
-  echo 2000 > /proc/sys/dev/raid/speed_limit_max
-  echo 0 > /sys/module/md_mod/parameters/start_ro
+	ulimit -c unlimited
+	[ -f /proc/mdstat ] || modprobe md_mod
+	echo 2000 > /proc/sys/dev/raid/speed_limit_max
+	echo 0 > /sys/module/md_mod/parameters/start_ro
 }
 
 # mdadm always adds --quiet, and we want to see any unexpected messages
 mdadm() {
-    rm -f $targetdir/stderr
-    case $* in
-	*-S* ) udevadm settle
-	       p=`cat /proc/sys/dev/raid/speed_limit_max`
-	       echo 20000 > /proc/sys/dev/raid/speed_limit_max
-    esac
-    case $* in
-	*-C* ) $mdadm 2> $targetdir/stderr --quiet "$@" --auto=yes;;
-	* )    $mdadm 2> $targetdir/stderr --quiet "$@"
-    esac
-    rv=$?
-    case $* in
-	*-S* ) udevadm settle
-	       echo $p > /proc/sys/dev/raid/speed_limit_max
-    esac
-    cat >&2 $targetdir/stderr
-    return $rv
+	rm -f $targetdir/stderr
+	case $* in
+		*-S* )
+			udevadm settle
+			p=`cat /proc/sys/dev/raid/speed_limit_max`
+			echo 20000 > /proc/sys/dev/raid/speed_limit_max
+	esac
+	case $* in
+		*-C* | *--create* | *-B* | *--build* )
+			for args in $*
+			do
+				[[ $args =~ "/dev/" ]] && {
+					[[ $args =~ "md" ]] ||
+						$mdadm --zero $args > /dev/null
+					}
+			done
+			$mdadm 2> $targetdir/stderr --quiet "$@" --auto=yes
+		;;
+		* )
+			$mdadm 2> $targetdir/stderr --quiet "$@"
+	esac
+	rv=$?
+	case $* in
+		*-S* )
+			udevadm settle
+			echo $p > /proc/sys/dev/raid/speed_limit_max
+	esac
+	cat >&2 $targetdir/stderr > $targetdir/log
+	[  $rv -ne 0 ] && exit 1
+	return $rv
 }
 
 # check various things
 check() {
-   case $1 in
-    spares )
-       spares=`tr '] ' '\012\012' < /proc/mdstat | grep -c '(S)' || exit 0`
-       if [ $spares -ne $2 ]
-       then
-          echo >&2 "ERROR expected $2 spares, found $spares"; exit 1;
-       fi
-      ;;
-    raid* | linear )
-      grep -s "active $1 " /proc/mdstat > /dev/null || {
-		echo >&2 "ERROR active $1 not found" ; cat /proc/mdstat ; exit 1;}
-     ;;
-    algorithm )
-      grep -s " algorithm $2 " /proc/mdstat > /dev/null || {
-	  echo >&2 "ERROR algorithm $2 not found"; cat /proc/mdstat; exit 1;}
-     ;;
-    resync | recovery | reshape)
-	cnt=5
-	while ! grep -s $1 /proc/mdstat > /dev/null
-	do
-	    if [ $cnt -gt 0 ] && grep -v idle /sys/block/md*/md/sync_action > /dev/null
-	    then # Something isn't idle - wait a bit
-		sleep 0.5
-		cnt=$[cnt-1]
-	    else
-		echo >&2 ERROR no $1 happening; cat /proc/mdstat; exit 1
-	    fi
-	done
-	;;
-
-     nosync )
-       sleep 0.5
-       # Since 4.2 we delay the close of recovery until there has been a chance for
-       # spares to be activated.  That means that a recovery that finds nothing
-       # to do can still take a little longer than expected.
-       # add an extra check: is sync_completed shows the end is reached, assume
-       # there is no recovery.
-       if grep -s -E '(resync|recovery|reshape) *=' > /dev/null /proc/mdstat ; then
-	   incomplete=`grep / /sys/block/md*/md/sync_completed 2> /dev/null | sed '/^ *\([0-9]*\) \/ \1/d'`
-	   if [ -n "$incomplete" ]; then
-		echo >&2 "ERROR resync or recovery is happening!"; cat /proc/mdstat ; exit 1;
-	   fi
-       fi
-     ;;
-
-    wait )
-      p=`cat /proc/sys/dev/raid/speed_limit_max`
-      echo 2000000 > /proc/sys/dev/raid/speed_limit_max
-      sleep 0.1
-      while grep -E '(resync|recovery|reshape|check|repair) *=' > /dev/null /proc/mdstat ||
-	      grep -v idle > /dev/null /sys/block/md*/md/sync_action
-      do sleep 0.5;
-      done
-      echo $p > /proc/sys/dev/raid/speed_limit_max
-      ;;
-
-    state )
-       grep -s "blocks.*\[$2\]\$" /proc/mdstat > /dev/null || {
-		echo >&2 "ERROR state $2 not found!"; cat /proc/mdstat ; exit 1; }
-       sleep 0.5
-      ;;
-
-    bitmap )
-       grep -s bitmap > /dev/null /proc/mdstat || {
-		echo >&2 ERROR no bitmap ; cat /proc/mdstat ; exit 1; }
-      ;;
-    nobitmap )
-       if grep -s "bitmap" > /dev/null /proc/mdstat
-       then
-		echo >&2 ERROR bitmap present ; cat /proc/mdstat ; exit 1;
-       fi
-      ;;
-
-    readonly )
-       grep -s "read-only" > /dev/null /proc/mdstat || {
-                echo >&2 "ERROR array is not read-only!"; cat /proc/mdstat ; exit 1; }
-      ;;
-
-    inactive )
-       grep -s "inactive" > /dev/null /proc/mdstat || {
-                echo >&2 "ERROR array is not inactive!"; cat /proc/mdstat ; exit 1; }
-      ;;
-    * ) echo >&2 ERROR unknown check $1 ; exit 1;
-   esac
+	case $1 in
+		spares )
+			spares=$(tr '] ' '\012\012' < /proc/mdstat | grep -c '(S)')
+			[ $spares -ne $2 ] &&
+				die "expected $2 spares, found $spares"
+		;;
+		raid* | linear )
+			grep -s -q "active $1 " /proc/mdstat ||
+				die "active $1 not found"
+		;;
+		algorithm )
+			grep -s -q " algorithm $2 " /proc/mdstat ||
+				die "algorithm $2 not found"
+		;;
+		resync | recovery | reshape )
+			cnt=5
+			while ! grep -s $1 /proc/mdstat > /dev/null
+			do
+				if [ $cnt -gt 0 ]
+				then # Something isn't idle - wait a bit
+					sleep 0.5
+					cnt=$[cnt-1]
+				else
+					die "no $1 happening"
+				fi
+			done
+		;;
+		nosync )
+			sleep 0.5
+			# Since 4.2 we delay the close of recovery until there has been a chance for
+			# spares to be activated.  That means that a recovery that finds nothing
+			# to do can still take a little longer than expected.
+			# add an extra check: is sync_completed shows the end is reached, assume
+			# there is no recovery.
+			if grep -s -E -q '(resync|recovery|reshape) *=' /proc/mdstat
+			then
+				incomplete=`grep / /sys/block/md*/md/sync_completed 2> /dev/null | sed '/^ *\([0-9]*\) \/ \1/d'`
+				[ -n "$incomplete" ] &&
+					die "resync or recovery is happening!"
+			fi
+		;;
+		wait )
+			p=$(cat /proc/sys/dev/raid/speed_limit_max)
+			echo 2000000 > /proc/sys/dev/raid/speed_limit_max
+			sleep 0.1
+			while grep -E -q '(resync|recovery|reshape|check|repair) *=' /proc/mdstat ||
+				grep -v idle > /dev/null /sys/block/md*/md/sync_action
+			do sleep 0.5;
+			done
+			echo $p > /proc/sys/dev/raid/speed_limit_max
+		;;
+		state )
+			grep -s -q "blocks.*\[$2\]\$" /proc/mdstat ||
+				die "state $2 not found!"
+			sleep 0.5
+		;;
+		bitmap )
+			grep -s -q bitmap /proc/mdstat ||
+				die "no bitmap found in /proc/mdstat"
+		;;
+		nobitmap )
+			grep -s -q "bitmap" /proc/mdstat &&
+				die "bitmap present in /proc/mdstat"
+		;;
+		readonly )
+			grep -s -q "read-only" /proc/mdstat ||
+				die "array is not read-only!"
+		;;
+		inactive )
+			grep -s -q "inactive" /proc/mdstat ||
+				die "array is not inactive!"
+		;;
+		* )
+			die "check $1 is unknown!"
+	esac
 }
 
-no_errors() {
-  if [ -s $targetdir/stderr ]
-  then echo Bad errors from mdadm: ; cat $targetdir/stderr; exit 2;
-  fi
-}
 # basic device test
-
 testdev() {
-   udevadm settle
-   dev=$1
-   cnt=$2
-   dvsize=$3
-   chunk=$4
-   if [ -z "$5" ]; then
-      mkfs.ext3 -F -j $dev > /dev/null 2>&1 && fsck -fn $dev >&2
-   fi
-   dsize=$[dvsize/chunk]
-   dsize=$[dsize*chunk]
-   rasize=$[dsize*2*cnt]
-   # rasize is in sectors
-   if [ -n "$DEV_ROUND_K" ]; then
-      rasize=$[rasize/DEV_ROUND_K/2]
-      rasize=$[rasize*DEV_ROUND_K*2]
-   fi
-   if [ `/sbin/blockdev --getsize $dev` -eq 0 ]; then sleep 2 ; fi
-   _sz=`/sbin/blockdev --getsize $dev`
-   if [ $rasize -lt $_sz -o $[rasize*4/5] -gt $_sz ]
-   then
-     echo "ERROR: size is wrong for $dev: $cnt * $dvsize (chunk=$chunk) = $rasize, not $_sz"
-     exit 1
-   fi
-}
-
-fast_sync() {
-  echo 200000 > /proc/sys/dev/raid/speed_limit_max
+# add the necessary checking when received object file, such as testdev /dev/md0
+# it would be created one regular file if /dev/md0 doesn't pull up, the rest testing
+# scripts would be affected.
+	[ -f $1 ] && rm -f $1
+	[ -b $1 ] || die "$1 doesn't exist!"
+	udevadm settle
+	dev=$1
+	cnt=$2
+	dvsize=$3
+	chunk=$4
+	if [ -z "$5" ]; then
+		mkfs.ext3 -F -j $dev > /dev/null 2>&1 && fsck -fn $dev >&2
+	fi
+	dsize=$[dvsize/chunk]
+	dsize=$[dsize*chunk]
+	rasize=$[dsize*2*cnt]
+	# rasize is in sectors
+	if [ -n "$DEV_ROUND_K" ]; then
+		rasize=$[rasize/DEV_ROUND_K/2]
+		rasize=$[rasize*DEV_ROUND_K*2]
+	fi
+	[ `/sbin/blockdev --getsize $dev` -eq 0 ] && sleep 2
+	_sz=`/sbin/blockdev --getsize $dev`
+	[ $rasize -lt $_sz -o $[rasize*4/5] -gt $_sz ] &&
+		die "size is wrong for $dev: $cnt * $dvsize (chunk=$chunk) = $rasize, not $_sz"
+
+# sometimes the above command would return non-0
+	return 0
 }
 
 rotest() {
-  dev=$1
-  fsck -fn $dev >&2
+	dev=$1
+	fsck -fn $dev >&2
 }
 
 do_test() {
-  _script=$1
-  _basename=`basename $_script`
-  if [ -f "$_script" ]
-  then
-    rm -f $targetdir/stderr
-    # stop all arrays, just incase some script left an array active.
-    $mdadm -Ssq 2> /dev/null
-    mdadm --zero $devlist 2> /dev/null
-    mdadm --zero $devlist 2> /dev/null
-    # this might have been reset: restore the default.
-    echo 2000 > /proc/sys/dev/raid/speed_limit_max
-    # source script in a subshell, so it has access to our
-    # namespace, but cannot change it.
-    echo -ne "$_script... "
-    if ( set -ex ; . $_script ) &> $targetdir/log
-    then
-      echo "succeeded"
-      _fail=0
-    else
-      log=log
-      cat $targetdir/stderr >> $targetdir/log
-      echo "=======================dmesg=================" >> $targetdir/log
-      dmesg | tail -n 200 >> $targetdir/log
-      if [ $exitonerror == 0 ]; then
-	  log=log-`basename $_script`
-	  mv $targetdir/log $logdir/$log
-      fi
-      echo "FAILED - see $logdir/$log for details"
-      _fail=1
-    fi
-    if [ "$savelogs" == "1" ]; then
-      cp $targetdir/log $logdir/$_basename.log
-    fi
-    if [ "$_fail" == "1" -a "$exitonerror" == "1" ]; then
-      exit 1
-    fi
-  fi
+	_script=$1
+	_basename=`basename $_script`
+	if [ -f "$_script" ]
+	then
+		rm -f $targetdir/stderr
+		# stop all arrays, just incase some script left an array active.
+		$mdadm -Ssq 2> /dev/null
+		mdadm --zero $devlist 2> /dev/null
+		# this might have been reset: restore the default.
+		echo 2000 > /proc/sys/dev/raid/speed_limit_max
+		# source script in a subshell, so it has access to our
+		# namespace, but cannot change it.
+		echo -ne "$_script... "
+		if ( set -ex ; . $_script ) &> $targetdir/log
+		then
+# put the dmesg checking here, the following key-words shouldn't appeared during testing.
+			dmesg | grep -i "error\|call trace\|segfault" &&
+				die "dmesg printed error when testing $_basename!"
+			echo "succeeded"
+			_fail=0
+		else
+			log=log-$_basename
+			cat $targetdir/stderr >> $targetdir/log
+			log_save fail
+			mv $targetdir/log $logdir/$log
+			echo "FAILED - see $logdir/$log for details"
+			_fail=1
+		fi
+		[ "$savelogs" == "1" ] &&
+			cp $targetdir/log $logdir/$_basename.log
+		[ "$_fail" == "1" -a "$exitonerror" == "1" ] && exit 1
+	fi
+
+	return 0
 }
 
+# just a recommend.
 do_help() {
-  echo "Usage: $0 [options]"
-  echo " Options:"
-  echo "    --tests=<test1,test2,..>    Comma separated list of tests to run"
-  echo "    --disable-multipath         Disable any tests involving multipath"
-  echo "    --disable-integrity         Disable slow tests of RAID[56] consistency"
-  echo "    --logdir=<directory>        Directory to save logfiles in"
-  echo "    --save-logs                 Save all logs in <logdir>"
-  echo "    --keep-going                Don't stop on error, ie. run all tests"
-  echo "    --dev=[loop|lvm|ram]        Use loop devices (default), LVM, or RAM disk"
-  echo "    --volgroup=<name>           LVM volume group for LVM test"
-  echo "    setup                       Setup test environment and exit"
-  echo "    cleanup                     Cleanup test environment"
-  echo "    <prefix>                    Run tests with <prefix>"
+	cat <<-EOF
+	Usage: $0 [options]
+	Options:
+		--tests=<test1,test2,..>    Comma separated list of tests to run
+		--disable-multipath         Disable any tests involving multipath
+		--disable-integrity         Disable slow tests of RAID[56] consistency
+		--logdir=<directory>        Directory to save logfiles in
+		--save-logs                 Save all logs in <logdir>
+		--keep-going                Don't stop on error, ie. run all tests
+		--dev=[loop|lvm|ram]        Use loop devices (default), LVM, or RAM disk
+		--volgroup=<name>           LVM volume group for LVM test
+		setup                       Setup test environment and exit
+		cleanup                     Cleanup test environment
+		<prefix>                    Run tests with <prefix>
+	EOF
+	exit 0
 }
 
 parse_args() {
-  for i in $*
-  do
-    case $i in
-    [0-9]*)
-      prefix=$i
-      ;;
-    setup)
-      echo "mdadm test environment setup"
-      do_setup
-      trap 0; exit 0
-      ;;
-    cleanup)
-      cleanup
-      exit 0
-      ;;
-    --tests=*)
-      TESTLIST=`expr "x$i" : 'x[^=]*=\(.*\)' | sed -e 's/,/ /g'`
-      ;;
-    --logdir=*)
-      logdir=`expr "x$i" : 'x[^=]*=\(.*\)'`
-      ;;
-    --save-logs)
-      savelogs=1
-      ;;
-    --keep-going | --no-error)
-      exitonerror=0
-      ;;
-    --disable-multipath)
-      unset MULTIPATH
-      ;;
-    --disable-integrity)
-      unset INTEGRITY
-      ;;
-    --dev=loop)
-      DEVTYPE=loop
-      ;;
-    --dev=lvm)
-      DEVTYPE=lvm
-      ;;
-    --dev=ram)
-      DEVTYPE=ram
-      ;;
-    --volgroup=*)
-      LVM_VOLGROUP=`expr "x$i" : 'x[^=]*=\(.*\)'`
-      ;;
-    --help)
-      do_help
-      exit 0;
-      ;;
-    -*)
-      echo " $0: Unknown argument: $i"
-      do_help
-      exit 0;
-      ;;
-    esac
-done
+	for i in $*
+	do
+		case $i in
+			[0-9]* )
+				prefix=$i
+			;;
+			setup )
+				echo "mdadm test environment setup"
+				do_setup
+				trap 0; exit 0
+			;;
+			cleanup )
+				cleanup
+				exit 0
+			;;
+			--tests=* )
+				TESTLIST=`expr "x$i" : 'x[^=]*=\(.*\)' | sed -e 's/,/ /g'`
+			;;
+			--logdir=* )
+				logdir=`expr "x$i" : 'x[^=]*=\(.*\)'`
+			;;
+			--save-logs )
+				savelogs=1
+			;;
+			--keep-going | --no-error )
+				exitonerror=0
+			;;
+			--disable-multipath )
+				unset MULTIPATH
+			;;
+			--disable-integrity )
+				unset INTEGRITY
+			;;
+			--dev=loop )
+				DEVTYPE=loop
+			;;
+			--dev=lvm )
+				DEVTYPE=lvm
+			;;
+			--dev=ram )
+				DEVTYPE=ram
+			;;
+			--volgroup=* )
+				LVM_VOLGROUP=`expr "x$i" : 'x[^=]*=\(.*\)'`
+			;;
+			--help )
+				do_help
+			;;
+			-* )
+				echo " $0: Unknown argument: $i"
+				do_help
+		;;
+		esac
+	done
 }
 
-logdir=$targetdir
-parse_args $@
-
-do_setup
-mkdir -p $logdir
-
-if [ "$savelogs" == "1" ]; then
-  echo "Saving logs to $logdir"
-fi
+# draft the main func
+main() {
+	[ "X$(id -un)" != "Xroot" ] && {
+		echo "test: testing can only be done as 'root'."
+		exit 1
+	}
+	[ -x $mdadm -a -x "test" ] || {
+		echo "test: $mdadm or '$dir/test' isn't usable."
+		exit 1
+	}
+	[ -x raid6check ] || {
+		echo "test: please run 'make everything' before testing."
+		exit 1
+	}
+	mkdir -p $targetdir
+	mkdir -p $logdir
+	# such as the external bitmap only support the ext file system.
+	# users can modify the $targetdir path under ext3 mount point.
+	[[ $(df $targetdir -T) =~ ext ]] || {
+		echo "ensure that $targetdir mounted under ext[2,3,4] filesystem!"
+		exit 1
+	}
+	echo "Testing on linux-$(uname -r) kernel"
+	[ "$savelogs" == "1" ] &&
+		echo "Saving logs to $logdir"
+	# Check whether to run multipath tests
+	modprobe multipath 2> /dev/null
+	grep -s -q 'Personalities : .*multipath' /proc/mdstat &&
+		MULTIPATH="yes"
+	do_setup
+	if [ "x$TESTLIST" != "x" ]; then
+		for script in $TESTLIST
+		do
+			do_test $testdir/$script
+		done
+	else
+		for script in $testdir/$prefix $testdir/$prefix*[^~]
+		do
+			do_test $script
+		done
+	fi
+
+	exit 0
+}
 
-if [ "x$TESTLIST" != "x" ]; then
-  for script in $TESTLIST
-  do
-    do_test $testdir/$script
-  done
-else
-  for script in $testdir/$prefix $testdir/$prefix*[^~]
-  do
-    do_test $script
-  done
-fi
-exit 0
+parse_args $@
+main
-- 
2.6.6


^ permalink raw reply related

* Re: Process stuck in md_flush_request (state: D)
From: Les Stroud @ 2017-02-28  2:58 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid@vger.kernel.org
In-Reply-To: <1224510038.17134.1488242683070@vsaw28.prod.google.com>

Sent from my iPhone

> On Feb 27, 2017, at 7:44 PM, Shaohua Li <shli@kernel.org> wrote:
>
>> On Mon, Feb 27, 2017 at 01:48:00PM -0500, Les Stroud wrote:
>>
>>
>>
>>
>>> On Feb 27, 2017, at 1:28 PM, Shaohua Li <shli@kernel.org> wrote:
>>>
>>> On Mon, Feb 27, 2017 at 09:49:59AM -0500, Les Stroud wrote:
>>>> After a period of a couple of weeks with one of our test instances having this problem every other day, they were all nice enough to operate without an issue for 9 days.  It finally reoccurred last night on one of the machines.
>>>>
>>>> It exhibits the same symptoms and the call traces look as they did previously.  This particular instance is configured with a deadline scheduler.  I was able to capture the inflight you requested:
>>>>
>>>> $ cat /sys/block/xvd[abcde]/inflight
>>>>        0        0
>>>>        0        0
>>>>        0        0
>>>>        0        0
>>>>        0        0
>>>>
>>>> I’ve had this happen on instances with the deadline scheduler and the noop scheduler.  At this point, I have not had this happen on an instance that is noop and the raid filesystem (ext4) is mounted with nobarrier.  The instances with noop/nobarrier have not been running long enough for me to make any sort of conclusion that it works around the problem. Frankly, I’m not sure I understand the interaction between ext4 barriers and raid0 block flushes well enough to theorize whether it should or shouldn’t make a difference.
>>>
>>> If nobarrier, ext4 doesn't send flush request.
>>
>> So, could ext4’s flush request deadlock with an md_flush_request?  Do they share a mutex of some sort? Could one of them be failing to acquire a mutex and not handling it?
>
> No, it shouldn't deadlock. I don't have other reports for such issue. Yours are the only one.
>
>>>
>>>> Does any of this help with identifying the bug?  Is there anymore information I can get that would be useful?
>>>
>>>
>>> Unfortunately I can't find anything fishing. Does the xcdx disk correctly
>>> handle flush request? For example, you can do the same test with a single such
>>> disk and check if anything wrong.
>>
I'll test a single disk config.


>> Until recently, we had a number of these systems setup without raid0.  This issue never occurred on those systems.  Unfortunately, I can’t find a way to make it happen other than stand a server up and let it run.
>>
>> I suppose I could try a different filesystem and see if that makes a difference (maybe ext3, xfs, etc).
>
> You could format a xcdx disk and do a test against it, and check if there is
> anything wrong. To be honest, I don't think it's a problme in ext4 side too,
> but better try other filesystems. If the xcdx is a proprietory driver, I highly
> recommend a check with a single such disk first.
>

These disks are AWS EBS. So, maybe it is an issue in the xen virtual
driver? I'll see if amazon support can give me any information about
what's happening below the OS.

Is there any other output that might tell me what the process is waiting on?

Thanx,
LES


> Thanks,
> Shaohua

^ permalink raw reply

* Re: LSI RAID
From: Hannes Reinecke @ 2017-02-28  9:06 UTC (permalink / raw)
  To: Gandalf Corvotempesta, linux-raid
In-Reply-To: <CAJH6TXhwfoWKM2=0v0+qhUWp=WppF5qO7zm5Y1-vS4zWQHA5WA@mail.gmail.com>

On 02/27/2017 05:02 PM, Gandalf Corvotempesta wrote:
> anyone?
> 
> 2017-02-13 11:33 GMT+01:00 Gandalf Corvotempesta
> <gandalf.corvotempesta@gmail.com>:
>> Hi to all
>> silly question: i've read that LSI/PERC Hardware controller supports DDF.
>>
>> Would be possible, for mdadm, to use a RAID created with an LSI/PERC
>> controller supporting DDF ?
Sure.
The recent mdadm should be able to create DDF metadata.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply

* Re: GRUB warning after replacing disk drive in RAID1
From: Reindl Harald @ 2017-02-28  9:23 UTC (permalink / raw)
  To: linux-raid
In-Reply-To: <002f01d29152$737b4550$5a71cff0$@wnsdev.com>



Am 28.02.2017 um 00:37 schrieb Peter Sangas:
> I have a RAID1 with 3 disks sda,sdb,sdc.  After replacing sdc and re-syncing
> it to the array I issued the following command to load grub but I get this
> warning:
>
> grub-install /dev/sdc
>
> Installing for i386-pc platform.
> grub-install: warning: Couldn't find physical volume `(null)'. Some modules
> may be missing from core image..
> grub-install: warning: Couldn't find physical volume `(null)'. Some modules
> may be missing from core image..
> Installation finished. No error reported.
>
> Does anyone know why I get this warning and how to avoid it

it's harmless and disappears after the resync finished

^ permalink raw reply

* Re: LSI RAID
From: Gandalf Corvotempesta @ 2017-02-28  9:44 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: linux-raid
In-Reply-To: <73fdfb90-4bdc-a452-d785-eb270ad987e2@suse.de>

2017-02-28 10:06 GMT+01:00 Hannes Reinecke <hare@suse.de>:
> Sure.
> The recent mdadm should be able to create DDF metadata.

This means that i'll be able to import a configuration created with a
LSI MegaRaid controller and use them with mdadm ?
If yes, how ?

^ permalink raw reply

* Re: [BUG] non-metadata arrays cannot use more than 27 component devices
From: ian_bruce @ 2017-02-28 10:25 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid
In-Reply-To: <87y3wsp47n.fsf@notabene.neil.brown.name>

On Mon, 27 Feb 2017 16:55:56 +1100
NeilBrown <neilb@suse.com> wrote:

>> When assembling non-metadata arrays ("mdadm --build"), the in-kernel
>> superblock apparently defaults to the MD-RAID v0.90 type. This
>> imposes a maximum of 27 component block devices, presumably as well
>> as limits on device size.
>>
>> mdadm does not allow you to override this default, by specifying the
>> v1.2 superblock. It is not clear whether mdadm tells the kernel to
>> use the v0.90 superblock, or the kernel assumes this by itself. One
>> or other of them should be fixed; there does not appear to be any
>> reason why the v1.2 superblock should not be the default in this
>> case.
> 
> Can you see if this change improves the behavior for you?

Unfortunately, I'm not set up for kernel compilation at the moment. But
here is my test case; it shouldn't be any harder to reproduce than this,
on extremely ordinary hardware (= no actual disk RAID array):


# truncate -s 64M img64m.{00..31}   # requires no space on ext4,
#                                   # because sparse files are created
# 
# ls img64m.*
img64m.00  img64m.04  img64m.08  img64m.12  img64m.16  img64m.20  img64m.24  img64m.28
img64m.01  img64m.05  img64m.09  img64m.13  img64m.17  img64m.21  img64m.25  img64m.29
img64m.02  img64m.06  img64m.10  img64m.14  img64m.18  img64m.22  img64m.26  img64m.30
img64m.03  img64m.07  img64m.11  img64m.15  img64m.19  img64m.23  img64m.27  img64m.31
# 
# RAID=$(for x in img64m.* ; do losetup --show -f $x ; done)
# 
# echo $RAID
/dev/loop0 /dev/loop1 /dev/loop2 /dev/loop3 /dev/loop4 /dev/loop5 /dev/loop6 /dev/loop7
/dev/loop8 /dev/loop9 /dev/loop10 /dev/loop11 /dev/loop12 /dev/loop13 /dev/loop14 /dev/loop15
/dev/loop16 /dev/loop17 /dev/loop18 /dev/loop19 /dev/loop20 /dev/loop21 /dev/loop22 /dev/loop23
/dev/loop24 /dev/loop25 /dev/loop26 /dev/loop27 /dev/loop28 /dev/loop29 /dev/loop30 /dev/loop31
# 
# mdadm --build /dev/md/md-test --level=linear --raid-devices=32 $RAID
mdadm: ADD_NEW_DISK failed for /dev/loop27: Device or resource busy
# 

kernel log:

    kernel: [109524.168624] md: nonpersistent superblock ...
    kernel: [109524.168638] md: md125: array is limited to 27 devices
    kernel: [109524.168643] md: export_rdev(loop27)
    kernel: [109524.180676] md: md125 stopped.


It appears that I was wrong in assuming that the MD-RAID v0.90
limitation of 4TB per component device would be in effect:


# truncate -s 5T img5t.{00..03}   # sparse files again
# 
# ls -l img5t.*
-rw-r--r-- 1 root root 5497558138880 Feb 28 00:09 img5t.00
-rw-r--r-- 1 root root 5497558138880 Feb 28 00:09 img5t.01
-rw-r--r-- 1 root root 5497558138880 Feb 28 00:09 img5t.02
-rw-r--r-- 1 root root 5497558138880 Feb 28 00:09 img5t.03
# 
# RAID=$(for x in img5t.* ; do losetup --show -f $x ; done)
# 
# echo $RAID
/dev/loop32 /dev/loop33 /dev/loop34 /dev/loop35
# 
# mdadm --build /dev/md/md-test --level=linear --raid-devices=4 $RAID
mdadm: array /dev/md/md-test built and started.
# 
# mdadm --detail /dev/md/md-test
/dev/md/md-test:
        Version : 
  Creation Time : Tue Feb 28 00:18:21 2017
     Raid Level : linear
     Array Size : 21474836480 (20480.00 GiB 21990.23 GB)
   Raid Devices : 4
  Total Devices : 4

          State : clean 
 Active Devices : 4
Working Devices : 4
 Failed Devices : 0
  Spare Devices : 0

       Rounding : 64K

    Number   Major   Minor   RaidDevice State
       0       7       32        0      active sync   /dev/loop32
       1       7       33        1      active sync   /dev/loop33
       2       7       34        2      active sync   /dev/loop34
       3       7       35        3      active sync   /dev/loop35
# 
# mkfs.ext4 /dev/md/md-test
mke2fs 1.43.4 (31-Jan-2017)
Discarding device blocks: done                            
Creating filesystem with 5368709120 4k blocks and 335544320 inodes
Filesystem UUID: da293fd3-b4ec-40e3-b5be-3caeef55edcf
Superblock backups stored on blocks: 
	32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632, 2654208, 
	4096000, 7962624, 11239424, 20480000, 23887872, 71663616, 78675968, 
	102400000, 214990848, 512000000, 550731776, 644972544, 1934917632, 
	2560000000, 3855122432

Allocating group tables: done                            
Writing inode tables: done                            
Creating journal (262144 blocks): done
Writing superblocks and filesystem accounting information: done         

# 
# fsck.ext4 -f /dev/md/md-test
e2fsck 1.43.4 (31-Jan-2017)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Pass 5: Checking group summary information
/dev/md/md-test: 11/335544320 files (0.0% non-contiguous), 21625375/5368709120 blocks
# 


> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index ba485dcf1064..e0ac7f5a8e68 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -6464,9 +6464,8 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
>  	mddev->layout        = info->layout;
>  	mddev->chunk_sectors = info->chunk_size >> 9;
>  
> -	mddev->max_disks     = MD_SB_DISKS;
> -
>  	if (mddev->persistent) {
> +		mddev->max_disks     = MD_SB_DISKS;
>  		mddev->flags         = 0;
>  		mddev->sb_flags         = 0;
>  	}

What value does mddev->max_disks get in the opposite case,
(!mddev->persistent) ?

I note this comment from the top of the function:

    * set_array_info is used two different ways
    * The original usage is when creating a new array.
    * In this usage, raid_disks is > 0 and it together with
    *  level, size, not_persistent,layout,chunksize determine the
    *  shape of the array.
    *  This will always create an array with a type-0.90.0 superblock.

http://lxr.free-electrons.com/source/drivers/md/md.c#L6410

Surely there is an equivalent function which creates arrays with a
type-1 superblock?


-- Ian Bruce

^ permalink raw reply

* Re: [PATCH v1 01/14] block: introduce bio_segments_all()
From: Ming Lei @ 2017-02-28 12:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block
In-Reply-To: <20170225182216.GB26447@infradead.org>

On Sun, Feb 26, 2017 at 2:22 AM, Christoph Hellwig <hch@infradead.org> wrote:
>> +static inline unsigned bio_segments_all(struct bio *bio)
>> +{
>> +     WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
>> +
>> +     return bio->bi_vcnt;
>> +}
>
> I don't think this helpers really adds any benefit.

IMO the first benefit is that misusing of .bi_vcnt can be warned, and
another one is that we have to introduce this helper if multipage bvec
is supported.

Thanks,
Ming Lei

^ permalink raw reply

* Re: [PATCH v1 02/14] block: introduce bio_remove_last_page()
From: Ming Lei @ 2017-02-28 12:18 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Shaohua Li, Jens Axboe, Linux Kernel Mailing List,
	open list:SOFTWARE RAID (Multiple Disks) SUPPORT, linux-block
In-Reply-To: <20170225182306.GC26447@infradead.org>

On Sun, Feb 26, 2017 at 2:23 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Fri, Feb 24, 2017 at 11:42:39PM +0800, Ming Lei wrote:
>> MD need this helper to remove the last added page, so introduce
>> it.
>
> If MD really has a valid use case for this it should open code the
> operation.  The semantics look deeply fishy to me.

Thinking about MD's case further, and looks bio_add_page() won't
fail at all in case that queue's limit isn't applied in bio_add_page()
in future.

So I will change MD's handling in this case and avoid to introduce
bio_remove_last_page() in V2.

Thanks,
Ming Lei

^ permalink raw reply

* Re: interesting case of a hung 'recovery'
From: Eyal Lebedinsky @ 2017-02-28 12:55 UTC (permalink / raw)
  To: linux-raid@vger.kernel.org
In-Reply-To: <52ea7f37-5368-3b9d-5348-75861cc56652@eyal.emu.id.au>

Anyone?

On 23/02/17 17:17, Eyal Lebedinsky wrote:
> Can one of the experts on the list please comment on the issue.
>
> Regards,
>     Eyal

-- 
Eyal Lebedinsky (eyal@eyal.emu.id.au)

^ permalink raw reply

* Performance regression with HP HW raid (hpsa) between 3.18.21 and 3.18.22 (and later)
From: Mattias Wadenstein @ 2017-02-28 13:38 UTC (permalink / raw)
  To: linux-raid; +Cc: christian.ehrhardt, esc.storagedev

Hi!

We recently noticed a big performance hit when upgrading distro kernel 
versions (Ubuntu trusty->xenial) for bulk io ("dd if=/dev/zero of=/dev/sdb 
bs=256k" as easy reproducable case), performance dropping from 1.8GB/s to 
0.6GB/s on our raidsets. With some fiddling we found this to come from a 
change between mainline kernels 3.18.21 and 3.18.22.

Christian Erhardt of helpful Ubuntu support helped pin this down to a 
particular change where newer kernels trust what the HW raid controller 
advertises for max_sectors_kb, even though it seems like the controller 
doesn't like getting big merged IOs larger than 1024k.

Our discussion, data and conclusion is documented in:

https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1668557

Posting here to raise some awareness and maybe providing some feedback to 
relevant maintainers for how to handle this case, where the hardware raid 
controller seems to be able to technically handle large IOs, but at a 
significant performance hit.

We're applying the workaround of capping max_sectors_kb to something 
performance-sensible like the old value of 512, and hopefully this mail 
can help someone else facing the same issues.

/Mattias Wadenstein

^ permalink raw reply

* [PATCH v2 00/14] md: cleanup on direct access to bvec table
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei

In MD's resync I/O path, there are lots of direct access to bio's
bvec table. This patchset kills almost all, and the conversion
is quite straightforward. One root cause of direct access to bvec
table is that resync I/O uses the bio's bvec to manage pages.
In V1, as suggested by Shaohua, a new approach is used to manage
these pages for resync I/O, turns out code becomes more clean
and readable.

Once direct access to bvec table in MD is cleaned up, we may make
multipage bvec moving on.

V2:
	- remove the patch for introducing/applying bio_remove_last_page()

V1:
	- allocate page array to manage resync pages

Thanks,
Ming

Ming Lei (13):
  block: introduce bio_segments_all()
  md: raid1/raid10: don't handle failure of bio_add_page()
  md: move two macros into md.h
  md: prepare for managing resync I/O pages in clean way
  md: raid1: simplify r1buf_pool_free()
  md: raid1: don't use bio's vec table to manage resync pages
  md: raid1: retrieve page from pre-allocated resync page array
  md: raid1: use bio helper in process_checks()
  md: raid1: use bio_segments_all()
  md: raid10: refactor code of read reshape's .bi_end_io
  md: raid10: don't use bio's vec table to manage resync pages
  md: raid10: retrieve page from preallocated resync page array
  md: raid10: avoid direct access to bvec table in
    handle_reshape_read_error

 drivers/md/md.h     |  59 ++++++++++++++
 drivers/md/raid1.c  | 140 ++++++++++++++++++---------------
 drivers/md/raid10.c | 220 ++++++++++++++++++++++++++++------------------------
 include/linux/bio.h |   7 ++
 4 files changed, 263 insertions(+), 163 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH v2 01/13] block: introduce bio_segments_all()
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

So that we can replace the direct access to .bi_vcnt.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 include/linux/bio.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..3364b3ed90e7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -293,6 +293,13 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
 		bv->bv_len = iter.bi_bvec_done;
 }
 
+static inline unsigned bio_segments_all(struct bio *bio)
+{
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+
+	return bio->bi_vcnt;
+}
+
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
 	BIP_MAPPED_INTEGRITY	= 1 << 1, /* ref tag has been remapped */
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 02/13] md: raid1/raid10: don't handle failure of bio_add_page()
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

All bio_add_page() is for adding one page into resync bio,
which is big enough to hold RESYNC_PAGES pages, and
the current bio_add_page() doesn't check queue limit any more,
so it won't fail at all.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c  | 21 ++++++---------------
 drivers/md/raid10.c | 41 ++++++++++-------------------------------
 2 files changed, 16 insertions(+), 46 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0628c07dd16d..b3021355c7e2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2903,21 +2903,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io) {
 				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-				if (bio_add_page(bio, page, len, 0) == 0) {
-					/* stop here */
-					bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-					while (i > 0) {
-						i--;
-						bio = r1_bio->bios[i];
-						if (bio->bi_end_io==NULL)
-							continue;
-						/* remove last page from this bio */
-						bio->bi_vcnt--;
-						bio->bi_iter.bi_size -= len;
-						bio_clear_flag(bio, BIO_SEG_VALID);
-					}
-					goto bio_full;
-				}
+
+				/*
+				 * won't fail because the vec table is big
+				 * enough to hold all these pages
+				 */
+				bio_add_page(bio, page, len, 0);
 			}
 		}
 		nr_sectors += len>>9;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 33f6a535dc1f..ceb3acc793cf 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3435,27 +3435,16 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		if (len == 0)
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
-			struct bio *bio2;
 			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-			if (bio_add_page(bio, page, len, 0))
-				continue;
-
-			/* stop here */
-			bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-			for (bio2 = biolist;
-			     bio2 && bio2 != bio;
-			     bio2 = bio2->bi_next) {
-				/* remove last page from this bio */
-				bio2->bi_vcnt--;
-				bio2->bi_iter.bi_size -= len;
-				bio_clear_flag(bio2, BIO_SEG_VALID);
-			}
-			goto bio_full;
+			/*
+			 * won't fail because the vec table is big enough
+			 * to hold all these pages
+			 */
+			bio_add_page(bio, page, len, 0);
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 	} while (biolist->bi_vcnt < RESYNC_PAGES);
- bio_full:
 	r10_bio->sectors = nr_sectors;
 
 	while (biolist) {
@@ -4528,25 +4517,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
 		for (bio = blist; bio ; bio = bio->bi_next) {
-			struct bio *bio2;
-			if (bio_add_page(bio, page, len, 0))
-				continue;
-
-			/* Didn't fit, must stop */
-			for (bio2 = blist;
-			     bio2 && bio2 != bio;
-			     bio2 = bio2->bi_next) {
-				/* Remove last page from this bio */
-				bio2->bi_vcnt--;
-				bio2->bi_iter.bi_size -= len;
-				bio_clear_flag(bio2, BIO_SEG_VALID);
-			}
-			goto bio_full;
+			/*
+			 * won't fail because the vec table is big enough
+			 * to hold all these pages
+			 */
+			bio_add_page(bio, page, len, 0);
 		}
 		sector_nr += len >> 9;
 		nr_sectors += len >> 9;
 	}
-bio_full:
 	rcu_read_unlock();
 	r10_bio->sectors = nr_sectors;
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 03/13] md: move two macros into md.h
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

Both raid1 and raid10 share common resync
block size and page count, so move them into md.h.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.h     | 5 +++++
 drivers/md/raid1.c  | 2 --
 drivers/md/raid10.c | 3 ---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index b8859cbf84b6..1d63239a1be4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -715,4 +715,9 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
 		mddev->queue->limits.max_write_same_sectors = 0;
 }
+
+/* Maximum size of each resync request */
+#define RESYNC_BLOCK_SIZE (64*1024)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+
 #endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b3021355c7e2..25c9172db639 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -91,10 +91,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 	kfree(r1_bio);
 }
 
-#define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_DEPTH 32
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ceb3acc793cf..c5f1a117494b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -125,9 +125,6 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 	kfree(r10_bio);
 }
 
-/* Maximum size of each resync request */
-#define RESYNC_BLOCK_SIZE (64*1024)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 /* amount of memory to reserve for resync requests */
 #define RESYNC_WINDOW (1024*1024)
 /* maximum number of concurrent requests, memory permitting */
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 04/13] md: prepare for managing resync I/O pages in clean way
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

Now resync I/O use bio's bec table to manage pages,
this way is very hacky, and may not work any more
once multipage bvec is introduced.

So introduce helpers and new data structure for
managing resync I/O pages more cleanly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/md.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1d63239a1be4..b5a638d85cb4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -720,4 +720,58 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 #define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 
+/* for managing resync I/O pages */
+struct resync_pages {
+	unsigned	idx;	/* for get/put page from the pool */
+	void		*raid_bio;
+	struct page	*pages[RESYNC_PAGES];
+};
+
+static inline int resync_alloc_pages(struct resync_pages *rp,
+				     gfp_t gfp_flags)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++) {
+		rp->pages[i] = alloc_page(gfp_flags);
+		if (!rp->pages[i])
+			goto out_free;
+	}
+
+	return 0;
+
+ out_free:
+	while (--i >= 0)
+		__free_page(rp->pages[i]);
+	return -ENOMEM;
+}
+
+static inline void resync_free_pages(struct resync_pages *rp)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++)
+		__free_page(rp->pages[i]);
+}
+
+static inline void resync_get_all_pages(struct resync_pages *rp)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++)
+		get_page(rp->pages[i]);
+}
+
+static inline struct page *resync_fetch_page(struct resync_pages *rp)
+{
+	if (WARN_ON_ONCE(rp->idx >= RESYNC_PAGES))
+		return NULL;
+	return rp->pages[rp->idx++];
+}
+
+static inline bool resync_page_available(struct resync_pages *rp)
+{
+	return rp->idx < RESYNC_PAGES;
+}
+
 #endif /* _MD_MD_H */
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 05/13] md: raid1: simplify r1buf_pool_free()
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

This patch gets each page's reference of each bio for resync,
then r1buf_pool_free() gets simplified a lot.

The same policy has been taken in raid10's buf pool allocation/free
too.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 25c9172db639..c442b4657e2f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -139,9 +139,12 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	/* If not user-requests, copy the page pointers to all bios */
 	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
 		for (i=0; i<RESYNC_PAGES ; i++)
-			for (j=1; j<pi->raid_disks; j++)
-				r1_bio->bios[j]->bi_io_vec[i].bv_page =
+			for (j=1; j<pi->raid_disks; j++) {
+				struct page *page =
 					r1_bio->bios[0]->bi_io_vec[i].bv_page;
+				get_page(page);
+				r1_bio->bios[j]->bi_io_vec[i].bv_page = page;
+			}
 	}
 
 	r1_bio->master_bio = NULL;
@@ -166,12 +169,8 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
 	struct r1bio *r1bio = __r1_bio;
 
 	for (i = 0; i < RESYNC_PAGES; i++)
-		for (j = pi->raid_disks; j-- ;) {
-			if (j == 0 ||
-			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
-			    r1bio->bios[0]->bi_io_vec[i].bv_page)
-				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
-		}
+		for (j = pi->raid_disks; j-- ;)
+			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 06/13] md: raid1: don't use bio's vec table to manage resync pages
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

Now we allocate one page array for managing resync pages, instead
of using bio's vec table to do that, and the old way is very hacky
and won't work any more if multipage bvec is enabled.

The introduced cost is that we need to allocate (128 + 16) * raid_disks
bytes per r1_bio, and it is fine because the inflight r1_bio for
resync shouldn't be much, as pointed by Shaohua.

Also the bio_reset() in raid1_sync_request() is removed because
all bios are freshly new now and not necessary to reset any more.

This patch can be thought as a cleanup too

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 83 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 30 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c442b4657e2f..900144f39630 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -77,6 +77,16 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 #define raid1_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
 
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+	return bio->bi_private;
+}
+
+static inline struct r1bio *get_resync_r1bio(struct bio *bio)
+{
+	return get_resync_pages(bio)->raid_bio;
+}
+
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
@@ -104,12 +114,18 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	struct r1bio *r1_bio;
 	struct bio *bio;
 	int need_pages;
-	int i, j;
+	int j;
+	struct resync_pages *rps;
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
 	if (!r1_bio)
 		return NULL;
 
+	rps = kmalloc(sizeof(struct resync_pages) * pi->raid_disks,
+		      gfp_flags);
+	if (!rps)
+		goto out_free_r1bio;
+
 	/*
 	 * Allocate bios : 1 for reading, n-1 for writing
 	 */
@@ -129,22 +145,22 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 		need_pages = pi->raid_disks;
 	else
 		need_pages = 1;
-	for (j = 0; j < need_pages; j++) {
+	for (j = 0; j < pi->raid_disks; j++) {
+		struct resync_pages *rp = &rps[j];
+
 		bio = r1_bio->bios[j];
-		bio->bi_vcnt = RESYNC_PAGES;
-
-		if (bio_alloc_pages(bio, gfp_flags))
-			goto out_free_pages;
-	}
-	/* If not user-requests, copy the page pointers to all bios */
-	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
-		for (i=0; i<RESYNC_PAGES ; i++)
-			for (j=1; j<pi->raid_disks; j++) {
-				struct page *page =
-					r1_bio->bios[0]->bi_io_vec[i].bv_page;
-				get_page(page);
-				r1_bio->bios[j]->bi_io_vec[i].bv_page = page;
-			}
+
+		if (j < need_pages) {
+			if (resync_alloc_pages(rp, gfp_flags))
+				goto out_free_pages;
+		} else {
+			memcpy(rp, &rps[0], sizeof(*rp));
+			resync_get_all_pages(rp);
+		}
+
+		rp->idx = 0;
+		rp->raid_bio = r1_bio;
+		bio->bi_private = rp;
 	}
 
 	r1_bio->master_bio = NULL;
@@ -153,11 +169,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 
 out_free_pages:
 	while (--j >= 0)
-		bio_free_pages(r1_bio->bios[j]);
+		resync_free_pages(&rps[j]);
 
 out_free_bio:
 	while (++j < pi->raid_disks)
 		bio_put(r1_bio->bios[j]);
+	kfree(rps);
+
+out_free_r1bio:
 	r1bio_pool_free(r1_bio, data);
 	return NULL;
 }
@@ -165,14 +184,18 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 static void r1buf_pool_free(void *__r1_bio, void *data)
 {
 	struct pool_info *pi = data;
-	int i,j;
+	int i;
 	struct r1bio *r1bio = __r1_bio;
+	struct resync_pages *rp = NULL;
 
-	for (i = 0; i < RESYNC_PAGES; i++)
-		for (j = pi->raid_disks; j-- ;)
-			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
-	for (i=0 ; i < pi->raid_disks; i++)
+	for (i = pi->raid_disks; i--; ) {
+		rp = get_resync_pages(r1bio->bios[i]);
+		resync_free_pages(rp);
 		bio_put(r1bio->bios[i]);
+	}
+
+	/* resync pages array stored in the 1st bio's .bi_private */
+	kfree(rp);
 
 	r1bio_pool_free(r1bio, data);
 }
@@ -1849,7 +1872,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
 static void end_sync_read(struct bio *bio)
 {
-	struct r1bio *r1_bio = bio->bi_private;
+	struct r1bio *r1_bio = get_resync_r1bio(bio);
 
 	update_head_pos(r1_bio->read_disk, r1_bio);
 
@@ -1868,7 +1891,7 @@ static void end_sync_read(struct bio *bio)
 static void end_sync_write(struct bio *bio)
 {
 	int uptodate = !bio->bi_error;
-	struct r1bio *r1_bio = bio->bi_private;
+	struct r1bio *r1_bio = get_resync_r1bio(bio);
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
 	sector_t first_bad;
@@ -2085,6 +2108,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int size;
 		int error;
 		struct bio *b = r1_bio->bios[i];
+		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
 			continue;
 		/* fixup the bio for reuse, but preserve errno */
@@ -2097,7 +2121,8 @@ static void process_checks(struct r1bio *r1_bio)
 			conf->mirrors[i].rdev->data_offset;
 		b->bi_bdev = conf->mirrors[i].rdev->bdev;
 		b->bi_end_io = end_sync_read;
-		b->bi_private = r1_bio;
+		rp->raid_bio = r1_bio;
+		b->bi_private = rp;
 
 		size = b->bi_iter.bi_size;
 		for (j = 0; j < vcnt ; j++) {
@@ -2755,7 +2780,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		struct md_rdev *rdev;
 		bio = r1_bio->bios[i];
-		bio_reset(bio);
 
 		rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev == NULL ||
@@ -2811,7 +2835,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 			atomic_inc(&rdev->nr_pending);
 			bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
 			bio->bi_bdev = rdev->bdev;
-			bio->bi_private = r1_bio;
 			if (test_bit(FailFast, &rdev->flags))
 				bio->bi_opf |= MD_FAILFAST;
 		}
@@ -2899,7 +2922,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		for (i = 0 ; i < conf->raid_disks * 2; i++) {
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io) {
-				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+				page = resync_fetch_page(get_resync_pages(bio));
 
 				/*
 				 * won't fail because the vec table is big
@@ -2911,8 +2934,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 		sync_blocks -= (len>>9);
-	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
- bio_full:
+	} while (resync_page_available(r1_bio->bios[disk]->bi_private));
+
 	r1_bio->sectors = nr_sectors;
 
 	if (mddev_is_clustered(mddev) &&
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 07/13] md: raid1: retrieve page from pre-allocated resync page array
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

Now one page array is allocated for each resync bio, and we can
retrieve page from this table directly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 900144f39630..d0cb5c026506 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1970,6 +1970,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
 	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+	struct page **pages = get_resync_pages(bio)->pages;
 	sector_t sect = r1_bio->sector;
 	int sectors = r1_bio->sectors;
 	int idx = 0;
@@ -2003,7 +2004,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				 */
 				rdev = conf->mirrors[d].rdev;
 				if (sync_page_io(rdev, sect, s<<9,
-						 bio->bi_io_vec[idx].bv_page,
+						 pages[idx],
 						 REQ_OP_READ, 0, false)) {
 					success = 1;
 					break;
@@ -2058,7 +2059,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				continue;
 			rdev = conf->mirrors[d].rdev;
 			if (r1_sync_page_io(rdev, sect, s,
-					    bio->bi_io_vec[idx].bv_page,
+					    pages[idx],
 					    WRITE) == 0) {
 				r1_bio->bios[d]->bi_end_io = NULL;
 				rdev_dec_pending(rdev, mddev);
@@ -2073,7 +2074,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				continue;
 			rdev = conf->mirrors[d].rdev;
 			if (r1_sync_page_io(rdev, sect, s,
-					    bio->bi_io_vec[idx].bv_page,
+					    pages[idx],
 					    READ) != 0)
 				atomic_add(s, &rdev->corrected_errors);
 		}
@@ -2149,6 +2150,8 @@ static void process_checks(struct r1bio *r1_bio)
 		struct bio *pbio = r1_bio->bios[primary];
 		struct bio *sbio = r1_bio->bios[i];
 		int error = sbio->bi_error;
+		struct page **ppages = get_resync_pages(pbio)->pages;
+		struct page **spages = get_resync_pages(sbio)->pages;
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
@@ -2157,11 +2160,8 @@ static void process_checks(struct r1bio *r1_bio)
 
 		if (!error) {
 			for (j = vcnt; j-- ; ) {
-				struct page *p, *s;
-				p = pbio->bi_io_vec[j].bv_page;
-				s = sbio->bi_io_vec[j].bv_page;
-				if (memcmp(page_address(p),
-					   page_address(s),
+				if (memcmp(page_address(ppages[j]),
+					   page_address(spages[j]),
 					   sbio->bi_io_vec[j].bv_len))
 					break;
 			}
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 08/13] md: raid1: use bio helper in process_checks()
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

Avoid to direct access to bvec table.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d0cb5c026506..316bd6dd6cc1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2108,6 +2108,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int j;
 		int size;
 		int error;
+		struct bio_vec *bi;
 		struct bio *b = r1_bio->bios[i];
 		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
@@ -2126,9 +2127,7 @@ static void process_checks(struct r1bio *r1_bio)
 		b->bi_private = rp;
 
 		size = b->bi_iter.bi_size;
-		for (j = 0; j < vcnt ; j++) {
-			struct bio_vec *bi;
-			bi = &b->bi_io_vec[j];
+		bio_for_each_segment_all(bi, b, j) {
 			bi->bv_offset = 0;
 			if (size > PAGE_SIZE)
 				bi->bv_len = PAGE_SIZE;
@@ -2152,17 +2151,22 @@ static void process_checks(struct r1bio *r1_bio)
 		int error = sbio->bi_error;
 		struct page **ppages = get_resync_pages(pbio)->pages;
 		struct page **spages = get_resync_pages(sbio)->pages;
+		struct bio_vec *bi;
+		int page_len[RESYNC_PAGES];
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
 		/* Now we can 'fixup' the error value */
 		sbio->bi_error = 0;
 
+		bio_for_each_segment_all(bi, sbio, j)
+			page_len[j] = bi->bv_len;
+
 		if (!error) {
 			for (j = vcnt; j-- ; ) {
 				if (memcmp(page_address(ppages[j]),
 					   page_address(spages[j]),
-					   sbio->bi_io_vec[j].bv_len))
+					   page_len[j]))
 					break;
 			}
 		} else
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 09/13] md: raid1: use bio_segments_all()
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

Use this helper, instead of direct access to .bi_vcnt.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid1.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 316bd6dd6cc1..7396c99ff7b1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1091,7 +1091,8 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 {
 	int i;
 	struct bio_vec *bvec;
-	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
+	unsigned vcnt = bio_segments_all(bio);
+	struct bio_vec *bvecs = kzalloc(vcnt * sizeof(struct bio_vec),
 					GFP_NOIO);
 	if (unlikely(!bvecs))
 		return;
@@ -1107,12 +1108,12 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 		kunmap(bvec->bv_page);
 	}
 	r1_bio->behind_bvecs = bvecs;
-	r1_bio->behind_page_count = bio->bi_vcnt;
+	r1_bio->behind_page_count = vcnt;
 	set_bit(R1BIO_BehindIO, &r1_bio->state);
 	return;
 
 do_sync_io:
-	for (i = 0; i < bio->bi_vcnt; i++)
+	for (i = 0; i < vcnt; i++)
 		if (bvecs[i].bv_page)
 			put_page(bvecs[i].bv_page);
 	kfree(bvecs);
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 10/13] md: raid10: refactor code of read reshape's .bi_end_io
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

reshape read request is a bit special and requires one extra
bio which isn't allocated from r10buf_pool.

Refactor the .bi_end_io for read reshape, so that we can use
raid10's resync page mangement approach easily in the following
patches.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c5f1a117494b..a9ddd4f14008 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1907,17 +1907,9 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	return err;
 }
 
-static void end_sync_read(struct bio *bio)
+static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
 {
-	struct r10bio *r10_bio = bio->bi_private;
 	struct r10conf *conf = r10_bio->mddev->private;
-	int d;
-
-	if (bio == r10_bio->master_bio) {
-		/* this is a reshape read */
-		d = r10_bio->read_slot; /* really the read dev */
-	} else
-		d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 
 	if (!bio->bi_error)
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1941,6 +1933,22 @@ static void end_sync_read(struct bio *bio)
 	}
 }
 
+static void end_sync_read(struct bio *bio)
+{
+	struct r10bio *r10_bio = bio->bi_private;
+	struct r10conf *conf = r10_bio->mddev->private;
+	int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
+
+	__end_sync_read(r10_bio, bio, d);
+}
+
+static void end_reshape_read(struct bio *bio)
+{
+	struct r10bio *r10_bio = bio->bi_private;
+
+	__end_sync_read(r10_bio, bio, r10_bio->read_slot);
+}
+
 static void end_sync_request(struct r10bio *r10_bio)
 {
 	struct mddev *mddev = r10_bio->mddev;
@@ -4464,7 +4472,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
 			       + rdev->data_offset);
 	read_bio->bi_private = r10_bio;
-	read_bio->bi_end_io = end_sync_read;
+	read_bio->bi_end_io = end_reshape_read;
 	bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
 	read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
 	read_bio->bi_error = 0;
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 11/13] md: raid10: don't use bio's vec table to manage resync pages
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

Now we allocate one page array for managing resync pages, instead
of using bio's vec table to do that, and the old way is very hacky
and won't work any more if multipage bvec is enabled.

The introduced cost is that we need to allocate (128 + 16) * copies
bytes per r10_bio, and it is fine because the inflight r10_bio for
resync shouldn't be much, as pointed by Shaohua.

Also bio_reset() in raid10_sync_request() and reshape_request()
are removed because all bios are freshly new now in these functions
and not necessary to reset any more.

This patch can be thought as cleanup too.

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 125 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 52 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a9ddd4f14008..f887b21332e7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -110,6 +110,16 @@ static void end_reshape(struct r10conf *conf);
 #define raid10_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
 
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+	return bio->bi_private;
+}
+
+static inline struct r10bio *get_resync_r10bio(struct bio *bio)
+{
+	return get_resync_pages(bio)->raid_bio;
+}
+
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
@@ -140,11 +150,11 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
-	struct page *page;
 	struct r10bio *r10_bio;
 	struct bio *bio;
-	int i, j;
-	int nalloc;
+	int j;
+	int nalloc, nalloc_rp;
+	struct resync_pages *rps;
 
 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
 	if (!r10_bio)
@@ -156,6 +166,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	else
 		nalloc = 2; /* recovery */
 
+	/* allocate once for all bios */
+	if (!conf->have_replacement)
+		nalloc_rp = nalloc;
+	else
+		nalloc_rp = nalloc * 2;
+	rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
+	if (!rps)
+		goto out_free_r10bio;
+
 	/*
 	 * Allocate bios.
 	 */
@@ -175,36 +194,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	 * Allocate RESYNC_PAGES data pages and attach them
 	 * where needed.
 	 */
-	for (j = 0 ; j < nalloc; j++) {
+	for (j = 0; j < nalloc; j++) {
 		struct bio *rbio = r10_bio->devs[j].repl_bio;
+		struct resync_pages *rp, *rp_repl;
+
+		rp = &rps[j];
+		if (rbio)
+			rp_repl = &rps[nalloc + j];
+
 		bio = r10_bio->devs[j].bio;
-		for (i = 0; i < RESYNC_PAGES; i++) {
-			if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
-					       &conf->mddev->recovery)) {
-				/* we can share bv_page's during recovery
-				 * and reshape */
-				struct bio *rbio = r10_bio->devs[0].bio;
-				page = rbio->bi_io_vec[i].bv_page;
-				get_page(page);
-			} else
-				page = alloc_page(gfp_flags);
-			if (unlikely(!page))
+
+		if (!j || test_bit(MD_RECOVERY_SYNC,
+				   &conf->mddev->recovery)) {
+			if (resync_alloc_pages(rp, gfp_flags))
 				goto out_free_pages;
+		} else {
+			memcpy(rp, &rps[0], sizeof(*rp));
+			resync_get_all_pages(rp);
+		}
 
-			bio->bi_io_vec[i].bv_page = page;
-			if (rbio)
-				rbio->bi_io_vec[i].bv_page = page;
+		rp->idx = 0;
+		rp->raid_bio = r10_bio;
+		bio->bi_private = rp;
+		if (rbio) {
+			memcpy(rp_repl, rp, sizeof(*rp));
+			rbio->bi_private = rp_repl;
 		}
 	}
 
 	return r10_bio;
 
 out_free_pages:
-	for ( ; i > 0 ; i--)
-		safe_put_page(bio->bi_io_vec[i-1].bv_page);
-	while (j--)
-		for (i = 0; i < RESYNC_PAGES ; i++)
-			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+	while (--j >= 0)
+		resync_free_pages(&rps[j * 2]);
+
 	j = 0;
 out_free_bio:
 	for ( ; j < nalloc; j++) {
@@ -213,30 +236,34 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 		if (r10_bio->devs[j].repl_bio)
 			bio_put(r10_bio->devs[j].repl_bio);
 	}
+	kfree(rps);
+out_free_r10bio:
 	r10bio_pool_free(r10_bio, conf);
 	return NULL;
 }
 
 static void r10buf_pool_free(void *__r10_bio, void *data)
 {
-	int i;
 	struct r10conf *conf = data;
 	struct r10bio *r10bio = __r10_bio;
 	int j;
+	struct resync_pages *rp = NULL;
 
-	for (j=0; j < conf->copies; j++) {
+	for (j = conf->copies; j--; ) {
 		struct bio *bio = r10bio->devs[j].bio;
-		if (bio) {
-			for (i = 0; i < RESYNC_PAGES; i++) {
-				safe_put_page(bio->bi_io_vec[i].bv_page);
-				bio->bi_io_vec[i].bv_page = NULL;
-			}
-			bio_put(bio);
-		}
+
+		rp = get_resync_pages(bio);
+		resync_free_pages(rp);
+		bio_put(bio);
+
 		bio = r10bio->devs[j].repl_bio;
 		if (bio)
 			bio_put(bio);
 	}
+
+	/* resync pages array stored in the 1st bio's .bi_private */
+	kfree(rp);
+
 	r10bio_pool_free(r10bio, conf);
 }
 
@@ -1935,7 +1962,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
 
 static void end_sync_read(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct r10conf *conf = r10_bio->mddev->private;
 	int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 
@@ -1944,6 +1971,7 @@ static void end_sync_read(struct bio *bio)
 
 static void end_reshape_read(struct bio *bio)
 {
+	/* reshape read bio isn't allocated from r10buf_pool */
 	struct r10bio *r10_bio = bio->bi_private;
 
 	__end_sync_read(r10_bio, bio, r10_bio->read_slot);
@@ -1978,7 +2006,7 @@ static void end_sync_request(struct r10bio *r10_bio)
 
 static void end_sync_write(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;
@@ -2058,6 +2086,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	for (i=0 ; i < conf->copies ; i++) {
 		int  j, d;
 		struct md_rdev *rdev;
+		struct resync_pages *rp;
 
 		tbio = r10_bio->devs[i].bio;
 
@@ -2099,11 +2128,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		 * First we need to fixup bv_offset, bv_len and
 		 * bi_vecs, as the read request might have corrupted these
 		 */
+		rp = get_resync_pages(tbio);
 		bio_reset(tbio);
 
 		tbio->bi_vcnt = vcnt;
 		tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
-		tbio->bi_private = r10_bio;
+		rp->raid_bio = r10_bio;
+		tbio->bi_private = rp;
 		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
 		tbio->bi_end_io = end_sync_write;
 		bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
@@ -3171,10 +3202,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 					}
 				}
 				bio = r10_bio->devs[0].bio;
-				bio_reset(bio);
 				bio->bi_next = biolist;
 				biolist = bio;
-				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_read;
 				bio_set_op_attrs(bio, REQ_OP_READ, 0);
 				if (test_bit(FailFast, &rdev->flags))
@@ -3198,10 +3227,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 				if (!test_bit(In_sync, &mrdev->flags)) {
 					bio = r10_bio->devs[1].bio;
-					bio_reset(bio);
 					bio->bi_next = biolist;
 					biolist = bio;
-					bio->bi_private = r10_bio;
 					bio->bi_end_io = end_sync_write;
 					bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 					bio->bi_iter.bi_sector = to_addr
@@ -3226,10 +3253,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				if (mreplace == NULL || bio == NULL ||
 				    test_bit(Faulty, &mreplace->flags))
 					break;
-				bio_reset(bio);
 				bio->bi_next = biolist;
 				biolist = bio;
-				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_write;
 				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 				bio->bi_iter.bi_sector = to_addr +
@@ -3351,7 +3376,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
 
 			bio = r10_bio->devs[i].bio;
-			bio_reset(bio);
 			bio->bi_error = -EIO;
 			rcu_read_lock();
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -3376,7 +3400,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			atomic_inc(&r10_bio->remaining);
 			bio->bi_next = biolist;
 			biolist = bio;
-			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_read;
 			bio_set_op_attrs(bio, REQ_OP_READ, 0);
 			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3395,13 +3418,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
-			bio_reset(bio);
 			bio->bi_error = -EIO;
 
 			sector = r10_bio->devs[i].addr;
 			bio->bi_next = biolist;
 			biolist = bio;
-			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_write;
 			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3440,7 +3461,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		if (len == 0)
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
-			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+			page = resync_fetch_page(get_resync_pages(bio));
 			/*
 			 * won't fail because the vec table is big enough
 			 * to hold all these pages
@@ -3449,7 +3470,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
-	} while (biolist->bi_vcnt < RESYNC_PAGES);
+	} while (resync_page_available(get_resync_pages(biolist)));
 	r10_bio->sectors = nr_sectors;
 
 	while (biolist) {
@@ -3457,7 +3478,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		biolist = biolist->bi_next;
 
 		bio->bi_next = NULL;
-		r10_bio = bio->bi_private;
+		r10_bio = get_resync_r10bio(bio);
 		r10_bio->sectors = nr_sectors;
 
 		if (bio->bi_end_io == end_sync_read) {
@@ -4352,6 +4373,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	struct bio *blist;
 	struct bio *bio, *read_bio;
 	int sectors_done = 0;
+	struct page **pages;
 
 	if (sector_nr == 0) {
 		/* If restarting in the middle, skip the initial sectors */
@@ -4502,11 +4524,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
 			continue;
 
-		bio_reset(b);
 		b->bi_bdev = rdev2->bdev;
 		b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
 			rdev2->new_data_offset;
-		b->bi_private = r10_bio;
 		b->bi_end_io = end_reshape_write;
 		bio_set_op_attrs(b, REQ_OP_WRITE, 0);
 		b->bi_next = blist;
@@ -4516,8 +4536,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	/* Now add as many pages as possible to all of these bios. */
 
 	nr_sectors = 0;
+	pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
 	for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
-		struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
+		struct page *page = pages[s / (PAGE_SIZE >> 9)];
 		int len = (max_sectors - s) << 9;
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
@@ -4701,7 +4722,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 
 static void end_reshape_write(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 12/13] md: raid10: retrieve page from preallocated resync page array
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

Now one page array is allocated for each resync bio, and we can
retrieve page from this table directly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f887b21332e7..0b97631e3905 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2065,6 +2065,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	int i, first;
 	struct bio *tbio, *fbio;
 	int vcnt;
+	struct page **tpages, **fpages;
 
 	atomic_set(&r10_bio->remaining, 1);
 
@@ -2080,6 +2081,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	fbio = r10_bio->devs[i].bio;
 	fbio->bi_iter.bi_size = r10_bio->sectors << 9;
 	fbio->bi_iter.bi_idx = 0;
+	fpages = get_resync_pages(fbio)->pages;
 
 	vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
 	/* now find blocks with errors */
@@ -2094,6 +2096,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 			continue;
 		if (i == first)
 			continue;
+
+		tpages = get_resync_pages(tbio)->pages;
 		d = r10_bio->devs[i].devnum;
 		rdev = conf->mirrors[d].rdev;
 		if (!r10_bio->devs[i].bio->bi_error) {
@@ -2106,8 +2110,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 				int len = PAGE_SIZE;
 				if (sectors < (len / 512))
 					len = sectors * 512;
-				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
-					   page_address(tbio->bi_io_vec[j].bv_page),
+				if (memcmp(page_address(fpages[j]),
+					   page_address(tpages[j]),
 					   len))
 					break;
 				sectors -= len/512;
@@ -2205,6 +2209,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 	int idx = 0;
 	int dr = r10_bio->devs[0].devnum;
 	int dw = r10_bio->devs[1].devnum;
+	struct page **pages = get_resync_pages(bio)->pages;
 
 	while (sectors) {
 		int s = sectors;
@@ -2220,7 +2225,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 		ok = sync_page_io(rdev,
 				  addr,
 				  s << 9,
-				  bio->bi_io_vec[idx].bv_page,
+				  pages[idx],
 				  REQ_OP_READ, 0, false);
 		if (ok) {
 			rdev = conf->mirrors[dw].rdev;
@@ -2228,7 +2233,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 			ok = sync_page_io(rdev,
 					  addr,
 					  s << 9,
-					  bio->bi_io_vec[idx].bv_page,
+					  pages[idx],
 					  REQ_OP_WRITE, 0, false);
 			if (!ok) {
 				set_bit(WriteErrorSeen, &rdev->flags);
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 13/13] md: raid10: avoid direct access to bvec table in handle_reshape_read_error
From: Ming Lei @ 2017-02-28 15:41 UTC (permalink / raw)
  To: Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig; +Cc: Ming Lei
In-Reply-To: <1488296503-4987-1-git-send-email-tom.leiming@gmail.com>

The cost is 128bytes(8*16) stack space in kernel thread context, and
just use the bio helper to retrieve pages from bio.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/md/raid10.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0b97631e3905..6ffb64ab45f8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4670,7 +4670,15 @@ static int handle_reshape_read_error(struct mddev *mddev,
 	struct r10bio *r10b = &on_stack.r10_bio;
 	int slot = 0;
 	int idx = 0;
-	struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
+	struct bio_vec *bvl;
+	struct page *pages[RESYNC_PAGES];
+
+	/*
+	 * This bio is allocated in reshape_request(), and size
+	 * is still RESYNC_PAGES
+	 */
+	bio_for_each_segment_all(bvl, r10_bio->master_bio, idx)
+		pages[idx] = bvl->bv_page;
 
 	r10b->sector = r10_bio->sector;
 	__raid10_find_phys(&conf->prev, r10b);
@@ -4699,7 +4707,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 			success = sync_page_io(rdev,
 					       addr,
 					       s << 9,
-					       bvec[idx].bv_page,
+					       pages[idx],
 					       REQ_OP_READ, 0, false);
 			rdev_dec_pending(rdev, mddev);
 			rcu_read_lock();
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH 01/14] md-cluster: remove unnecessary header files
From: Shaohua Li @ 2017-02-28 18:03 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <1487906124-20107-2-git-send-email-gqjiang@suse.com>

On Fri, Feb 24, 2017 at 11:15:11AM +0800, Guoqing Jiang wrote:
> md-cluster.h is already included in md.h, so remove
> the redundant one and we don't want to cross include
> header file too.
> 
> Reviewed-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>

It would be better md.h doesn't include md-cluster.h and include md-cluster.h
in required .c files.

> ---
>  drivers/md/md-cluster.c | 1 -
>  drivers/md/md-cluster.h | 2 --
>  2 files changed, 3 deletions(-)
> 
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index 2b13117fb918..03c38ed222ce 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -16,7 +16,6 @@
>  #include <linux/raid/md_p.h>
>  #include "md.h"
>  #include "bitmap.h"
> -#include "md-cluster.h"
>  
>  #define LVB_SIZE	64
>  #define NEW_DEV_TIMEOUT 5000
> diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
> index e765499ba591..8f26a5e80810 100644
> --- a/drivers/md/md-cluster.h
> +++ b/drivers/md/md-cluster.h
> @@ -3,8 +3,6 @@
>  #ifndef _MD_CLUSTER_H
>  #define _MD_CLUSTER_H
>  
> -#include "md.h"
> -
>  struct mddev;
>  struct md_rdev;
>  
> -- 
> 2.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox