Distributed Replicated Block Device (DRBD) development
 help / color / mirror / Atom feed
* [Drbd-dev] DRBD-8: No-panic patch part I-c
@ 2006-09-18 15:13 Graham, Simon
  2006-09-19 13:15 ` Philipp Reisner
  0 siblings, 1 reply; 2+ messages in thread
From: Graham, Simon @ 2006-09-18 15:13 UTC (permalink / raw)
  To: drbd-dev

[-- Attachment #1: Type: text/plain, Size: 1094 bytes --]

Final few changes for the 1st phase of panic removal, including:

1. Updated tests - I implemented a new set of fault insertion tests in
testing/testsuite.
   NOTE: DRBD does NOT pass these tests yet - however, one of the main
problems I am
   having is with the test tool itself - it keeps on timing out
communicating with the
   agent script for no apparent reason. These tests are worth running
though - they
   seem to stress the failure handling quite nicely!

2. Need to unlock the bitmap if a failure occurs in drbd_nl_disk_conf
after the point where
   the lock is established.

3. There are some cases where drbd_bm_read can be called with a
zero-size bitmap which 
   leads to a crash -- test for a NULL b->bm.

4. drbd_io_error() needs to call drbd_md_sync - I had removed this but
there are a couple
   of issues - first of all, the error being reported might not stop the
bitmap from
   being written and secondly drbd_md_sync has the side effect of
canceling the md-sync
   timer which we want in this case.

5. Minor changes in trace code

Simon


[-- Attachment #2: drbd-panic-phase1c.patch --]
[-- Type: application/octet-stream, Size: 14612 bytes --]

Index: drbd/drbd_receiver.c
===================================================================
--- drbd/drbd_receiver.c	(.../trunk)	(revision 4106)
+++ drbd/drbd_receiver.c	(.../branches/drbd-panic)	(revision 4106)
@@ -1668,7 +1668,7 @@
 
 STATIC void drbd_uuid_dump(drbd_dev *mdev,char* text,u64* uuid)
 {
-	WARN("%s %016llX:%016llX:%016llX:%016llX\n",
+	INFO("%s %016llX:%016llX:%016llX:%016llX\n",
 	     text,
 	     uuid[Current],
 	     uuid[Bitmap],
@@ -1748,13 +1748,13 @@
 	int hg;
 	drbd_conns_t rv = conn_mask;
 
-
+	//INFO("drbd_sync_handshake:\n");
 	//drbd_uuid_dump(mdev,"self",mdev->bc->md.uuid);
 	//drbd_uuid_dump(mdev,"peer",mdev->p_uuid);
 
 	hg = drbd_uuid_compare(mdev);
 
-	//WARN("uuid_compare()=%d\n",hg);
+	//INFO("uuid_compare()=%d\n",hg);
 
 	if (hg == 100) {
 		int pcount = (mdev->state.role==Primary) + (peer_role==Primary);
Index: drbd/drbd_nl.c
===================================================================
--- drbd/drbd_nl.c	(.../trunk)	(revision 4106)
+++ drbd/drbd_nl.c	(.../branches/drbd-panic)	(revision 4106)
@@ -883,6 +883,8 @@
 	return 0;
 
  release_bdev3_fail:
+	drbd_bm_unlock(mdev);
+
 	/* The following will be freed by state change below */
 	nbc = NULL; 
 	resync_lru = NULL;
Index: drbd/drbd_bitmap.c
===================================================================
--- drbd/drbd_bitmap.c	(.../trunk)	(revision 4106)
+++ drbd/drbd_bitmap.c	(.../branches/drbd-panic)	(revision 4106)
@@ -806,11 +806,15 @@
 int drbd_bm_read(struct Drbd_Conf *mdev)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
+	int err=0;
 
-	int err = drbd_bm_rw(mdev, READ);
+	if (b->bm) {
+	    // bitmap size > 0
+	    err = drbd_bm_rw(mdev, READ);
 
-	if (err == 0)
-	    b->bm[b->bm_words] = DRBD_MAGIC;
+	    if (err == 0)
+		b->bm[b->bm_words] = DRBD_MAGIC;
+	}
 
 	return err;
 }
Index: drbd/drbd_main.c
===================================================================
--- drbd/drbd_main.c	(.../trunk)	(revision 4106)
+++ drbd/drbd_main.c	(.../branches/drbd-panic)	(revision 4106)
@@ -378,18 +378,12 @@
 	if (ok) WARN("Notified peer that my disk is broken.\n");
 	else ERR("Sending state in drbd_io_error() failed\n");
 
-#if 0
-// warning SPG
-// This code seems wrong -- we only get here if we are set to
-// detach in which case we have no local disk, so there's no
-// point asserting that a full sync is needed.
-// Flushing the meta data is probably also wrong -- we want
-// this node to appear out of date so we should deliberately
-// NOT update the meta data with the latest epoch info!
-	D_ASSERT(drbd_md_test_flag(mdev->bc,MDF_FullSync));
-	D_ASSERT(!drbd_md_test_flag(mdev->bc,MDF_Consistent));
+	// Make sure we try to flush meta-data to disk - we come
+	// in here because of a local disk error so it might fail
+	// but we still need to try -- both because the error might
+	// be in the data portion of the disk and because we need
+	// to ensure the md-sync-timer is stopped if running.
 	drbd_md_sync(mdev);
-#endif
 
 	/* Releasing the backing device is done in after_state_ch() */
 
@@ -2903,10 +2905,10 @@
 
 	case ReportUUIDs:
 		INFOP("%s Curr:%016llX, Bitmap:%016llX, HisSt:%016llX, HisEnd:%016llX\n", cmdname(cmd),
-		      p->GenCnt.uuid[Current],
-		      p->GenCnt.uuid[Bitmap],
-		      p->GenCnt.uuid[History_start],
-		      p->GenCnt.uuid[History_end]);
+		      be64_to_cpu(p->GenCnt.uuid[Current]),
+		      be64_to_cpu(p->GenCnt.uuid[Bitmap]),
+		      be64_to_cpu(p->GenCnt.uuid[History_start]),
+		      be64_to_cpu(p->GenCnt.uuid[History_end]));
 		break;
 		      
 	case ReportSizes:
Index: testing/testsuite/testfaults.conf
===================================================================
--- testing/testsuite/testfaults.conf	(.../trunk)	(revision 0)
+++ testing/testsuite/testfaults.conf	(.../branches/drbd-panic)	(revision 4106)
@@ -0,0 +1,299 @@
+# Configuration file of the testsuite.pl
+#
+# Use to configure both nodes and to change default commands of the testsuite.
+# Important: after each argument put ; to finish the line otherwise it will not be set and can cause errors!
+
+# Configuration of one Node
+node node1 {
+  # Syntax: addr <ip>
+  # IP Address of the node that is visible to the internet. If this is left undefined
+  # the testsuite will not start.
+  #
+  # Example: addr 192.168.1.10;
+  #
+  addr  192.168.1.10;
+  
+  # Syntax: port <p>
+  # The port the testsuite will try to connect to. If this is left undefined
+  # the testsuite will not start.
+  #
+  # Example: port 4000;
+  #
+  port  4000; 
+}
+
+# Configuration of second Node
+node node2 {
+  # Syntax: addr <ip>
+  # IP Address of the node that is visible to the internet. If this is left undefined
+  # the testsuite will not start.
+  #
+  # Example: addr 192.168.1.10;
+  #
+  addr  192.168.1.10;
+  
+  # Syntax: port <p>
+  # The port the testsuite will try to connect to. If this is left undefined
+  # the testsuite will not start.
+  #
+  # Example: port 4000;
+  #
+  port  4000;
+}
+
+# Set of default parameters of the testsuite
+defaults {
+  # Syntax: timeout <n>
+  # Generall timeout limit of <n> seconds for the response of the executed command.
+  #
+  # If this is left undefined default is to wait 0 seconds = wait forever.
+  #
+  # Example: timeout 20;
+  #
+  timeout  10;
+  
+  # Syntax: latency <l>
+  # 
+  # The maximum time/clock difference between the two agents. If it's above
+  # the given value, FIXME => sync
+  #
+  # If this is left undefined default latency is 0.5 seconds
+  #
+  # Example: latency 0.05;
+  #
+  latency  0.1;
+
+  # Syntax: connect_timeout <n>
+  # Timeout after trying to establish a connection after <n> seconds
+  #
+  # If this is left undefined default is to wait 3 seconds.
+  #
+  # Example: connect_timeout 1;
+  #
+  connect_timeout 1;
+
+  # Syntax: timeserver <ip/host>
+  # If nodes are out of sync, ntpdate <ip/host> will be sent to each node.
+  #
+  # If no timeserver is given, the testsuite will abort instead of trying
+  # to syncronize the agents.
+  #
+  timeserver 10.25.91.12;
+}
+
+# Set of sequence commands to be executet on the nodes. See README for detailed
+# information about available commands.
+seq-commands {
+
+  # check if we are in a stable state
+  if ((get 'drbdsetup /dev/drbd0 state', on node1) =~ /Unknown/) {
+    VERBOSE ("trouble with drbd on agents");
+    die();
+  }
+
+  cmd '/sbin/drbdadm detach {resource}', on node1;
+
+  # simulate meta data read failures during attach
+  INFO "1. Simulate local meta data read failures during attach";
+  cmd set_fr, on node1;
+  cmd set_md_rd, on node1;
+
+  # attach/detach a few times (want at least one failure!
+  for (my $i = 0; $i < 5; ++$i) {
+    cmd '/sbin/drbdadm attach {resource}', on node1;
+
+    sleep 2;
+
+    if ((get state_ds, on node1) =~ /Diskless/) {
+	VERBOSE ("goodness: meta data read fault fired");
+    }
+    else {
+	# might need to wait for resync here... disable
+	# faults whilst we wait
+	cmd clr_fr, on node1;
+
+	expected 'cs', state 'Connected', timeout 500;
+	expected 'ds', state 'UpToDate', timeout 500;
+
+	cmd '/sbin/drbdadm detach {resource}', on node1;
+
+	cmd set_fr, on node1;
+	cmd set_md_rd, on node1;
+    }
+
+    # everyone should be in connected state
+    expected 'cs', state 'Connected', timeout 15;
+  }
+
+  cmd clr_fr, on node1;
+
+  # simulate meta data write failures during attach
+  INFO "2. Simulate local meta data write failures during attach";
+  cmd set_fr, on node1;
+  cmd set_md_wr, on node1;
+
+  # attach/detach a few times (want at least one failure!
+  for (my $i = 0; $i < 5; ++$i) {
+    cmd '/sbin/drbdadm attach {resource}', on node1;
+
+    sleep 2;
+
+    if ((get state_ds, on node1) =~ /Diskless/) {
+	VERBOSE ("goodness: meta data write fault fired");
+    }
+    else {
+	# might need to wait for resync here... disable
+	# faults whilst we wait
+	cmd clr_fr, on node1;
+
+	expected 'cs', state 'Connected', timeout 500;
+	expected 'ds', state 'UpToDate', timeout 500;
+
+	cmd '/sbin/drbdadm detach {resource}', on node1;
+
+	cmd set_fr, on node1;
+	cmd set_md_wr, on node1;
+    }
+  }
+
+  cmd clr_fr, on node1;
+
+  cmd '/sbin/drbdadm attach {resource}', on node1;
+
+  # everyone should be in connected cstate, uptodate dstate
+  expected 'cs', state 'Connected', timeout 15;
+  expected 'ds', state 'UpToDate', timeout 15;
+
+  # switch to primary
+  cmd '/sbin/drbdadm primary {resource}', on node1;
+
+  # Check node1 went primary...
+  expected 'st', state 'Primary', timeout 15, on node1;
+
+  # simulate meta data write failures on partner node
+  INFO "3. Simulate remote meta data write failures during attach";
+  cmd set_fr, on node2;
+  cmd set_md_rd, on node2;
+
+  cmd '/sbin/drbdadm detach {resource}', on node1;
+
+  # attach/detach a few times (want to see at least one failure!
+  for (my $i = 0; $i < 5; ++$i) {
+    cmd '/sbin/drbdadm attach {resource}', on node1;
+
+    sleep 2;
+
+    if ((get state_ds, on node1) =~ /Diskless/) {
+	VERBOSE ("goodness: meta data write fault fired");
+    }
+    else {
+	# might need to wait for resync here... disable
+	# faults whilst we wait
+	cmd clr_fr, on node2;
+
+	expected 'cs', state 'Connected', timeout 500;
+	expected 'ds', state 'UpToDate', timeout 500;
+
+	cmd '/sbin/drbdadm detach {resource}', on node1;
+
+	cmd set_fr, on node2;
+	cmd set_md_rd, on node2;
+    }
+  }
+
+  cmd clr_fr, on node2;
+
+  # make sure partner is attached...
+  cmd '/sbin/drbdadm attach {resource}', on node2;
+
+  # everyone should be in connected cstate, uptodate dstate
+  expected 'cs', state 'Connected', timeout 500;
+  expected 'ds', state 'UpToDate', timeout 500;
+
+  # simulate read errors locally
+  INFO "4. Simulate local user data read failures";
+  cmd set_fr, on node1;
+  cmd set_dt_rd, on node1;
+
+  # mount file system a few times.
+  for (my $i = 0; $i < 5; ++$i) {
+      cmd 'mount /dev/{device} {mountpoint}', on node1;
+      cmd 'umount /dev/{device}', on node1;
+
+      sleep 2;
+
+      # state should not change - i.e. no resync (YET! Should get resync of failed block
+      # eventually)
+      expected 'cs', state 'Connected', timeout 15;
+  }
+
+  cmd clr_fr, on node1;
+
+  # simulate write errors locally
+  INFO "5. Simulate local user data write failures";
+  cmd set_fr, on node1;
+  cmd set_dt_wr, on node1;
+
+  # mount file system and modify - check for errors!
+  for (my $i = 0; $i < 5; ++$i) {
+      cmd 'mount /dev/{device} {mountpoint}', on node1;
+
+      # create some files - should get some errors...
+      cmd 'cp -f /boot/* {mountpoint}', on node1;
+  
+      cmd 'umount /dev/{device}', on node1;
+
+      sleep 2;
+
+      # state should not change - i.e. no resync (YET!)
+      expected 'cs', state 'Connected', timeout 15;
+  }
+
+  cmd clr_fr, on node1;
+
+  # simulate read errors remotely
+  INFO "6. Simulate remote user data read failures";
+  cmd 'drbdadm detach {resource}', on node1;
+
+  cmd set_fr, on node2;
+  cmd set_dt_rd, on node2;
+
+  # mount file system - check for errors!
+  for (my $i = 0; $i < 5; ++$i) {
+      cmd 'mount /dev/{device} {mountpoint}', on node1;
+      cmd 'umount /dev/{device}', on node1;
+
+      sleep 2;
+
+      # state should not change - i.e. no resync
+      expected 'cs', state 'Connected', timeout 15;
+  }
+
+  cmd clr_fr, on node2;
+
+  cmd 'drbdadm attach {resource}', on node1;
+
+  # wait for resync to complete
+  expected 'cs', state 'Connected', timeout 15;
+  expected 'ds', state 'UpToDate', timeout 15;
+
+  # simulate write errors remotely
+  INFO "6. Simulate remote user data write failures";
+  cmd set_fr, on node2;
+  cmd set_dt_wr, on node2;
+
+  for (my $i = 0; $i < 5; ++$i) {
+      # mount file system - check for errors!
+      cmd 'mount /dev/{device} {mountpoint}', on node1;
+
+      # create some files - should get some errors...
+      cmd 'cp -f /boot/* {mountpoint}', on node1;
+  
+      cmd 'umount /dev/{device}', on node1;
+
+      sleep 2;
+  }
+
+  cmd clr_fr, on node2;
+}
+
Index: testing/testsuite/testsuite.pl
===================================================================
--- testing/testsuite/testsuite.pl	(.../trunk)	(revision 4106)
+++ testing/testsuite/testsuite.pl	(.../branches/drbd-panic)	(revision 4106)
@@ -83,6 +83,16 @@
 #FILESYSTEM:
 $commands{'fs_make'} = 'mkfs.{filesystem} /dev/{device}'; #FIXME FileSystem - agent.conf!!'
 
+#FAULTS
+$commands{'set_fr'} = 'echo 10 >/sys/module/drbd/parameters/fault_rate';
+$commands{'clr_fr'} = 'echo 0 >/sys/module/drbd/parameters/fault_rate; echo 0 >/sys/module/drbd/parameters/enable_faults';
+$commands{'set_md_wr'} = 'echo 1 >/sys/module/drbd/parameters/enable_faults';
+$commands{'set_md_rd'} = 'echo 2 >/sys/module/drbd/parameters/enable_faults';
+$commands{'set_rs_wr'} = 'echo 4 >/sys/module/drbd/parameters/enable_faults';
+$commands{'set_rs_rd'} = 'echo 8 >/sys/module/drbd/parameters/enable_faults';
+$commands{'set_dt_wr'} = 'echo 16 >/sys/module/drbd/parameters/enable_faults';
+$commands{'set_dt_rd'} = 'echo 32 >/sys/module/drbd/parameters/enable_faults';
+
 ###############################################################################
 
 require 'getopts.pl';
@@ -154,10 +164,10 @@
     elsif ($section == 4) {
       push @seqcommands, $_;
       if (/{/) {
-        $seqsection = 1;
+        $seqsection += 1;
       }
       if (/}/) {
-        $seqsection = 0;
+        $seqsection -= 1;
       }
     }
     elsif ($section == 1 or $section == 2) {
@@ -210,7 +220,7 @@
       $section = 4;
     }
     else {
-      ERROR ("unknown configuration");
+      ERROR ("unknown configuration: ".$_);
     }
   }
 }
@@ -335,7 +345,6 @@
   return $reply;
 }
 
-
 ###############################################################################
 ######  functions
 ###############################################################################
@@ -603,7 +612,7 @@
     my $yday;
 
     print LOGFILE "--------- TestSuite --------\n";
-    foreach(sort(@logList)) {
+    foreach(@logList) {
       ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = (localtime($$_[0]));
       printf LOGFILE "%s %02d %02d:%02d:%02d ", $mnames[$mon], $mday, $hour, $min, $sec;
       print LOGFILE $$_[1]."\n";    
@@ -674,6 +683,7 @@
   
   while(($key, $value) = each(%commands)) {
     $seqcommands_eval =~ s/cmd $key/cmd '$value'/g;
+    $seqcommands_eval =~ s/get $key/get '$value'/g;
   }
   
   set_default_vars();
@@ -757,6 +767,19 @@
 }
 
 
+# print info messages
+sub INFO {
+  my ($msg) = @_;
+  
+ if (defined($opt_l)) {
+       LOG($msg);
+  }
+
+  print $msg. "\n";
+
+  return;
+}
+
 # print warn messages
 sub WARN {
   my ($msg) = @_;

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [Drbd-dev] DRBD-8: No-panic patch part I-c
  2006-09-18 15:13 [Drbd-dev] DRBD-8: No-panic patch part I-c Graham, Simon
@ 2006-09-19 13:15 ` Philipp Reisner
  0 siblings, 0 replies; 2+ messages in thread
From: Philipp Reisner @ 2006-09-19 13:15 UTC (permalink / raw)
  To: drbd-dev

Am Montag, 18. September 2006 17:13 schrieb Graham, Simon:
> Final few changes for the 1st phase of panic removal, including:
>

Gone to SVN.

-Phil
-- 
: Dipl-Ing Philipp Reisner                      Tel +43-1-8178292-50 :
: LINBIT Information Technologies GmbH          Fax +43-1-8178292-82 :
: Schönbrunnerstr 244, 1120 Vienna, Austria    http://www.linbit.com :

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2006-09-19 13:15 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-09-18 15:13 [Drbd-dev] DRBD-8: No-panic patch part I-c Graham, Simon
2006-09-19 13:15 ` Philipp Reisner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox