From mboxrd@z Thu Jan  1 00:00:00 1970
From: jbrassow@sourceware.org <jbrassow@sourceware.org>
Date: 25 Jan 2008 16:23:25 -0000
Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-clog.c
Message-ID: <20080125162325.23412.qmail@sourceware.org>
List-Id: <cluster-devel.redhat.com>
To: cluster-devel.redhat.com
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	jbrassow at sourceware.org	2008-01-25 16:23:25

Modified files:
	cmirror-kernel/src: dm-clog.c 

Log message:
	- calling dm_get_device fixes rename bug 205641
	
	- caching extra state in the kernel helps reduce cluster traffic 90%,
	this improves performance

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-clog.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2.2.7&r2=1.2.2.8

--- cluster/cmirror-kernel/src/dm-clog.c	2008/01/23 21:22:28	1.2.2.7
+++ cluster/cmirror-kernel/src/dm-clog.c	2008/01/25 16:23:24	1.2.2.8
@@ -23,8 +23,12 @@
 	char *ctr_str; /* Gives ability to restart if userspace dies */
 	uint32_t ctr_size;
 
+	uint32_t in_sync_hint;
+
 	spinlock_t flush_lock;
 	struct list_head flush_list;  /* only for clear and mark requests */
+
+	struct dm_dev *disk_log;
 };
 
 static mempool_t *flush_entry_pool = NULL;
@@ -78,21 +82,21 @@
 }
 
 static int cluster_ctr(struct dirty_log *log, struct dm_target *ti,
-		       unsigned int argc, char **argv, int disk_log)
+		       unsigned int argc, char **argv,
+		       struct dm_dev *disk_log)
 {
 	int i;
 	int r = 0;
 	int str_size;
+	int offset = (disk_log) ? 1 : 0;
 	char *ctr_str = NULL;
 	struct log_c *lc = NULL;
 	uint32_t region_size;
 	region_t region_count;
 
 	/* Already checked argument count */
-	if (disk_log != 0 && disk_log != 1)
-		return -EINVAL;
 
-	if (sscanf(argv[disk_log], "%u", &region_size) != 1) {
+	if (sscanf(argv[offset], "%u", &region_size) != 1) {
 		DMWARN("Invalid region size string");
 		return -EINVAL;
 	}
@@ -108,9 +112,10 @@
 	lc->ti = ti;
 	lc->region_size = region_size;
 	lc->region_count = region_count;
+	lc->disk_log = disk_log;
 
 	/* FIXME: Need to check size of uuid arg */
-	memcpy(lc->uuid, argv[1 + disk_log], DM_UUID_LEN);
+	memcpy(lc->uuid, argv[1 + offset], DM_UUID_LEN);
 	spin_lock_init(&lc->flush_lock);
 	INIT_LIST_HEAD(&lc->flush_list);
 
@@ -174,7 +179,7 @@
 		return -EINVAL;
 	}
 
-	r = cluster_ctr(log, ti, argc, argv, 0);
+	r = cluster_ctr(log, ti, argc, argv, NULL);
 
 	return r;
 }
@@ -195,7 +200,9 @@
 static int cluster_disk_ctr(struct dirty_log *log, struct dm_target *ti,
 			    unsigned int argc, char **argv)
 {
-	int i;
+	int r, i;
+	struct dm_dev *dev;
+
 	if ((argc < 4) || (argc > 5)) {
 		DMERR("Too %s arguments to clustered-disk mirror log type.",
 		      (argc < 3) ? "few" : "many");
@@ -205,7 +212,15 @@
 		return -EINVAL;
 	}
 
-	return cluster_ctr(log, ti, argc, argv, 1);
+	r = dm_get_device(ti, argv[0], 0, 0, FMODE_READ | FMODE_WRITE, &dev);
+	if (r)
+		return r;
+
+	r = cluster_ctr(log, ti, argc, argv, dev);
+	if (r)
+		dm_put_device(ti, dev);
+
+	return r;
 }
 
 /*
@@ -222,6 +237,8 @@
 				   NULL, NULL);
 
 	/* FIXME: What do we do on failure? */
+	if (lc->disk_log)
+		dm_put_device(lc->ti, lc->disk_log);
 	kfree(lc->ctr_str);
 	kfree(lc);
 
@@ -269,6 +286,7 @@
 	int r;
 	struct log_c *lc = (struct log_c *)log->context;
 
+	lc->in_sync_hint = 0;
 	r = dm_clog_consult_server(lc->uuid, DM_CLOG_RESUME,
 				   NULL, 0,
 				   NULL, NULL);
@@ -335,6 +353,19 @@
 	int rdata_size;
 	struct log_c *lc = (struct log_c *)log->context;
 
+	/*
+	 * We can never respond directly - even if in_sync_hint is
+	 * set.  This is because another machine could see a device
+	 * failure and mark the region out-of-sync.  If we don't go
+	 * to userspace to ask, we might think the region is in-sync
+	 * and allow a read to pick up data that is stale.  (This is
+	 * very unlikely if a device actually fails; but it is very
+	 * likely if a connection to one device from one machine fails.)
+	 *
+	 * There still might be a problem if the mirror caches the region
+	 * state as in-sync... but then this call would not be made.  So,
+	 * that is a mirror problem.
+	 */
 	if (!can_block)
 		return -EWOULDBLOCK;
 
@@ -559,7 +590,19 @@
 			       NULL, 0,
 			       (char *)&sync_count, &rdata_size);
 
-	return (r) ? 0 : sync_count;
+	if (r)
+		return 0;
+
+	if (sync_count == lc->region_count)
+		lc->in_sync_hint = 1;
+	/*
+	 * get_sync_count is never called after the
+	 * initial sync=1
+	else
+		lc->in_sync_hint = 0;
+	*/
+
+	return sync_count;
 }
 
 /*
@@ -610,6 +653,16 @@
 	int rdata_size;
 	struct log_c *lc = (struct log_c *)log->context;
 
+	/*
+	 * Once the mirror has been reported to be in-sync,
+	 * it will never again ask for recovery work.  So,
+	 * we can safely say there is not a remote machine
+	 * recovering if the device is in-sync.  (in_sync_hint
+	 * must be reset at resume time.)
+	 */
+	if (lc->in_sync_hint)
+		return 0;
+
 	rdata_size = sizeof(is_recovering);
 	r = cluster_do_request(lc, lc->uuid, DM_CLOG_IS_REMOTE_RECOVERING,
 			       (char *)&region, sizeof(region),