From mboxrd@z Thu Jan 1 00:00:00 1970 From: jbrassow@sourceware.org Date: 20 Feb 2007 19:35:13 -0000 Subject: [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ... Message-ID: <20070220193513.11090.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL4 Changes by: jbrassow at sourceware.org 2007-02-20 19:35:10 Modified files: cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-cman.c dm-cmirror-cman.h dm-cmirror-server.c Log message: Bug 217895: lost election results from cmirror server cause mirror ... There was a race happening as a result of simultaneous cman issued 'starts'. The client receives the start requests, but the server processes them. So, it was possible for the server to reset the event id/type while the client was trying to set them. This would cause the next kcl_start_done command issued by the server to fail. The bug can be interpretted many different ways depending on which machine in the cluster you are looking at when it happens. The fix was to have the client wait to set the event id/type until it knows the server has completed the previous request. This fix may resolve other bugs as well, but I will test them individually. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.37&r2=1.1.2.38 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-cman.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.5&r2=1.1.2.6 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-cman.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.1&r2=1.1.2.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.22&r2=1.1.2.23 --- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/02/19 16:29:42 1.1.2.37 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/02/20 19:35:10 1.1.2.38 @@ -45,6 +45,7 @@ static int shutting_down=0; static atomic_t suspend_client; static wait_queue_head_t suspend_client_queue; +static wait_queue_head_t event_queue; static DECLARE_MUTEX(consult_server_lock); @@ -1228,8 +1229,11 @@ kcl_get_node_by_nodeid(0, &node); my_id = node.node_id; + /* Wait for any outstanding starts to complete */ + suspend_on(&event_queue, atomic_read(&restart_event_type)); + restart_event_id = event_id; - restart_event_type = type; + atomic_set(&restart_event_type, type); switch(type){ case SERVICE_NODE_LEAVE: @@ -1391,6 +1395,7 @@ } init_waitqueue_head(&suspend_client_queue); + init_waitqueue_head(&event_queue); r = dm_register_dirty_log_type(&_clustered_core_type); if (r) { --- cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.c 2006/06/15 19:48:00 1.1.2.5 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.c 2007/02/20 19:35:10 1.1.2.6 @@ -27,7 +27,7 @@ int global_count=0; uint32_t *global_nodeids=NULL; -int restart_event_type=0; +atomic_t restart_event_type = ATOMIC_INIT(0); int restart_event_id=0; uint32_t nodeid_to_ipaddr(uint32_t nodeid){ --- cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.h 2005/07/27 16:09:31 1.1.2.1 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.h 2007/02/20 19:35:10 1.1.2.2 @@ -12,7 +12,7 @@ extern int global_count; extern uint32_t *global_nodeids; -extern int restart_event_type; +extern atomic_t restart_event_type; extern int restart_event_id; uint32_t nodeid_to_ipaddr(uint32_t nodeid); --- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/02/19 16:29:42 1.1.2.22 +++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/02/20 19:35:10 1.1.2.23 @@ -1067,7 +1067,7 @@ } suspend_on(&_suspend_queue, atomic_read(&_suspend)); - switch(restart_event_type){ + switch(atomic_read(&restart_event_type)){ case SERVICE_NODE_LEAVE: /* ATTENTION -- may wish to check if regions ** ** are still in use by this node. For now, ** @@ -1076,7 +1076,7 @@ ** leaving node, it won't hurt anything - and** ** if there is, they will be recovered. */ case SERVICE_NODE_FAILED: - if (restart_event_type == SERVICE_NODE_FAILED) + if (atomic_read(&restart_event_type) == SERVICE_NODE_FAILED) DMINFO("A cluster mirror log member has failed."); list_for_each_entry(lc, &log_list_head, log_list){ @@ -1095,10 +1095,13 @@ } - if(restart_event_type){ + if(atomic_read(&restart_event_type)){ /* finish the start phase */ kcl_start_done(local_id, restart_event_id); - restart_event_id = restart_event_type = 0; + restart_event_id = 0; + + /* Trigger any waiting starts to proceed */ + atomic_set(&restart_event_type, 0); } else if (atomic_read(&_do_requests)) { /* ATTENTION -- what to do with error ? */ if(process_log_request(sock))