From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 14 Jan 2008 15:57:47 -0000 Subject: [Cluster-devel] cluster/dlm-kernel/src lockqueue.c Message-ID: <20080114155747.4852.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL4 Changes by: teigland at sourceware.org 2008-01-14 15:57:46 Modified files: dlm-kernel/src : lockqueue.c Log message: bz 351321 add_to_requestqueue() can add a new message to the requestqueue just after process_requestqueue() checks it and determines it's empty. This means dlm_recvd will spin forever in wait_requestqueue() waiting for the message to be removed. The same problem was found and fixed in the RHEL5 code (and then subsequently changed again). This patch is the RHEL4 equivalent of the original RHEL5 fix. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/dlm-kernel/src/lockqueue.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.37.2.11&r2=1.37.2.12 --- cluster/dlm-kernel/src/Attic/lockqueue.c 2008/01/04 16:12:05 1.37.2.11 +++ cluster/dlm-kernel/src/Attic/lockqueue.c 2008/01/14 15:57:46 1.37.2.12 @@ -112,22 +112,23 @@ * request queue and processed when recovery is complete. */ -void add_to_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) +int add_to_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) { struct rq_entry *entry; int length = hd->rh_length; + int rv; if (test_bit(LSFL_REQUEST_WARN, &ls->ls_flags)) log_error(ls, "request during recovery from %u", nodeid); if (in_nodes_gone(ls, nodeid)) - return; + return 0; entry = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); if (!entry) { // TODO something better printk("dlm: add_to_requestqueue: out of memory\n"); - return; + return 0; } log_debug(ls, "add_to_requestq cmd %d fr %d", hd->rh_cmd, nodeid); @@ -135,8 +136,22 @@ memcpy(entry->rqe_request, hd, length); down(&ls->ls_requestqueue_lock); - list_add_tail(&entry->rqe_list, &ls->ls_requestqueue); + + /* We need to check LS_RUN after taking the mutex to + avoid a race where dlm_recoverd enables locking and runs + process_requestqueue between our earlier LS_RUN check + and this addition to the requestqueue. (From RHEL5 code). */ + + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) { + list_add_tail(&entry->rqe_list, &ls->ls_requestqueue); + rv = 0; + } else { + log_debug(ls, "add_to_requestq skip fr %d", nodeid); + kfree(entry); + rv = -EAGAIN; + } up(&ls->ls_requestqueue_lock); + return rv; } int process_requestqueue(struct dlm_ls *ls) @@ -819,6 +834,7 @@ struct dlm_request *freq = (struct dlm_request *) req; struct dlm_reply *rp = (struct dlm_reply *) req; struct dlm_reply reply; + int error; lspace = find_lockspace_by_global_id(req->rh_lockspace); @@ -840,8 +856,11 @@ */ retry: if (!test_bit(LSFL_LS_RUN, &lspace->ls_flags)) { - if (!recovery) - add_to_requestqueue(lspace, nodeid, req); + if (!recovery) { + error = add_to_requestqueue(lspace, nodeid, req); + if (error == -EAGAIN) + goto retry; + } status = -EINTR; goto out; }