From mboxrd@z Thu Jan 1 00:00:00 1970 From: jbrassow@sourceware.org Date: 4 Feb 2008 18:27:22 -0000 Subject: [Cluster-devel] cluster/cmirror/src cluster.c functions.c loca ... Message-ID: <20080204182722.19913.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL5 Changes by: jbrassow at sourceware.org 2008-02-04 18:27:20 Modified files: cmirror/src : cluster.c functions.c local.c logging.h Log message: - change priority of some log statements - fix potential OOB memory op by macro - add reference counting to log... fixes some issues with mirror conversion - plug a memory leak... likely related to bug 383291 Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror/src/cluster.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.11&r2=1.1.2.12 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror/src/functions.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.10&r2=1.1.2.11 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror/src/local.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.10&r2=1.1.2.11 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror/src/logging.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.4&r2=1.1.2.5 --- cluster/cmirror/src/Attic/cluster.c 2008/01/23 21:21:06 1.1.2.11 +++ cluster/cmirror/src/Attic/cluster.c 2008/02/04 18:27:20 1.1.2.12 @@ -335,7 +335,7 @@ } if (rv == SA_AIS_ERR_EXIST) { - LOG_ERROR("export_checkpoint: checkpoint already exists"); + LOG_DBG("export_checkpoint: checkpoint already exists"); EXIT(); return -EEXIST; } @@ -361,7 +361,7 @@ } if (rv == SA_AIS_ERR_EXIST) { - LOG_ERROR("export_checkpoint: sync checkpoint section already exists"); + LOG_DBG("export_checkpoint: sync checkpoint section already exists"); EXIT(); return -EEXIST; } @@ -588,8 +588,8 @@ } for (cp = entry->checkpoint_list; cp;) { - LOG_ERROR("Checkpoint data available for node %u", - cp->requester); + LOG_DBG("[%s] Checkpoint data available for node %u", + SHORT_UUID(entry->name.value), cp->requester); /* * FIXME: Check return code. Could send failure @@ -747,24 +747,25 @@ ENTER(); - LOG_PRINT("****** CPG config callback ****************"); + LOG_DBG("****** CPG config callback **[%s]**", + SHORT_UUID(gname->value)); - LOG_PRINT("* JOINING (%d):", joined_list_entries); + LOG_DBG("* JOINING (%d):", joined_list_entries); for (i = 0; i < joined_list_entries; i++) - LOG_PRINT("* nodeid: %d, pid: %d", - joined_list[i].nodeid, joined_list[i].pid); + LOG_DBG("* nodeid: %d, pid: %d", + joined_list[i].nodeid, joined_list[i].pid); - LOG_PRINT("* MEMBERS (%d):", member_list_entries); + LOG_DBG("* MEMBERS (%d):", member_list_entries); for (i = 0; i < member_list_entries; i++) - LOG_PRINT("* nodeid: %d, pid: %d", - member_list[i].nodeid, member_list[i].pid); + LOG_DBG("* nodeid: %d, pid: %d", + member_list[i].nodeid, member_list[i].pid); - LOG_PRINT("* LEAVING (%d):", left_list_entries); + LOG_DBG("* LEAVING (%d):", left_list_entries); for (i = 0; i < left_list_entries; i++) - LOG_PRINT("* nodeid: %d, pid: %d", - left_list[i].nodeid, left_list[i].pid); + LOG_DBG("* nodeid: %d, pid: %d", + left_list[i].nodeid, left_list[i].pid); - LOG_PRINT("*****************************************"); + LOG_DBG("*****************************************"); list_for_each_entry_safe(match, tmp, &clog_cpg_list, list) { LOG_DBG("Given handle: %llu", (unsigned long long)handle); --- cluster/cmirror/src/Attic/functions.c 2008/01/23 21:21:06 1.1.2.10 +++ cluster/cmirror/src/Attic/functions.c 2008/02/04 18:27:20 1.1.2.11 @@ -43,7 +43,9 @@ struct log_c { struct list_head list; + char uuid[DM_UUID_LEN]; + uint32_t ref_count; int touched; uint32_t region_size; @@ -350,6 +352,7 @@ uint64_t region_count; uint32_t bitset_size; struct log_c *lc = NULL; + struct log_c *dup; enum sync sync = DEFAULTSYNC; int disk_log = 0; @@ -422,9 +425,19 @@ lc->sync = sync; lc->sync_search = 0; lc->recovering_region = (uint64_t)-1; - strncpy(lc->uuid, argv[1 + disk_log], DM_UUID_LEN); lc->disk_fd = -1; lc->log_dev_failed = 0; + lc->ref_count = 1; + strncpy(lc->uuid, argv[1 + disk_log], DM_UUID_LEN); + + if ((dup = get_log(lc->uuid)) || + (dup = get_pending_log(lc->uuid))) { + LOG_PRINT("[%s] Inc reference count on cluster log", + SHORT_UUID(lc->uuid)); + free(lc); + dup->ref_count++; + return 0; + } INIT_LIST_HEAD(&lc->mark_list); @@ -561,7 +574,8 @@ if (r) LOG_ERROR("Failed to create cluster log (%s)", tfr->uuid); else - LOG_PRINT("Cluster log created (%s)", tfr->uuid); + LOG_PRINT("[%s] Cluster log created", + SHORT_UUID(tfr->uuid)); return r; } @@ -575,23 +589,31 @@ { struct log_c *lc = get_log(tfr->uuid); - if (!lc) { - /* Is the log in the pending list? */ - lc = get_pending_log(tfr->uuid); - if (!lc) { - LOG_ERROR("clog_dtr called on log that is not official or pending"); - return -EINVAL; - } - } else { - LOG_DBG("[%s] clog_dtr: leaving CPG", SHORT_UUID(lc->uuid)); + if (lc) { /* - * If postsuspend had done the destroy_cluster_cpg, - * the log context would be in the pending list + * The log should not be on the official list. There + * should have been a suspend first. */ - destroy_cluster_cpg(tfr->uuid); + lc->ref_count--; + if (!lc->ref_count) { + LOG_ERROR("[%s] DTR before SUS: leaving CPG", + SHORT_UUID(tfr->uuid)); + destroy_cluster_cpg(tfr->uuid); + } + } else if ((lc = get_pending_log(tfr->uuid))) { + lc->ref_count--; + } else { + LOG_ERROR("clog_dtr called on log that is not official or pending"); + return -EINVAL; + } + + if (lc->ref_count) { + LOG_PRINT("[%s] Dec reference count on cluster log", + SHORT_UUID(lc->uuid)); + return 0; } - LOG_PRINT("Cluster log removed (%s)", lc->uuid); + LOG_PRINT("[%s] Cluster log removed", SHORT_UUID(lc->uuid)); list_del_init(&lc->list); if (lc->disk_fd != -1) @@ -638,7 +660,7 @@ if (!lc) return -EINVAL; - LOG_DBG("[%s] clog_postsuspend: leaving CPG", SHORT_UUID(lc->uuid)); + LOG_PRINT("[%s] clog_postsuspend: leaving CPG", SHORT_UUID(lc->uuid)); destroy_cluster_cpg(tfr->uuid); return 0; @@ -656,7 +678,7 @@ if (!lc) return -EINVAL; - LOG_DBG("[%s] clog_postsuspend: finalizing", SHORT_UUID(lc->uuid)); + LOG_PRINT("[%s] clog_postsuspend: finalizing", SHORT_UUID(lc->uuid)); lc->resume_override = 0; /* move log to pending list */ @@ -686,10 +708,12 @@ switch (lc->resume_override) { case 1000: - LOG_ERROR("ERROR:: Additional resume issued before suspend"); + LOG_ERROR("[%s] ERROR:: Additional resume issued before suspend", + SHORT_UUID(tfr->uuid)); return 0; case 0: - LOG_PRINT("Master resume: reading disk log"); + LOG_PRINT("[%s] Master resume: reading disk log", + SHORT_UUID(lc->uuid)); lc->resume_override = 1000; break; case 1: @@ -699,8 +723,8 @@ LOG_ERROR("Error:: partial bit loading (just clean_bits)"); return -EINVAL; case 3: - LOG_DBG("[%s] Non-master resume: bits pre-loaded", - SHORT_UUID(lc->uuid)); + LOG_PRINT("[%s] Non-master resume: bits pre-loaded", + SHORT_UUID(lc->uuid)); lc->resume_override = 1000; lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); LOG_DBG("[%s] Initial sync_count = %llu", @@ -1232,6 +1256,9 @@ struct log_c *lc = get_log(tfr->uuid); if (!lc) + lc = get_pending_log(tfr->uuid); + + if (!lc) return -EINVAL; if (lc->disk_fd == -1) @@ -1287,6 +1314,9 @@ struct log_c *lc = get_log(tfr->uuid); if (!lc) + lc = get_pending_log(tfr->uuid); + + if (!lc) return -EINVAL; if (lc->disk_fd == -1) --- cluster/cmirror/src/Attic/local.c 2008/01/25 16:24:47 1.1.2.10 +++ cluster/cmirror/src/Attic/local.c 2008/02/04 18:27:20 1.1.2.11 @@ -16,8 +16,6 @@ #include "local.h" static int cn_fd; /* Connector (netlink) socket fd */ -static int request_array[20]; /* for request counting (debugging) */ -static int cluster_array[20]; /* for request counting (debugging) */ static int kernel_recv_helper(void *data, int in_size) { @@ -145,7 +143,6 @@ */ static int do_local_work(void *data) { - static int request_count = 0; int r, i; struct clog_tfr *tfr = NULL; @@ -154,9 +151,6 @@ if (r) return r; - request_array[tfr->request_type]++; - request_count++; - LOG_DBG("Request from kernel recieved [%s/%s/%llu]", RQ_TYPE(tfr->request_type), SHORT_UUID(tfr->uuid), (unsigned long long)tfr->seq); @@ -208,13 +202,22 @@ } /* ELSE, fall through to default */ default: - cluster_array[tfr->request_type]++; - /* Add before send_to_cluster, so cluster code can find it */ - queue_add_tail(tfr, cluster_queue); r = cluster_send(tfr); - if (r) - LOG_ERROR("Unable to send request to cluster: %s", - strerror(-r)); + if (r) { + LOG_ERROR("[%s] Unable to send %s to cluster: %s", + SHORT_UUID(tfr->uuid), + RQ_TYPE(tfr->request_type), strerror(-r)); + tfr->error = r; + kernel_send(tfr); + } else { + /* + * If this was multi-threaded, we would have to + * add the 'tfr' to the queue before doing + * the cluster_send + */ + queue_add_tail(tfr, cluster_queue); + } + break; } @@ -223,14 +226,6 @@ tfr->error = r; } - if (!(request_count % 10000)) { - LOG_PRINT("Total requests (%d):", request_count); - for (i = 0; i < 20; i++) - LOG_PRINT(" %s: %d", RQ_TYPE(i), request_array[i]); - LOG_PRINT("Cluster-bound requests:"); - for (i = 0; i < 20; i++) - LOG_PRINT(" %s: %d", RQ_TYPE(i), cluster_array[i]); - } EXIT(); return r; } @@ -303,9 +298,6 @@ ENTER(); - memset(request_array, 0, sizeof(int)*20); - memset(cluster_array, 0, sizeof(int)*20); - cn_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); if (cn_fd < 0) { EXIT(); --- cluster/cmirror/src/Attic/logging.h 2008/01/18 17:11:07 1.1.2.4 +++ cluster/cmirror/src/Attic/logging.h 2008/02/04 18:27:20 1.1.2.5 @@ -29,7 +29,7 @@ #endif /* SHORT_UUID - print last 8 chars of a string */ -#define SHORT_UUID(x) ((x) + (strlen(x) - 8)) +#define SHORT_UUID(x) (strlen(x) > 8) ? ((x) + (strlen(x) - 8)) : (x) extern int log_tabbing; extern int log_is_open;