From mboxrd@z Thu Jan 1 00:00:00 1970 From: Fabio M. Di Nitto Date: Mon, 2 Jul 2012 15:55:04 +0200 Subject: [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away Message-ID: <1341237304-4691-1-git-send-email-fdinitto@redhat.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit From: Lon Hohberger Qdiskd hsitorically has required significant tuning to work around delays which occur during multipath failover, overloaded I/O, and LUN trespasses in both device-mapper-multipath and EMC PowerPath environments. This patch goes a very long way towards eliminating false evictions when these conditions occur by making qdiskd whine to the other cluster members when it detects hung system calls. When a cluster member whines, it indicates the source of the problem (which system call is hung), and the act of receiving a whine from a host indicates that qdiskd is operational, but that I/O is hung. Hung I/O is different from losing storage entirely (where you get I/O errors). Possible problems: - Receive queue getting very full, causing messages to become blocked on a node where I/O is hung. 1) that would take a very long time, and 2) node should get evicted at that point anyway. Resolves: rhbz#782900 this version of the patch is a backport of: e2937eb33f224f86904fead08499a6178868ca6a 34d2872fb7e60be1594158acaaeb8acd74f78d22 There is a minor change vs original patch based on how qdiskd in RHEL5 handles cman connection. We add an extra call to cman_alive in main qdisk_loop to make sure data are not stalled on the cman port, and data_callback to qdiskd_whine executed. Signed-off-by: Lon Hohberger Signed-off-by: Fabio M. Di Nitto --- cman/daemon/cnxman-socket.h | 1 + cman/qdisk/Makefile | 2 +- cman/qdisk/disk.h | 6 ++++ cman/qdisk/iostate.c | 17 +++++++++++-- cman/qdisk/iostate.h | 4 ++- cman/qdisk/main.c | 54 +++++++++++++++++++++++++++++++++++++++---- 6 files changed, 74 insertions(+), 10 deletions(-) diff --git a/cman/daemon/cnxman-socket.h b/cman/daemon/cnxman-socket.h index 351c97c..1d01b44 100644 --- a/cman/daemon/cnxman-socket.h +++ b/cman/daemon/cnxman-socket.h @@ -79,6 +79,7 @@ #define CLUSTER_PORT_SERVICES 2 #define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */ #define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */ +#define CLUSTER_PORT_QDISKD 178 /* Quorum disk daemon */ /* Port numbers above this will be blocked when the cluster is inquorate or in * transition */ diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile index f58806b..9bfc486 100644 --- a/cman/qdisk/Makefile +++ b/cman/qdisk/Makefile @@ -32,7 +32,7 @@ qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \ gcc -o $@ $^ -lpthread -L../lib -L${ccslibdir} -lccs -lrt mkqdisk: disk.o crc32.o disk_util.o iostate.o \ - proc.o mkqdisk.o scandisk.o clulog.o gettid.o + proc.o mkqdisk.o scandisk.o clulog.o gettid.o ../lib/libcman.a gcc -o $@ $^ -lrt %.o: %.c diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h index b784220..d491de1 100644 --- a/cman/qdisk/disk.h +++ b/cman/qdisk/disk.h @@ -290,6 +290,12 @@ typedef struct { status_block_t ni_status; } node_info_t; +typedef struct { + qd_ctx *ctx; + node_info_t *ni; + size_t ni_len; +} qd_priv_t; + int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state, disk_msg_t *msg, memb_mask_t mask, memb_mask_t master); int qd_read_print_status(target_info_t *disk, int nid); diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c index 65b4d50..eb74ad2 100644 --- a/cman/qdisk/iostate.c +++ b/cman/qdisk/iostate.c @@ -1,10 +1,14 @@ #include +#include #include #include #include #include #include +#include +#include "platform.h" #include "iostate.h" +#include "../daemon/cnxman-socket.h" static iostate_t main_state = 0; static int main_incarnation = 0; @@ -26,7 +30,7 @@ static struct state_table io_state_table[] = { { STATE_LSEEK, "seek" }, { -1, NULL } }; -static const char * +const char * state_to_string(iostate_t state) { static const char *ret = "unknown"; @@ -65,6 +69,8 @@ io_nanny_thread(void *arg) iostate_t last_main_state = 0, current_main_state = 0; int last_main_incarnation = 0, current_main_incarnation = 0; int logged_incarnation = 0; + cman_handle_t ch = (cman_handle_t)arg; + int32_t whine_state; /* Start with wherever we're at now */ pthread_mutex_lock(&state_mutex); @@ -96,6 +102,11 @@ io_nanny_thread(void *arg) continue; } + /* Whine on CMAN api */ + whine_state = (int32_t)current_main_state; + swab32(whine_state); + cman_send_data(ch, &whine_state, sizeof(int32_t), 0, CLUSTER_PORT_QDISKD, 0); + /* Don't log things twice */ if (logged_incarnation == current_main_incarnation) continue; @@ -114,7 +125,7 @@ io_nanny_thread(void *arg) int -io_nanny_start(int timeout) +io_nanny_start(cman_handle_t ch, int timeout) { int ret; @@ -124,7 +135,7 @@ io_nanny_start(int timeout) qdisk_timeout = timeout; thread_active = 1; - ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL); + ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, ch); pthread_mutex_unlock(&state_mutex); return ret; diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h index 7dd7bf6..a65b1d4 100644 --- a/cman/qdisk/iostate.h +++ b/cman/qdisk/iostate.h @@ -11,7 +11,9 @@ typedef enum { void io_state(iostate_t state); -int io_nanny_start(int timeout); +int io_nanny_start(cman_handle_t ch, int timeout); int io_nanny_stop(void); +const char * state_to_string(iostate_t state); + #endif diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c index 0d7bb3d..90d00ab 100644 --- a/cman/qdisk/main.c +++ b/cman/qdisk/main.c @@ -48,6 +48,7 @@ (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION < 2)) #include #endif +#include "../daemon/cnxman-socket.h" int daemon_init(char *); int check_process_running(char *, pid_t *); @@ -892,6 +893,11 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) _running = 1; while (_running) { + /* perform a forceful cman dispatch */ + if (cman_alive(ctx->qc_ch) < 0) { + clulog(LOG_ERR, "cman: %s\n", strerror(errno)); + } + /* XXX this was getuptime() in clumanager */ get_time(&oldtime, (ctx->qc_flags&RF_UPTIME)); @@ -1514,6 +1520,31 @@ check_stop_cman(qd_ctx *ctx) } } +static void +qdisk_whine(cman_handle_t h, void *privdata, char *buf, int len, + uint8_t port, int nodeid) +{ + int32_t dstate; + qd_priv_t *qp = (qd_priv_t *)privdata; + node_info_t *ni = qp->ni; + + if (len != sizeof(dstate)) { + return; + } + + dstate = *((int32_t*)buf); + + if (nodeid == (qp->ctx->qc_my_id)) + return; + + swab32(dstate); + + if (dstate) { + clulog(LOG_CRIT, "qdiskd on node %d reports hung %s()\n", + state_to_string(dstate)); + ni[nodeid-1].ni_misses = 0; + } +} int main(int argc, char **argv) @@ -1528,6 +1559,7 @@ main(int argc, char **argv) char device[128]; pid_t pid; quorum_header_t qh; + qd_priv_t qp; if (check_process_running(argv[0], &pid) && pid !=getpid()) { printf("QDisk services already running\n"); @@ -1559,10 +1591,16 @@ main(int argc, char **argv) } } + /* For cman notifications we need two sockets - one for events, + one for config change callbacks */ + qp.ctx = &ctx; + qp.ni = &ni[0]; + qp.ni_len = MAX_NODES_DISK; + #if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2) - ch = cman_admin_init(NULL); + ch = cman_admin_init(&qp); #else - ch = cman_init(NULL); + ch = cman_init(&qp); #endif if (!ch) { if (!foreground && !forked) { @@ -1577,13 +1615,19 @@ main(int argc, char **argv) do { sleep(5); #if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2) - ch = cman_admin_init(NULL); + ch = cman_admin_init(&qp); #else - ch = cman_init(NULL); + ch = cman_init(&qp); #endif } while (!ch); } + if (cman_start_recv_data(ch, qdisk_whine, CLUSTER_PORT_QDISKD) != 0) { + clulog_and_print(LOG_CRIT, "Could not register with CMAN: %s\n", + strerror(errno)); + goto out; + } + memset(&me, 0, sizeof(me)); while (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) { if (!foreground && !forked) { @@ -1696,7 +1740,7 @@ main(int argc, char **argv) } } - io_nanny_start(ctx.qc_tko * ctx.qc_interval); + io_nanny_start(ch, ctx.qc_tko * ctx.qc_interval); if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) cman_unregister_quorum_device(ctx.qc_ch); -- 1.7.7.6