* [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away [RHEL6, take2]
@ 2012-02-17 21:39 Lon Hohberger
2012-02-18 3:57 ` Fabio M. Di Nitto
0 siblings, 1 reply; 2+ messages in thread
From: Lon Hohberger @ 2012-02-17 21:39 UTC (permalink / raw)
To: cluster-devel.redhat.com
Qdiskd hsitorically has required significant tuning to work around
delays which occur during multipath failover, overloaded I/O, and LUN
trespasses in both device-mapper-multipath and EMC PowerPath
environments.
This patch goes a very long way towards eliminating false evictions
when these conditions occur by making qdiskd whine to the other
cluster members when it detects hung system calls. When a cluster
member whines, it indicates the source of the problem (which system
call is hung), and the act of receiving a whine from a host indicates
that qdiskd is operational, but that I/O is hung. Hung I/O is different
from losing storage entirely (where you get I/O errors).
Possible problems:
- Receive queue getting very full, causing messages to become blocked on
a node where I/O is hung. 1) that would take a very long time, and 2)
node should get evicted at that point anyway.
Resolves: rhbz#678372
Signed-off-by: Lon Hohberger <lhh@redhat.com>
---
cman/daemon/cnxman-socket.h | 1 +
cman/qdisk/Makefile | 2 +-
cman/qdisk/disk.h | 6 +++++
cman/qdisk/iostate.c | 16 ++++++++++++--
cman/qdisk/iostate.h | 4 ++-
cman/qdisk/main.c | 46 ++++++++++++++++++++++++++++++++++++++++--
6 files changed, 67 insertions(+), 8 deletions(-)
diff --git a/cman/daemon/cnxman-socket.h b/cman/daemon/cnxman-socket.h
index d243b40..e6bb5dd 100644
--- a/cman/daemon/cnxman-socket.h
+++ b/cman/daemon/cnxman-socket.h
@@ -68,6 +68,7 @@
#define CLUSTER_PORT_SERVICES 2
#define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */
#define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */
+#define CLUSTER_PORT_QDISKD 178 /* Quorum disk daemon */
/* Port numbers above this will be blocked when the cluster is inquorate or in
* transition */
diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
index 68e20cd..e3bb5f7 100644
--- a/cman/qdisk/Makefile
+++ b/cman/qdisk/Makefile
@@ -40,7 +40,7 @@ ${TARGET1}: ${SHAREDOBJS} ${OBJS1}
$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
${TARGET2}: ${SHAREDOBJS} ${OBJS2}
- $(CC) -o $@ $^ $(LDFLAGS)
+ $(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
depends:
$(MAKE) -C ../lib all
diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
index 93d15fe..fd80fa6 100644
--- a/cman/qdisk/disk.h
+++ b/cman/qdisk/disk.h
@@ -273,6 +273,12 @@ typedef struct {
status_block_t ni_status;
} node_info_t;
+typedef struct {
+ qd_ctx *ctx;
+ node_info_t *ni;
+ size_t ni_len;
+} qd_priv_t;
+
int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
int qd_init(qd_ctx *ctx, cman_handle_t ch_admin,
diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
index 0199da4..06c6831 100644
--- a/cman/qdisk/iostate.c
+++ b/cman/qdisk/iostate.c
@@ -1,9 +1,12 @@
#include <pthread.h>
+#include <libcman.h>
#include <iostate.h>
#include <unistd.h>
#include <time.h>
#include <sys/time.h>
#include <liblogthread.h>
+#include <stdint.h>
+#include "platform.h"
#include "iostate.h"
static iostate_t main_state = 0;
@@ -26,7 +29,7 @@ static struct state_table io_state_table[] = {
{ STATE_LSEEK, "seek" },
{ -1, NULL } };
-static const char *
+const char *
state_to_string(iostate_t state)
{
static const char *ret = "unknown";
@@ -65,6 +68,8 @@ io_nanny_thread(void *arg)
iostate_t last_main_state = 0, current_main_state = 0;
int last_main_incarnation = 0, current_main_incarnation = 0;
int logged_incarnation = 0;
+ cman_handle_t ch = (cman_handle_t)arg;
+ int32_t whine_state;
/* Start with wherever we're at now */
pthread_mutex_lock(&state_mutex);
@@ -96,6 +101,11 @@ io_nanny_thread(void *arg)
continue;
}
+ /* Whine on CMAN api */
+ whine_state = (int32_t)current_main_state;
+ swab32(whine_state);
+ cman_send_data(ch, &whine_state, sizeof(int32_t), 0, 178, 0);
+
/* Don't log things twice */
if (logged_incarnation == current_main_incarnation)
continue;
@@ -114,7 +124,7 @@ io_nanny_thread(void *arg)
int
-io_nanny_start(int timeout)
+io_nanny_start(cman_handle_t ch, int timeout)
{
int ret;
@@ -124,7 +134,7 @@ io_nanny_start(int timeout)
qdisk_timeout = timeout;
thread_active = 1;
- ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
+ ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, ch);
pthread_mutex_unlock(&state_mutex);
return ret;
diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
index 7dd7bf6..a65b1d4 100644
--- a/cman/qdisk/iostate.h
+++ b/cman/qdisk/iostate.h
@@ -11,7 +11,9 @@ typedef enum {
void io_state(iostate_t state);
-int io_nanny_start(int timeout);
+int io_nanny_start(cman_handle_t ch, int timeout);
int io_nanny_stop(void);
+const char * state_to_string(iostate_t state);
+
#endif
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index a90e82c..fc714f6 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -25,6 +25,7 @@
#include <liblogthread.h>
#include "score.h"
#include "../daemon/cman.h"
+#include "../daemon/cnxman-socket.h"
#include <sys/syslog.h>
#define LOG_DAEMON_NAME "qdiskd"
@@ -915,7 +916,8 @@ cman_wait(cman_handle_t ch, struct timeval *_tv)
static void
process_cman_event(cman_handle_t handle, void *private, int reason, int arg)
{
- qd_ctx *ctx = (qd_ctx *)private;
+ qd_priv_t *qp = (qd_priv_t *)private;
+ qd_ctx *ctx = qp->ctx;
switch(reason) {
case CMAN_REASON_PORTOPENED:
@@ -1926,6 +1928,33 @@ check_stop_cman(qd_ctx *ctx)
do { static int _logged=0; if (!_logged) { _logged=1; logt_print(level, fmt, ##args); } } while(0)
+static void
+qdisk_whine(cman_handle_t h, void *privdata, char *buf, int len,
+ uint8_t port, int nodeid)
+{
+ int32_t dstate;
+ qd_priv_t *qp = (qd_priv_t *)privdata;
+ node_info_t *ni = qp->ni;
+
+ if (len != sizeof(dstate)) {
+ return;
+ }
+
+ dstate = *((int32_t*)buf);
+
+ if (nodeid == (qp->ctx->qc_my_id))
+ return;
+
+ swab32(dstate);
+
+ if (dstate) {
+ logt_print(LOG_NOTICE, "qdiskd on node %d reports hung %s()\n", nodeid,
+ state_to_string(dstate));
+ ni[nodeid-1].ni_misses = 0;
+ }
+}
+
+
int
main(int argc, char **argv)
{
@@ -1939,6 +1968,7 @@ main(int argc, char **argv)
char device[128];
pid_t pid;
quorum_header_t qh;
+ qd_priv_t qp;
if (check_process_running(argv[0], &pid) && pid !=getpid()) {
printf("QDisk services already running\n");
@@ -2001,13 +2031,23 @@ main(int argc, char **argv)
/* For cman notifications we need two sockets - one for events,
one for config change callbacks */
- ch_user = cman_init(&ctx);
+ qp.ctx = &ctx;
+ qp.ni = &ni[0];
+ qp.ni_len = MAX_NODES_DISK;
+
+ ch_user = cman_init(&qp);
if (cman_start_notification(ch_user, process_cman_event) != 0) {
logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
strerror(errno));
goto out;
}
+ if (cman_start_recv_data(ch_user, qdisk_whine, CLUSTER_PORT_QDISKD) != 0) {
+ logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
+ strerror(errno));
+ goto out;
+ }
+
memset(&me, 0, sizeof(me));
if (cman_get_node(ch_admin, CMAN_NODEID_US, &me) < 0) {
logt_print(LOG_CRIT, "Could not determine local node ID: %s\n",
@@ -2108,7 +2148,7 @@ main(int argc, char **argv)
}
}
- io_nanny_start(ctx.qc_tko * ctx.qc_interval);
+ io_nanny_start(ch_user, ctx.qc_tko * ctx.qc_interval);
if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) {
/* Only clean up if we're exiting w/o error) */
--
1.7.7.6
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away [RHEL6, take2]
2012-02-17 21:39 [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away [RHEL6, take2] Lon Hohberger
@ 2012-02-18 3:57 ` Fabio M. Di Nitto
0 siblings, 0 replies; 2+ messages in thread
From: Fabio M. Di Nitto @ 2012-02-18 3:57 UTC (permalink / raw)
To: cluster-devel.redhat.com
Signed-Off.
I also sent a very small cleanup after ionanny/178 port to STABLE32.
Thanks
Fabio
On 02/17/2012 10:39 PM, Lon Hohberger wrote:
> Qdiskd hsitorically has required significant tuning to work around
> delays which occur during multipath failover, overloaded I/O, and LUN
> trespasses in both device-mapper-multipath and EMC PowerPath
> environments.
>
> This patch goes a very long way towards eliminating false evictions
> when these conditions occur by making qdiskd whine to the other
> cluster members when it detects hung system calls. When a cluster
> member whines, it indicates the source of the problem (which system
> call is hung), and the act of receiving a whine from a host indicates
> that qdiskd is operational, but that I/O is hung. Hung I/O is different
> from losing storage entirely (where you get I/O errors).
>
> Possible problems:
>
> - Receive queue getting very full, causing messages to become blocked on
> a node where I/O is hung. 1) that would take a very long time, and 2)
> node should get evicted at that point anyway.
>
> Resolves: rhbz#678372
>
> Signed-off-by: Lon Hohberger <lhh@redhat.com>
> ---
> cman/daemon/cnxman-socket.h | 1 +
> cman/qdisk/Makefile | 2 +-
> cman/qdisk/disk.h | 6 +++++
> cman/qdisk/iostate.c | 16 ++++++++++++--
> cman/qdisk/iostate.h | 4 ++-
> cman/qdisk/main.c | 46 ++++++++++++++++++++++++++++++++++++++++--
> 6 files changed, 67 insertions(+), 8 deletions(-)
>
> diff --git a/cman/daemon/cnxman-socket.h b/cman/daemon/cnxman-socket.h
> index d243b40..e6bb5dd 100644
> --- a/cman/daemon/cnxman-socket.h
> +++ b/cman/daemon/cnxman-socket.h
> @@ -68,6 +68,7 @@
> #define CLUSTER_PORT_SERVICES 2
> #define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */
> #define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */
> +#define CLUSTER_PORT_QDISKD 178 /* Quorum disk daemon */
>
> /* Port numbers above this will be blocked when the cluster is inquorate or in
> * transition */
> diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
> index 68e20cd..e3bb5f7 100644
> --- a/cman/qdisk/Makefile
> +++ b/cman/qdisk/Makefile
> @@ -40,7 +40,7 @@ ${TARGET1}: ${SHAREDOBJS} ${OBJS1}
> $(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
>
> ${TARGET2}: ${SHAREDOBJS} ${OBJS2}
> - $(CC) -o $@ $^ $(LDFLAGS)
> + $(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
>
> depends:
> $(MAKE) -C ../lib all
> diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
> index 93d15fe..fd80fa6 100644
> --- a/cman/qdisk/disk.h
> +++ b/cman/qdisk/disk.h
> @@ -273,6 +273,12 @@ typedef struct {
> status_block_t ni_status;
> } node_info_t;
>
> +typedef struct {
> + qd_ctx *ctx;
> + node_info_t *ni;
> + size_t ni_len;
> +} qd_priv_t;
> +
> int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
> disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
> int qd_init(qd_ctx *ctx, cman_handle_t ch_admin,
> diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
> index 0199da4..06c6831 100644
> --- a/cman/qdisk/iostate.c
> +++ b/cman/qdisk/iostate.c
> @@ -1,9 +1,12 @@
> #include <pthread.h>
> +#include <libcman.h>
> #include <iostate.h>
> #include <unistd.h>
> #include <time.h>
> #include <sys/time.h>
> #include <liblogthread.h>
> +#include <stdint.h>
> +#include "platform.h"
> #include "iostate.h"
>
> static iostate_t main_state = 0;
> @@ -26,7 +29,7 @@ static struct state_table io_state_table[] = {
> { STATE_LSEEK, "seek" },
> { -1, NULL } };
>
> -static const char *
> +const char *
> state_to_string(iostate_t state)
> {
> static const char *ret = "unknown";
> @@ -65,6 +68,8 @@ io_nanny_thread(void *arg)
> iostate_t last_main_state = 0, current_main_state = 0;
> int last_main_incarnation = 0, current_main_incarnation = 0;
> int logged_incarnation = 0;
> + cman_handle_t ch = (cman_handle_t)arg;
> + int32_t whine_state;
>
> /* Start with wherever we're at now */
> pthread_mutex_lock(&state_mutex);
> @@ -96,6 +101,11 @@ io_nanny_thread(void *arg)
> continue;
> }
>
> + /* Whine on CMAN api */
> + whine_state = (int32_t)current_main_state;
> + swab32(whine_state);
> + cman_send_data(ch, &whine_state, sizeof(int32_t), 0, 178, 0);
> +
> /* Don't log things twice */
> if (logged_incarnation == current_main_incarnation)
> continue;
> @@ -114,7 +124,7 @@ io_nanny_thread(void *arg)
>
>
> int
> -io_nanny_start(int timeout)
> +io_nanny_start(cman_handle_t ch, int timeout)
> {
> int ret;
>
> @@ -124,7 +134,7 @@ io_nanny_start(int timeout)
> qdisk_timeout = timeout;
> thread_active = 1;
>
> - ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
> + ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, ch);
> pthread_mutex_unlock(&state_mutex);
>
> return ret;
> diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
> index 7dd7bf6..a65b1d4 100644
> --- a/cman/qdisk/iostate.h
> +++ b/cman/qdisk/iostate.h
> @@ -11,7 +11,9 @@ typedef enum {
>
> void io_state(iostate_t state);
>
> -int io_nanny_start(int timeout);
> +int io_nanny_start(cman_handle_t ch, int timeout);
> int io_nanny_stop(void);
>
> +const char * state_to_string(iostate_t state);
> +
> #endif
> diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
> index a90e82c..fc714f6 100644
> --- a/cman/qdisk/main.c
> +++ b/cman/qdisk/main.c
> @@ -25,6 +25,7 @@
> #include <liblogthread.h>
> #include "score.h"
> #include "../daemon/cman.h"
> +#include "../daemon/cnxman-socket.h"
> #include <sys/syslog.h>
>
> #define LOG_DAEMON_NAME "qdiskd"
> @@ -915,7 +916,8 @@ cman_wait(cman_handle_t ch, struct timeval *_tv)
> static void
> process_cman_event(cman_handle_t handle, void *private, int reason, int arg)
> {
> - qd_ctx *ctx = (qd_ctx *)private;
> + qd_priv_t *qp = (qd_priv_t *)private;
> + qd_ctx *ctx = qp->ctx;
>
> switch(reason) {
> case CMAN_REASON_PORTOPENED:
> @@ -1926,6 +1928,33 @@ check_stop_cman(qd_ctx *ctx)
> do { static int _logged=0; if (!_logged) { _logged=1; logt_print(level, fmt, ##args); } } while(0)
>
>
> +static void
> +qdisk_whine(cman_handle_t h, void *privdata, char *buf, int len,
> + uint8_t port, int nodeid)
> +{
> + int32_t dstate;
> + qd_priv_t *qp = (qd_priv_t *)privdata;
> + node_info_t *ni = qp->ni;
> +
> + if (len != sizeof(dstate)) {
> + return;
> + }
> +
> + dstate = *((int32_t*)buf);
> +
> + if (nodeid == (qp->ctx->qc_my_id))
> + return;
> +
> + swab32(dstate);
> +
> + if (dstate) {
> + logt_print(LOG_NOTICE, "qdiskd on node %d reports hung %s()\n", nodeid,
> + state_to_string(dstate));
> + ni[nodeid-1].ni_misses = 0;
> + }
> +}
> +
> +
> int
> main(int argc, char **argv)
> {
> @@ -1939,6 +1968,7 @@ main(int argc, char **argv)
> char device[128];
> pid_t pid;
> quorum_header_t qh;
> + qd_priv_t qp;
>
> if (check_process_running(argv[0], &pid) && pid !=getpid()) {
> printf("QDisk services already running\n");
> @@ -2001,13 +2031,23 @@ main(int argc, char **argv)
>
> /* For cman notifications we need two sockets - one for events,
> one for config change callbacks */
> - ch_user = cman_init(&ctx);
> + qp.ctx = &ctx;
> + qp.ni = &ni[0];
> + qp.ni_len = MAX_NODES_DISK;
> +
> + ch_user = cman_init(&qp);
> if (cman_start_notification(ch_user, process_cman_event) != 0) {
> logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
> strerror(errno));
> goto out;
> }
>
> + if (cman_start_recv_data(ch_user, qdisk_whine, CLUSTER_PORT_QDISKD) != 0) {
> + logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
> + strerror(errno));
> + goto out;
> + }
> +
> memset(&me, 0, sizeof(me));
> if (cman_get_node(ch_admin, CMAN_NODEID_US, &me) < 0) {
> logt_print(LOG_CRIT, "Could not determine local node ID: %s\n",
> @@ -2108,7 +2148,7 @@ main(int argc, char **argv)
> }
> }
>
> - io_nanny_start(ctx.qc_tko * ctx.qc_interval);
> + io_nanny_start(ch_user, ctx.qc_tko * ctx.qc_interval);
>
> if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) {
> /* Only clean up if we're exiting w/o error) */
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2012-02-18 3:57 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-02-17 21:39 [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away [RHEL6, take2] Lon Hohberger
2012-02-18 3:57 ` Fabio M. Di Nitto
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).