From mboxrd@z Thu Jan 1 00:00:00 1970 From: lhh@sourceware.org Date: 4 Dec 2007 20:24:45 -0000 Subject: [Cluster-devel] cluster/cman/qdisk disk.c disk.h disk_util.c m ... Message-ID: <20071204202445.27135.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Branch: RHEL5 Changes by: lhh at sourceware.org 2007-12-04 20:24:43 Modified files: cman/qdisk : disk.c disk.h disk_util.c main.c mkqdisk.c proc.c Log message: Make qdiskd work with sector sizes other than 512 bytes. Import patch from Fabio M. Di Nitto to make qdiskd use (node_count - 1) for votes if there's none specified in cluster.conf Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.4.1&r2=1.4.4.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.3&r2=1.4.2.4 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk_util.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2.4.2&r2=1.2.4.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.6&r2=1.4.2.7 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/mkqdisk.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3.4.1&r2=1.3.4.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/proc.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.4.1 --- cluster/cman/qdisk/disk.c 2007/10/29 17:54:25 1.4.4.1 +++ cluster/cman/qdisk/disk.c 2007/12/04 20:24:43 1.4.4.2 @@ -43,8 +43,9 @@ #include #include #include +#include -static int diskRawRead(int fd, char *buf, int len); +static int diskRawRead(target_info_t *disk, char *buf, int len); uint32_t clu_crc32(const char *data, size_t count); @@ -211,49 +212,58 @@ * Returns - (the file descriptor), a value >= 0 on success. */ int -qdisk_open(char *name) +qdisk_open(char *name, target_info_t *disk) { - int fd; - int retval; + int ret; + unsigned long ssz; /* * Open for synchronous writes to insure all writes go directly * to disk. */ - fd = open(name, O_RDWR | O_SYNC | O_DIRECT); - if (fd < 0) { - return fd; - } + disk->d_fd = open(name, O_RDWR | O_SYNC | O_DIRECT); + if (disk->d_fd < 0) + return disk->d_fd; + + disk->d_blksz = 512; + ret = ioctl(disk->d_fd, BLKSSZGET, &ssz); + if (ret < 0) + perror("qdisk_open: ioctl(BLKSSZGET)"); + else + /* Sorry, no sector sizes >4GB please */ + disk->d_blksz = (uint32_t)ssz; - /* Check to verify that the partition is large enough.*/ - retval = lseek(fd, END_OF_DISK, SEEK_SET); + disk->d_pagesz = sysconf(_SC_PAGESIZE); - if (retval < 0) { + /* Check to verify that the partition is large enough.*/ + ret = lseek(disk->d_fd, END_OF_DISK(disk->d_blksz), SEEK_SET); + if (ret < 0) { perror("open_partition: seek"); return -1; } - if (retval < END_OF_DISK) { + if (ret < END_OF_DISK(disk->d_blksz)) { fprintf(stderr, "Partition %s too small\n", name); errno = EINVAL; return -1; } /* Set close-on-exec bit */ - retval = fcntl(fd, F_GETFD, 0); - if (retval < 0) { - close(fd); + ret = fcntl(disk->d_fd, F_GETFD, 0); + if (ret < 0) { + perror("open_partition: fcntl(F_GETFD)"); + close(disk->d_fd); return -1; } - retval |= FD_CLOEXEC; - if (fcntl(fd, F_SETFD, retval) < 0) { - perror("open_partition: fcntl"); - close(fd); + ret |= FD_CLOEXEC; + if (fcntl(disk->d_fd, F_SETFD, ret) < 0) { + perror("open_partition: fcntl(F_SETFD)"); + close(disk->d_fd); return -1; } - return fd; + return 0; } @@ -263,17 +273,17 @@ * Returns - value from close syscall. */ int -qdisk_close(int *fd) +qdisk_close(target_info_t *disk) { int retval; - if (!fd || *fd < 0) { + if (!disk || disk->d_fd < 0) { errno = EINVAL; return -1; } - retval = close(*fd); - *fd = -1; + retval = close(disk->d_fd); + disk->d_fd = -1; return retval; } @@ -288,7 +298,7 @@ qdisk_validate(char *name) { struct stat stat_st, *stat_ptr; - int fd; + target_info_t disk; stat_ptr = &stat_st; if (stat(name, stat_ptr) < 0) { @@ -310,26 +320,25 @@ /* * Verify read/write permission. */ - fd = qdisk_open(name); - if (fd < 0) { + if (qdisk_open(name, &disk) < 0) { fprintf(stderr, "%s: open of %s for RDWR failed: %s\n", __FUNCTION__, name, strerror(errno)); return -1; } - qdisk_close(&fd); + qdisk_close(&disk); return 0; } static int -diskRawReadShadow(int fd, off_t readOffset, char *buf, int len) +diskRawReadShadow(target_info_t *disk, off_t readOffset, char *buf, int len) { int ret; shared_header_t *hdrp; char *data; int datalen; - ret = lseek(fd, readOffset, SEEK_SET); + ret = lseek(disk->d_fd, readOffset, SEEK_SET); if (ret != readOffset) { #if 0 fprintf(stderr, @@ -340,7 +349,7 @@ return -1; } - ret = diskRawRead(fd, buf, len); + ret = diskRawRead(disk, buf, len); if (ret != len) { #if 0 fprintf(stderr, "diskRawReadShadow: aligned read " @@ -375,7 +384,7 @@ * Here we check for alignment and do a bounceio if necessary. */ static int -diskRawRead(int fd, char *buf, int len) +diskRawRead(target_info_t *disk, char *buf, int len) { char *alignedBuf; int readret; @@ -383,21 +392,24 @@ int readlen; int bounceNeeded = 1; - if ((((unsigned long) buf & (unsigned long) 0x3ff) == 0) && - ((len % 512) == 0)) { + + /* was 3ff, which is (512<<1-1) */ + if ((((unsigned long) buf & + (unsigned long) ((disk->d_blksz << 1) -1)) == 0) && + ((len % (disk->d_blksz)) == 0)) { bounceNeeded = 0; } if (bounceNeeded == 0) { /* Already aligned and even multiple of 512, no bounceio * required. */ - return (read(fd, buf, len)); + return (read(disk->d_fd, buf, len)); } - if (len > 512) { + if (len > disk->d_blksz) { fprintf(stderr, "diskRawRead: not setup for reads larger than %d.\n", - 512); + (int)disk->d_blksz); return (-1); } /* @@ -406,8 +418,8 @@ * XXX - if the on-disk offsets don't provide enough room we're cooked! */ extraLength = 0; - if (len % 512) { - extraLength = 512 - (len % 512); + if (len % disk->d_blksz) { + extraLength = disk->d_blksz - (len % disk->d_blksz); } readlen = len; @@ -415,18 +427,18 @@ readlen += extraLength; } - readret = posix_memalign((void **)&alignedBuf, 512, 512); + readret = posix_memalign((void **)&alignedBuf, disk->d_pagesz, disk->d_blksz); if (readret < 0) { return -1; } - readret = read(fd, alignedBuf, readlen); + readret = read(disk->d_fd, alignedBuf, readlen); if (readret > 0) { if (readret > len) { - bcopy(alignedBuf, buf, len); + memcpy(alignedBuf, buf, len); readret = len; } else { - bcopy(alignedBuf, buf, readret); + memcpy(alignedBuf, buf, readret); } } @@ -445,7 +457,7 @@ * Here we check for alignment and do a bounceio if necessary. */ static int -diskRawWrite(int fd, char *buf, int len) +diskRawWrite(target_info_t *disk, char *buf, int len) { char *alignedBuf; int ret; @@ -453,31 +465,33 @@ int writelen; int bounceNeeded = 1; - if ((((unsigned long) buf & (unsigned long) 0x3ff) == 0) && - ((len % 512) == 0)) { + /* was 3ff, which is (512<<1-1) */ + if ((((unsigned long) buf & + (unsigned long) ((disk->d_blksz << 1) -1)) == 0) && + ((len % (disk->d_blksz)) == 0)) { bounceNeeded = 0; } + if (bounceNeeded == 0) { /* Already aligned and even multiple of 512, no bounceio * required. */ - return (write(fd, buf, len)); + return (write(disk->d_fd, buf, len)); } - if (len > 512) { + if (len > disk->d_blksz) { fprintf(stderr, - "diskRawWrite: not setup for larger than %d.\n", - 512); + "diskRawRead: not setup for reads larger than %d.\n", + (int)disk->d_blksz); return (-1); } - /* * All IOs must be of size which is a multiple of 512. Here we * just add in enough extra to accommodate. * XXX - if the on-disk offsets don't provide enough room we're cooked! */ extraLength = 0; - if (len % 512) { - extraLength = 512 - (len % 512); + if (len % disk->d_blksz) { + extraLength = disk->d_blksz - (len % disk->d_blksz); } writelen = len; @@ -485,13 +499,20 @@ writelen += extraLength; } - ret = posix_memalign((void **)&alignedBuf, 512,512); + ret = posix_memalign((void **)&alignedBuf, disk->d_pagesz, disk->d_blksz); if (ret < 0) { + return -1; + } + + if (len > disk->d_blksz) { + fprintf(stderr, + "diskRawWrite: not setup for larger than %d.\n", + (int)disk->d_blksz); return (-1); } - bcopy(buf, alignedBuf, len); - ret = write(fd, alignedBuf, writelen); + memcpy(buf, alignedBuf, len); + ret = write(disk->d_fd, alignedBuf, writelen); if (ret > len) { ret = len; } @@ -507,7 +528,7 @@ static int -diskRawWriteShadow(int fd, __off64_t writeOffset, char *buf, int len) +diskRawWriteShadow(target_info_t *disk, __off64_t writeOffset, char *buf, int len) { off_t retval_seek; ssize_t retval_write; @@ -519,7 +540,7 @@ return (-1); } - retval_seek = lseek(fd, writeOffset, SEEK_SET); + retval_seek = lseek(disk->d_fd, writeOffset, SEEK_SET); if (retval_seek != writeOffset) { fprintf(stderr, "diskRawWriteShadow: can't seek to offset %d\n", @@ -527,7 +548,7 @@ return (-1); } - retval_write = diskRawWrite(fd, buf, len); + retval_write = diskRawWrite(disk, buf, len); if (retval_write != len) { if (retval_write == -1) { fprintf(stderr, "%s: %s\n", __FUNCTION__, @@ -544,7 +565,7 @@ int -qdisk_read(int fd, __off64_t offset, void *buf, int count) +qdisk_read(target_info_t *disk, __off64_t offset, void *buf, int count) { shared_header_t *hdrp; char *data; @@ -556,15 +577,15 @@ * Raw blocks are 512 byte aligned. */ total = count + sizeof(shared_header_t); - if (total < 512) - total = 512; + if (total < disk->d_blksz) + total = disk->d_blksz; /* Round it up */ - if (total % 512) - total = total + (512 * !!(total % 512)) - (total % 512); + if (total % disk->d_blksz) + total = total + (disk->d_blksz * !!(total % disk->d_blksz)) - (total % disk->d_blksz); hdrp = NULL; - rv = posix_memalign((void **)&hdrp, sysconf(_SC_PAGESIZE), total); + rv = posix_memalign((void **)&hdrp, disk->d_pagesz, disk->d_blksz); if (rv < 0) return -1; @@ -573,7 +594,7 @@ data = (char *)hdrp + sizeof(shared_header_t); - rv = diskRawReadShadow(fd, offset, (char *)hdrp, total); + rv = diskRawReadShadow(disk, offset, (char *)hdrp, disk->d_blksz); if (rv == -1) { return -1; @@ -594,12 +615,12 @@ int -qdisk_write(int fd, __off64_t offset, const void *buf, int count) +qdisk_write(target_info_t *disk, __off64_t offset, const void *buf, int count) { size_t maxsize; shared_header_t *hdrp; char *data; - size_t total = 0, rv = -1, psz = 512; //sysconf(_SC_PAGESIZE); + size_t total = 0, rv = -1, psz = disk->d_blksz; //sysconf(_SC_PAGESIZE); maxsize = psz - (sizeof(shared_header_t)); if (count >= (maxsize + sizeof(shared_header_t))) { @@ -611,7 +632,6 @@ /* * Calculate the total length of the buffer, including the header. - * Raw blocks are 512 byte aligned. */ total = count + sizeof(shared_header_t); if (total < psz) @@ -622,7 +642,7 @@ total = total + (psz * !!(total % psz)) - (total % psz); hdrp = NULL; - rv = posix_memalign((void **)&hdrp, sysconf(_SC_PAGESIZE), total); + rv = posix_memalign((void **)&hdrp, disk->d_pagesz, total); if (rv < 0) { perror("posix_memalign"); return -1; @@ -645,7 +665,7 @@ * about locking here. */ if (total == psz) - rv = diskRawWriteShadow(fd, offset, (char *)hdrp, psz); + rv = diskRawWriteShadow(disk, offset, (char *)hdrp, psz); if (rv == -1) perror("diskRawWriteShadow"); @@ -658,11 +678,11 @@ static int -header_init(int fd, char *label) +header_init(target_info_t *disk, char *label) { quorum_header_t qh; - if (qdisk_read(fd, OFFSET_HEADER, &qh, sizeof(qh)) == sizeof(qh)) { + if (qdisk_read(disk, OFFSET_HEADER, &qh, sizeof(qh)) == sizeof(qh)) { swab_quorum_header_t(&qh); if (qh.qh_magic == HEADER_MAGIC_OLD) { printf("Warning: Red Hat Cluster Manager 1.2.x " @@ -681,14 +701,18 @@ /* Copy in the cluster/label name */ snprintf(qh.qh_cluster, sizeof(qh.qh_cluster)-1, "%s", label); + qh.qh_version = VERSION_MAGIC_V2; if ((qh.qh_timestamp = (uint64_t)time(NULL)) <= 0) { perror("time"); return -1; } qh.qh_magic = HEADER_MAGIC_NUMBER; + qh.qh_blksz = disk->d_blksz; + qh.qh_pad = 0; + swab_quorum_header_t(&qh); - if (qdisk_write(fd, OFFSET_HEADER, &qh, sizeof(qh)) != sizeof(qh)) { + if (qdisk_write(disk, OFFSET_HEADER, &qh, sizeof(qh)) != sizeof(qh)) { return -1; } @@ -699,24 +723,24 @@ int qdisk_init(char *partname, char *label) { - int fd; + target_info_t disk; status_block_t ps, wps; - int nid; + int nid, ret; time_t t; - fd = qdisk_validate(partname); - if (fd < 0) { + ret = qdisk_validate(partname); + if (ret < 0) { perror("qdisk_verify"); return -1; } - fd = qdisk_open(partname); - if (fd < 0) { + ret = qdisk_open(partname, &disk); + if (ret < 0) { perror("qdisk_open"); return -1; } - if (header_init(fd, label) < 0) { + if (header_init(&disk, label) < 0) { return -1; } @@ -744,14 +768,14 @@ wps = ps; swab_status_block_t(&wps); - if (qdisk_write(fd, qdisk_nodeid_offset(nid), &wps, sizeof(wps)) < 0) { + if (qdisk_write(&disk, qdisk_nodeid_offset(nid, disk.d_blksz), &wps, sizeof(wps)) < 0) { printf("Error writing node ID block %d\n", nid); - qdisk_close(&fd); + qdisk_close(&disk); return -1; } } - qdisk_close(&fd); + qdisk_close(&disk); return 0; } --- cluster/cman/qdisk/disk.h 2007/02/21 20:22:53 1.4.2.3 +++ cluster/cman/qdisk/disk.h 2007/12/04 20:24:43 1.4.2.4 @@ -72,7 +72,8 @@ RF_DEBUG = 0x4, RF_PARANOID = 0x8, RF_ALLOW_KILL = 0x10, - RF_UPTIME = 0x20 + RF_UPTIME = 0x20, + RF_CMAN_LABEL = 0x40 } run_flag_t; @@ -86,6 +87,9 @@ #define STATE_MAGIC_NUMBER 0x47bacef8 /* Status block */ #define SHARED_HEADER_MAGIC 0x00DEBB1E /* Per-block headeer */ +/* Version magic. */ +#define VERSION_MAGIC_V2 0x389fabc4 + typedef struct __attribute__ ((packed)) { uint32_t ps_magic; @@ -152,16 +156,21 @@ */ typedef struct __attribute__ ((packed)) { uint32_t qh_magic; - uint32_t qh_align; // 64-bit-ism: alignment fixer. + uint32_t qh_version; // uint64_t qh_timestamp; // time of last update char qh_updatehost[128];// Hostname who put this here... - char qh_cluster[128]; // Cluster name + char qh_cluster[120]; // Cluster name; CMAN only + // supports 16 chars. + uint32_t qh_blksz; // Known block size @ creation + uint32_t qh_pad; } quorum_header_t; #define swab_quorum_header_t(ptr) \ {\ swab32((ptr)->qh_magic); \ - swab32((ptr)->qh_align); \ + swab32((ptr)->qh_version); \ + swab32((ptr)->qh_blksz); \ + swab32((ptr)->qh_pad); \ swab64((ptr)->qh_timestamp); \ } @@ -196,31 +205,35 @@ /* Offsets from RHCM 1.2.x */ #define OFFSET_HEADER 0 -#define HEADER_SIZE 4096 /* Page size for now */ +#define HEADER_SIZE(ssz) (ssz<4096?4096:ssz) -#define OFFSET_FIRST_STATUS_BLOCK (OFFSET_HEADER + HEADER_SIZE) -#define SPACE_PER_STATUS_BLOCK 4096 /* Page size for now */ +#define OFFSET_FIRST_STATUS_BLOCK(ssz) (OFFSET_HEADER + HEADER_SIZE(ssz)) +#define SPACE_PER_STATUS_BLOCK(ssz) (ssz<4096?4096:ssz) #define STATUS_BLOCK_COUNT MAX_NODES_DISK -#define SPACE_PER_MESSAGE_BLOCK (4096) -#define MESSAGE_BLOCK_COUNT MAX_NODES_DISK - -#define END_OF_DISK (OFFSET_FIRST_STATUS_BLOCK + \ +#define END_OF_DISK(ssz) (OFFSET_FIRST_STATUS_BLOCK(ssz) + \ (MAX_NODES_DISK + 1) * \ - SPACE_PER_STATUS_BLOCK) \ + SPACE_PER_STATUS_BLOCK(ssz)) \ +typedef struct { + int d_fd; + int _pad_; + size_t d_blksz; + size_t d_pagesz; +} target_info_t; + /* From disk.c */ -int qdisk_open(char *name); -int qdisk_close(int *fd); +int qdisk_open(char *name, target_info_t *disk); +int qdisk_close(target_info_t *disk); int qdisk_init(char *name, char *clustername); int qdisk_validate(char *name); -int qdisk_read(int fd, __off64_t ofs, void *buf, int len); -int qdisk_write(int fd, __off64_t ofs, const void *buf, int len); +int qdisk_read(target_info_t *disk, __off64_t ofs, void *buf, int len); +int qdisk_write(target_info_t *disk, __off64_t ofs, const void *buf, int len); -#define qdisk_nodeid_offset(nodeid) \ - (OFFSET_FIRST_STATUS_BLOCK + (SPACE_PER_STATUS_BLOCK * (nodeid - 1))) +#define qdisk_nodeid_offset(nodeid, ssz) \ + (OFFSET_FIRST_STATUS_BLOCK(ssz) + (SPACE_PER_STATUS_BLOCK(ssz) * (nodeid - 1))) /* From disk_utils.c */ #define HISTORY_LENGTH 60 @@ -231,11 +244,12 @@ uint16_t pad0; } disk_msg_t; + typedef struct { uint64_t qc_incarnation; struct timeval qc_average; struct timeval qc_last[HISTORY_LENGTH]; - int qc_fd; + target_info_t qc_disk; int qc_my_id; int qc_writes; int qc_interval; @@ -250,12 +264,14 @@ disk_node_state_t qc_disk_status; disk_node_state_t qc_status; int qc_master; /* Master?! */ - int _pad_; + int qc_status_sock; run_flag_t qc_flags; cman_handle_t qc_ch; char *qc_device; char *qc_label; char *qc_status_file; + char *qc_cman_label; + char *qc_status_sockname; } qd_ctx; typedef struct { @@ -272,14 +288,15 @@ int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state, disk_msg_t *msg, memb_mask_t mask, memb_mask_t master); -int qd_read_print_status(int fd, int nid); +int qd_read_print_status(target_info_t *disk, int nid); int qd_init(qd_ctx *ctx, cman_handle_t ch, int me); void qd_destroy(qd_ctx *ctx); /* proc.c */ int find_partitions(const char *partfile, const char *label, char *devname, size_t devlen, int print); -int check_device(char *device, char *label, quorum_header_t *qh); +int check_device(char *device, char *label, int *ssz, quorum_header_t *qh, + int flags); #endif --- cluster/cman/qdisk/disk_util.c 2007/01/26 14:34:55 1.2.4.2 +++ cluster/cman/qdisk/disk_util.c 2007/12/04 20:24:43 1.2.4.3 @@ -201,8 +201,9 @@ if (get_time(&start, ctx->qc_flags&RF_UPTIME) < 0) utime_ok = 0; swab_status_block_t(&ps); - if (qdisk_write(ctx->qc_fd, qdisk_nodeid_offset(nid), &ps, - sizeof(ps)) < 0) { + if (qdisk_write(&ctx->qc_disk, + qdisk_nodeid_offset(nid, ctx->qc_disk.d_blksz), + &ps, sizeof(ps)) < 0) { printf("Error writing node ID block %d\n", nid); return -1; } @@ -223,12 +224,12 @@ int -qd_print_status(status_block_t *ps) +qd_print_status(target_info_t *disk, status_block_t *ps) { int x; printf("Data @ offset %d:\n", - (int)qdisk_nodeid_offset(ps->ps_nodeid)); + (int)qdisk_nodeid_offset(ps->ps_nodeid, disk->d_blksz)); printf("status_block_t {\n"); printf("\t.ps_magic = %08x;\n", (int)ps->ps_magic); printf("\t.ps_nodeid = %d;\n", (int)ps->ps_nodeid); @@ -261,11 +262,11 @@ int -qd_read_print_status(int fd, int nid) +qd_read_print_status(target_info_t *disk, int nid) { status_block_t ps; - if (fd < 0) { + if (!disk || disk->d_fd < 0) { errno = EINVAL; return -1; } @@ -275,13 +276,13 @@ return -1; } - if (qdisk_read(fd, qdisk_nodeid_offset(nid), &ps, + if (qdisk_read(disk, qdisk_nodeid_offset(nid, disk->d_blksz), &ps, sizeof(ps)) < 0) { printf("Error reading node ID block %d\n", nid); return -1; } swab_status_block_t(&ps); - qd_print_status(&ps); + qd_print_status(disk, &ps); return 0; } @@ -322,6 +323,7 @@ ctx->qc_incarnation = generate_token(); ctx->qc_ch = ch; ctx->qc_my_id = me; + ctx->qc_status_sock = -1; return 0; } @@ -339,6 +341,5 @@ free(ctx->qc_device); ctx->qc_device = NULL; } - close(ctx->qc_fd); - ctx->qc_fd = -1; + qdisk_close(&ctx->qc_disk); } --- cluster/cman/qdisk/main.c 2007/03/20 19:37:04 1.4.2.6 +++ cluster/cman/qdisk/main.c 2007/12/04 20:24:43 1.4.2.7 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -147,7 +148,8 @@ sb = &ni[x].ni_status; - if (qdisk_read(ctx->qc_fd, qdisk_nodeid_offset(x+1), + if (qdisk_read(&ctx->qc_disk, + qdisk_nodeid_offset(x+1, ctx->qc_disk.d_blksz), sb, sizeof(*sb)) < 0) { clulog(LOG_WARNING,"Error reading node ID block %d\n", x+1); @@ -452,6 +454,10 @@ quorum_init(qd_ctx *ctx, node_info_t *ni, int max, struct h_data *h, int maxh) { int x = 0, score, maxscore, score_req; + char buf[64]; +#if 0 + struct sockaddr_un sun; +#endif clulog(LOG_INFO, "Quorum Daemon Initializing\n"); @@ -462,12 +468,28 @@ if (qdisk_validate(ctx->qc_device) < 0) return -1; - ctx->qc_fd = qdisk_open(ctx->qc_device); - if (ctx->qc_fd < 0) { + if (qdisk_open(ctx->qc_device, &ctx->qc_disk) < 0) { clulog(LOG_CRIT, "Failed to open %s: %s\n", ctx->qc_device, strerror(errno)); return -1; } + + if (strlen(ctx->qc_device) > 15 && !(ctx->qc_flags & RF_CMAN_LABEL)) { + if (ctx->qc_label && strlen(ctx->qc_label) <= 15) { + ctx->qc_cman_label = strdup(ctx->qc_label); + } else { + snprintf(buf, sizeof(buf), "QDisk[%d]", + (int)strlen(ctx->qc_device)); + ctx->qc_cman_label = strdup(buf); + } + + ctx->qc_flags |= RF_CMAN_LABEL; + clulog(LOG_DEBUG, "Device too long! Setting CMAN label to: %s\n", + ctx->qc_cman_label); + } + + clulog(LOG_DEBUG, "I/O Size: %d Page Size: %d\n", + ctx->qc_disk.d_blksz, ctx->qc_disk.d_pagesz); if (h && maxh) { start_score_thread(ctx, h, maxh); @@ -484,6 +506,42 @@ return -1; } +#if 0 + if (ctx->qc_status_sockname) { + ctx->qc_status_sock = socket(PF_LOCAL, SOCK_STREAM, 0); + + if (ctx->qc_status_sockname < 0) { + clulog(LOG_ERR, + "Could not create local socket %s: %s\n", + qc->qc_status_sockname, strerror(errno)); + free(qc->qc_status_sockname); + qc->qc_status_sockname = NULL; + } else { + sun.sun_family = PF_LOCAL; + snprintf(sun.sun_path, sizeof(sun.sun_path), + qc->qc_status_sockname); + unlink(qc->qc_status_sockname); + if (bind(ctx->qc_status_sock, + (struct sockaddr *)&sun, sizeof(sun)) < 0) { + clulog(LOG_ERR, "Could not bind to local " + "socket %s: %s\n", + qc->qc_status_sockname, + strerror(errno)); + free(qc->qc_status_sockname); + qc->qc_status_sockname = NULL; + close(qc->qc_status_sock); + qc->qc_status_sock = -1; + } + } + } else { + qc->qc_status_sock = -1; + } + + if (qc->qc_status_sock >= 0) { + listen(qc->qc_status_sock, 5); + } +#endif + while (++x <= ctx->qc_tko && _running) { read_node_blocks(ctx, ni, max); check_transitions(ctx, ni, max, NULL); @@ -622,23 +680,7 @@ char * -state_str(disk_node_state_t s) -{ - switch (s) { - case S_NONE: - return "None"; - case S_EVICT: - return "Evicted"; - case S_INIT: - return "Initializing"; - case S_RUN: - return "Running"; - case S_MASTER: - return "Master"; - default: - return "ILLEGAL"; - } -} +state_str(disk_node_state_t s); void @@ -1237,6 +1279,12 @@ ctx->qc_status_file = val; } + /* Get status socket */ + snprintf(query, sizeof(query), "/cluster/quorumd/@status_sock"); + if (ccs_get(ccsfd, query, &val) == 0) { + ctx->qc_status_sockname = val; + } + /* Get min score */ snprintf(query, sizeof(query), "/cluster/quorumd/@min_score"); if (ccs_get(ccsfd, query, &val) == 0) { @@ -1285,6 +1333,15 @@ ctx->qc_flags &= ~RF_REBOOT; free(val); } + + /* Get cman_label */ + snprintf(query, sizeof(query), "/cluster/quorumd/@cman_label"); + if (ccs_get(ccsfd, query, &val) == 0) { + if (strlen(val) > 0 && strlen(val) <= 15) { + ctx->qc_flags |= RF_CMAN_LABEL; + ctx->qc_cman_label = val; + } + } /* * Get flag to see if we're supposed to kill cman if qdisk is not @@ -1384,21 +1441,25 @@ main(int argc, char **argv) { cman_node_t me; - int cfh, rv, forked = 0, nfd = -1; + int cfh, rv, forked = 0, nfd = -1, ret = -1; +#if 0 + int status_run = 0; +#endif qd_ctx ctx; - cman_handle_t ch; + cman_handle_t ch = NULL; node_info_t ni[MAX_NODES_DISK]; struct h_data h[10]; char debug = 0, foreground = 0; char device[128]; pid_t pid; + quorum_header_t qh; if (check_process_running(argv[0], &pid) && pid !=getpid()) { printf("QDisk services already running\n"); return 0; } - while ((rv = getopt(argc, argv, "fdQ")) != EOF) { + while ((rv = getopt(argc, argv, "fdQs")) != EOF) { switch (rv) { case 'd': debug = 1; @@ -1418,11 +1479,15 @@ dup2(nfd, 2); close(nfd); break; +#if 0 + case 's': + status_run = 1; +#endif default: break; } } - + #if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2) ch = cman_admin_init(NULL); #else @@ -1431,7 +1496,7 @@ if (!ch) { if (!foreground && !forked) { if (daemon_init(argv[0]) < 0) - return -1; + goto out; else forked = 1; } @@ -1452,7 +1517,7 @@ while (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) { if (!foreground && !forked) { if (daemon_init(argv[0]) < 0) - return -1; + goto out; else forked = 1; } @@ -1472,7 +1537,7 @@ if (get_config_data(NULL, &ctx, h, 10, &cfh, debug) < 0) { clulog_and_print(LOG_CRIT, "Configuration failed\n"); check_stop_cman(&ctx); - return -1; + goto out; } if (ctx.qc_label) { @@ -1483,7 +1548,7 @@ " '%s' to any device\n", ctx.qc_label); check_stop_cman(&ctx); - return -1; + goto out; } if (ctx.qc_device) @@ -1494,18 +1559,29 @@ clulog(LOG_INFO, "Quorum Partition: %s Label: %s\n", ctx.qc_device, ctx.qc_label); } else if (ctx.qc_device) { - if (check_device(ctx.qc_device, NULL, NULL) != 0) { + if (check_device(ctx.qc_device, NULL, &rv, &qh, 0) != 0) { clulog(LOG_CRIT, "Specified partition %s does not have a " "qdisk label\n", ctx.qc_device); check_stop_cman(&ctx); - return -1; + goto out; + } + + if (qh.qh_version == VERSION_MAGIC_V2 && + qh.qh_blksz != rv) { + clulog(LOG_CRIT, + "Specified device %s does match kernel's " + "reported sector size (%d != %d)\n", + ctx.qc_device, + ctx.qc_disk.d_blksz, rv); + check_stop_cman(&ctx); + goto out; } } if (!foreground && !forked) { if (daemon_init(argv[0]) < 0) - return -1; + goto out; } set_priority(ctx.qc_sched, ctx.qc_sched_prio); @@ -1513,13 +1589,19 @@ if (quorum_init(&ctx, ni, MAX_NODES_DISK, h, cfh) < 0) { clulog_and_print(LOG_CRIT, "Initialization failed\n"); check_stop_cman(&ctx); - return -1; + goto out; } + ret = 0; + if (!_running) - return 0; + goto out; - cman_register_quorum_device(ctx.qc_ch, ctx.qc_device, ctx.qc_votes); + cman_register_quorum_device(ctx.qc_ch, + (ctx.qc_flags&RF_CMAN_LABEL)? + ctx.qc_cman_label: + ctx.qc_device, + ctx.qc_votes); /* XXX this always returns -1 / EBUSY even when it works?!!! @@ -1529,16 +1611,18 @@ "Could not register %s with CMAN; " "return = %d; error = %s\n", ctx.qc_device, rv, strerror(errno)); - return -1; + goto out; } */ - if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) cman_unregister_quorum_device(ctx.qc_ch); quorum_logout(&ctx); + /* free cman handle to avoid leak in cman */ +out: + cman_finish(ctx.qc_ch); qd_destroy(&ctx); - return 0; + return ret; } --- cluster/cman/qdisk/mkqdisk.c 2006/11/21 14:50:30 1.3.4.1 +++ cluster/cman/qdisk/mkqdisk.c 2007/12/04 20:24:43 1.3.4.2 @@ -37,23 +37,26 @@ { char device[128]; char *newdev = NULL, *newlabel = NULL; - int rv; + int rv, debug_level = 1; - printf("mkqdisk v0.5.1\n"); + printf("mkqdisk v0.5.2\n"); - while ((rv = getopt(argc, argv, "Lf:c:l:h")) != EOF) { + while ((rv = getopt(argc, argv, "Ldf:c:l:h")) != EOF) { switch (rv) { + case 'd': + ++debug_level; + break; case 'L': /* List */ close(2); return find_partitions("/proc/partitions", - NULL, NULL, 0, 1); + NULL, NULL, 0, debug_level); break; case 'f': close(2); return find_partitions("/proc/partitions", optarg, device, - sizeof(device), 1); + sizeof(device), debug_level); case 'c': newdev = optarg; break; --- cluster/cman/qdisk/proc.c 2006/06/23 16:05:33 1.2 +++ cluster/cman/qdisk/proc.c 2007/12/04 20:24:43 1.2.4.1 @@ -32,27 +32,33 @@ int -check_device(char *device, char *label, quorum_header_t *qh) +check_device(char *device, char *label, int *ssz, quorum_header_t *qh, + int flags) { - int fd = -1, ret = -1; + int ret = -1; quorum_header_t qh_local; + target_info_t disk; if (!qh) qh = &qh_local; - fd = qdisk_validate(device); - if (fd < 0) { + ret = qdisk_validate(device); + if (ret < 0) { perror("qdisk_verify"); return -1; } - fd = qdisk_open(device); - if (fd < 0) { + ret = qdisk_open(device, &disk); + if (ret < 0) { perror("qdisk_open"); return -1; } - if (qdisk_read(fd, OFFSET_HEADER, qh, sizeof(*qh)) == sizeof(*qh)) { + if (ssz) + *ssz = disk.d_blksz; + + ret = -1; + if (qdisk_read(&disk, OFFSET_HEADER, qh, sizeof(*qh)) == sizeof(*qh)) { swab_quorum_header_t(qh); if (qh->qh_magic == HEADER_MAGIC_NUMBER) { if (!label || !strcmp(qh->qh_cluster, label)) { @@ -61,12 +67,91 @@ } } - qdisk_close(&fd); + /* only flag now is 'strict device check'; i.e., + "block size recorded must match kernel's reported size" */ + if (flags && qh->qh_version == VERSION_MAGIC_V2 && + disk.d_blksz != qh->qh_blksz) { + ret = -1; + } + + qdisk_close(&disk); return ret; } +char * +state_str(disk_node_state_t s) +{ + switch (s) { + case S_NONE: + return "None"; + case S_EVICT: + return "Evicted"; + case S_INIT: + return "Initializing"; + case S_RUN: + return "Running"; + case S_MASTER: + return "Master"; + default: + return "ILLEGAL"; + } +} + + +void +print_status_block(status_block_t *sb) +{ + if (sb->ps_state == S_NONE) + return; + printf("Status block for node %d\n", sb->ps_nodeid); + printf("\tLast updated by node %d\n", sb->ps_updatenode); + printf("\tLast updated on %s", ctime((time_t *)&sb->ps_timestamp)); + printf("\tState: %s\n", state_str(sb->ps_state)); + printf("\tFlags: %04x\n", sb->ps_flags); + printf("\tScore: %d/%d\n", sb->ps_score, sb->ps_scoremax); + printf("\tAverage Cycle speed: %d.%06d seconds\n", + sb->ps_ca_sec, sb->ps_ca_usec); + printf("\tLast Cycle speed: %d.%06d seconds\n", + sb->ps_lc_sec, sb->ps_lc_usec); + printf("\tIncarnation: %08x%08x\n", + (int)(sb->ps_incarnation>>32&0xffffffff), + (int)(sb->ps_incarnation&0xffffffff)); + +} + + +void +read_info(char *dev) +{ + target_info_t ti; + int x; + status_block_t sb; + + if (qdisk_open(dev, &ti) < 0) { + printf("Could not read from %s: %s\n", + dev, strerror(errno)); + return; + } + + for (x = 0; x < MAX_NODES_DISK; x++) { + + if (qdisk_read(&ti, + qdisk_nodeid_offset(x+1, ti.d_blksz), + &sb, sizeof(sb)) < 0) { + printf("Error reading node ID block %d\n", + x+1); + continue; + } + swab_status_block_t(&sb); + print_status_block(&sb); + } + + qdisk_close(&ti); +} + + int find_partitions(const char *partfile, const char *label, char *devname, size_t devlen, int print) @@ -78,6 +163,7 @@ char device[128]; char realdev[256]; quorum_header_t qh; + int ssz; fp = fopen(partfile, "r"); if (!fp) @@ -96,16 +182,35 @@ if (strlen(device)) { snprintf(realdev, sizeof(realdev), "/dev/%s", device); - if (check_device(realdev, (char *)label, &qh) != 0) + + /* If we're not "just printing", then + then reject devices which don't match + the recorded sector size */ + if (check_device(realdev, (char *)label, &ssz, + &qh, !print) != 0) continue; if (print) { printf("%s:\n", realdev); - printf("\tMagic: %08x\n", qh.qh_magic); - printf("\tLabel: %s\n", qh.qh_cluster); - printf("\tCreated: %s", + printf("\tMagic: %08x\n", qh.qh_magic); + printf("\tLabel: %s\n", qh.qh_cluster); + printf("\tCreated: %s", ctime((time_t *)&qh.qh_timestamp)); - printf("\tHost: %s\n\n", qh.qh_updatehost); + printf("\tHost: %s\n", qh.qh_updatehost); + printf("\tKernel Sector Size: %d\n", ssz); + if (qh.qh_version == VERSION_MAGIC_V2) { + printf("\tRecorded Sector Size: %d\n\n", (int)qh.qh_blksz); + if (qh.qh_blksz != ssz) { + printf("WARNING: Sector size mismatch: Header: %d Kernel: %d\n", + (int)qh.qh_blksz, ssz); + } + } else + printf("\n"); + } + + if (print >= 2) { + /* Print node stuff */ + read_info(realdev); } if (devname && devlen) {