All of lore.kernel.org
 help / color / mirror / Atom feed
* [Patch 4/4] chunkd: add self-checking
@ 2009-12-27 23:59 Pete Zaitcev
  2010-01-12 14:21 ` Jeff Garzik
  0 siblings, 1 reply; 3+ messages in thread
From: Pete Zaitcev @ 2009-12-27 23:59 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Project Hail List

This patch adds a background process that periodically verifies the
integrity of all known objects. Objects that are found found faulty
are made invisible to applications. This way, only known good objects
remain visible and applications may implement integrity checking that
is run asynchronously with the self-checking. The intent is to let
the performance of checking to scale with the number of objects and
amount of data stored in the cell.

Signed-off-by: Pete Zaitcev <zaitcev@redhat.com>

---
 server/Makefile.am    |    2 
 server/be-fs.c        |  126 +++++++++++++++-
 server/chunkd.h       |    8 +
 server/config.c       |   28 +++
 server/selfcheck.c    |  208 ++++++++++++++++++++++++++
 server/server.c       |    2 
 test/.gitignore       |    1 
 test/Makefile.am      |    4 
 test/selfcheck-unit.c |  313 ++++++++++++++++++++++++++++++++++++++++
 test/server-test.cfg  |    2 
 test/test.h           |    2 
 11 files changed, 693 insertions(+), 3 deletions(-)

commit ded36ccc14d7356ceaa523c43e6d91efc06c51cd
Author: Master <zaitcev@lembas.zaitcev.lan>
Date:   Sun Dec 27 16:36:17 2009 -0700

    Selfcheck (with a test).

diff --git a/server/Makefile.am b/server/Makefile.am
index 824c0b3..f2a66a1 100644
--- a/server/Makefile.am
+++ b/server/Makefile.am
@@ -6,7 +6,7 @@ sbin_PROGRAMS	= chunkd
 
 chunkd_SOURCES	= chunkd.h		\
 		  ../lib/chunksrv.c	\
-		  be-fs.c object.c server.c config.c cldu.c util.c \
+		  be-fs.c object.c server.c selfcheck.c config.c cldu.c util.c \
 		  objcache.c
 chunkd_LDADD	= \
 		  @CLDC_LIBS@ @GLIB_LIBS@ @CRYPTO_LIBS@ \
diff --git a/server/be-fs.c b/server/be-fs.c
index 2c63541..652650d 100644
--- a/server/be-fs.c
+++ b/server/be-fs.c
@@ -39,8 +39,8 @@
 #include <tchdb.h>
 #include "chunkd.h"
 
-#define MDB_TABLE_ID	"__chunkd_table_id"
 #define MDB_TPATH_FMT	"%s/%X"
+#define BAD_TPATH_FMT	"%s/bad"
 
 struct fs_obj {
 	struct backend_obj	bo;
@@ -262,6 +262,47 @@ err_out:
 	return NULL;
 }
 
+static char *fs_obj_badname(unsigned long tag)
+{
+	char *s;
+	struct stat st;
+	int rc;
+
+	rc = asprintf(&s, BAD_TPATH_FMT, chunkd_srv.vol_path);
+	if (rc < 0)
+		return NULL;
+
+	/* create subdir on the fly, if not already exists */
+	if (stat(s, &st) < 0) {
+		if (errno != ENOENT) {
+			syslogerr(s);
+			free(s);
+			return NULL;
+		}
+		if (mkdir(s, 0777) < 0) {
+			if (errno != EEXIST) {
+				syslogerr(s);
+				free(s);
+				return NULL;
+			}
+		}
+	} else {
+		if (!S_ISDIR(st.st_mode)) {
+			applog(LOG_WARNING,
+			       "%s: not a dir, fs_obj_badname go boom", s);
+			free(s);
+			return NULL;
+		}
+	}
+	free(s);
+
+	rc = asprintf(&s, BAD_TPATH_FMT "/%lu", chunkd_srv.vol_path, tag);
+	if (rc < 0)
+		return NULL;
+
+	return s;
+}
+
 static bool key_valid(const void *key, size_t key_len)
 {
 	if (!key || key_len < 1 || key_len > CHD_KEY_SZ)
@@ -691,6 +732,27 @@ err_out:
 	return false;
 }
 
+int fs_obj_disable(const char *fn)
+{
+	struct stat st;
+	char *bad;
+	int rc;
+
+	if (stat(fn, &st) < 0)
+		return -errno;
+
+	bad = fs_obj_badname(st.st_ino);
+
+	if (rename(fn, bad) < 0) {
+		rc = errno;
+		free(bad);
+		return -rc;
+	}
+
+	free(bad);
+	return 0;
+}
+
 int fs_list_objs_open(struct fs_obj_lister *t,
 		      const char *root_path, uint32_t table_id)
 {
@@ -951,3 +1013,65 @@ int fs_obj_hdr_read(const char *fn, char **owner, char **csum,
 	return -1;
 }
 
+int fs_obj_do_sum(const char *fn, unsigned int klen, char **csump)
+{
+	enum { BUFLEN = 128 * 1024 };
+	void *buf;
+	int fd;
+	ssize_t rrc;
+	int rc;
+	SHA_CTX hash;
+	char *csum;
+	unsigned char md[SHA_DIGEST_LENGTH];
+
+	rc = ENOMEM;
+	buf = malloc(BUFLEN);
+	if (!buf)
+		goto err_alloc;
+
+	fd = open(fn, O_RDONLY);
+	if (fd == -1) {
+		rc = errno;
+		goto err_open;
+	}
+	if (lseek(fd, sizeof(struct be_fs_obj_hdr) + klen, SEEK_SET) == (off_t)-1) {
+		rc = errno;
+		goto err_seek;
+	}
+
+	SHA1_Init(&hash);
+	for (;;) {
+		rrc = read(fd, buf, BUFLEN);
+		if (rrc < 0) {
+			rc = errno;
+			goto err_read;
+		}
+		if (rrc != 0)
+			SHA1_Update(&hash, buf, rrc);
+		if (rrc < BUFLEN)
+			break;
+	}
+	SHA1_Final(md, &hash);
+
+	csum = malloc(SHA_DIGEST_LENGTH*2 + 1);
+	if (!csum) {
+		rc = ENOMEM;
+		goto err_retdup;
+	}
+	hexstr(md, SHA_DIGEST_LENGTH, csum);
+
+	close(fd);
+	free(buf);
+	*csump = csum;
+	return 0;
+
+ err_retdup:
+ err_read:
+	close(fd);
+ err_seek:
+ err_open:
+	free(buf);
+ err_alloc:
+	return -rc;
+}
+
diff --git a/server/chunkd.h b/server/chunkd.h
index 18dd31a..fd3e28f 100644
--- a/server/chunkd.h
+++ b/server/chunkd.h
@@ -210,6 +210,7 @@ struct server {
 	char			*cell;
 	uint32_t		nid;
 	struct geo		loc;
+	time_t			chk_period;
 
 	TCHDB			*tbl_master;
 	struct objcache		actives;
@@ -217,6 +218,8 @@ struct server {
 	struct server_stats	stats;		/* global statistics */
 };
 
+#define MDB_TABLE_ID	"__chunkd_table_id"
+
 extern struct hail_log cldu_hail_log;
 
 /* be-fs.c */
@@ -245,6 +248,7 @@ extern bool fs_obj_write_commit(struct backend_obj *bo, const char *user,
 extern bool fs_obj_delete(uint32_t table_id, const char *user,
 		          const void *kbuf, size_t klen,
 			  enum chunk_errcode *err_code);
+extern int fs_obj_disable(const char *fn);
 extern ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len);
 extern int fs_list_objs_open(struct fs_obj_lister *t,
 			     const char *root_path, uint32_t table_id);
@@ -257,6 +261,7 @@ extern GList *fs_list_objs(uint32_t table_id, const char *user);
 extern bool fs_table_open(const char *user, const void *kbuf, size_t klen,
 		   bool tbl_creat, bool excl_creat, uint32_t *table_id,
 		   enum chunk_errcode *err_code);
+extern int fs_obj_do_sum(const char *fn, unsigned int klen, char **csump);
 
 /* object.c */
 extern bool object_del(struct client *cli);
@@ -320,6 +325,9 @@ extern void resp_init_req(struct chunksrv_resp *resp,
 /* config.c */
 extern void read_config(void);
 
+/* selfcheck.c */
+extern void chk_init(void);
+
 static inline bool use_sendfile(struct client *cli)
 {
 #if defined(HAVE_SENDFILE) && defined(HAVE_SYS_SENDFILE_H)
diff --git a/server/config.c b/server/config.c
index 674e237..55ef327 100644
--- a/server/config.c
+++ b/server/config.c
@@ -442,6 +442,24 @@ static void cfg_elm_end (GMarkupParseContext *context,
 		cc->text = NULL;
 	}
 
+	else if (!strcmp(element_name, "SelfCheckPeriod")) {
+		if (!cc->text) {
+			applog(LOG_WARNING, "SelfCheckPeriod element empty");
+			return;
+		}
+		n = strtol(cc->text, NULL, 10);
+		if (n < 0 || n >= LONG_MAX) {
+			applog(LOG_ERR, "SelfCheckPeriod '%s' is invalid",
+			       cc->text);
+			free(cc->text);
+			cc->text = NULL;
+			return;
+		}
+		chunkd_srv.chk_period = n;
+		free(cc->text);
+		cc->text = NULL;
+	}
+
 	else {
 		applog(LOG_WARNING, "Unknown element \"%s\"", element_name);
 	}
@@ -527,6 +545,16 @@ void read_config(void)
 #endif
 	}
 
+	/*
+	 * The check period determines how quickly the self-check notices
+	 * bad keys when there are few keys or they are small (otherwise the
+	 * time to rescan determines the observation latency). To save
+	 * power, the default is several hours. Notice that it's randomized.
+	 * Zero means no self-check.
+	 */
+	if (!chunkd_srv.chk_period)
+		chunkd_srv.chk_period = 3 * 60 * 60;
+
 	free(ctx.geo_area);
 	free(ctx.geo_zone);
 	free(ctx.geo_rack);
diff --git a/server/selfcheck.c b/server/selfcheck.c
new file mode 100644
index 0000000..218516f
--- /dev/null
+++ b/server/selfcheck.c
@@ -0,0 +1,208 @@
+#define _GNU_SOURCE
+#include "chunkd-config.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <syslog.h>
+#include <tcutil.h>
+#include <tchdb.h>
+#include "chunkd.h"
+
+struct chk_arg {
+	time_t period;
+	TCHDB *hdb;
+	// GThread *gthread;
+};
+
+struct chk_tls {
+	struct chk_arg *arg;
+
+	int stat_ok;
+	int stat_conflict;
+};
+
+static void chk_list_objs(struct chk_tls *tls, uint32_t table_id)
+{
+	struct fs_obj_lister lister;
+	char *fn;
+	char *csum, *csum_act;
+	char *owner;
+	unsigned long long size;
+	time_t mtime;
+	void *key_in;
+	size_t klen_in;
+	struct objcache_entry *cep;
+	int rc;
+
+	memset(&lister, 0, sizeof(struct fs_obj_lister));
+	rc = fs_list_objs_open(&lister, chunkd_srv.vol_path, table_id);
+	if (rc) {
+		applog(LOG_WARNING, "Cannot open table %u: %s", table_id,
+		       strerror(-rc));
+		return;
+	}
+
+	while (fs_list_objs_next(&lister, &fn) > 0) {
+
+		rc = fs_obj_hdr_read(fn, &owner, &csum, &key_in, &klen_in,
+				     &size, &mtime);
+		if (rc < 0) {
+			free(fn);
+			break;
+		}
+
+		cep = objcache_get(&chunkd_srv.actives, key_in, klen_in);
+		if (!cep) {
+			/* This is pretty much impossible unless OOM */
+			applog(LOG_ERR, "chk: objcache_get failed");
+			free(owner);
+			free(csum);
+			free(key_in);
+			free(fn);
+			break;
+		}
+
+		rc = fs_obj_do_sum(fn, klen_in, &csum_act);
+		if (rc) {
+			applog(LOG_INFO, "Cannot compute checksum for %s", fn);
+		} else {
+			if (!objcache_test_dirty(&chunkd_srv.actives, cep)) {
+				if (strcmp(csum, csum_act)) {
+					applog(LOG_INFO,
+					       "Checksum mismatch for %s: "
+					       "expected %s actual %s",
+					       fn, csum, csum_act);
+					fs_obj_disable(fn);
+					/*
+					 * FIXME Suicide the whole server if
+					 * fs_obj_disable fails a few times,
+					 * maybe? But what about races?
+					 */
+				} else {
+					tls->stat_ok++;
+				}
+			} else {
+				tls->stat_conflict++;
+			}
+			free(csum_act);
+		}
+
+		objcache_put(&chunkd_srv.actives, cep);
+		free(owner);
+		free(csum);
+		free(key_in);
+
+		free(fn);
+	}
+}
+
+static void chk_dbscan(struct chk_tls *tls)
+{
+	TCHDB *hdb = tls->arg->hdb;
+	void *kbuf;
+	int klen;
+	uint32_t *val_p;
+	int vlen;
+
+	tchdbiterinit(hdb);
+	while ((kbuf = tchdbiternext(hdb, &klen)) != NULL) {
+		if (!strcmp(kbuf, MDB_TABLE_ID)) {
+			free(kbuf);
+			continue;
+		}
+		val_p = tchdbget(hdb, kbuf, klen, &vlen);
+		if (!val_p) {
+			free(kbuf);
+			continue;
+		}
+		if (vlen != sizeof(int32_t)) {
+			applog(LOG_INFO, "table %s bad size %d", kbuf, vlen);
+			free(val_p);
+			free(kbuf);
+			continue;
+		}
+
+		chk_list_objs(tls, GUINT32_FROM_LE(*val_p));
+
+		free(val_p);
+		free(kbuf);
+	}
+}
+
+static gpointer chk_thread_func(gpointer data)
+{
+	struct chk_tls _tls;
+	struct chk_tls *tls;
+	time_t dt;
+	struct timespec sleepreq;
+
+	tls = &_tls;
+	memset(tls, 0, sizeof(struct chk_tls));
+	tls->arg = data;
+
+	for (;;) {
+		/*
+		 * Without the randomization, all systems in a low loaded
+		 * datacenter are virtually guaranteed to start checks in the
+		 * same time, blow fuses and/or disrupt applications.
+		 */
+		dt = tls->arg->period;
+		if (dt >= 3)
+			dt += rand() % (dt/3);
+		if (debugging)
+			applog(LOG_DEBUG, "chk: sleeping %lu s", dt);
+		sleepreq.tv_sec = dt;
+		sleepreq.tv_nsec = 0;
+		nanosleep(&sleepreq, NULL);
+
+		tls->stat_ok = 0;
+		tls->stat_conflict = 0;
+
+		chk_dbscan(tls);
+ 
+		if (debugging)
+			applog(LOG_INFO, "chk: done ok %d busy %d",
+			       tls->stat_ok,
+			       tls->stat_conflict);
+	}
+	return NULL;
+}
+
+/*
+ * Mind that we cannot have two threads scanning the master db,
+ * as long as Tokyo Cabinet embeds one and only one iterator into
+ * an instance of open database with tchdbiterinit().
+ */
+static struct chk_arg *thread;
+
+void chk_init()
+{
+	GError *error;
+	struct chk_arg *arg;
+	GThread *gthread;
+
+	if (!chunkd_srv.chk_period) {
+		if (debugging)
+			applog(LOG_DEBUG, "chk: disabled by configuration");
+		return;
+	}
+
+	arg = malloc(sizeof(struct chk_arg));
+	if (!arg) {
+		applog(LOG_ERR, "No core");
+		exit(1);
+	}
+	arg->period = chunkd_srv.chk_period;
+	arg->hdb = chunkd_srv.tbl_master;
+
+	gthread = g_thread_create(chk_thread_func, arg, FALSE, &error);
+	if (!gthread) {
+		applog(LOG_ERR, "Failed to start replication thread: %s",
+		       error->message);
+		exit(1);
+	}
+
+	// arg->gthread = gthread;
+	thread = arg;
+}
diff --git a/server/server.c b/server/server.c
index e955dc8..fddc556 100644
--- a/server/server.c
+++ b/server/server.c
@@ -1678,6 +1678,8 @@ int main (int argc, char *argv[])
 		goto err_out_cld;
 	}
 
+	chk_init();
+
 	/* set up server networking */
 	for (tmpl = chunkd_srv.listeners; tmpl; tmpl = tmpl->next) {
 		rc = net_open(tmpl->data);
diff --git a/test/.gitignore b/test/.gitignore
index a929882..dd7d312 100644
--- a/test/.gitignore
+++ b/test/.gitignore
@@ -5,6 +5,7 @@ large-object
 lotsa-objects
 nop
 objcache-unit
+selfcheck-unit
 
 .libs
 libtest.a
diff --git a/test/Makefile.am b/test/Makefile.am
index 84b4837..e1f7ac4 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -26,11 +26,12 @@ TESTS =				\
 	auth			\
 	large-object		\
 	lotsa-objects		\
+	selfcheck-unit		\
 	stop-daemon		\
 	clean-db
 
 check_PROGRAMS		= auth basic-object it-works large-object \
-			  lotsa-objects nop objcache-unit
+			  lotsa-objects nop objcache-unit selfcheck-unit
 
 TESTLDADD		= ../lib/libchunkdc.la	\
 			  libtest.a		\
@@ -42,6 +43,7 @@ it_works_LDADD		= $(TESTLDADD)
 large_object_LDADD	= $(TESTLDADD)
 lotsa_objects_LDADD	= $(TESTLDADD)
 nop_LDADD		= $(TESTLDADD)
+selfcheck_unit_LDADD	= $(TESTLDADD)
 
 objcache_unit_LDADD	= @GLIB_LIBS@
 
diff --git a/test/selfcheck-unit.c b/test/selfcheck-unit.c
new file mode 100644
index 0000000..7b6ca0d
--- /dev/null
+++ b/test/selfcheck-unit.c
@@ -0,0 +1,313 @@
+
+/*
+ * Copyright 2009 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define _GNU_SOURCE
+#include "chunkd-config.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <glib.h>
+#include <chunkc.h>
+#include "test.h"
+
+enum { BUFLEN = 8192 };
+
+struct config_context {
+	char *text;
+
+	char *path;
+	long period;
+};
+
+static void cfg_elm_start(GMarkupParseContext *context,
+			  const gchar	 *element_name,
+			  const gchar    **attribute_names,
+			  const gchar    **attribute_values,
+			  gpointer       user_data,
+			  GError         **error)
+{
+	;
+}
+
+static void cfg_elm_end(GMarkupParseContext *context,
+			const gchar	*element_name,
+			gpointer	user_data,
+			GError		**error)
+{
+	struct config_context *cc = user_data;
+	long n;
+
+	if (!strcmp(element_name, "Path")) {
+		OK(cc->text);
+		free(cc->path);
+		cc->path = cc->text;
+		cc->text = NULL;
+	} else if (!strcmp(element_name, "SelfCheckPeriod")) {
+		OK(cc->text);
+		n = strtol(cc->text, NULL, 10);
+		OK(n >= 0 && n < LONG_MAX);
+		cc->period = n;
+		free(cc->text);
+		cc->text = NULL;
+	} else {
+		free(cc->text);
+		cc->text = NULL;
+	}
+}
+
+static bool str_n_isspace(const char *s, size_t n)
+{
+	char c;
+	size_t i;
+
+	for (i = 0; i < n; i++) {
+		c = *s++;
+		if (!isspace(c))
+			return false;
+	}
+	return true;
+}
+
+static void cfg_elm_text(GMarkupParseContext *context,
+			 const gchar	*text,
+			 gsize		text_len,
+			 gpointer	user_data,
+			 GError		**error)
+{
+	struct config_context *cc = user_data;
+
+	free(cc->text);
+	if (str_n_isspace(text, text_len))
+		cc->text = NULL;
+	else
+		cc->text = g_strndup(text, text_len);
+}
+
+static const GMarkupParser cfg_parse_ops = {
+	.start_element		= cfg_elm_start,
+	.end_element		= cfg_elm_end,
+	.text			= cfg_elm_text,
+};
+
+static void read_config(struct config_context *cc)
+{
+	GMarkupParseContext* parser;
+	char *top, *cfg;
+	char *text;
+	gsize len;
+	int rc;
+
+	top = getenv("top_srcdir");
+	OK(top);
+
+	rc = asprintf(&cfg, "%s/test/" TEST_CHUNKD_CFG, top);
+	OK(rc > 0);
+
+	rc = g_file_get_contents(cfg, &text, &len, NULL);
+	OK(rc);
+
+	parser = g_markup_parse_context_new(&cfg_parse_ops, 0, cc, NULL);
+	OK(parser);
+
+	rc = g_markup_parse_context_parse(parser, text, len, NULL);
+	OK(rc);
+
+	g_markup_parse_context_free(parser);
+	free(text);
+	free(cfg);
+}
+
+static void hexstr(const unsigned char *buf, size_t buf_len, char *outstr)
+{
+	static const char hex[] = "0123456789abcdef";
+	int i;
+
+	for (i = 0; i < buf_len; i++) {
+		outstr[i * 2]       = hex[(buf[i] & 0xF0) >> 4];
+		outstr[(i * 2) + 1] = hex[(buf[i] & 0x0F)     ];
+	}
+
+	outstr[buf_len * 2] = 0;
+}
+
+#define MDB_TPATH_FMT	"%s/%X"
+
+static char *fs_obj_pathname(const char *path, uint32_t table_id,
+			     const void *key, size_t key_len)
+{
+	char *s = NULL;
+	char prefix[5];
+	unsigned char md[SHA256_DIGEST_LENGTH];
+	char mdstr[(SHA256_DIGEST_LENGTH * 2) + 1];
+	int rc;
+
+	if (!table_id || !key || !key_len)
+		return NULL;
+
+	SHA256(key, key_len, md);
+	hexstr(md, SHA256_DIGEST_LENGTH, mdstr);
+
+	memcpy(prefix, mdstr, 4);
+	prefix[4] = 0;
+
+	rc = asprintf(&s, MDB_TPATH_FMT "/%s/%s", path, table_id,
+		      prefix, mdstr + 4);
+	OK(rc != -1);
+
+	return s;
+}
+
+static bool be_file_verify(const char *fn)
+{
+	int fd;
+
+	/* stat(2) is nice and all, but whatever. */
+	fd = open(fn, O_RDONLY);
+	if (fd < 0)
+		return false;
+	close(fd);
+	return true;
+}
+
+static bool be_file_damage(const char *fn)
+{
+	int fd;
+	char buf[1];
+	ssize_t rcs;
+	off_t rco;
+
+	fd = open(fn, O_WRONLY);
+	if (fd < 0)
+		return false;
+
+	/*
+	 * This puts the damage at data size minus the mysterious header size.
+	 */
+	rco = lseek(fd, BUFLEN, SEEK_SET);
+	if (rco == (off_t)-1) {
+		close(fd);
+		return false;
+	}
+
+	buf[0] = 0;
+	rcs = write(fd, buf, 1);
+	if (rcs <= 0) {
+		close(fd);
+		return false;
+	}
+
+	close(fd);
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	static char key[] = "selfcheck-test-key";
+	int port;
+	char *buf;
+	struct config_context ctx;
+	struct st_client *stc;
+	char *fn;
+	size_t len;
+	void *mem;
+	bool rcb;
+
+	setlocale(LC_ALL, "C");
+
+	stc_init();
+	SSL_library_init();
+
+	buf = malloc(BUFLEN);
+	OK(buf);
+	memset(buf, 0x55, BUFLEN);
+
+	port = stc_readport(TEST_PORTFILE);
+	OK(port > 0);
+
+	/*
+	 * Step 0: read and parse the configuration.
+	 */
+	memset(&ctx, 0, sizeof(struct config_context));
+	read_config(&ctx);
+	OK(ctx.path);		/* must have a path */
+	OK(ctx.period);		/* must have an explicit SelfCheckPeriod */
+	OK(ctx.period <= 100);	/* A minute is already too long for tests */
+
+	/*
+	 * Step 1: create the object
+	 */
+	stc = stc_new(TEST_HOST, port, TEST_USER, TEST_USER_KEY, false);
+	OK(stc);
+	rcb = stc_table_openz(stc, TEST_TABLE, 0);
+	OK(rcb);
+	rcb = stc_put_inline(stc, key, sizeof(key), buf, BUFLEN, 0);
+	OK(rcb);
+	stc_free(stc);
+
+	/*
+	 * Step 2: verify the back-end file is created
+	 * N.B. We guess the tabled ID to be 1, sice all tests use the same
+	 *      table and they are numbered sequentially on a fresh DB.
+	 */
+	fn = fs_obj_pathname(ctx.path, 1, key, sizeof(key));
+	OK(fn);
+	rcb = be_file_verify(fn);
+	OK(rcb);
+
+	/*
+	 * Step 3: damage the back-end file and wait
+	 * We sleep for longer than the self-check period because:
+	 * 1) the server sleeps for up to period*1.5
+	 * 2) the server may be quite busy walking numerous objects
+	 * 3) the build system may be overloaded
+	 */
+	rcb = be_file_damage(fn);
+	OK(rcb);
+	sleep(ctx.period * 3);
+
+	/*
+	 * Step 4: verify that the damaged object is removed
+	 * This is, strictly speaking, not necessary. The true test is
+	 * trying to access the keyed object through the chunkserver's API.
+	 * But since we have the function already, might as well use it.
+	 */
+	rcb = be_file_verify(fn);
+	OK(!rcb);
+
+	free(fn);
+	fn = NULL;
+
+	/*
+	 * Step 5: verify that we didn't crash the chunkserver and that
+	 * the object we created and damaged is not considered present anymore.
+	 */
+	stc = stc_new(TEST_HOST, port, TEST_USER, TEST_USER_KEY, false);
+	OK(stc);
+	rcb = stc_table_openz(stc, TEST_TABLE, 0);
+	OK(rcb);
+	mem = stc_get_inline(stc, key, sizeof(key), &len);
+	OK(!mem);
+	stc_free(stc);
+
+	return 0;
+}
diff --git a/test/server-test.cfg b/test/server-test.cfg
index 80833d2..a487556 100644
--- a/test/server-test.cfg
+++ b/test/server-test.cfg
@@ -27,3 +27,5 @@
   <Host>localhost</Host>
 </CLD>
 
+<SelfCheckPeriod>35</SelfCheckPeriod>
+
diff --git a/test/test.h b/test/test.h
index c3c48b7..75aa250 100644
--- a/test/test.h
+++ b/test/test.h
@@ -38,6 +38,8 @@
 #define TEST_PORTFILE		"chunkd.port"
 #define TEST_PORTFILE_SSL	"chunkd-ssl.port"
 
+#define TEST_CHUNKD_CFG		"server-test.cfg"
+
 #define OK(expr)				\
 	do {					\
 		if (!(expr)) {			\

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [Patch 4/4] chunkd: add self-checking
  2009-12-27 23:59 [Patch 4/4] chunkd: add self-checking Pete Zaitcev
@ 2010-01-12 14:21 ` Jeff Garzik
  2010-01-12 16:52   ` Pete Zaitcev
  0 siblings, 1 reply; 3+ messages in thread
From: Jeff Garzik @ 2010-01-12 14:21 UTC (permalink / raw)
  To: Pete Zaitcev; +Cc: Project Hail List

On 12/27/2009 06:59 PM, Pete Zaitcev wrote:
> This patch adds a background process that periodically verifies the
> integrity of all known objects. Objects that are found found faulty
> are made invisible to applications. This way, only known good objects
> remain visible and applications may implement integrity checking that
> is run asynchronously with the self-checking. The intent is to let
> the performance of checking to scale with the number of objects and
> amount of data stored in the cell.
>
> Signed-off-by: Pete Zaitcev<zaitcev@redhat.com>
>
> ---
>   server/Makefile.am    |    2
>   server/be-fs.c        |  126 +++++++++++++++-
>   server/chunkd.h       |    8 +
>   server/config.c       |   28 +++
>   server/selfcheck.c    |  208 ++++++++++++++++++++++++++
>   server/server.c       |    2
>   test/.gitignore       |    1
>   test/Makefile.am      |    4
>   test/selfcheck-unit.c |  313 ++++++++++++++++++++++++++++++++++++++++
>   test/server-test.cfg  |    2
>   test/test.h           |    2
>   11 files changed, 693 insertions(+), 3 deletions(-)

OK.  The objcache (patches 3 & 5) look good.  I was thinking that the 
"table of open files" was better suited for server/be-fs.c, thus keeping 
the backend / frontend separation a bit more distinct.  However, this is 
a matter of taste, and I can also see advantages in how you implemented 
it.  So, no changes necessary, we'll how that works out.

The reason why I think about backend / frontend separation, when we only 
have one storage backend, "the filesystem", is that I see a lot of merit 
in a second type of backend:  single-file (aka raw block device mode). 
I don't want to stray too far from being able to implement a second type 
of storage backend.  On the flip side, of course, I don't want imaginary 
future ideas to constraint us in the present.

On to this patch, self-checking.  Regarding this patch's implementation, 
the code appears to be correct.  However, I wanted to discuss the design 
a little bit.

I am definitely concerned that the current implementation does not 
really scale at all with the datastore.  With this change, the admin 
would have to guess at the self-check period, with not much feedback 
from the system about how long a scan takes, when a scan begins, or when 
a scan ends.  I also think a

	sleep(n)
	self_check()

algorithm seems less useful to the average admin than a slightly more 
complex one that solves the problem defined as "guarantee an object is 
checked at least every N days."  Because, as the dataset grows beyond a 
test database measures in megabytes, the dataset scan time will dwarf 
the self-check sleep period.  The behavior becomes one of constant 
scanning, with a tiny period of rest in between.

It seems like either (a) tracking the total dataset size and total 
dataset scan time, or (b) tracking and modifying per-object 
last-self-check times, would be needed.

Also of relevance to admins is scan start time.  This patch would have a 
scan begin at essentially random times throughout the day or night, in 
particular occurring during the most heavily-trafficked portion of the 
day.  An algorithm that occurs on hour N every day (or hour N, day M of 
each week) is much more predictable, regardless of dataset size.  That 
would be pretty easy:

	while (1)
		calc time until next desired interval
			(example: day 0, hour 5 (every sunday at 5am))
		timer_init()
		timer_add(scan_timer)

	scan_timer()
		if (scan thread exists)
			log ("still scanning")
			return
		start scan thread

	scan_thread()
		dbscan()
		end thread (do not loop)

Another option is to add an administrative command "START SCAN", and 
permit an external utility, scheduled by cron or executed by "make 
check", to be the entity that starts the scan thread in the background. 
  That would permit maximum flexibility for both the admin and our 
testsuite.

Comments on all this?  I will happily apply the objcache stuff once all 
the self-checking details are settled.

	Jeff



^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [Patch 4/4] chunkd: add self-checking
  2010-01-12 14:21 ` Jeff Garzik
@ 2010-01-12 16:52   ` Pete Zaitcev
  0 siblings, 0 replies; 3+ messages in thread
From: Pete Zaitcev @ 2010-01-12 16:52 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Project Hail List

On Tue, 12 Jan 2010 09:21:24 -0500
Jeff Garzik <jeff@garzik.org> wrote:

> 	sleep(n)
> 	self_check()
> 
> algorithm seems less useful to the average admin than a slightly more 
> complex one that solves the problem defined as "guarantee an object is 
> checked at least every N days."  Because, as the dataset grows beyond a 
> test database measures in megabytes, the dataset scan time will dwarf 
> the self-check sleep period.  The behavior becomes one of constant 
> scanning, with a tiny period of rest in between.

That's obvious. You also forgot to recall that your "fat" node
exhacerbates the problem, but conversely, splitting them up helps.
The single-threaded design is on purpose: it provides a crude
method of load control. But that aside, two things about your
scheme:
 - how do you select N? It is no different from n in its arbitrariness.
 - "as the data set grows beyond a test database", it is going to
   take more work to satisfy N.
What you are proposing is actually no different. It is not more
adaptive.

> It seems like either (a) tracking the total dataset size and total 
> dataset scan time, or (b) tracking and modifying per-object 
> last-self-check times, would be needed.

Well, that would be nice.

Still, we are powerless to keep the scan time down when the dataset
grows. At best, we can sacrificy the mean detection time and constrain
the power that the scan consumes.

I'm thinking about doing just that actually. But first I'm going to
implement a reporting scheme in tabled.

> Also of relevance to admins is scan start time.  This patch would have a 
> scan begin at essentially random times throughout the day or night, in 
> particular occurring during the most heavily-trafficked portion of the 
> day.  An algorithm that occurs on hour N every day (or hour N, day M of 
> each week) is much more predictable, regardless of dataset size.

So, you want the mean time to detect to be at least 12 hours.
I can do that if you insist.

> Another option is to add an administrative command "START SCAN", and 
> permit an external utility, scheduled by cron or executed by "make 
> check", to be the entity that starts the scan thread in the background. 
>   That would permit maximum flexibility for both the admin and our 
> testsuite.

We can do that too, I guess. Just don't know when.

-- Pete

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2010-01-12 16:52 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-27 23:59 [Patch 4/4] chunkd: add self-checking Pete Zaitcev
2010-01-12 14:21 ` Jeff Garzik
2010-01-12 16:52   ` Pete Zaitcev

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.