* [PATCH] chunkd: add support for multiple key/value tables
@ 2009-11-10 11:24 Jeff Garzik
2009-11-10 16:33 ` Pete Zaitcev
0 siblings, 1 reply; 6+ messages in thread
From: Jeff Garzik @ 2009-11-10 11:24 UTC (permalink / raw)
To: hail-devel
This is fully fleshed out and working, but I have not committed it yet,
in case there is feedback or major objections.
The following patch presents the last major chunkd conceptual change
I felt was needed in the chunkd API: multiple key/value tables.
Applications and users will often want to be partitioned away from
each other, or simply have a need for something other than simply a
flat namespace. With this change, chunkd's API is now much like Amazon
S3, which has a shared namespace for buckets, and then each bucket has
its own namespace.
This is made possible with a single API addition, TABLE OPEN.
Here is what a chunkd session now looks like, with this patch:
LOGIN(user="jgarzik")
TABLE-OPEN(name="tabled")
GET...
GET...
GET...
PUT...
PUT...
PUT...
The corresponding C API call is stc_table_open().
With this change, some of my upper layer projects (NFS, SQL) become
more manageable. This change enables me to dedicate several tables
to each application and/or user.
configure.ac | 1
doc/chcli.cfg | 3 +
include/chunk_msg.h | 4 +
include/chunkc.h | 8 ++
lib/chunkdc.c | 42 ++++++++++++++
server/Makefile.am | 6 +-
server/be-fs.c | 152 ++++++++++++++++++++++++++++++++++++++++++++-------
server/chunkd.h | 15 +++--
server/object.c | 7 +-
server/server.c | 64 +++++++++++++++++++--
test/auth.c | 6 ++
test/basic-object.c | 3 +
test/it-works.c | 9 +++
test/large-object.c | 3 +
test/lotsa-objects.c | 3 +
test/nop.c | 3 +
test/test.h | 2
tools/chcli.c | 38 +++++++++++-
18 files changed, 332 insertions(+), 37 deletions(-)
diff --git a/configure.ac b/configure.ac
index 8c32383..f54cf27 100644
--- a/configure.ac
+++ b/configure.ac
@@ -79,6 +79,7 @@ AC_CHECK_LIB(event, event_base_new, EVENT_LIBS=-levent,
AC_CHECK_LIB(argp, argp_parse, ARGP_LIBS=-largp)
AC_CHECK_LIB(socket, bind, SOCKET_LIBS=-lsocket)
PKG_CHECK_MODULES(CLDC, libcldc)
+PKG_CHECK_MODULES(TOKYOCABINET, tokyocabinet)
dnl -----------------------------
dnl Check for cld program, used
diff --git a/doc/chcli.cfg b/doc/chcli.cfg
index c27b956..63ef148 100644
--- a/doc/chcli.cfg
+++ b/doc/chcli.cfg
@@ -9,6 +9,9 @@
## provide the host:port pair of the chunkd service
# host=127.0.0.1:9191
+## provide the initial table to open and communicate with
+# table=my_table_name
+
## provide the username and secret key password for authentication.
## password is ready from CHCLI_PASSWORD env var, if not supplied here.
# username=guest
diff --git a/include/chunk_msg.h b/include/chunk_msg.h
index 90272ca..4d3d208 100644
--- a/include/chunk_msg.h
+++ b/include/chunk_msg.h
@@ -21,6 +21,8 @@ enum chunksrv_ops {
CHO_DEL = 4,
CHO_LIST = 5,
CHO_LOGIN = 6,
+ CHO_TABLE_OPEN = 7,
+ CHO_TABLE_DEL = 8,
};
enum chunk_errcode {
@@ -32,10 +34,12 @@ enum chunk_errcode {
che_NoSuchKey = 5,
che_SignatureDoesNotMatch = 6,
che_InvalidKey = 7,
+ che_InvalidTable = 8,
};
enum chunk_flags {
CHF_SYNC = (1 << 0), /* force write to media */
+ CHF_TABLE_NEW = (1 << 1), /* create table */
};
struct chunksrv_req {
diff --git a/include/chunkc.h b/include/chunkc.h
index 768eecd..04cbd87 100644
--- a/include/chunkc.h
+++ b/include/chunkc.h
@@ -44,6 +44,8 @@ extern void stc_init(void);
extern struct st_client *stc_new(const char *service_host, int port,
const char *user, const char *secret_key,
bool encrypt);
+extern bool stc_table_open(struct st_client *stc, const void *key, size_t key_len,
+ uint32_t flags);
extern bool stc_get(struct st_client *stc, const void *key, size_t key_len,
size_t (*write_cb)(void *, size_t, size_t, void *),
@@ -103,4 +105,10 @@ static inline bool stc_delz(struct st_client *stc, const char *key)
return stc_del(stc, key, strlen(key) + 1);
}
+static inline bool stc_table_openz(struct st_client *stc, const char *key,
+ uint32_t flags)
+{
+ return stc_table_open(stc, key, strlen(key) + 1, flags);
+}
+
#endif /* __STC_H__ */
diff --git a/lib/chunkdc.c b/lib/chunkdc.c
index 1597e91..c9606a8 100644
--- a/lib/chunkdc.c
+++ b/lib/chunkdc.c
@@ -453,6 +453,48 @@ size_t stc_get_recv(struct st_client *stc, void *data, size_t data_len)
return done_cnt;
}
+bool stc_table_open(struct st_client *stc, const void *key, size_t key_len,
+ uint32_t flags)
+{
+ struct chunksrv_resp resp;
+ struct chunksrv_req *req = (struct chunksrv_req *) stc->req_buf;
+
+ if (stc->verbose)
+ fprintf(stderr, "libstc: TABLE OPEN(%u, %u)\n",
+ (unsigned int) key_len,
+ flags);
+
+ if (!key_valid(key, key_len))
+ return false;
+
+ /* initialize request */
+ req_init(stc, req);
+ req->op = CHO_TABLE_OPEN;
+ req->flags = (flags & CHF_TABLE_NEW);
+ req_set_key(req, key, key_len);
+
+ /* sign request */
+ chreq_sign(req, stc->key, req->sig);
+
+ /* write request */
+ if (!net_write(stc, req, req_len(req)))
+ return false;
+
+ /* read response header */
+ if (!net_read(stc, &resp, sizeof(resp)))
+ return false;
+
+ /* check response code */
+ if (resp.resp_code != che_Success) {
+ if (stc->verbose)
+ fprintf(stderr, "TABLE OPEN resp code: %d\n",
+ resp.resp_code);
+ return false;
+ }
+
+ return true;
+}
+
bool stc_put(struct st_client *stc, const void *key, size_t key_len,
size_t (*read_cb)(void *, size_t, size_t, void *),
uint64_t len, void *user_data, uint32_t flags)
diff --git a/server/Makefile.am b/server/Makefile.am
index 70fd066..7589a38 100644
--- a/server/Makefile.am
+++ b/server/Makefile.am
@@ -1,5 +1,6 @@
-INCLUDES = -I$(top_srcdir)/include @GLIB_CFLAGS@ @CLDC_CFLAGS@
+INCLUDES = -I$(top_srcdir)/include @GLIB_CFLAGS@ @CLDC_CFLAGS@ \
+ @TOKYOCABINET_CFLAGS@
sbin_PROGRAMS = chunkd
@@ -8,4 +9,5 @@ chunkd_SOURCES = chunkd.h \
be-fs.c object.c server.c config.c cldu.c util.c
chunkd_LDADD = \
@CLDC_LIBS@ @GLIB_LIBS@ @CRYPTO_LIBS@ \
- @SSL_LIBS@ @EVENT_LIBS@ @ARGP_LIBS@ @SOCKET_LIBS@
+ @SSL_LIBS@ @EVENT_LIBS@ @ARGP_LIBS@ @SOCKET_LIBS@ \
+ @TOKYOCABINET_LIBS@
diff --git a/server/be-fs.c b/server/be-fs.c
index fb301b8..48ce698 100644
--- a/server/be-fs.c
+++ b/server/be-fs.c
@@ -16,10 +16,14 @@
#include <string.h>
#include <errno.h>
#include <syslog.h>
+#include <tcutil.h>
+#include <tchdb.h>
#include "chunkd.h"
#define BE_NAME "fs"
+#define MDB_TABLE_ID "__chunkd_table_id"
+
struct fs_obj {
struct backend_obj bo;
@@ -37,6 +41,103 @@ struct be_fs_obj_hdr {
uint32_t key_len;
};
+bool fs_table_open(const char *user, const void *kbuf, size_t klen,
+ bool create_tbl, uint32_t *table_id,
+ enum chunk_errcode *err_code)
+{
+ TCHDB *hdb;
+ char *db_fn = NULL, *table_path = NULL;
+ int omode, osize = 0, next_num;
+ bool rc = false;
+ uint32_t *val_p, table_id_le;
+
+ *err_code = che_InternalError;
+
+ /* validate table name */
+ if (klen < 1 || klen > CHD_KEY_SZ ||
+ (klen >= strlen(MDB_TABLE_ID) &&
+ !memcmp(kbuf, MDB_TABLE_ID, strlen(MDB_TABLE_ID)))) {
+ *err_code = che_InvalidArgument;
+ return false;
+ }
+
+ /*
+ * open master database
+ */
+ if (asprintf(&db_fn, "%s/master.tch", chunkd_srv.vol_path) < 0)
+ return false;
+
+ hdb = tchdbnew();
+ if (!hdb)
+ goto out;
+
+ omode = HDBOREADER | HDBONOLCK;
+ if (create_tbl)
+ omode |= HDBOWRITER | HDBOCREAT | HDBOTSYNC;
+ if (!tchdbopen(hdb, db_fn, omode)) {
+ applog(LOG_ERR, "failed to open master table %s", db_fn);
+ goto out_hdb;
+ }
+
+ /*
+ * lookup table name. if found, return immediately
+ */
+ val_p = tchdbget(hdb, kbuf, klen, &osize);
+ if (val_p) {
+ if (create_tbl) {
+ *err_code = che_InvalidArgument;
+ goto out_close;
+ }
+
+ *table_id = GUINT32_FROM_LE(*val_p);
+ goto out_ok;
+ }
+
+ /*
+ * otherwise, we now begin the process of table creation
+ */
+
+ if (!create_tbl) {
+ *err_code = che_InvalidArgument;
+ goto out_close;
+ }
+
+ /* allocate unique integer id for table */
+ next_num = tchdbaddint(hdb, MDB_TABLE_ID, strlen(MDB_TABLE_ID)+1, 1);
+ if (next_num == INT_MIN)
+ goto out_close;
+
+ *table_id = next_num;
+ table_id_le = GUINT32_TO_LE(next_num);
+
+ /*
+ * create table directory, $BASE_PATH/table-id
+ */
+ if (asprintf(&table_path, "%s/%d", chunkd_srv.vol_path, next_num) < 0)
+ goto out_close;
+
+ if ((mkdir(table_path, 0777) < 0) && (errno != EEXIST)) {
+ applog(LOG_ERR, "mkdir(%s): %s", table_path, strerror(errno));
+ goto out_close;
+ }
+
+ /* finally, store in table_name->table_id map */
+ if (!tchdbput(hdb, kbuf, klen, &table_id_le, sizeof(table_id_le)))
+ goto out_close;
+
+out_ok:
+ *err_code = che_Success;
+ rc = true;
+out_close:
+ tchdbclose(hdb);
+out_hdb:
+ tchdbdel(hdb);
+out:
+ free(db_fn);
+ free(table_path);
+ return rc;
+}
+
static struct fs_obj *fs_obj_alloc(void)
{
struct fs_obj *obj;
@@ -53,7 +154,7 @@ static struct fs_obj *fs_obj_alloc(void)
return obj;
}
-static char *fs_obj_pathname(const void *key, size_t key_len)
+static char *fs_obj_pathname(uint32_t table_id,const void *key, size_t key_len)
{
char *s = NULL;
char prefix[5] = "";
@@ -62,19 +163,23 @@ static char *fs_obj_pathname(const void *key, size_t key_len)
unsigned char md[SHA256_DIGEST_LENGTH];
char mdstr[(SHA256_DIGEST_LENGTH * 2) + 1];
+ if (!table_id || !key || !key_len)
+ return NULL;
+
SHA256(key, key_len, md);
hexstr(md, SHA256_DIGEST_LENGTH, mdstr);
memcpy(prefix, mdstr, 4);
- slen = strlen(chunkd_srv.vol_path) + 1 +
- strlen(prefix) + 1 +
- strlen(mdstr) + 1;
+ slen = strlen(chunkd_srv.vol_path) + 1 + /* volume */
+ 16 + /* table id */
+ strlen(prefix) + 1 + /* prefix */
+ strlen(mdstr) + 1; /* filename */
s = malloc(slen);
if (!s)
return NULL;
- sprintf(s, "%s/%s", chunkd_srv.vol_path, prefix);
+ sprintf(s, "%s/%u/%s", chunkd_srv.vol_path, table_id, prefix);
/* create subdir on the fly, if not already exists */
if (stat(s, &st) < 0) {
@@ -97,7 +202,8 @@ static char *fs_obj_pathname(const void *key, size_t key_len)
goto err_out;
}
- sprintf(s, "%s/%s/%s", chunkd_srv.vol_path, prefix, mdstr + 4);
+ sprintf(s, "%s/%u/%s/%s", chunkd_srv.vol_path, table_id,
+ prefix, mdstr + 4);
return s;
@@ -114,7 +220,8 @@ static bool key_valid(const void *key, size_t key_len)
return true;
}
-struct backend_obj *fs_obj_new(const void *key, size_t key_len,
+struct backend_obj *fs_obj_new(uint32_t table_id,
+ const void *key, size_t key_len,
enum chunk_errcode *err_code)
{
struct fs_obj *obj;
@@ -136,7 +243,7 @@ struct backend_obj *fs_obj_new(const void *key, size_t key_len,
}
/* build local fs pathname */
- fn = fs_obj_pathname(key, key_len);
+ fn = fs_obj_pathname(table_id, key, key_len);
if (!fn) {
applog(LOG_ERR, "OOM in object_put");
*err_code = che_InternalError;
@@ -194,8 +301,9 @@ err_out:
return NULL;
}
-struct backend_obj *fs_obj_open(const char *user, const void *key,
- size_t key_len, enum chunk_errcode *err_code)
+struct backend_obj *fs_obj_open(uint32_t table_id, const char *user,
+ const void *key, size_t key_len,
+ enum chunk_errcode *err_code)
{
struct fs_obj *obj;
struct stat st;
@@ -214,7 +322,7 @@ struct backend_obj *fs_obj_open(const char *user, const void *key,
}
/* build local fs pathname */
- obj->in_fn = fs_obj_pathname(key, key_len);
+ obj->in_fn = fs_obj_pathname(table_id, key, key_len);
if (!obj->in_fn) {
*err_code = che_InternalError;
goto err_out;
@@ -457,7 +565,8 @@ bool fs_obj_write_commit(struct backend_obj *bo, const char *user,
return true;
}
-bool fs_obj_delete(const char *user, const void *key, size_t key_len,
+bool fs_obj_delete(uint32_t table_id, const char *user,
+ const void *key, size_t key_len,
enum chunk_errcode *err_code)
{
char *fn = NULL;
@@ -473,7 +582,7 @@ bool fs_obj_delete(const char *user, const void *key, size_t key_len,
}
/* build local fs pathname */
- fn = fs_obj_pathname(key, key_len);
+ fn = fs_obj_pathname(table_id, key, key_len);
if (!fn)
goto err_out;
@@ -532,18 +641,22 @@ err_out:
return false;
}
-GList *fs_list_objs(const char *user)
+GList *fs_list_objs(uint32_t table_id, const char *user)
{
GList *res = NULL;
struct dirent *de, *root_de;
DIR *d, *root;
- char *sub;
+ char *sub, *table_path = NULL;
+
+ sub = alloca(strlen(chunkd_srv.vol_path) + 1 + 16 + 4 + 1);
- sub = alloca(strlen(chunkd_srv.vol_path) + 1 + 4 + 1);
+ if (asprintf(&table_path, "%s/%u", chunkd_srv.vol_path, table_id) < 0)
+ return NULL;
- root = opendir(chunkd_srv.vol_path);
+ root = opendir(table_path);
if (!root) {
- syslogerr(chunkd_srv.vol_path);
+ syslogerr(table_path);
+ free(table_path);
return NULL;
}
@@ -555,7 +668,7 @@ GList *fs_list_objs(const char *user)
if (strlen(root_de->d_name) != 4)
continue;
- sprintf(sub, "%s/%s", chunkd_srv.vol_path, root_de->d_name);
+ sprintf(sub, "%s/%s", table_path, root_de->d_name);
d = opendir(sub);
if (!d) {
syslogerr(sub);
@@ -688,6 +801,7 @@ GList *fs_list_objs(const char *user)
closedir(root);
+ free(table_path);
return res;
}
diff --git a/server/chunkd.h b/server/chunkd.h
index d6b37c6..2058144 100644
--- a/server/chunkd.h
+++ b/server/chunkd.h
@@ -78,6 +78,9 @@ struct client {
char user[CHD_USER_SZ + 1];
+ size_t table_len;
+ uint32_t table_id;
+
SSL *ssl;
bool read_want_write;
bool write_want_read;
@@ -107,6 +110,7 @@ struct client {
char netbuf[CLI_DATA_BUF_SZ];
char netbuf_out[CLI_DATA_BUF_SZ];
char key[CHD_KEY_SZ];
+ char table[CHD_KEY_SZ];
};
struct backend_obj {
@@ -192,9 +196,9 @@ struct server {
};
/* be-fs.c */
-extern struct backend_obj *fs_obj_new(const void *kbuf, size_t klen,
+extern struct backend_obj *fs_obj_new(uint32_t table_id, const void *kbuf, size_t klen,
enum chunk_errcode *err_code);
-extern struct backend_obj *fs_obj_open(const char *user,
+extern struct backend_obj *fs_obj_open(uint32_t table_id, const char *user,
const void *kbuf, size_t klen,
enum chunk_errcode *err_code);
extern ssize_t fs_obj_write(struct backend_obj *bo, const void *ptr, size_t len);
@@ -202,11 +206,14 @@ extern ssize_t fs_obj_read(struct backend_obj *bo, void *ptr, size_t len);
extern void fs_obj_free(struct backend_obj *bo);
extern bool fs_obj_write_commit(struct backend_obj *bo, const char *user,
const char *hashstr, bool sync_data);
-extern bool fs_obj_delete(const char *user,
+extern bool fs_obj_delete(uint32_t table_id, const char *user,
const void *kbuf, size_t klen,
enum chunk_errcode *err_code);
-extern GList *fs_list_objs(const char *user);
extern ssize_t fs_obj_sendfile(struct backend_obj *bo, int out_fd, size_t len);
+extern GList *fs_list_objs(uint32_t table_id, const char *user);
+extern bool fs_table_open(const char *user, const void *kbuf, size_t klen,
+ bool create_tbl, uint32_t *table_id,
+ enum chunk_errcode *err_code);
/* object.c */
extern bool object_del(struct client *cli);
diff --git a/server/object.c b/server/object.c
index 23b0aa9..027ce2b 100644
--- a/server/object.c
+++ b/server/object.c
@@ -30,7 +30,8 @@ bool object_del(struct client *cli)
resp_init_req(resp, &cli->creq);
- rcb = fs_obj_delete(cli->user, cli->key, cli->key_len, &err);
+ rcb = fs_obj_delete(cli->table_id, cli->user,
+ cli->key, cli->key_len, &err);
if (!rcb)
return cli_err(cli, err, true);
@@ -196,7 +197,7 @@ bool object_put(struct client *cli)
if (!user)
return cli_err(cli, che_AccessDenied, true);
- cli->out_bo = fs_obj_new(cli->key, cli->key_len, &err);
+ cli->out_bo = fs_obj_new(cli->table_id, cli->key, cli->key_len, &err);
if (!cli->out_bo)
return cli_err(cli, err, true);
@@ -286,7 +287,7 @@ bool object_get(struct client *cli, bool want_body)
resp_init_req(&get_resp->resp, &cli->creq);
- cli->in_obj = obj = fs_obj_open(cli->user, cli->key,
+ cli->in_obj = obj = fs_obj_open(cli->table_id, cli->user, cli->key,
cli->key_len, &err);
if (!obj) {
free(get_resp);
diff --git a/server/server.c b/server/server.c
index 2d9095f..a145706 100644
--- a/server/server.c
+++ b/server/server.c
@@ -117,6 +117,10 @@ static struct {
[che_InvalidKey] =
{ "che_InvalidKey", 400,
"Invalid key presented" },
+
+ [che_InvalidTable] =
+ { "che_InvalidTable", 400,
+ "Invalid table requested, or table not open" },
};
void applog(int prio, const char *fmt, ...)
@@ -728,7 +732,7 @@ static bool volume_list(struct client *cli)
bool rcb;
GList *res = NULL;
- res = fs_list_objs(cli->user);
+ res = fs_list_objs(cli->table_id, cli->user);
s = g_markup_printf_escaped(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n"
@@ -784,6 +788,23 @@ static bool volume_list(struct client *cli)
return rcb;
}
+static bool volume_open(struct client *cli)
+{
+ enum chunk_errcode err = che_Success;
+
+ if (!fs_table_open(cli->user, cli->key, cli->key_len,
+ (cli->creq.flags & CHF_TABLE_NEW),
+ &cli->table_id, &err))
+ goto out;
+
+ memset(cli->table, 0, sizeof(cli->table));
+ memcpy(cli->table, cli->key, cli->key_len);
+ cli->table_len = cli->key_len;
+
+out:
+ return cli_err(cli, err, true);
+}
+
static bool authcheck(const struct chunksrv_req *req, const void *key,
size_t key_len, const char *secret_key)
{
@@ -854,6 +875,8 @@ static const char *op2str(enum chunksrv_ops op)
case CHO_DEL: return "CHO_DEL";
case CHO_LIST: return "CHO_LIST";
case CHO_LOGIN: return "CHO_LOGIN";
+ case CHO_TABLE_OPEN: return "CHO_TABLE_OPEN";
+ case CHO_TABLE_DEL: return "CHO_TABLE_DEL";
default:
return "BUG/UNKNOWN!";
@@ -867,14 +890,13 @@ static bool cli_evt_exec_req(struct client *cli, unsigned int events)
{
struct chunksrv_req *req = &cli->creq;
bool rcb;
- enum chunk_errcode err;
+ enum chunk_errcode err = che_InvalidArgument;
bool logged_in = (cli->user[0] != 0);
+ bool have_table = (cli->table_len > 0);
/* validate request header */
- if (!valid_req_hdr(req)) {
- err = che_InvalidArgument;
+ if (!valid_req_hdr(req))
goto err_out;
- }
if (debugging)
applog(LOG_DEBUG, "REQ(op %s, key %s (%u), user %s) "
@@ -905,10 +927,31 @@ static bool cli_evt_exec_req(struct client *cli, unsigned int events)
}
/*
+ * verify open-table requirement, for the operations that need it
+ */
+ switch (req->op) {
+ case CHO_GET:
+ case CHO_GET_META:
+ case CHO_PUT:
+ case CHO_DEL:
+ case CHO_LIST:
+ if (!have_table) {
+ err = che_InvalidTable;
+ goto err_out;
+ }
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+
+ /*
* operations on objects
*/
switch (req->op) {
case CHO_LOGIN:
+ if (logged_in)
+ goto err_out;
rcb = login_user(cli);
break;
case CHO_NOP:
@@ -929,6 +972,17 @@ static bool cli_evt_exec_req(struct client *cli, unsigned int events)
case CHO_LIST:
rcb = volume_list(cli);
break;
+ case CHO_TABLE_OPEN:
+ rcb = volume_open(cli);
+ break;
+ case CHO_TABLE_DEL:
+#if 0
+ /* not implemented yet */
+ rcv = volume_del(cli);
+#else
+ rcb = cli_err(cli, che_InternalError, true);
+#endif
+ break;
default:
rcb = cli_err(cli, che_InvalidURI, true);
break;
diff --git a/test/auth.c b/test/auth.c
index 232efa7..ae28620 100644
--- a/test/auth.c
+++ b/test/auth.c
@@ -29,9 +29,15 @@ static void test(bool encrypt)
stc1 = stc_new(TEST_HOST, port, TEST_USER, TEST_USER_KEY, encrypt);
OK(stc1);
+ rcb = stc_table_openz(stc1, TEST_TABLE, 0);
+ OK(rcb);
+
stc2 = stc_new(TEST_HOST, port, TEST_USER2, TEST_USER2_KEY, encrypt);
OK(stc2);
+ rcb = stc_table_openz(stc2, TEST_TABLE, 0);
+ OK(rcb);
+
/* store object 1 */
rcb = stc_put_inlinez(stc1, key1, val1, strlen(val1), 0);
OK(rcb);
diff --git a/test/basic-object.c b/test/basic-object.c
index 8f4c040..05b8630 100644
--- a/test/basic-object.c
+++ b/test/basic-object.c
@@ -27,6 +27,9 @@ static void test(bool encrypt)
stc = stc_new(TEST_HOST, port, TEST_USER, TEST_USER_KEY, encrypt);
OK(stc);
+ rcb = stc_table_openz(stc, TEST_TABLE, 0);
+ OK(rcb);
+
/* store object */
rcb = stc_put_inlinez(stc, key, val, strlen(val), 0);
OK(rcb);
diff --git a/test/it-works.c b/test/it-works.c
index ef923d8..87233bc 100644
--- a/test/it-works.c
+++ b/test/it-works.c
@@ -21,6 +21,15 @@ static void test(bool ssl)
stc = stc_new(TEST_HOST, port, TEST_USER, TEST_USER_KEY, ssl);
OK(stc);
+ /*
+ * we must supply CHF_TABLE_NEW on the first iteration of
+ * this test, because we are the first test in the testsuite,
+ * and must create the database to be used by all other tests.
+ */
+ rcb = stc_table_openz(stc, TEST_TABLE,
+ ssl ? 0 : CHF_TABLE_NEW);
+ OK(rcb);
+
rcb = stc_ping(stc);
OK(rcb);
diff --git a/test/large-object.c b/test/large-object.c
index f49ca64..f0884b8 100644
--- a/test/large-object.c
+++ b/test/large-object.c
@@ -93,6 +93,9 @@ static void test(bool encrypt)
stc = stc_new(TEST_HOST, port, TEST_USER, TEST_USER_KEY, encrypt);
OK(stc);
+ rcb = stc_table_openz(stc, TEST_TABLE, 0);
+ OK(rcb);
+
sync();
gettimeofday(&ta, NULL);
diff --git a/test/lotsa-objects.c b/test/lotsa-objects.c
index bf6b96d..fbf5f81 100644
--- a/test/lotsa-objects.c
+++ b/test/lotsa-objects.c
@@ -33,6 +33,9 @@ static void test(int n_objects, bool encrypt)
stc = stc_new(TEST_HOST, port, TEST_USER, TEST_USER_KEY, encrypt);
OK(stc);
+ rcb = stc_table_openz(stc, TEST_TABLE, 0);
+ OK(rcb);
+
fprintf(stderr, " lotsa-objects syncing...\n");
sync();
diff --git a/test/nop.c b/test/nop.c
index c683dc4..5771a2c 100644
--- a/test/nop.c
+++ b/test/nop.c
@@ -28,6 +28,9 @@ static void test(int n_nops, bool encrypt)
stc = stc_new(TEST_HOST, port, TEST_USER, TEST_USER_KEY, encrypt);
OK(stc);
+ rcb = stc_table_openz(stc, TEST_TABLE, 0);
+ OK(rcb);
+
gettimeofday(&ta, NULL);
/* send NOP messages */
diff --git a/test/test.h b/test/test.h
index dd99843..3321587 100644
--- a/test/test.h
+++ b/test/test.h
@@ -8,6 +8,8 @@
#define TEST_HOST "localhost"
+#define TEST_TABLE "test"
+
#define TEST_USER "testuser"
#define TEST_USER_KEY "testuser"
diff --git a/tools/chcli.c b/tools/chcli.c
index bc8fad5..0d995a9 100644
--- a/tools/chcli.c
+++ b/tools/chcli.c
@@ -36,6 +36,8 @@ static struct argp_option options[] = {
"Send GET output to FILE, rather than stdout" },
{ "ssl", 'S', NULL, 0,
"Enable SSL channel security" },
+ { "table", 't', "TABLE", 0,
+ "Set table for storage and retrieval" },
{ "user", 'u', "USER", 0,
"Set username to USER" },
{ "verbose", 'v', NULL, 0,
@@ -43,6 +45,8 @@ static struct argp_option options[] = {
{ "list-cmds", 1001, NULL, 0,
"List supported commands" },
+ { "create", 1002, NULL, 0,
+ "Create new table (required, if table does not exist)" },
{ }
};
@@ -78,6 +82,9 @@ static char *password;
static char *output_fn;
static char *key_data;
static gsize key_data_len;
+static char *table_name;
+static size_t table_name_len;
+static bool table_create;
static char *password_env = "CHCLI_PASSWORD";
static bool chcli_verbose;
static bool use_ssl;
@@ -197,6 +204,10 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state)
free(s);
}
+ table_name = g_key_file_get_string(config, "global", "table",
+ NULL);
+ if (table_name)
+ table_name_len = strlen(table_name) + 1;
password = g_key_file_get_string(config, "global", "password",
NULL);
@@ -245,16 +256,23 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state)
case 'o':
output_fn = arg;
break;
- case 'v':
- chcli_verbose = true;
- break;
case 'S':
use_ssl = true;
break;
+ case 't':
+ table_name = arg;
+ table_name_len = strlen(arg) + 1;
+ break;
+ case 'v':
+ chcli_verbose = true;
+ break;
case 1001: /* --list-cmds */
show_cmds();
break;
+ case 1002: /* --create */
+ table_create = true;
+ break;
case ARGP_KEY_ARG:
if (cmd_mode != CHC_NONE)
@@ -298,6 +316,15 @@ static struct st_client *chcli_stc_new(void)
stc->verbose = chcli_verbose;
+ if (!stc_table_open(stc, table_name, table_name_len,
+ table_create ? CHF_TABLE_NEW : 0)) {
+ fprintf(stderr, "%s:%u: failed to open table\n",
+ host->name,
+ host->port);
+ stc_free(stc);
+ return NULL;
+ }
+
return stc;
}
@@ -527,7 +554,10 @@ int main (int argc, char *argv[])
fprintf(stderr, "no host specified\n");
return 1;
}
-
+ if (!table_name || !table_name_len) {
+ fprintf(stderr, "no table name specified\n");
+ return 1;
+ }
if (strlen(username) == 0) {
fprintf(stderr, "no username specified\n");
return 1;
^ permalink raw reply related [flat|nested] 6+ messages in thread* Re: [PATCH] chunkd: add support for multiple key/value tables
2009-11-10 11:24 [PATCH] chunkd: add support for multiple key/value tables Jeff Garzik
@ 2009-11-10 16:33 ` Pete Zaitcev
2009-11-10 19:45 ` Jeff Garzik
0 siblings, 1 reply; 6+ messages in thread
From: Pete Zaitcev @ 2009-11-10 16:33 UTC (permalink / raw)
To: Jeff Garzik; +Cc: hail-devel
On Tue, 10 Nov 2009 06:24:09 -0500, Jeff Garzik <jeff@garzik.org> wrote:
> LOGIN(user="jgarzik")
> TABLE-OPEN(name="tabled")
> GET...
2 more turnarounds per session? Brilliant!
The theory behind this is sound: let's not saddle chunkd with caching
authentication results, which is ineffective anyway, but provide
a way for application to amortize the cost of authentication over
a number of requests explicitly. But in practice it means tabled
needs to keep inactive sessions open, which is a chunk of code for
me to write (and debug!). I guess I'll do it in a few months...
> @@ -29,9 +29,15 @@ static void test(bool encrypt)
> stc1 = stc_new(TEST_HOST, port, TEST_USER, TEST_USER_KEY, encrypt);
> OK(stc1);
>
> + rcb = stc_table_openz(stc1, TEST_TABLE, 0);
> + OK(rcb);
> +
> stc2 = stc_new(TEST_HOST, port, TEST_USER2, TEST_USER2_KEY, encrypt);
> OK(stc2);
Having a default table? Naah, those lazy application programmers have
it too easy already!
Again, from the point of view of chunkd, this makes complete sense:
why carry an extra (default) table in cases when application does
in fact set its own tables, right?
> + /*
> + * we must supply CHF_TABLE_NEW on the first iteration of
> + * this test, because we are the first test in the testsuite,
> + * and must create the database to be used by all other tests.
> + */
> + rcb = stc_table_openz(stc, TEST_TABLE,
> + ssl ? 0 : CHF_TABLE_NEW);
> + OK(rcb);
You've got to be kidding me. How is tabled supposed to know that
the request it's making is "first"?! I guess I have to supply
CHF_TABLE_NEW to every call now, or else retry if InvalidTable
is returned, I haven't decided what workaround to apply yet.
-- Pete
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] chunkd: add support for multiple key/value tables
2009-11-10 16:33 ` Pete Zaitcev
@ 2009-11-10 19:45 ` Jeff Garzik
2009-11-10 19:50 ` Jeff Garzik
0 siblings, 1 reply; 6+ messages in thread
From: Jeff Garzik @ 2009-11-10 19:45 UTC (permalink / raw)
To: Pete Zaitcev; +Cc: hail-devel
On 11/10/2009 11:33 AM, Pete Zaitcev wrote:
> On Tue, 10 Nov 2009 06:24:09 -0500, Jeff Garzik<jeff@garzik.org> wrote:
>
>> LOGIN(user="jgarzik")
>> TABLE-OPEN(name="tabled")
>> GET...
>
> 2 more turnarounds per session? Brilliant!
>
> The theory behind this is sound: let's not saddle chunkd with caching
> authentication results, which is ineffective anyway, but provide
> a way for application to amortize the cost of authentication over
> a number of requests explicitly. But in practice it means tabled
> needs to keep inactive sessions open, which is a chunk of code for
> me to write (and debug!). I guess I'll do it in a few months...
chunkd protocol was never intended to be a connect+request+disconnect
model... HTTP 1.0 proved that was a bad model, which is why the world
moved on to pipelined, multiple-request protocols.
connect+request+disconnect protocols waste kernel, IPVS, firewall and
router resources, and are distinctly network unfriendly.
So yeah... tabled does need to keep chunkd sessions open, after a
chunkd request completes. That was always true, regardless of the
multi-kv API change in $Subject.
As a matter of fact, libchunkdc is actually a limiting factor here:
even though the chunkd network protocol is pipeline-able, libchunkdc
always waits for a response before returning control back to the
application. That is not strictly necessary: an application could
choose to submit 10 'DEL' requests in a single write(2), and then wait
for 10 responses from the server, if it so wished.
>> @@ -29,9 +29,15 @@ static void test(bool encrypt)
>> stc1 = stc_new(TEST_HOST, port, TEST_USER, TEST_USER_KEY, encrypt);
>> OK(stc1);
>>
>> + rcb = stc_table_openz(stc1, TEST_TABLE, 0);
>> + OK(rcb);
>> +
>> stc2 = stc_new(TEST_HOST, port, TEST_USER2, TEST_USER2_KEY, encrypt);
>> OK(stc2);
>
> Having a default table? Naah, those lazy application programmers have
> it too easy already!
>
> Again, from the point of view of chunkd, this makes complete sense:
> why carry an extra (default) table in cases when application does
> in fact set its own tables, right?
If people want a default table, I can put it in. MySQL tries to connect
to a database $Username, if database name is not supplied, for example.
But yes, from point of view of chunkd simplicity, no-default-table is
certainly more simple, which makes me reluctant to add it.
If the separate API call is bothersome, we could pass table name to
stc_new().
>> + /*
>> + * we must supply CHF_TABLE_NEW on the first iteration of
>> + * this test, because we are the first test in the testsuite,
>> + * and must create the database to be used by all other tests.
>> + */
>> + rcb = stc_table_openz(stc, TEST_TABLE,
>> + ssl ? 0 : CHF_TABLE_NEW);
>> + OK(rcb);
>
> You've got to be kidding me. How is tabled supposed to know that
> the request it's making is "first"?! I guess I have to supply
> CHF_TABLE_NEW to every call now, or else retry if InvalidTable
> is returned, I haven't decided what workaround to apply yet.
A fair question... It seemed logical to create the table at the time a
new chunkd node comes online, and that the application would want to
always run in normal mode WITHOUT CHF_TABLE_NEW -- thus making a table's
unexpected absence a hard error, mirroring real life.
Easy alternatives include (a) create on demand and never worry about
this detail, and (b) add an 'exclusive' flag analagous to O_EXCL. This
complements CHF_TABLE_NEW, which is analagous to O_CREAT.
Jeff
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] chunkd: add support for multiple key/value tables
2009-11-10 19:45 ` Jeff Garzik
@ 2009-11-10 19:50 ` Jeff Garzik
2009-11-10 20:46 ` chunkd design genesis, storage tech, and " Jeff Garzik
0 siblings, 1 reply; 6+ messages in thread
From: Jeff Garzik @ 2009-11-10 19:50 UTC (permalink / raw)
To: Pete Zaitcev; +Cc: hail-devel
On 11/10/2009 02:45 PM, Jeff Garzik wrote:
> But yes, from point of view of chunkd simplicity, no-default-table is
> certainly more simple, which makes me reluctant to add it.
English parse error :)
no-default-table is more simple, and imposes zero policy on the
namespace, which makes me reluctant to add default-table.
Jeff
^ permalink raw reply [flat|nested] 6+ messages in thread
* chunkd design genesis, storage tech, and support for multiple key/value tables
2009-11-10 19:50 ` Jeff Garzik
@ 2009-11-10 20:46 ` Jeff Garzik
2009-11-11 1:48 ` Pete Zaitcev
0 siblings, 1 reply; 6+ messages in thread
From: Jeff Garzik @ 2009-11-10 20:46 UTC (permalink / raw)
To: Pete Zaitcev; +Cc: hail-devel
You wrote this insightful and pointed comment on IRC...
> Comparing with "every k/v service out there" assumes that you're
> growing a generic key/value service out of Chunk. You're essentially
> admitting it openly.
This is an excellent point to raise. So let the "begin at the
beginning", cover the chunkd design thought process, and hope to explain
how this matches up.
Let us consider storage technology, at the level I'm used to: ATA,
SCSI, and nbd protocols.
For decades, storage has been a run of fixed-length records (sectors and
blocks), with the following API:
key = offset + data length
<-- "key" is minimum amount of data required to
uniquely describe a run of data
PUT key, data
data = GET key
Now the world has figured out giving a storage device the flexibility to
manage data on a per-object granular basis simplifies applications, and
gives underlying storage more ability to optimize. Thus was born the
object-based storage device (SCSI OSD), with the API
key = 64-bit object id
PUT key, data, data length
data, data length = GET key
A key design decision of Project Hail was to follow this object-based
storage model, when considering the two alternatives:
1) Build cloud apps on top of multple block devices. My conclusion:
this is undesirable for the same reason why sector-based storage is
undesirable: applications want more granularity, and with sector-based
systems, must build their own filesystem-like data structures just to
keep their own objects separated from one another.
2) Build cloud apps on top of filesystems. I think(?) GlusterFS is
taking this route. This approach is workable, but may create a lot of
unnecessary overhead. Filesystem protocols are much more complicated
than storage protocols, in particular.
Object-based storage devices sit in the middle: not as complex as
filesystems, but more useful than sector-based storage.
chunkd is thus designed to be a simple, straightforward, easy-to-use
replacement for SCSI OSD, which has already been proven useful in
distributed storage (Lustre, pNFS).
That is why chunkd originally used fixed-length hexidecimal keys: It
was modelled on the SCSI OSD object id. However, it quickly became
evident in practice that EVERY chunkd application would create its own
scheme to map internal_object_id to chunkd_object_id.
Thus, moving to generic key/value storage actually simplified
applications, by eliminating that mapping.
However, one glaring difference from SCSI OSD was chunkd's lack of
administrative partitions. SCSI OSDs provide "partitions" within each
logical unit (LUN), each of contains a set of objects within a single
object id namespace. Therefore, if you consider SCSI OSD object id as
the key, then SCSI OSD definitely has multiple key/value tables.
As you pointed out on IRC, it is possible to create administrative
partitioning by running multiple chunkd instances.
But I think the Real World(tm) has shown that in-protocol partitioning
of object namespace is the way to go. Being able to create and destroy
partitions within the protocol, on-demand, has a lot of value.
So, just as SCSI OSD has
[ target + logical unit + ] partition + object
With chunkd we can have
[ host + port + ] table + object
Amazon S3 has buckets. Pretty much every protocol in production tends
to have some sort of administrative separation ability.
Jeff
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: chunkd design genesis, storage tech, and support for multiple key/value tables
2009-11-10 20:46 ` chunkd design genesis, storage tech, and " Jeff Garzik
@ 2009-11-11 1:48 ` Pete Zaitcev
0 siblings, 0 replies; 6+ messages in thread
From: Pete Zaitcev @ 2009-11-11 1:48 UTC (permalink / raw)
To: Jeff Garzik; +Cc: hail-devel, zaitcev
On Tue, 10 Nov 2009 15:46:27 -0500, Jeff Garzik <jeff@garzik.org> wrote:
> Now the world has figured out giving a storage device the flexibility to
> manage data on a per-object granular basis simplifies applications, and
> gives underlying storage more ability to optimize.
This is a sleught of hand by OSD vendors, interested in selling for
more dollars per gigabyte.
> Thus, moving to generic key/value storage actually simplified
> applications, by eliminating that mapping.
You're so sure about this, I wonder where it comes from.
The fact in case of tabled is, it must maintain a database of keys
of its own, primarily because (a) it cannot afford round-trips into
Chunk for every operation, and (b) to locate the chunks. Both of
these databases may be in RAM, but it does not make them non-existing.
> However, one glaring difference from SCSI OSD was chunkd's lack of
> administrative partitions. SCSI OSDs provide "partitions" within each
> logical unit (LUN), each of contains a set of objects within a single
> object id namespace. Therefore, if you consider SCSI OSD object id as
> the key, then SCSI OSD definitely has multiple key/value tables.
This is a completely bogus analogy. OSD vendors want to push their
wares into PC space, where one unit is all a computer has. But in
the cloud we have thousands of Chunk nodes per each application.
That is your partitioning right there: it's called <Cell></Cell>.
Look, I would not mind if all this partition stuff was free, but
it's not. You decided to embed a partition into a session, so
- There's a round trip that you excuse by telling applications
to keep long-living connections, thanks a lot
- requests to different partitions cannot be pipelined (well,
not easily).
-- Pete
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2009-11-11 1:48 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-11-10 11:24 [PATCH] chunkd: add support for multiple key/value tables Jeff Garzik
2009-11-10 16:33 ` Pete Zaitcev
2009-11-10 19:45 ` Jeff Garzik
2009-11-10 19:50 ` Jeff Garzik
2009-11-10 20:46 ` chunkd design genesis, storage tech, and " Jeff Garzik
2009-11-11 1:48 ` Pete Zaitcev
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.