From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jeff Garzik Subject: Re: [patch tabled 6/8] Add filesystem back-end Date: Mon, 13 Dec 2010 16:30:59 -0500 Message-ID: <4D069093.2060108@garzik.org> References: <20101128184108.2172f590@lembas.zaitcev.lan> Mime-Version: 1.0 Content-Transfer-Encoding: 7bit Return-path: DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:received:received:sender:message-id:date:from :user-agent:mime-version:to:cc:subject:references:in-reply-to :content-type:content-transfer-encoding; bh=D9ElCl2KPJtXJ1/FBfklEJ17+aaQTGSPq8Muvd3rmxw=; b=skHFn4ohdlmJBhrU4EZaM9cjAxIMSO4jOzHgkewNhf/bUZpUIZViZTRIG70UuvonMq 395ZJ4etTlfZ21C7I5uAT1jXoLijt9wW39u40eumEQ3Zgii4WphTSTqMvDNPPdJU8bQI jBoUN5wXqQTX6QFbydFpTEKx+rImk/AawKtxc= In-Reply-To: <20101128184108.2172f590@lembas.zaitcev.lan> Sender: hail-devel-owner@vger.kernel.org List-ID: Content-Type: text/plain; charset="us-ascii"; format="flowed" To: Pete Zaitcev Cc: Project Hail List On 11/28/2010 08:41 PM, Pete Zaitcev wrote: > This patch adds the first new back-end and makes some changes to the way > nodes are added, to make the invariants of storage_node more sensible. > > The filesystem back-end itself is not intended for production use, > so it makes no attempt to run any asynchronous transfers. > > We also add a test. Note that this differs from the preliminary versions > of this patch. We used to add both chunk and fs back-ends, so that tabled > replicates to both. This makes sense as a test of store path, but on > retrieval tabled selects any one of available storage nodes with the > object, randomly. It creates gaps in test coverage in any given run. > Therefore, we test two back-end types sequentially now. > > Signed-off-by: Pete Zaitcev > > --- > server/Makefile.am | 2 > server/stor_chunk.c | 21 - > server/stor_fs.c | 498 +++++++++++++++++++++++++++++++++++++++++ > server/storage.c | 157 ++++++++++-- > server/storparse.c | 97 +++++++ > server/tabled.h | 31 ++ > test/Makefile.am | 3 > test/be_fs-test.conf | 5 > test/combo-redux | 74 ++++++ > test/prep-db | 4 > test/start-daemon | 1 > test/stop-daemon | 9 > 12 files changed, 835 insertions(+), 67 deletions(-) > > commit bccedeedabbe713e4053afa185314b3f57f3d204 > Author: Pete Zaitcev > Date: Sun Nov 28 17:58:05 2010 -0700 > > Add fs back-end, with a test. > > diff --git a/server/Makefile.am b/server/Makefile.am > index 52beec4..71bcb35 100644 > --- a/server/Makefile.am > +++ b/server/Makefile.am > @@ -6,7 +6,7 @@ sbin_PROGRAMS = tabled tdbadm > tabled_SOURCES = tabled.h \ > bucket.c cldu.c config.c metarep.c object.c replica.c \ > server.c status.c storage.c storparse.c \ > - stor_chunk.c util.c > + stor_chunk.c stor_fs.c util.c > tabled_LDADD = ../lib/libtdb.a \ > @HAIL_LIBS@ @PCRE_LIBS@ @GLIB_LIBS@ \ > @CRYPTO_LIBS@ @DB4_LIBS@ @EVENT_LIBS@ @SSL_LIBS@ > diff --git a/server/stor_chunk.c b/server/stor_chunk.c > index 815adcf..7462a9c 100644 > --- a/server/stor_chunk.c > +++ b/server/stor_chunk.c > @@ -31,8 +31,7 @@ > #include > #include "tabled.h" > > -static const char stor_key_fmt[] = "%016llx"; > -#define STOR_KEY_SLEN 16 > +static const char stor_key_fmt[] = STOR_KEY_FMT; > > static int stor_new_stc(struct storage_node *stn, struct st_client **stcp) > { > @@ -66,24 +65,6 @@ static int stor_new_stc(struct storage_node *stn, struct st_client **stcp) > return 0; > } > > -static void stor_read_event(int fd, short events, void *userdata) > -{ > - struct open_chunk *cep = userdata; > - > - cep->r_armed = false; /* no EV_PERSIST */ > - if (cep->ocb) > - (*cep->ocb)(cep); > -} > - > -static void stor_write_event(int fd, short events, void *userdata) > -{ > - struct open_chunk *cep = userdata; > - > - cep->w_armed = false; /* no EV_PERSIST */ > - if (cep->ocb) > - (*cep->ocb)(cep); > -} > - > /* > * Open *cep using stn, set up chunk session if needed. > */ > diff --git a/server/stor_fs.c b/server/stor_fs.c > new file mode 100644 > index 0000000..b433a67 > --- /dev/null > +++ b/server/stor_fs.c > @@ -0,0 +1,498 @@ > + > +/* > + * Copyright 2010 Red Hat, Inc. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; see the file COPYING. If not, write to > + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. > + * > + */ > + > +#define _GNU_SOURCE > +#include "tabled-config.h" > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include "tabled.h" > + > +static const char stor_key_fmt[] = STOR_KEY_FMT; > + > +static char *fs_obj_pathname(const char *base, uint64_t key) > +{ > + enum { PREFIX_LEN = 3 }; > + char prefix[PREFIX_LEN + 1]; > + char stckey[STOR_KEY_SLEN+1]; > + char *s; > + int rc; > + > + /* we know that stckey is going to be longer than PREFIX_LEN */ > + sprintf(stckey, stor_key_fmt, (unsigned long long) key); > + memcpy(prefix, stckey, PREFIX_LEN); > + prefix[PREFIX_LEN] = 0; > + > + rc = asprintf(&s, "%s/%s/%s", base, prefix, stckey + PREFIX_LEN); > + if (rc< 0) > + goto err_out; > + > + return s; > + > +err_out: > + return NULL; > +} > + > +static char *fs_ctl_pathname(const char *base, const char *file) > +{ > + char *s; > + int rc; > + > + rc = asprintf(&s, "%s/%s", base, file); > + if (rc< 0) > + return NULL; > + return s; > +} > + > +static int fs_obj_mkpath(const char *path) > +{ > + struct stat statb; > + char *s; > + int rc; > + > + /* one dir is enough */ > + /* not using dirname because on some platforms it modifies its arg. */ > + s = strrchr(path, '/'); > + if (s == NULL) > + return -EINVAL; > + s = strndup(path, s-path); > + if (!s) > + return -ENOMEM; > + > + /* create subdir on the fly, if not already exists */ > + if (stat(s,&statb)< 0) { > + rc = errno; > + if (rc != ENOENT) > + goto err_out; > + if (mkdir(s, 0777)< 0) { > + rc = errno; > + /* > + * Directory already exists, perhaps > + * because we raced with another thread. > + */ > + if (rc != EEXIST) > + goto err_out; > + } > + } else { > + if (!S_ISDIR(statb.st_mode)) { > + rc = EINVAL; > + goto err_out; > + } > + } > + > + free(s); > + return 0; > + > +err_out: > + free(s); > + return -rc; > +} > + > +static int fs_open(struct open_chunk *cep, struct storage_node *stn, > + struct event_base *ev_base) > +{ > + if (cep->node) > + return -EBUSY; > + > + if (!stn->basepath) { > + applog(LOG_WARNING, > + "No base path for Posix chunk, nid %u", stn->id); > + return -EINVAL; > + } > + > + cep->evbase = ev_base; > + cep->node = stor_node_get(stn); > + cep->pfd = -1; > + > + return 0; > +} > + > +static int fs_open_read(struct open_chunk *cep, > + void (*cb)(struct open_chunk *), > + uint64_t key, uint64_t *psize) > +{ > + char *objpath; > + struct stat statb; > + uint64_t size; > + int rc; > + > + if (!cep->node || cep->key) > + return -EBUSY; > + > + objpath = fs_obj_pathname(cep->node->basepath, key); > + if (!objpath) { > + applog(LOG_WARNING, "No core"); > + return -ENOMEM; > + } > + > + rc = open(objpath, O_RDONLY); > + if (rc == -1) { > + rc = errno; > + applog(LOG_WARNING, "Cannot open file %s oid %llX: %s", > + objpath, (long long) key, strerror(rc)); > + free(objpath); > + return -rc; > + } > + cep->pfd = rc; > + > + if (fstat(cep->pfd,&statb)< 0) { > + rc = errno; > + applog(LOG_WARNING, "Cannot stat file %s: %s", > + objpath, strerror(rc)); > + close(cep->pfd); > + cep->pfd = -1; > + free(objpath); > + return -rc; > + } > + size = statb.st_size; > + > + *psize = size; > + cep->size = size; > + cep->done = 0; > + cep->key = key; > + cep->ocb = cb; > + > + /* > + * We cannot call cep->ocb directly. Instead, we steal the > + * arm-disarm mechanism from chunk. This works because in Linux > + * regular files can be polled and always return ready. > + */ > + event_set(&cep->revt, cep->pfd, EV_READ, stor_read_event, cep); > + event_base_set(cep->evbase,&cep->revt); > + > + free(objpath); > + return 0; > +} > + > +static void fs_close(struct open_chunk *cep) > +{ > + if (cep->node) { > + stor_node_put(cep->node); > + cep->node = NULL; > + if (cep->pfd != -1) { > + close(cep->pfd); > + cep->pfd = -1; > + } > + } > + > + cep->done = 0; > + cep->size = 0; > + > + if (cep->r_armed) { > + event_del(&cep->revt); > + cep->r_armed = false; > + } > + > + if (cep->w_armed) { > + event_del(&cep->wevt); > + cep->w_armed = false; > + } > + > + cep->key = 0; > +} > + > +static void fs_abort(struct open_chunk *cep) > +{ > + if (cep->r_armed) { > + event_del(&cep->revt); > + cep->r_armed = false; > + } > + if (cep->w_armed) { > + event_del(&cep->wevt); > + cep->w_armed = false; > + } > + /* XXX delete the unfinished object under write */ > + cep->key = 0; > +} > + > +static int fs_put_start(struct open_chunk *cep, > + void (*cb)(struct open_chunk *), > + uint64_t key, uint64_t size) > +{ > + char *objpath; > + int rc; > + > + if (!cep->node || cep->key) > + return -EBUSY; > + > + objpath = fs_obj_pathname(cep->node->basepath, key); > + if (!objpath) { > + applog(LOG_WARNING, "No core"); > + return -ENOMEM; > + } > + > + rc = fs_obj_mkpath(objpath); > + if (rc) { > + applog(LOG_WARNING, "Cannot create a directory for %s: %s", > + objpath, strerror(-rc)); > + free(objpath); > + return rc; > + } > + > + rc = open(objpath, O_WRONLY|O_TRUNC|O_CREAT, 0666); > + if (rc == -1) { > + rc = errno; > + applog(LOG_WARNING, "Cannot create file %s: %s", > + objpath, strerror(rc)); > + free(objpath); > + return -rc; > + } Current chunkd intentionally prevents two random users from put'ing the same object. The FS backend should do the same.