* [PATCH] ceph: Add FScache support
@ 2013-06-29 3:52 Milosz Tanski
2013-06-29 3:58 ` Milosz Tanski
0 siblings, 1 reply; 20+ messages in thread
From: Milosz Tanski @ 2013-06-29 3:52 UTC (permalink / raw)
To: ceph-devel; +Cc: Sage Weil, Yan, Zheng, linux-cachefs
Adding support for fscache to the Ceph filesystem. This would bring it to on
par with some of the other network filesystems in Linux (like NFS, AFS, etc...)
This code uses uses existing ceph capabilities (cache & lazy io) to determine
if an inode is cacheable.
In order to mount the filesystem with fscache the 'fsc' mount option must be
passed.
Signed-off-by: Milosz Tanski <milosz@adfin.com>
---
fs/ceph/Kconfig | 9 ++
fs/ceph/Makefile | 2 +
fs/ceph/addr.c | 84 ++++++++++----
fs/ceph/cache.c | 334 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/cache.h | 115 +++++++++++++++++++
fs/ceph/caps.c | 12 ++
fs/ceph/file.c | 8 ++
fs/ceph/inode.c | 23 +++-
fs/ceph/super.c | 39 ++++++-
fs/ceph/super.h | 13 +++
10 files changed, 616 insertions(+), 23 deletions(-)
create mode 100644 fs/ceph/cache.c
create mode 100644 fs/ceph/cache.h
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 49bc782..ac9a2ef 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -16,3 +16,12 @@ config CEPH_FS
If unsure, say N.
+if CEPH_FS
+config CEPH_FSCACHE
+ bool "Enable Ceph client caching support"
+ depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
+ help
+ Choose Y here to enable persistent, read-only local
+ caching support for Ceph clients using FS-Cache
+
+endif
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index bd35212..0af0678 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -9,3 +9,5 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o
+ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
+
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e68ac1..b22610f 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -11,6 +11,7 @@
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
#include <linux/ceph/osd_client.h>
/*
@@ -149,11 +150,26 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc = page_snap_context(page);
- BUG_ON(!PageLocked(page));
- BUG_ON(!PagePrivate(page));
BUG_ON(!page->mapping);
inode = page->mapping->host;
+ ci = ceph_inode(inode);
+
+ if (offset != 0) {
+ dout("%p invalidatepage %p idx %lu partial dirty page\n",
+ inode, page, page->index);
+ return;
+ }
+
+#ifdef CONFIG_CEPH_FSCACHE
+ if (PageFsCache(page))
+ ceph_invalidate_fscache_page(inode, page);
+#endif
+
+ if (!PagePrivate(page))
+ return;
+
+ BUG_ON(!PageLocked(page));
/*
* We can get non-dirty pages here due to races between
@@ -163,31 +179,32 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
if (!PageDirty(page))
pr_err("%p invalidatepage %p page not dirty\n", inode, page);
- if (offset == 0)
- ClearPageChecked(page);
+ ClearPageChecked(page);
- ci = ceph_inode(inode);
- if (offset == 0) {
- dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
- inode, page, page->index, offset);
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
- ceph_put_snap_context(snapc);
- page->private = 0;
- ClearPagePrivate(page);
- } else {
- dout("%p invalidatepage %p idx %lu partial dirty page\n",
- inode, page, page->index);
- }
+ dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+ inode, page, page->index, offset);
+
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+ page->private = 0;
+ ClearPagePrivate(page);
}
-/* just a sanity check */
static int ceph_releasepage(struct page *page, gfp_t g)
{
struct inode *inode = page->mapping ? page->mapping->host : NULL;
dout("%p releasepage %p idx %lu\n", inode, page, page->index);
WARN_ON(PageDirty(page));
- WARN_ON(PagePrivate(page));
- return 0;
+
+#ifdef CONFIG_CEPH_FSCACHE
+ /* Can we release the page from the cache? */
+ if (PageFsCache(page) && ceph_release_fscache_page(page, g) == 0)
+ return 0;
+#endif
+ if (PagePrivate(page))
+ return 0;
+
+ return 1;
}
/*
@@ -202,6 +219,13 @@ static int readpage_nounlock(struct file *filp, struct page *page)
int err = 0;
u64 len = PAGE_CACHE_SIZE;
+#ifdef CONFIG_CEPH_FSCACHE
+ err = ceph_readpage_from_fscache(inode, page);
+
+ if (err == 0)
+ goto out;
+#endif
+
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
@@ -219,6 +243,11 @@ static int readpage_nounlock(struct file *filp, struct page *page)
}
SetPageUptodate(page);
+#ifdef CONFIG_CEPH_FSCACHE
+ if (err == 0)
+ ceph_readpage_to_fscache(inode, page);
+#endif
+
out:
return err < 0 ? err : 0;
}
@@ -261,6 +290,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
page->index);
flush_dcache_page(page);
SetPageUptodate(page);
+#ifdef CONFIG_CEPH_FSCACHE
+ ceph_readpage_to_fscache(inode, page);
+#endif
unlock_page(page);
page_cache_release(page);
bytes -= PAGE_CACHE_SIZE;
@@ -330,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
page = list_entry(page_list->prev, struct page, lru);
BUG_ON(PageLocked(page));
list_del(&page->lru);
-
+
dout("start_read %p adding %p idx %lu\n", inode, page,
page->index);
if (add_to_page_cache_lru(page, &inode->i_data, page->index,
@@ -377,6 +409,14 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
int rc = 0;
int max = 0;
+#ifdef CONFIG_CEPH_FSCACHE
+ rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
+ &nr_pages);
+
+ if (rc == 0)
+ goto out;
+#endif
+
if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT;
@@ -490,6 +530,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
+#ifdef CONFIG_CEPH_FSCACHE
+ ceph_readpage_to_fscache(inode, page);
+#endif
+
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
&ci->i_layout, snapc,
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
new file mode 100644
index 0000000..1ee3461
--- /dev/null
+++ b/fs/ceph/cache.c
@@ -0,0 +1,334 @@
+/*
+ * Ceph cache definitions.
+ *
+ * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ * Written by Milosz Tanski (milosz@adfin.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#include "super.h"
+#include "cache.h"
+
+struct ceph_aux_inode {
+ struct timespec mtime;
+ loff_t size;
+};
+
+struct fscache_netfs ceph_cache_netfs = {
+ .name = "ceph",
+ .version = 0,
+};
+
+static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t maxbuf)
+{
+ const struct ceph_fs_client* fsc = cookie_netfs_data;
+ uint16_t klen;
+
+ klen = sizeof(fsc->client->fsid);
+ if (klen > maxbuf)
+ return 0;
+
+ memcpy(buffer, &fsc->client->fsid, klen);
+ return klen;
+}
+
+static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
+ .name = "CEPH.fsid",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = ceph_fscache_session_get_key,
+};
+
+void ceph_fscache_register_fsid_cookie(struct ceph_fs_client* fsc)
+{
+ fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
+ &ceph_fscache_fsid_object_def,
+ fsc);
+}
+
+void ceph_fscache_unregister_fsid_cookie(struct ceph_fs_client* fsc)
+{
+ fscache_relinquish_cookie(fsc->fscache, 0);
+ fsc->fscache = NULL;
+}
+
+static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t maxbuf)
+{
+ const struct ceph_inode_info* ci = cookie_netfs_data;
+ uint16_t klen;
+
+ /* use ceph virtual inode (id + snaphot) */
+ klen = sizeof(ci->i_vino);
+ if (klen > maxbuf)
+ return 0;
+
+ memcpy(buffer, &ci->i_vino, klen);
+ return klen;
+}
+
+static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ struct ceph_aux_inode aux;
+ const struct ceph_inode_info* ci = cookie_netfs_data;
+ const struct inode* inode = &ci->vfs_inode;
+
+ memset(&aux, 0, sizeof(aux));
+ aux.mtime = inode->i_mtime;
+ aux.size = inode->i_size;
+
+ memcpy(buffer, &aux, sizeof(aux));
+
+ return sizeof(aux);
+}
+
+static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
+ uint64_t *size)
+{
+ const struct ceph_inode_info* ci = cookie_netfs_data;
+ const struct inode* inode = &ci->vfs_inode;
+
+ *size = inode->i_size;
+}
+
+static enum fscache_checkaux ceph_fscache_inode_check_aux(
+ void *cookie_netfs_data, const void *data, uint16_t dlen)
+{
+ struct ceph_aux_inode aux;
+ struct ceph_inode_info* ci = cookie_netfs_data;
+ struct inode* inode = &ci->vfs_inode;
+
+ if (dlen != sizeof(aux))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ memset(&aux, 0, sizeof(aux));
+ aux.mtime = inode->i_mtime;
+ aux.size = inode->i_size;
+
+ if (memcmp(data, &aux, sizeof(aux)) != 0)
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ dout("ceph inode 0x%p cached okay", ci);
+ return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
+{
+ struct ceph_inode_info* ci = cookie_netfs_data;
+ struct pagevec pvec;
+ pgoff_t first;
+ int loop, nr_pages;
+
+ pagevec_init(&pvec, 0);
+ first = 0;
+
+ dout("ceph inode 0x%p now uncached", ci);
+
+ while (1) {
+ nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
+ PAGEVEC_SIZE - pagevec_count(&pvec));
+
+ if (!nr_pages)
+ break;
+
+ for (loop = 0; loop < nr_pages; loop++)
+ ClearPageFsCache(pvec.pages[loop]);
+
+ first = pvec.pages[nr_pages - 1]->index + 1;
+
+ pvec.nr = nr_pages;
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
+ .name = "CEPH.inode",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .get_key = ceph_fscache_inode_get_key,
+ .get_attr = ceph_fscache_inode_get_attr,
+ .get_aux = ceph_fscache_inode_get_aux,
+ .check_aux = ceph_fscache_inode_check_aux,
+ .now_uncached = ceph_fscache_inode_now_uncached,
+};
+
+
+static int get_caps_issued(struct ceph_inode_info* ci)
+{
+ int issued;
+ int implemented = 0;
+
+ issued = __ceph_caps_issued(ci, &implemented);
+ issued |= implemented | __ceph_caps_dirty(ci);
+ return issued;
+}
+
+
+void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
+ struct ceph_inode_info* ci)
+{
+ const int want = (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO);
+
+ /* No caching for filesystem */
+ if (fsc->fscache == NULL)
+ return;
+ /* Only do it for data files */
+ if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
+ return;
+
+ if (ci->fscache)
+ return;
+ if ((get_caps_issued(ci) & want) == 0) {
+ dout("No caps for caching inode: %p", &ci->vfs_inode);
+ return;
+ }
+
+ ci->fscache = fscache_acquire_cookie(fsc->fscache,
+ &ceph_fscache_inode_object_def,
+ ci);
+}
+
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+ if (ci->fscache == NULL)
+ return;
+
+ fscache_relinquish_cookie(ci->fscache, 0);
+ ci->fscache = NULL;
+}
+
+void ceph_fscache_revoke_inode_cookie(struct ceph_inode_info* ci)
+{
+ if (ci->fscache == NULL)
+ return;
+
+ fscache_invalidate(ci->fscache);
+ /* Make sure the cache is cleared after we close the handle */
+ fscache_relinquish_cookie(ci->fscache, 1);
+ ci->fscache = NULL;
+}
+
+void __ceph_fscache_async_uncache_inode(struct ceph_inode_info* ci)
+{
+ fscache_uncache_all_inode_pages(ci->fscache, &ci->vfs_inode);
+}
+
+static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
+{
+ if (!error)
+ SetPageUptodate(page);
+}
+
+static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
+{
+ if (!error) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+}
+
+/* Atempt to read from the fscache,
+ *
+ * This function is called from the readpage_nounlock context. DO NOT attempt to
+ * unlock the page here (or in the callback).
+ */
+int __ceph_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+ const struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ ret = fscache_read_or_alloc_page(ci->fscache, page,
+ ceph_vfs_readpage_complete, NULL,
+ GFP_KERNEL);
+
+ switch (ret) {
+ case 0: /* Page found */
+ dout("page read submitted\n");
+ return 0;
+ case -ENOBUFS: /* Pages were not found, and can't be */
+ case -ENODATA: /* Pages were not found */
+ dout("page/inode not in cache\n");
+ return 1;
+ default:
+ dout("%s: unknown error ret = %i\n", __func__, ret);
+ return ret;
+ }
+}
+
+int __ceph_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int issued = get_caps_issued(ci);
+ const int want = (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO);
+ int ret;
+
+ /* Check if we have cached read caps */
+ if ((issued & want) == 0) {
+ return -ENOBUFS;
+ }
+
+ ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
+ ceph_vfs_readpage_complete_unlock,
+ NULL, mapping_gfp_mask(mapping));
+
+ switch (ret) {
+ case 0: /* All pages found */
+ dout("all-page read submitted\n");
+ return 0;
+ case -ENOBUFS: /* Some pages were not found, and can't be */
+ case -ENODATA: /* some pages were not found */
+ dout("page/inode not in cache\n");
+ return 1;
+ default:
+ dout("%s: unknown error ret = %i\n", __func__, ret);
+ return ret;
+ }
+}
+
+void __ceph_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+ const struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
+ if (ret)
+ fscache_uncache_page(ci->fscache, page);
+}
+
+void __ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
+{
+ const struct ceph_inode_info *ci = ceph_inode(inode);
+ struct fscache_cookie *cookie = ci->fscache;
+
+ fscache_wait_on_page_write(cookie, page);
+ fscache_uncache_page(cookie, page);
+}
+
+int __ceph_release_fscache_page(struct page *page, gfp_t gfp)
+{
+ struct inode* inode = page->mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct fscache_cookie *cookie = ci->fscache;
+
+ return fscache_maybe_release_page(cookie, page, gfp);
+}
+
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
new file mode 100644
index 0000000..7d24151
--- /dev/null
+++ b/fs/ceph/cache.h
@@ -0,0 +1,115 @@
+/*
+ * Ceph cache definitions.
+ *
+ * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ * Written by Milosz Tanski (milosz@adfin.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#ifndef _CEPH_CACHE_H
+#define _CEPH_CACHE_H
+#ifdef CONFIG_CEPH_FSCACHE
+
+#include <linux/fscache.h>
+
+
+extern struct fscache_netfs ceph_cache_netfs;
+
+
+void ceph_fscache_inode_get_cookie(struct inode *inode);
+
+void ceph_fscache_register_fsid_cookie(struct ceph_fs_client* fsc);
+void ceph_fscache_unregister_fsid_cookie(struct ceph_fs_client* fsc);
+void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
+ struct ceph_inode_info* ci);
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
+void ceph_fscache_revoke_inode_cookie(struct ceph_inode_info* ci);
+void __ceph_fscache_async_uncache_inode(struct ceph_inode_info* ci);
+
+int __ceph_readpage_from_fscache(struct inode *inode, struct page *page);
+int __ceph_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages);
+void __ceph_readpage_to_fscache(struct inode *inode, struct page *page);
+void __ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
+int __ceph_release_fscache_page(struct page *page, gfp_t gfp);
+
+static inline void ceph_fsxache_async_uncache_inode(struct inode* inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (ci->fscache == NULL)
+ return;
+
+ __ceph_fscache_async_uncache_inode(ci);
+}
+
+static inline int ceph_readpage_from_fscache(struct inode *inode,
+ struct page *page)
+{
+ if (ceph_inode(inode)->fscache == NULL)
+ return -ENOBUFS;
+
+ return __ceph_readpage_from_fscache(inode, page);
+}
+
+static inline int ceph_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ if (ceph_inode(inode)->fscache == NULL)
+ return -ENOBUFS;
+
+ return __ceph_readpages_from_fscache(inode, mapping, pages, nr_pages);
+}
+
+static inline void ceph_readpage_to_fscache(struct inode *inode,
+ struct page *page)
+{
+ if (ceph_inode(inode)->fscache == NULL)
+ return;
+
+ if (PageFsCache(page))
+ return __ceph_readpage_to_fscache(inode, page);
+}
+
+static inline void ceph_invalidate_fscache_page(struct inode *inode,
+ struct page *page)
+{
+ if (ceph_inode(inode)->fscache == NULL)
+ return;
+
+ if (PageFsCache(page))
+ return __ceph_invalidate_fscache_page(inode, page);
+}
+
+static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+{
+ struct inode* inode = page->mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (ci->fscache == NULL)
+ return 1;
+
+ return __ceph_release_fscache_page(page, gfp);
+}
+
+#endif
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index da0f9b8..5379f41 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -10,6 +10,7 @@
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
#include <linux/ceph/decode.h>
#include <linux/ceph/messenger.h>
@@ -2366,6 +2367,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
}
+
+#ifdef CONFIG_CEPH_FSCACHE
+ /* Close the fscache on inode */
+ ceph_fscache_unregister_inode_cookie(ci);
+#endif
}
/* side effects now are allowed */
@@ -2425,6 +2431,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
wake = 1;
}
+#ifdef CONFIG_CEPH_FSCACHE
+ /* Register cache (if needed); perform this after any size change. */
+ if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
+ ceph_fscache_register_inode_cookie(session->s_mdsc->fsc, ci);
+#endif
+
/* check cap bits */
wanted = __ceph_caps_wanted(ci);
used = __ceph_caps_used(ci);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 656e169..2162b35 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -11,6 +11,7 @@
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
/*
* Ceph file operations
@@ -67,10 +68,17 @@ out:
static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{
struct ceph_file_info *cf;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
int ret = 0;
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
+#ifdef CONFIG_CEPH_FSCACHE
+ spin_lock(&ci->i_ceph_lock);
+ ceph_fscache_register_inode_cookie(fsc, ci);
+ spin_unlock(&ci->i_ceph_lock);
+#endif
case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index be0f7e2..5144b36 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -12,6 +12,7 @@
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
#include <linux/ceph/decode.h>
/*
@@ -377,6 +378,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+#ifdef CONFIG_CEPH_FSCACHE
+ ci->fscache = NULL;
+#endif
+
return &ci->vfs_inode;
}
@@ -396,6 +401,10 @@ void ceph_destroy_inode(struct inode *inode)
dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+#ifdef CONFIG_CEPH_FSCACHE
+ ceph_fscache_unregister_inode_cookie(ci);
+#endif
+
ceph_queue_caps_release(inode);
/*
@@ -430,7 +439,6 @@ void ceph_destroy_inode(struct inode *inode)
call_rcu(&inode->i_rcu, ceph_i_callback);
}
-
/*
* Helpers to fill in size, ctime, mtime, and atime. We have to be
* careful because either the client or MDS may have more up to date
@@ -633,6 +641,14 @@ static int fill_inode(struct inode *inode,
le32_to_cpu(info->time_warp_seq),
&ctime, &mtime, &atime);
+#ifdef CONFIG_CEPH_FSCACHE
+ /* Notify the cache that size has changed */
+ if (queue_trunc && ci->fscache) {
+ pr_info("size changed inode: %p cap flags\n", &ci->vfs_inode);
+ fscache_attr_changed(ci->fscache);
+ }
+#endif
+
/* only update max_size on auth cap */
if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
ci->i_max_size != le64_to_cpu(info->max_size)) {
@@ -1430,6 +1446,11 @@ static void ceph_invalidate_work(struct work_struct *work)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
+#ifdef CONFIG_CEPH_FSCACHE
+ dout("cache invalidating inode: %p cap flags\n", &ci->vfs_inode);
+ fscache_invalidate(ci->fscache);
+#endif
+
truncate_inode_pages(&inode->i_data, 0);
spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7d377c9..850c161 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -17,6 +17,7 @@
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/decode.h>
@@ -142,6 +143,8 @@ enum {
Opt_nodcache,
Opt_ino32,
Opt_noino32,
+ Opt_fscache,
+ Opt_nofscache
};
static match_table_t fsopt_tokens = {
@@ -167,6 +170,8 @@ static match_table_t fsopt_tokens = {
{Opt_nodcache, "nodcache"},
{Opt_ino32, "ino32"},
{Opt_noino32, "noino32"},
+ {Opt_fscache, "fsc"},
+ {Opt_nofscache, "nofsc"},
{-1, NULL}
};
@@ -260,6 +265,12 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_noino32:
fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
break;
+ case Opt_fscache:
+ fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
+ break;
+ case Opt_nofscache:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
+ break;
default:
BUG_ON(token);
}
@@ -422,6 +433,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",dcache");
else
seq_puts(m, ",nodcache");
+ if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
+ seq_puts(m, ",fsc");
+ else
+ seq_puts(m, ",nofsc");
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
@@ -530,6 +545,11 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc->wb_pagevec_pool)
goto fail_trunc_wq;
+#ifdef CONFIG_CEPH_FSCACHE
+ if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE))
+ ceph_fscache_register_fsid_cookie(fsc);
+#endif
+
/* caps */
fsc->min_caps = fsopt->max_readdir;
@@ -554,6 +574,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
{
dout("destroy_fs_client %p\n", fsc);
+#ifdef CONFIG_CEPH_FSCACHE
+ ceph_fscache_unregister_fsid_cookie(fsc);
+#endif
+
destroy_workqueue(fsc->wb_wq);
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
@@ -588,6 +612,8 @@ static void ceph_inode_init_once(void *foo)
static int __init init_caches(void)
{
+ int error = -ENOMEM;
+
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
sizeof(struct ceph_inode_info),
__alignof__(struct ceph_inode_info),
@@ -611,15 +637,19 @@ static int __init init_caches(void)
if (ceph_file_cachep == NULL)
goto bad_file;
- return 0;
+#ifdef CONFIG_CEPH_FSCACHE
+ if ((error = fscache_register_netfs(&ceph_cache_netfs)))
+ goto bad_file;
+#endif
+ return 0;
bad_file:
kmem_cache_destroy(ceph_dentry_cachep);
bad_dentry:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
- return -ENOMEM;
+ return error;
}
static void destroy_caches(void)
@@ -629,10 +659,15 @@ static void destroy_caches(void)
* destroy cache.
*/
rcu_barrier();
+
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
+
+#ifdef CONFIG_CEPH_FSCACHE
+ fscache_unregister_netfs(&ceph_cache_netfs);
+#endif
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7ccfdb4..5ddaad5 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -16,6 +16,10 @@
#include <linux/ceph/libceph.h>
+#ifdef CONFIG_CEPH_FSCACHE
+#include <linux/fscache.h>
+#endif
+
/* f_type in struct statfs */
#define CEPH_SUPER_MAGIC 0x00c36400
@@ -29,6 +33,7 @@
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
+#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
@@ -90,6 +95,10 @@ struct ceph_fs_client {
struct dentry *debugfs_bdi;
struct dentry *debugfs_mdsc, *debugfs_mdsmap;
#endif
+
+#ifdef CONFIG_CEPH_FSCACHE
+ struct fscache_cookie *fscache;
+#endif
};
@@ -319,6 +328,10 @@ struct ceph_inode_info {
struct work_struct i_vmtruncate_work;
+#ifdef CONFIG_CEPH_FSCACHE
+ struct fscache_cookie *fscache;
+#endif
+
struct inode vfs_inode; /* at end */
};
--
1.7.10.4
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH] ceph: Add FScache support
2013-06-29 3:52 [PATCH] ceph: Add FScache support Milosz Tanski
@ 2013-06-29 3:58 ` Milosz Tanski
2013-07-01 15:55 ` Milosz Tanski
0 siblings, 1 reply; 20+ messages in thread
From: Milosz Tanski @ 2013-06-29 3:58 UTC (permalink / raw)
To: ceph-devel
On our side we're pretty happy where this is at now. In our
pre-production environment we are unable to find any issues with the
current patch (not that there aren't any).
Changes since the last patch:
- Combine the two patches into one.
- Fixed typos you guys found.
- Fixe a bug where we were disposing of the cookie and then calling
page invalidate on the inode leading to being unable to invalidate
pages marked PRIVATE_2. The simple fix was to move unregistering the
cookies passed the invalidate.
- Fixe some acidental whitespace changes that snuck in.
As always I welcome you guys' feedback. At this point in time I feel
pretty good about the state this is in.
-- Milosz
On Fri, Jun 28, 2013 at 11:51 PM, Milosz Tanski <milosz@adfin.com> wrote:
> Adding support for fscache to the Ceph filesystem. This would bring it to on
> par with some of the other network filesystems in Linux (like NFS, AFS, etc...)
>
> This code uses uses existing ceph capabilities (cache & lazy io) to determine
> if an inode is cacheable.
>
> In order to mount the filesystem with fscache the 'fsc' mount option must be
> passed.
>
> Signed-off-by: Milosz Tanski <milosz@adfin.com>
> ---
> fs/ceph/Kconfig | 9 ++
> fs/ceph/Makefile | 2 +
> fs/ceph/addr.c | 84 ++++++++++----
> fs/ceph/cache.c | 334 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/ceph/cache.h | 115 +++++++++++++++++++
> fs/ceph/caps.c | 12 ++
> fs/ceph/file.c | 8 ++
> fs/ceph/inode.c | 23 +++-
> fs/ceph/super.c | 39 ++++++-
> fs/ceph/super.h | 13 +++
> 10 files changed, 616 insertions(+), 23 deletions(-)
> create mode 100644 fs/ceph/cache.c
> create mode 100644 fs/ceph/cache.h
>
> diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
> index 49bc782..ac9a2ef 100644
> --- a/fs/ceph/Kconfig
> +++ b/fs/ceph/Kconfig
> @@ -16,3 +16,12 @@ config CEPH_FS
>
> If unsure, say N.
>
> +if CEPH_FS
> +config CEPH_FSCACHE
> + bool "Enable Ceph client caching support"
> + depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
> + help
> + Choose Y here to enable persistent, read-only local
> + caching support for Ceph clients using FS-Cache
> +
> +endif
> diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
> index bd35212..0af0678 100644
> --- a/fs/ceph/Makefile
> +++ b/fs/ceph/Makefile
> @@ -9,3 +9,5 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
> mds_client.o mdsmap.o strings.o ceph_frag.o \
> debugfs.o
>
> +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
> +
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 3e68ac1..b22610f 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -11,6 +11,7 @@
>
> #include "super.h"
> #include "mds_client.h"
> +#include "cache.h"
> #include <linux/ceph/osd_client.h>
>
> /*
> @@ -149,11 +150,26 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
> struct ceph_inode_info *ci;
> struct ceph_snap_context *snapc = page_snap_context(page);
>
> - BUG_ON(!PageLocked(page));
> - BUG_ON(!PagePrivate(page));
> BUG_ON(!page->mapping);
>
> inode = page->mapping->host;
> + ci = ceph_inode(inode);
> +
> + if (offset != 0) {
> + dout("%p invalidatepage %p idx %lu partial dirty page\n",
> + inode, page, page->index);
> + return;
> + }
> +
> +#ifdef CONFIG_CEPH_FSCACHE
> + if (PageFsCache(page))
> + ceph_invalidate_fscache_page(inode, page);
> +#endif
> +
> + if (!PagePrivate(page))
> + return;
> +
> + BUG_ON(!PageLocked(page));
>
> /*
> * We can get non-dirty pages here due to races between
> @@ -163,31 +179,32 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
> if (!PageDirty(page))
> pr_err("%p invalidatepage %p page not dirty\n", inode, page);
>
> - if (offset == 0)
> - ClearPageChecked(page);
> + ClearPageChecked(page);
>
> - ci = ceph_inode(inode);
> - if (offset == 0) {
> - dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
> - inode, page, page->index, offset);
> - ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
> - ceph_put_snap_context(snapc);
> - page->private = 0;
> - ClearPagePrivate(page);
> - } else {
> - dout("%p invalidatepage %p idx %lu partial dirty page\n",
> - inode, page, page->index);
> - }
> + dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
> + inode, page, page->index, offset);
> +
> + ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
> + ceph_put_snap_context(snapc);
> + page->private = 0;
> + ClearPagePrivate(page);
> }
>
> -/* just a sanity check */
> static int ceph_releasepage(struct page *page, gfp_t g)
> {
> struct inode *inode = page->mapping ? page->mapping->host : NULL;
> dout("%p releasepage %p idx %lu\n", inode, page, page->index);
> WARN_ON(PageDirty(page));
> - WARN_ON(PagePrivate(page));
> - return 0;
> +
> +#ifdef CONFIG_CEPH_FSCACHE
> + /* Can we release the page from the cache? */
> + if (PageFsCache(page) && ceph_release_fscache_page(page, g) == 0)
> + return 0;
> +#endif
> + if (PagePrivate(page))
> + return 0;
> +
> + return 1;
> }
>
> /*
> @@ -202,6 +219,13 @@ static int readpage_nounlock(struct file *filp, struct page *page)
> int err = 0;
> u64 len = PAGE_CACHE_SIZE;
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + err = ceph_readpage_from_fscache(inode, page);
> +
> + if (err == 0)
> + goto out;
> +#endif
> +
> dout("readpage inode %p file %p page %p index %lu\n",
> inode, filp, page, page->index);
> err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
> @@ -219,6 +243,11 @@ static int readpage_nounlock(struct file *filp, struct page *page)
> }
> SetPageUptodate(page);
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + if (err == 0)
> + ceph_readpage_to_fscache(inode, page);
> +#endif
> +
> out:
> return err < 0 ? err : 0;
> }
> @@ -261,6 +290,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
> page->index);
> flush_dcache_page(page);
> SetPageUptodate(page);
> +#ifdef CONFIG_CEPH_FSCACHE
> + ceph_readpage_to_fscache(inode, page);
> +#endif
> unlock_page(page);
> page_cache_release(page);
> bytes -= PAGE_CACHE_SIZE;
> @@ -330,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
> page = list_entry(page_list->prev, struct page, lru);
> BUG_ON(PageLocked(page));
> list_del(&page->lru);
> -
> +
> dout("start_read %p adding %p idx %lu\n", inode, page,
> page->index);
> if (add_to_page_cache_lru(page, &inode->i_data, page->index,
> @@ -377,6 +409,14 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
> int rc = 0;
> int max = 0;
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
> + &nr_pages);
> +
> + if (rc == 0)
> + goto out;
> +#endif
> +
> if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
> max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
> >> PAGE_SHIFT;
> @@ -490,6 +530,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
> CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
> set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + ceph_readpage_to_fscache(inode, page);
> +#endif
> +
> set_page_writeback(page);
> err = ceph_osdc_writepages(osdc, ceph_vino(inode),
> &ci->i_layout, snapc,
> diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
> new file mode 100644
> index 0000000..1ee3461
> --- /dev/null
> +++ b/fs/ceph/cache.c
> @@ -0,0 +1,334 @@
> +/*
> + * Ceph cache definitions.
> + *
> + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
> + * Written by Milosz Tanski (milosz@adfin.com)
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2
> + * as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to:
> + * Free Software Foundation
> + * 51 Franklin Street, Fifth Floor
> + * Boston, MA 02111-1301 USA
> + *
> + */
> +
> +#include "super.h"
> +#include "cache.h"
> +
> +struct ceph_aux_inode {
> + struct timespec mtime;
> + loff_t size;
> +};
> +
> +struct fscache_netfs ceph_cache_netfs = {
> + .name = "ceph",
> + .version = 0,
> +};
> +
> +static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
> + void *buffer, uint16_t maxbuf)
> +{
> + const struct ceph_fs_client* fsc = cookie_netfs_data;
> + uint16_t klen;
> +
> + klen = sizeof(fsc->client->fsid);
> + if (klen > maxbuf)
> + return 0;
> +
> + memcpy(buffer, &fsc->client->fsid, klen);
> + return klen;
> +}
> +
> +static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
> + .name = "CEPH.fsid",
> + .type = FSCACHE_COOKIE_TYPE_INDEX,
> + .get_key = ceph_fscache_session_get_key,
> +};
> +
> +void ceph_fscache_register_fsid_cookie(struct ceph_fs_client* fsc)
> +{
> + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
> + &ceph_fscache_fsid_object_def,
> + fsc);
> +}
> +
> +void ceph_fscache_unregister_fsid_cookie(struct ceph_fs_client* fsc)
> +{
> + fscache_relinquish_cookie(fsc->fscache, 0);
> + fsc->fscache = NULL;
> +}
> +
> +static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
> + void *buffer, uint16_t maxbuf)
> +{
> + const struct ceph_inode_info* ci = cookie_netfs_data;
> + uint16_t klen;
> +
> + /* use ceph virtual inode (id + snaphot) */
> + klen = sizeof(ci->i_vino);
> + if (klen > maxbuf)
> + return 0;
> +
> + memcpy(buffer, &ci->i_vino, klen);
> + return klen;
> +}
> +
> +static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
> + void *buffer, uint16_t bufmax)
> +{
> + struct ceph_aux_inode aux;
> + const struct ceph_inode_info* ci = cookie_netfs_data;
> + const struct inode* inode = &ci->vfs_inode;
> +
> + memset(&aux, 0, sizeof(aux));
> + aux.mtime = inode->i_mtime;
> + aux.size = inode->i_size;
> +
> + memcpy(buffer, &aux, sizeof(aux));
> +
> + return sizeof(aux);
> +}
> +
> +static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
> + uint64_t *size)
> +{
> + const struct ceph_inode_info* ci = cookie_netfs_data;
> + const struct inode* inode = &ci->vfs_inode;
> +
> + *size = inode->i_size;
> +}
> +
> +static enum fscache_checkaux ceph_fscache_inode_check_aux(
> + void *cookie_netfs_data, const void *data, uint16_t dlen)
> +{
> + struct ceph_aux_inode aux;
> + struct ceph_inode_info* ci = cookie_netfs_data;
> + struct inode* inode = &ci->vfs_inode;
> +
> + if (dlen != sizeof(aux))
> + return FSCACHE_CHECKAUX_OBSOLETE;
> +
> + memset(&aux, 0, sizeof(aux));
> + aux.mtime = inode->i_mtime;
> + aux.size = inode->i_size;
> +
> + if (memcmp(data, &aux, sizeof(aux)) != 0)
> + return FSCACHE_CHECKAUX_OBSOLETE;
> +
> + dout("ceph inode 0x%p cached okay", ci);
> + return FSCACHE_CHECKAUX_OKAY;
> +}
> +
> +static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
> +{
> + struct ceph_inode_info* ci = cookie_netfs_data;
> + struct pagevec pvec;
> + pgoff_t first;
> + int loop, nr_pages;
> +
> + pagevec_init(&pvec, 0);
> + first = 0;
> +
> + dout("ceph inode 0x%p now uncached", ci);
> +
> + while (1) {
> + nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
> + PAGEVEC_SIZE - pagevec_count(&pvec));
> +
> + if (!nr_pages)
> + break;
> +
> + for (loop = 0; loop < nr_pages; loop++)
> + ClearPageFsCache(pvec.pages[loop]);
> +
> + first = pvec.pages[nr_pages - 1]->index + 1;
> +
> + pvec.nr = nr_pages;
> + pagevec_release(&pvec);
> + cond_resched();
> + }
> +}
> +
> +static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
> + .name = "CEPH.inode",
> + .type = FSCACHE_COOKIE_TYPE_DATAFILE,
> + .get_key = ceph_fscache_inode_get_key,
> + .get_attr = ceph_fscache_inode_get_attr,
> + .get_aux = ceph_fscache_inode_get_aux,
> + .check_aux = ceph_fscache_inode_check_aux,
> + .now_uncached = ceph_fscache_inode_now_uncached,
> +};
> +
> +
> +static int get_caps_issued(struct ceph_inode_info* ci)
> +{
> + int issued;
> + int implemented = 0;
> +
> + issued = __ceph_caps_issued(ci, &implemented);
> + issued |= implemented | __ceph_caps_dirty(ci);
> + return issued;
> +}
> +
> +
> +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
> + struct ceph_inode_info* ci)
> +{
> + const int want = (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO);
> +
> + /* No caching for filesystem */
> + if (fsc->fscache == NULL)
> + return;
> + /* Only do it for data files */
> + if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
> + return;
> +
> + if (ci->fscache)
> + return;
> + if ((get_caps_issued(ci) & want) == 0) {
> + dout("No caps for caching inode: %p", &ci->vfs_inode);
> + return;
> + }
> +
> + ci->fscache = fscache_acquire_cookie(fsc->fscache,
> + &ceph_fscache_inode_object_def,
> + ci);
> +}
> +
> +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
> +{
> + if (ci->fscache == NULL)
> + return;
> +
> + fscache_relinquish_cookie(ci->fscache, 0);
> + ci->fscache = NULL;
> +}
> +
> +void ceph_fscache_revoke_inode_cookie(struct ceph_inode_info* ci)
> +{
> + if (ci->fscache == NULL)
> + return;
> +
> + fscache_invalidate(ci->fscache);
> + /* Make sure the cache is cleared after we close the handle */
> + fscache_relinquish_cookie(ci->fscache, 1);
> + ci->fscache = NULL;
> +}
> +
> +void __ceph_fscache_async_uncache_inode(struct ceph_inode_info* ci)
> +{
> + fscache_uncache_all_inode_pages(ci->fscache, &ci->vfs_inode);
> +}
> +
> +static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
> +{
> + if (!error)
> + SetPageUptodate(page);
> +}
> +
> +static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
> +{
> + if (!error) {
> + SetPageUptodate(page);
> + unlock_page(page);
> + }
> +}
> +
> +/* Atempt to read from the fscache,
> + *
> + * This function is called from the readpage_nounlock context. DO NOT attempt to
> + * unlock the page here (or in the callback).
> + */
> +int __ceph_readpage_from_fscache(struct inode *inode, struct page *page)
> +{
> + const struct ceph_inode_info *ci = ceph_inode(inode);
> + int ret;
> +
> + ret = fscache_read_or_alloc_page(ci->fscache, page,
> + ceph_vfs_readpage_complete, NULL,
> + GFP_KERNEL);
> +
> + switch (ret) {
> + case 0: /* Page found */
> + dout("page read submitted\n");
> + return 0;
> + case -ENOBUFS: /* Pages were not found, and can't be */
> + case -ENODATA: /* Pages were not found */
> + dout("page/inode not in cache\n");
> + return 1;
> + default:
> + dout("%s: unknown error ret = %i\n", __func__, ret);
> + return ret;
> + }
> +}
> +
> +int __ceph_readpages_from_fscache(struct inode *inode,
> + struct address_space *mapping,
> + struct list_head *pages,
> + unsigned *nr_pages)
> +{
> + struct ceph_inode_info *ci = ceph_inode(inode);
> + int issued = get_caps_issued(ci);
> + const int want = (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO);
> + int ret;
> +
> + /* Check if we have cached read caps */
> + if ((issued & want) == 0) {
> + return -ENOBUFS;
> + }
> +
> + ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
> + ceph_vfs_readpage_complete_unlock,
> + NULL, mapping_gfp_mask(mapping));
> +
> + switch (ret) {
> + case 0: /* All pages found */
> + dout("all-page read submitted\n");
> + return 0;
> + case -ENOBUFS: /* Some pages were not found, and can't be */
> + case -ENODATA: /* some pages were not found */
> + dout("page/inode not in cache\n");
> + return 1;
> + default:
> + dout("%s: unknown error ret = %i\n", __func__, ret);
> + return ret;
> + }
> +}
> +
> +void __ceph_readpage_to_fscache(struct inode *inode, struct page *page)
> +{
> + const struct ceph_inode_info *ci = ceph_inode(inode);
> + int ret;
> +
> + ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
> + if (ret)
> + fscache_uncache_page(ci->fscache, page);
> +}
> +
> +void __ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
> +{
> + const struct ceph_inode_info *ci = ceph_inode(inode);
> + struct fscache_cookie *cookie = ci->fscache;
> +
> + fscache_wait_on_page_write(cookie, page);
> + fscache_uncache_page(cookie, page);
> +}
> +
> +int __ceph_release_fscache_page(struct page *page, gfp_t gfp)
> +{
> + struct inode* inode = page->mapping->host;
> + struct ceph_inode_info *ci = ceph_inode(inode);
> + struct fscache_cookie *cookie = ci->fscache;
> +
> + return fscache_maybe_release_page(cookie, page, gfp);
> +}
> +
> diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
> new file mode 100644
> index 0000000..7d24151
> --- /dev/null
> +++ b/fs/ceph/cache.h
> @@ -0,0 +1,115 @@
> +/*
> + * Ceph cache definitions.
> + *
> + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
> + * Written by Milosz Tanski (milosz@adfin.com)
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2
> + * as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to:
> + * Free Software Foundation
> + * 51 Franklin Street, Fifth Floor
> + * Boston, MA 02111-1301 USA
> + *
> + */
> +
> +#ifndef _CEPH_CACHE_H
> +#define _CEPH_CACHE_H
> +#ifdef CONFIG_CEPH_FSCACHE
> +
> +#include <linux/fscache.h>
> +
> +
> +extern struct fscache_netfs ceph_cache_netfs;
> +
> +
> +void ceph_fscache_inode_get_cookie(struct inode *inode);
> +
> +void ceph_fscache_register_fsid_cookie(struct ceph_fs_client* fsc);
> +void ceph_fscache_unregister_fsid_cookie(struct ceph_fs_client* fsc);
> +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
> + struct ceph_inode_info* ci);
> +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
> +void ceph_fscache_revoke_inode_cookie(struct ceph_inode_info* ci);
> +void __ceph_fscache_async_uncache_inode(struct ceph_inode_info* ci);
> +
> +int __ceph_readpage_from_fscache(struct inode *inode, struct page *page);
> +int __ceph_readpages_from_fscache(struct inode *inode,
> + struct address_space *mapping,
> + struct list_head *pages,
> + unsigned *nr_pages);
> +void __ceph_readpage_to_fscache(struct inode *inode, struct page *page);
> +void __ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
> +int __ceph_release_fscache_page(struct page *page, gfp_t gfp);
> +
> +static inline void ceph_fsxache_async_uncache_inode(struct inode* inode)
> +{
> + struct ceph_inode_info *ci = ceph_inode(inode);
> +
> + if (ci->fscache == NULL)
> + return;
> +
> + __ceph_fscache_async_uncache_inode(ci);
> +}
> +
> +static inline int ceph_readpage_from_fscache(struct inode *inode,
> + struct page *page)
> +{
> + if (ceph_inode(inode)->fscache == NULL)
> + return -ENOBUFS;
> +
> + return __ceph_readpage_from_fscache(inode, page);
> +}
> +
> +static inline int ceph_readpages_from_fscache(struct inode *inode,
> + struct address_space *mapping,
> + struct list_head *pages,
> + unsigned *nr_pages)
> +{
> + if (ceph_inode(inode)->fscache == NULL)
> + return -ENOBUFS;
> +
> + return __ceph_readpages_from_fscache(inode, mapping, pages, nr_pages);
> +}
> +
> +static inline void ceph_readpage_to_fscache(struct inode *inode,
> + struct page *page)
> +{
> + if (ceph_inode(inode)->fscache == NULL)
> + return;
> +
> + if (PageFsCache(page))
> + return __ceph_readpage_to_fscache(inode, page);
> +}
> +
> +static inline void ceph_invalidate_fscache_page(struct inode *inode,
> + struct page *page)
> +{
> + if (ceph_inode(inode)->fscache == NULL)
> + return;
> +
> + if (PageFsCache(page))
> + return __ceph_invalidate_fscache_page(inode, page);
> +}
> +
> +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
> +{
> + struct inode* inode = page->mapping->host;
> + struct ceph_inode_info *ci = ceph_inode(inode);
> +
> + if (ci->fscache == NULL)
> + return 1;
> +
> + return __ceph_release_fscache_page(page, gfp);
> +}
> +
> +#endif
> +#endif
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index da0f9b8..5379f41 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -10,6 +10,7 @@
>
> #include "super.h"
> #include "mds_client.h"
> +#include "cache.h"
> #include <linux/ceph/decode.h>
> #include <linux/ceph/messenger.h>
>
> @@ -2366,6 +2367,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
> ci->i_rdcache_revoking = ci->i_rdcache_gen;
> }
> }
> +
> +#ifdef CONFIG_CEPH_FSCACHE
> + /* Close the fscache on inode */
> + ceph_fscache_unregister_inode_cookie(ci);
> +#endif
> }
>
> /* side effects now are allowed */
> @@ -2425,6 +2431,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
> wake = 1;
> }
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + /* Register cache (if needed); perform this after any size change. */
> + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
> + ceph_fscache_register_inode_cookie(session->s_mdsc->fsc, ci);
> +#endif
> +
> /* check cap bits */
> wanted = __ceph_caps_wanted(ci);
> used = __ceph_caps_used(ci);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 656e169..2162b35 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -11,6 +11,7 @@
>
> #include "super.h"
> #include "mds_client.h"
> +#include "cache.h"
>
> /*
> * Ceph file operations
> @@ -67,10 +68,17 @@ out:
> static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> {
> struct ceph_file_info *cf;
> + struct ceph_inode_info *ci = ceph_inode(inode);
> + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> int ret = 0;
>
> switch (inode->i_mode & S_IFMT) {
> case S_IFREG:
> +#ifdef CONFIG_CEPH_FSCACHE
> + spin_lock(&ci->i_ceph_lock);
> + ceph_fscache_register_inode_cookie(fsc, ci);
> + spin_unlock(&ci->i_ceph_lock);
> +#endif
> case S_IFDIR:
> dout("init_file %p %p 0%o (regular)\n", inode, file,
> inode->i_mode);
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index be0f7e2..5144b36 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -12,6 +12,7 @@
>
> #include "super.h"
> #include "mds_client.h"
> +#include "cache.h"
> #include <linux/ceph/decode.h>
>
> /*
> @@ -377,6 +378,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>
> INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + ci->fscache = NULL;
> +#endif
> +
> return &ci->vfs_inode;
> }
>
> @@ -396,6 +401,10 @@ void ceph_destroy_inode(struct inode *inode)
>
> dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + ceph_fscache_unregister_inode_cookie(ci);
> +#endif
> +
> ceph_queue_caps_release(inode);
>
> /*
> @@ -430,7 +439,6 @@ void ceph_destroy_inode(struct inode *inode)
> call_rcu(&inode->i_rcu, ceph_i_callback);
> }
>
> -
> /*
> * Helpers to fill in size, ctime, mtime, and atime. We have to be
> * careful because either the client or MDS may have more up to date
> @@ -633,6 +641,14 @@ static int fill_inode(struct inode *inode,
> le32_to_cpu(info->time_warp_seq),
> &ctime, &mtime, &atime);
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + /* Notify the cache that size has changed */
> + if (queue_trunc && ci->fscache) {
> + pr_info("size changed inode: %p cap flags\n", &ci->vfs_inode);
> + fscache_attr_changed(ci->fscache);
> + }
> +#endif
> +
> /* only update max_size on auth cap */
> if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
> ci->i_max_size != le64_to_cpu(info->max_size)) {
> @@ -1430,6 +1446,11 @@ static void ceph_invalidate_work(struct work_struct *work)
> orig_gen = ci->i_rdcache_gen;
> spin_unlock(&ci->i_ceph_lock);
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + dout("cache invalidating inode: %p cap flags\n", &ci->vfs_inode);
> + fscache_invalidate(ci->fscache);
> +#endif
> +
> truncate_inode_pages(&inode->i_data, 0);
>
> spin_lock(&ci->i_ceph_lock);
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 7d377c9..850c161 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -17,6 +17,7 @@
>
> #include "super.h"
> #include "mds_client.h"
> +#include "cache.h"
>
> #include <linux/ceph/ceph_features.h>
> #include <linux/ceph/decode.h>
> @@ -142,6 +143,8 @@ enum {
> Opt_nodcache,
> Opt_ino32,
> Opt_noino32,
> + Opt_fscache,
> + Opt_nofscache
> };
>
> static match_table_t fsopt_tokens = {
> @@ -167,6 +170,8 @@ static match_table_t fsopt_tokens = {
> {Opt_nodcache, "nodcache"},
> {Opt_ino32, "ino32"},
> {Opt_noino32, "noino32"},
> + {Opt_fscache, "fsc"},
> + {Opt_nofscache, "nofsc"},
> {-1, NULL}
> };
>
> @@ -260,6 +265,12 @@ static int parse_fsopt_token(char *c, void *private)
> case Opt_noino32:
> fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
> break;
> + case Opt_fscache:
> + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
> + break;
> + case Opt_nofscache:
> + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
> + break;
> default:
> BUG_ON(token);
> }
> @@ -422,6 +433,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
> seq_puts(m, ",dcache");
> else
> seq_puts(m, ",nodcache");
> + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
> + seq_puts(m, ",fsc");
> + else
> + seq_puts(m, ",nofsc");
>
> if (fsopt->wsize)
> seq_printf(m, ",wsize=%d", fsopt->wsize);
> @@ -530,6 +545,11 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
> if (!fsc->wb_pagevec_pool)
> goto fail_trunc_wq;
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE))
> + ceph_fscache_register_fsid_cookie(fsc);
> +#endif
> +
> /* caps */
> fsc->min_caps = fsopt->max_readdir;
>
> @@ -554,6 +574,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
> {
> dout("destroy_fs_client %p\n", fsc);
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + ceph_fscache_unregister_fsid_cookie(fsc);
> +#endif
> +
> destroy_workqueue(fsc->wb_wq);
> destroy_workqueue(fsc->pg_inv_wq);
> destroy_workqueue(fsc->trunc_wq);
> @@ -588,6 +612,8 @@ static void ceph_inode_init_once(void *foo)
>
> static int __init init_caches(void)
> {
> + int error = -ENOMEM;
> +
> ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
> sizeof(struct ceph_inode_info),
> __alignof__(struct ceph_inode_info),
> @@ -611,15 +637,19 @@ static int __init init_caches(void)
> if (ceph_file_cachep == NULL)
> goto bad_file;
>
> - return 0;
> +#ifdef CONFIG_CEPH_FSCACHE
> + if ((error = fscache_register_netfs(&ceph_cache_netfs)))
> + goto bad_file;
> +#endif
>
> + return 0;
> bad_file:
> kmem_cache_destroy(ceph_dentry_cachep);
> bad_dentry:
> kmem_cache_destroy(ceph_cap_cachep);
> bad_cap:
> kmem_cache_destroy(ceph_inode_cachep);
> - return -ENOMEM;
> + return error;
> }
>
> static void destroy_caches(void)
> @@ -629,10 +659,15 @@ static void destroy_caches(void)
> * destroy cache.
> */
> rcu_barrier();
> +
> kmem_cache_destroy(ceph_inode_cachep);
> kmem_cache_destroy(ceph_cap_cachep);
> kmem_cache_destroy(ceph_dentry_cachep);
> kmem_cache_destroy(ceph_file_cachep);
> +
> +#ifdef CONFIG_CEPH_FSCACHE
> + fscache_unregister_netfs(&ceph_cache_netfs);
> +#endif
> }
>
>
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 7ccfdb4..5ddaad5 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -16,6 +16,10 @@
>
> #include <linux/ceph/libceph.h>
>
> +#ifdef CONFIG_CEPH_FSCACHE
> +#include <linux/fscache.h>
> +#endif
> +
> /* f_type in struct statfs */
> #define CEPH_SUPER_MAGIC 0x00c36400
>
> @@ -29,6 +33,7 @@
> #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
> #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
> #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
> +#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
>
> #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
>
> @@ -90,6 +95,10 @@ struct ceph_fs_client {
> struct dentry *debugfs_bdi;
> struct dentry *debugfs_mdsc, *debugfs_mdsmap;
> #endif
> +
> +#ifdef CONFIG_CEPH_FSCACHE
> + struct fscache_cookie *fscache;
> +#endif
> };
>
>
> @@ -319,6 +328,10 @@ struct ceph_inode_info {
>
> struct work_struct i_vmtruncate_work;
>
> +#ifdef CONFIG_CEPH_FSCACHE
> + struct fscache_cookie *fscache;
> +#endif
> +
> struct inode vfs_inode; /* at end */
> };
>
> --
> 1.7.10.4
>
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] ceph: Add FScache support
2013-06-29 3:58 ` Milosz Tanski
@ 2013-07-01 15:55 ` Milosz Tanski
2013-07-02 19:14 ` David Howells
0 siblings, 1 reply; 20+ messages in thread
From: Milosz Tanski @ 2013-07-01 15:55 UTC (permalink / raw)
To: ceph-devel; +Cc: Sage Weil, Yan, Zheng, linux-cachefs, David Howells
Now that 3.10 is out I hope that David Howell's fscache improvement
branch will go into the next kernel. Without that the ceph fscache
code sames to run into an assertion in the fscache with relative ease.
With those in place on our systems we're not able to find any
additional fscache issues with the latest patches I submitted. It's my
hope that after that hurdle is cleared you guys can accept the fscache
changes into your tree (and thus into the mainline kernel down the
road).
Thanks,
- Milosz
On Fri, Jun 28, 2013 at 11:58 PM, Milosz Tanski <milosz@adfin.com> wrote:
> On our side we're pretty happy where this is at now. In our
> pre-production environment we are unable to find any issues with the
> current patch (not that there aren't any).
>
> Changes since the last patch:
> - Combine the two patches into one.
> - Fixed typos you guys found.
> - Fixe a bug where we were disposing of the cookie and then calling
> page invalidate on the inode leading to being unable to invalidate
> pages marked PRIVATE_2. The simple fix was to move unregistering the
> cookies passed the invalidate.
> - Fixe some acidental whitespace changes that snuck in.
>
> As always I welcome you guys' feedback. At this point in time I feel
> pretty good about the state this is in.
>
> -- Milosz
>
> On Fri, Jun 28, 2013 at 11:51 PM, Milosz Tanski <milosz@adfin.com> wrote:
>> Adding support for fscache to the Ceph filesystem. This would bring it to on
>> par with some of the other network filesystems in Linux (like NFS, AFS, etc...)
>>
>> This code uses uses existing ceph capabilities (cache & lazy io) to determine
>> if an inode is cacheable.
>>
>> In order to mount the filesystem with fscache the 'fsc' mount option must be
>> passed.
>>
>> Signed-off-by: Milosz Tanski <milosz@adfin.com>
>> ---
>> fs/ceph/Kconfig | 9 ++
>> fs/ceph/Makefile | 2 +
>> fs/ceph/addr.c | 84 ++++++++++----
>> fs/ceph/cache.c | 334 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> fs/ceph/cache.h | 115 +++++++++++++++++++
>> fs/ceph/caps.c | 12 ++
>> fs/ceph/file.c | 8 ++
>> fs/ceph/inode.c | 23 +++-
>> fs/ceph/super.c | 39 ++++++-
>> fs/ceph/super.h | 13 +++
>> 10 files changed, 616 insertions(+), 23 deletions(-)
>> create mode 100644 fs/ceph/cache.c
>> create mode 100644 fs/ceph/cache.h
>>
>> diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
>> index 49bc782..ac9a2ef 100644
>> --- a/fs/ceph/Kconfig
>> +++ b/fs/ceph/Kconfig
>> @@ -16,3 +16,12 @@ config CEPH_FS
>>
>> If unsure, say N.
>>
>> +if CEPH_FS
>> +config CEPH_FSCACHE
>> + bool "Enable Ceph client caching support"
>> + depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
>> + help
>> + Choose Y here to enable persistent, read-only local
>> + caching support for Ceph clients using FS-Cache
>> +
>> +endif
>> diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
>> index bd35212..0af0678 100644
>> --- a/fs/ceph/Makefile
>> +++ b/fs/ceph/Makefile
>> @@ -9,3 +9,5 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
>> mds_client.o mdsmap.o strings.o ceph_frag.o \
>> debugfs.o
>>
>> +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
>> +
>> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
>> index 3e68ac1..b22610f 100644
>> --- a/fs/ceph/addr.c
>> +++ b/fs/ceph/addr.c
>> @@ -11,6 +11,7 @@
>>
>> #include "super.h"
>> #include "mds_client.h"
>> +#include "cache.h"
>> #include <linux/ceph/osd_client.h>
>>
>> /*
>> @@ -149,11 +150,26 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
>> struct ceph_inode_info *ci;
>> struct ceph_snap_context *snapc = page_snap_context(page);
>>
>> - BUG_ON(!PageLocked(page));
>> - BUG_ON(!PagePrivate(page));
>> BUG_ON(!page->mapping);
>>
>> inode = page->mapping->host;
>> + ci = ceph_inode(inode);
>> +
>> + if (offset != 0) {
>> + dout("%p invalidatepage %p idx %lu partial dirty page\n",
>> + inode, page, page->index);
>> + return;
>> + }
>> +
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + if (PageFsCache(page))
>> + ceph_invalidate_fscache_page(inode, page);
>> +#endif
>> +
>> + if (!PagePrivate(page))
>> + return;
>> +
>> + BUG_ON(!PageLocked(page));
>>
>> /*
>> * We can get non-dirty pages here due to races between
>> @@ -163,31 +179,32 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
>> if (!PageDirty(page))
>> pr_err("%p invalidatepage %p page not dirty\n", inode, page);
>>
>> - if (offset == 0)
>> - ClearPageChecked(page);
>> + ClearPageChecked(page);
>>
>> - ci = ceph_inode(inode);
>> - if (offset == 0) {
>> - dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
>> - inode, page, page->index, offset);
>> - ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
>> - ceph_put_snap_context(snapc);
>> - page->private = 0;
>> - ClearPagePrivate(page);
>> - } else {
>> - dout("%p invalidatepage %p idx %lu partial dirty page\n",
>> - inode, page, page->index);
>> - }
>> + dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
>> + inode, page, page->index, offset);
>> +
>> + ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
>> + ceph_put_snap_context(snapc);
>> + page->private = 0;
>> + ClearPagePrivate(page);
>> }
>>
>> -/* just a sanity check */
>> static int ceph_releasepage(struct page *page, gfp_t g)
>> {
>> struct inode *inode = page->mapping ? page->mapping->host : NULL;
>> dout("%p releasepage %p idx %lu\n", inode, page, page->index);
>> WARN_ON(PageDirty(page));
>> - WARN_ON(PagePrivate(page));
>> - return 0;
>> +
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + /* Can we release the page from the cache? */
>> + if (PageFsCache(page) && ceph_release_fscache_page(page, g) == 0)
>> + return 0;
>> +#endif
>> + if (PagePrivate(page))
>> + return 0;
>> +
>> + return 1;
>> }
>>
>> /*
>> @@ -202,6 +219,13 @@ static int readpage_nounlock(struct file *filp, struct page *page)
>> int err = 0;
>> u64 len = PAGE_CACHE_SIZE;
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + err = ceph_readpage_from_fscache(inode, page);
>> +
>> + if (err == 0)
>> + goto out;
>> +#endif
>> +
>> dout("readpage inode %p file %p page %p index %lu\n",
>> inode, filp, page, page->index);
>> err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
>> @@ -219,6 +243,11 @@ static int readpage_nounlock(struct file *filp, struct page *page)
>> }
>> SetPageUptodate(page);
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + if (err == 0)
>> + ceph_readpage_to_fscache(inode, page);
>> +#endif
>> +
>> out:
>> return err < 0 ? err : 0;
>> }
>> @@ -261,6 +290,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
>> page->index);
>> flush_dcache_page(page);
>> SetPageUptodate(page);
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + ceph_readpage_to_fscache(inode, page);
>> +#endif
>> unlock_page(page);
>> page_cache_release(page);
>> bytes -= PAGE_CACHE_SIZE;
>> @@ -330,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
>> page = list_entry(page_list->prev, struct page, lru);
>> BUG_ON(PageLocked(page));
>> list_del(&page->lru);
>> -
>> +
>> dout("start_read %p adding %p idx %lu\n", inode, page,
>> page->index);
>> if (add_to_page_cache_lru(page, &inode->i_data, page->index,
>> @@ -377,6 +409,14 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
>> int rc = 0;
>> int max = 0;
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
>> + &nr_pages);
>> +
>> + if (rc == 0)
>> + goto out;
>> +#endif
>> +
>> if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
>> max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
>> >> PAGE_SHIFT;
>> @@ -490,6 +530,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
>> CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
>> set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + ceph_readpage_to_fscache(inode, page);
>> +#endif
>> +
>> set_page_writeback(page);
>> err = ceph_osdc_writepages(osdc, ceph_vino(inode),
>> &ci->i_layout, snapc,
>> diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
>> new file mode 100644
>> index 0000000..1ee3461
>> --- /dev/null
>> +++ b/fs/ceph/cache.c
>> @@ -0,0 +1,334 @@
>> +/*
>> + * Ceph cache definitions.
>> + *
>> + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
>> + * Written by Milosz Tanski (milosz@adfin.com)
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2
>> + * as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to:
>> + * Free Software Foundation
>> + * 51 Franklin Street, Fifth Floor
>> + * Boston, MA 02111-1301 USA
>> + *
>> + */
>> +
>> +#include "super.h"
>> +#include "cache.h"
>> +
>> +struct ceph_aux_inode {
>> + struct timespec mtime;
>> + loff_t size;
>> +};
>> +
>> +struct fscache_netfs ceph_cache_netfs = {
>> + .name = "ceph",
>> + .version = 0,
>> +};
>> +
>> +static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
>> + void *buffer, uint16_t maxbuf)
>> +{
>> + const struct ceph_fs_client* fsc = cookie_netfs_data;
>> + uint16_t klen;
>> +
>> + klen = sizeof(fsc->client->fsid);
>> + if (klen > maxbuf)
>> + return 0;
>> +
>> + memcpy(buffer, &fsc->client->fsid, klen);
>> + return klen;
>> +}
>> +
>> +static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
>> + .name = "CEPH.fsid",
>> + .type = FSCACHE_COOKIE_TYPE_INDEX,
>> + .get_key = ceph_fscache_session_get_key,
>> +};
>> +
>> +void ceph_fscache_register_fsid_cookie(struct ceph_fs_client* fsc)
>> +{
>> + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
>> + &ceph_fscache_fsid_object_def,
>> + fsc);
>> +}
>> +
>> +void ceph_fscache_unregister_fsid_cookie(struct ceph_fs_client* fsc)
>> +{
>> + fscache_relinquish_cookie(fsc->fscache, 0);
>> + fsc->fscache = NULL;
>> +}
>> +
>> +static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
>> + void *buffer, uint16_t maxbuf)
>> +{
>> + const struct ceph_inode_info* ci = cookie_netfs_data;
>> + uint16_t klen;
>> +
>> + /* use ceph virtual inode (id + snaphot) */
>> + klen = sizeof(ci->i_vino);
>> + if (klen > maxbuf)
>> + return 0;
>> +
>> + memcpy(buffer, &ci->i_vino, klen);
>> + return klen;
>> +}
>> +
>> +static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
>> + void *buffer, uint16_t bufmax)
>> +{
>> + struct ceph_aux_inode aux;
>> + const struct ceph_inode_info* ci = cookie_netfs_data;
>> + const struct inode* inode = &ci->vfs_inode;
>> +
>> + memset(&aux, 0, sizeof(aux));
>> + aux.mtime = inode->i_mtime;
>> + aux.size = inode->i_size;
>> +
>> + memcpy(buffer, &aux, sizeof(aux));
>> +
>> + return sizeof(aux);
>> +}
>> +
>> +static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
>> + uint64_t *size)
>> +{
>> + const struct ceph_inode_info* ci = cookie_netfs_data;
>> + const struct inode* inode = &ci->vfs_inode;
>> +
>> + *size = inode->i_size;
>> +}
>> +
>> +static enum fscache_checkaux ceph_fscache_inode_check_aux(
>> + void *cookie_netfs_data, const void *data, uint16_t dlen)
>> +{
>> + struct ceph_aux_inode aux;
>> + struct ceph_inode_info* ci = cookie_netfs_data;
>> + struct inode* inode = &ci->vfs_inode;
>> +
>> + if (dlen != sizeof(aux))
>> + return FSCACHE_CHECKAUX_OBSOLETE;
>> +
>> + memset(&aux, 0, sizeof(aux));
>> + aux.mtime = inode->i_mtime;
>> + aux.size = inode->i_size;
>> +
>> + if (memcmp(data, &aux, sizeof(aux)) != 0)
>> + return FSCACHE_CHECKAUX_OBSOLETE;
>> +
>> + dout("ceph inode 0x%p cached okay", ci);
>> + return FSCACHE_CHECKAUX_OKAY;
>> +}
>> +
>> +static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
>> +{
>> + struct ceph_inode_info* ci = cookie_netfs_data;
>> + struct pagevec pvec;
>> + pgoff_t first;
>> + int loop, nr_pages;
>> +
>> + pagevec_init(&pvec, 0);
>> + first = 0;
>> +
>> + dout("ceph inode 0x%p now uncached", ci);
>> +
>> + while (1) {
>> + nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
>> + PAGEVEC_SIZE - pagevec_count(&pvec));
>> +
>> + if (!nr_pages)
>> + break;
>> +
>> + for (loop = 0; loop < nr_pages; loop++)
>> + ClearPageFsCache(pvec.pages[loop]);
>> +
>> + first = pvec.pages[nr_pages - 1]->index + 1;
>> +
>> + pvec.nr = nr_pages;
>> + pagevec_release(&pvec);
>> + cond_resched();
>> + }
>> +}
>> +
>> +static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
>> + .name = "CEPH.inode",
>> + .type = FSCACHE_COOKIE_TYPE_DATAFILE,
>> + .get_key = ceph_fscache_inode_get_key,
>> + .get_attr = ceph_fscache_inode_get_attr,
>> + .get_aux = ceph_fscache_inode_get_aux,
>> + .check_aux = ceph_fscache_inode_check_aux,
>> + .now_uncached = ceph_fscache_inode_now_uncached,
>> +};
>> +
>> +
>> +static int get_caps_issued(struct ceph_inode_info* ci)
>> +{
>> + int issued;
>> + int implemented = 0;
>> +
>> + issued = __ceph_caps_issued(ci, &implemented);
>> + issued |= implemented | __ceph_caps_dirty(ci);
>> + return issued;
>> +}
>> +
>> +
>> +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
>> + struct ceph_inode_info* ci)
>> +{
>> + const int want = (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO);
>> +
>> + /* No caching for filesystem */
>> + if (fsc->fscache == NULL)
>> + return;
>> + /* Only do it for data files */
>> + if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
>> + return;
>> +
>> + if (ci->fscache)
>> + return;
>> + if ((get_caps_issued(ci) & want) == 0) {
>> + dout("No caps for caching inode: %p", &ci->vfs_inode);
>> + return;
>> + }
>> +
>> + ci->fscache = fscache_acquire_cookie(fsc->fscache,
>> + &ceph_fscache_inode_object_def,
>> + ci);
>> +}
>> +
>> +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
>> +{
>> + if (ci->fscache == NULL)
>> + return;
>> +
>> + fscache_relinquish_cookie(ci->fscache, 0);
>> + ci->fscache = NULL;
>> +}
>> +
>> +void ceph_fscache_revoke_inode_cookie(struct ceph_inode_info* ci)
>> +{
>> + if (ci->fscache == NULL)
>> + return;
>> +
>> + fscache_invalidate(ci->fscache);
>> + /* Make sure the cache is cleared after we close the handle */
>> + fscache_relinquish_cookie(ci->fscache, 1);
>> + ci->fscache = NULL;
>> +}
>> +
>> +void __ceph_fscache_async_uncache_inode(struct ceph_inode_info* ci)
>> +{
>> + fscache_uncache_all_inode_pages(ci->fscache, &ci->vfs_inode);
>> +}
>> +
>> +static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
>> +{
>> + if (!error)
>> + SetPageUptodate(page);
>> +}
>> +
>> +static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
>> +{
>> + if (!error) {
>> + SetPageUptodate(page);
>> + unlock_page(page);
>> + }
>> +}
>> +
>> +/* Atempt to read from the fscache,
>> + *
>> + * This function is called from the readpage_nounlock context. DO NOT attempt to
>> + * unlock the page here (or in the callback).
>> + */
>> +int __ceph_readpage_from_fscache(struct inode *inode, struct page *page)
>> +{
>> + const struct ceph_inode_info *ci = ceph_inode(inode);
>> + int ret;
>> +
>> + ret = fscache_read_or_alloc_page(ci->fscache, page,
>> + ceph_vfs_readpage_complete, NULL,
>> + GFP_KERNEL);
>> +
>> + switch (ret) {
>> + case 0: /* Page found */
>> + dout("page read submitted\n");
>> + return 0;
>> + case -ENOBUFS: /* Pages were not found, and can't be */
>> + case -ENODATA: /* Pages were not found */
>> + dout("page/inode not in cache\n");
>> + return 1;
>> + default:
>> + dout("%s: unknown error ret = %i\n", __func__, ret);
>> + return ret;
>> + }
>> +}
>> +
>> +int __ceph_readpages_from_fscache(struct inode *inode,
>> + struct address_space *mapping,
>> + struct list_head *pages,
>> + unsigned *nr_pages)
>> +{
>> + struct ceph_inode_info *ci = ceph_inode(inode);
>> + int issued = get_caps_issued(ci);
>> + const int want = (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO);
>> + int ret;
>> +
>> + /* Check if we have cached read caps */
>> + if ((issued & want) == 0) {
>> + return -ENOBUFS;
>> + }
>> +
>> + ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
>> + ceph_vfs_readpage_complete_unlock,
>> + NULL, mapping_gfp_mask(mapping));
>> +
>> + switch (ret) {
>> + case 0: /* All pages found */
>> + dout("all-page read submitted\n");
>> + return 0;
>> + case -ENOBUFS: /* Some pages were not found, and can't be */
>> + case -ENODATA: /* some pages were not found */
>> + dout("page/inode not in cache\n");
>> + return 1;
>> + default:
>> + dout("%s: unknown error ret = %i\n", __func__, ret);
>> + return ret;
>> + }
>> +}
>> +
>> +void __ceph_readpage_to_fscache(struct inode *inode, struct page *page)
>> +{
>> + const struct ceph_inode_info *ci = ceph_inode(inode);
>> + int ret;
>> +
>> + ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
>> + if (ret)
>> + fscache_uncache_page(ci->fscache, page);
>> +}
>> +
>> +void __ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
>> +{
>> + const struct ceph_inode_info *ci = ceph_inode(inode);
>> + struct fscache_cookie *cookie = ci->fscache;
>> +
>> + fscache_wait_on_page_write(cookie, page);
>> + fscache_uncache_page(cookie, page);
>> +}
>> +
>> +int __ceph_release_fscache_page(struct page *page, gfp_t gfp)
>> +{
>> + struct inode* inode = page->mapping->host;
>> + struct ceph_inode_info *ci = ceph_inode(inode);
>> + struct fscache_cookie *cookie = ci->fscache;
>> +
>> + return fscache_maybe_release_page(cookie, page, gfp);
>> +}
>> +
>> diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
>> new file mode 100644
>> index 0000000..7d24151
>> --- /dev/null
>> +++ b/fs/ceph/cache.h
>> @@ -0,0 +1,115 @@
>> +/*
>> + * Ceph cache definitions.
>> + *
>> + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
>> + * Written by Milosz Tanski (milosz@adfin.com)
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2
>> + * as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to:
>> + * Free Software Foundation
>> + * 51 Franklin Street, Fifth Floor
>> + * Boston, MA 02111-1301 USA
>> + *
>> + */
>> +
>> +#ifndef _CEPH_CACHE_H
>> +#define _CEPH_CACHE_H
>> +#ifdef CONFIG_CEPH_FSCACHE
>> +
>> +#include <linux/fscache.h>
>> +
>> +
>> +extern struct fscache_netfs ceph_cache_netfs;
>> +
>> +
>> +void ceph_fscache_inode_get_cookie(struct inode *inode);
>> +
>> +void ceph_fscache_register_fsid_cookie(struct ceph_fs_client* fsc);
>> +void ceph_fscache_unregister_fsid_cookie(struct ceph_fs_client* fsc);
>> +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
>> + struct ceph_inode_info* ci);
>> +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
>> +void ceph_fscache_revoke_inode_cookie(struct ceph_inode_info* ci);
>> +void __ceph_fscache_async_uncache_inode(struct ceph_inode_info* ci);
>> +
>> +int __ceph_readpage_from_fscache(struct inode *inode, struct page *page);
>> +int __ceph_readpages_from_fscache(struct inode *inode,
>> + struct address_space *mapping,
>> + struct list_head *pages,
>> + unsigned *nr_pages);
>> +void __ceph_readpage_to_fscache(struct inode *inode, struct page *page);
>> +void __ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
>> +int __ceph_release_fscache_page(struct page *page, gfp_t gfp);
>> +
>> +static inline void ceph_fsxache_async_uncache_inode(struct inode* inode)
>> +{
>> + struct ceph_inode_info *ci = ceph_inode(inode);
>> +
>> + if (ci->fscache == NULL)
>> + return;
>> +
>> + __ceph_fscache_async_uncache_inode(ci);
>> +}
>> +
>> +static inline int ceph_readpage_from_fscache(struct inode *inode,
>> + struct page *page)
>> +{
>> + if (ceph_inode(inode)->fscache == NULL)
>> + return -ENOBUFS;
>> +
>> + return __ceph_readpage_from_fscache(inode, page);
>> +}
>> +
>> +static inline int ceph_readpages_from_fscache(struct inode *inode,
>> + struct address_space *mapping,
>> + struct list_head *pages,
>> + unsigned *nr_pages)
>> +{
>> + if (ceph_inode(inode)->fscache == NULL)
>> + return -ENOBUFS;
>> +
>> + return __ceph_readpages_from_fscache(inode, mapping, pages, nr_pages);
>> +}
>> +
>> +static inline void ceph_readpage_to_fscache(struct inode *inode,
>> + struct page *page)
>> +{
>> + if (ceph_inode(inode)->fscache == NULL)
>> + return;
>> +
>> + if (PageFsCache(page))
>> + return __ceph_readpage_to_fscache(inode, page);
>> +}
>> +
>> +static inline void ceph_invalidate_fscache_page(struct inode *inode,
>> + struct page *page)
>> +{
>> + if (ceph_inode(inode)->fscache == NULL)
>> + return;
>> +
>> + if (PageFsCache(page))
>> + return __ceph_invalidate_fscache_page(inode, page);
>> +}
>> +
>> +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
>> +{
>> + struct inode* inode = page->mapping->host;
>> + struct ceph_inode_info *ci = ceph_inode(inode);
>> +
>> + if (ci->fscache == NULL)
>> + return 1;
>> +
>> + return __ceph_release_fscache_page(page, gfp);
>> +}
>> +
>> +#endif
>> +#endif
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index da0f9b8..5379f41 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -10,6 +10,7 @@
>>
>> #include "super.h"
>> #include "mds_client.h"
>> +#include "cache.h"
>> #include <linux/ceph/decode.h>
>> #include <linux/ceph/messenger.h>
>>
>> @@ -2366,6 +2367,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
>> ci->i_rdcache_revoking = ci->i_rdcache_gen;
>> }
>> }
>> +
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + /* Close the fscache on inode */
>> + ceph_fscache_unregister_inode_cookie(ci);
>> +#endif
>> }
>>
>> /* side effects now are allowed */
>> @@ -2425,6 +2431,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
>> wake = 1;
>> }
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + /* Register cache (if needed); perform this after any size change. */
>> + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
>> + ceph_fscache_register_inode_cookie(session->s_mdsc->fsc, ci);
>> +#endif
>> +
>> /* check cap bits */
>> wanted = __ceph_caps_wanted(ci);
>> used = __ceph_caps_used(ci);
>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>> index 656e169..2162b35 100644
>> --- a/fs/ceph/file.c
>> +++ b/fs/ceph/file.c
>> @@ -11,6 +11,7 @@
>>
>> #include "super.h"
>> #include "mds_client.h"
>> +#include "cache.h"
>>
>> /*
>> * Ceph file operations
>> @@ -67,10 +68,17 @@ out:
>> static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>> {
>> struct ceph_file_info *cf;
>> + struct ceph_inode_info *ci = ceph_inode(inode);
>> + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
>> int ret = 0;
>>
>> switch (inode->i_mode & S_IFMT) {
>> case S_IFREG:
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + spin_lock(&ci->i_ceph_lock);
>> + ceph_fscache_register_inode_cookie(fsc, ci);
>> + spin_unlock(&ci->i_ceph_lock);
>> +#endif
>> case S_IFDIR:
>> dout("init_file %p %p 0%o (regular)\n", inode, file,
>> inode->i_mode);
>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>> index be0f7e2..5144b36 100644
>> --- a/fs/ceph/inode.c
>> +++ b/fs/ceph/inode.c
>> @@ -12,6 +12,7 @@
>>
>> #include "super.h"
>> #include "mds_client.h"
>> +#include "cache.h"
>> #include <linux/ceph/decode.h>
>>
>> /*
>> @@ -377,6 +378,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>>
>> INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + ci->fscache = NULL;
>> +#endif
>> +
>> return &ci->vfs_inode;
>> }
>>
>> @@ -396,6 +401,10 @@ void ceph_destroy_inode(struct inode *inode)
>>
>> dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + ceph_fscache_unregister_inode_cookie(ci);
>> +#endif
>> +
>> ceph_queue_caps_release(inode);
>>
>> /*
>> @@ -430,7 +439,6 @@ void ceph_destroy_inode(struct inode *inode)
>> call_rcu(&inode->i_rcu, ceph_i_callback);
>> }
>>
>> -
>> /*
>> * Helpers to fill in size, ctime, mtime, and atime. We have to be
>> * careful because either the client or MDS may have more up to date
>> @@ -633,6 +641,14 @@ static int fill_inode(struct inode *inode,
>> le32_to_cpu(info->time_warp_seq),
>> &ctime, &mtime, &atime);
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + /* Notify the cache that size has changed */
>> + if (queue_trunc && ci->fscache) {
>> + pr_info("size changed inode: %p cap flags\n", &ci->vfs_inode);
>> + fscache_attr_changed(ci->fscache);
>> + }
>> +#endif
>> +
>> /* only update max_size on auth cap */
>> if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
>> ci->i_max_size != le64_to_cpu(info->max_size)) {
>> @@ -1430,6 +1446,11 @@ static void ceph_invalidate_work(struct work_struct *work)
>> orig_gen = ci->i_rdcache_gen;
>> spin_unlock(&ci->i_ceph_lock);
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + dout("cache invalidating inode: %p cap flags\n", &ci->vfs_inode);
>> + fscache_invalidate(ci->fscache);
>> +#endif
>> +
>> truncate_inode_pages(&inode->i_data, 0);
>>
>> spin_lock(&ci->i_ceph_lock);
>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>> index 7d377c9..850c161 100644
>> --- a/fs/ceph/super.c
>> +++ b/fs/ceph/super.c
>> @@ -17,6 +17,7 @@
>>
>> #include "super.h"
>> #include "mds_client.h"
>> +#include "cache.h"
>>
>> #include <linux/ceph/ceph_features.h>
>> #include <linux/ceph/decode.h>
>> @@ -142,6 +143,8 @@ enum {
>> Opt_nodcache,
>> Opt_ino32,
>> Opt_noino32,
>> + Opt_fscache,
>> + Opt_nofscache
>> };
>>
>> static match_table_t fsopt_tokens = {
>> @@ -167,6 +170,8 @@ static match_table_t fsopt_tokens = {
>> {Opt_nodcache, "nodcache"},
>> {Opt_ino32, "ino32"},
>> {Opt_noino32, "noino32"},
>> + {Opt_fscache, "fsc"},
>> + {Opt_nofscache, "nofsc"},
>> {-1, NULL}
>> };
>>
>> @@ -260,6 +265,12 @@ static int parse_fsopt_token(char *c, void *private)
>> case Opt_noino32:
>> fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
>> break;
>> + case Opt_fscache:
>> + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
>> + break;
>> + case Opt_nofscache:
>> + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
>> + break;
>> default:
>> BUG_ON(token);
>> }
>> @@ -422,6 +433,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
>> seq_puts(m, ",dcache");
>> else
>> seq_puts(m, ",nodcache");
>> + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
>> + seq_puts(m, ",fsc");
>> + else
>> + seq_puts(m, ",nofsc");
>>
>> if (fsopt->wsize)
>> seq_printf(m, ",wsize=%d", fsopt->wsize);
>> @@ -530,6 +545,11 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>> if (!fsc->wb_pagevec_pool)
>> goto fail_trunc_wq;
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE))
>> + ceph_fscache_register_fsid_cookie(fsc);
>> +#endif
>> +
>> /* caps */
>> fsc->min_caps = fsopt->max_readdir;
>>
>> @@ -554,6 +574,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
>> {
>> dout("destroy_fs_client %p\n", fsc);
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + ceph_fscache_unregister_fsid_cookie(fsc);
>> +#endif
>> +
>> destroy_workqueue(fsc->wb_wq);
>> destroy_workqueue(fsc->pg_inv_wq);
>> destroy_workqueue(fsc->trunc_wq);
>> @@ -588,6 +612,8 @@ static void ceph_inode_init_once(void *foo)
>>
>> static int __init init_caches(void)
>> {
>> + int error = -ENOMEM;
>> +
>> ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
>> sizeof(struct ceph_inode_info),
>> __alignof__(struct ceph_inode_info),
>> @@ -611,15 +637,19 @@ static int __init init_caches(void)
>> if (ceph_file_cachep == NULL)
>> goto bad_file;
>>
>> - return 0;
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + if ((error = fscache_register_netfs(&ceph_cache_netfs)))
>> + goto bad_file;
>> +#endif
>>
>> + return 0;
>> bad_file:
>> kmem_cache_destroy(ceph_dentry_cachep);
>> bad_dentry:
>> kmem_cache_destroy(ceph_cap_cachep);
>> bad_cap:
>> kmem_cache_destroy(ceph_inode_cachep);
>> - return -ENOMEM;
>> + return error;
>> }
>>
>> static void destroy_caches(void)
>> @@ -629,10 +659,15 @@ static void destroy_caches(void)
>> * destroy cache.
>> */
>> rcu_barrier();
>> +
>> kmem_cache_destroy(ceph_inode_cachep);
>> kmem_cache_destroy(ceph_cap_cachep);
>> kmem_cache_destroy(ceph_dentry_cachep);
>> kmem_cache_destroy(ceph_file_cachep);
>> +
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + fscache_unregister_netfs(&ceph_cache_netfs);
>> +#endif
>> }
>>
>>
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 7ccfdb4..5ddaad5 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -16,6 +16,10 @@
>>
>> #include <linux/ceph/libceph.h>
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> +#include <linux/fscache.h>
>> +#endif
>> +
>> /* f_type in struct statfs */
>> #define CEPH_SUPER_MAGIC 0x00c36400
>>
>> @@ -29,6 +33,7 @@
>> #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
>> #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
>> #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
>> +#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
>>
>> #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
>>
>> @@ -90,6 +95,10 @@ struct ceph_fs_client {
>> struct dentry *debugfs_bdi;
>> struct dentry *debugfs_mdsc, *debugfs_mdsmap;
>> #endif
>> +
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + struct fscache_cookie *fscache;
>> +#endif
>> };
>>
>>
>> @@ -319,6 +328,10 @@ struct ceph_inode_info {
>>
>> struct work_struct i_vmtruncate_work;
>>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + struct fscache_cookie *fscache;
>> +#endif
>> +
>> struct inode vfs_inode; /* at end */
>> };
>>
>> --
>> 1.7.10.4
>>
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] ceph: Add FScache support
2013-07-01 15:55 ` Milosz Tanski
@ 2013-07-02 19:14 ` David Howells
2013-07-02 19:20 ` Milosz Tanski
0 siblings, 1 reply; 20+ messages in thread
From: David Howells @ 2013-07-02 19:14 UTC (permalink / raw)
To: Milosz Tanski; +Cc: Yan, Zheng, ceph-devel, linux-cachefs, Sage Weil
Milosz Tanski <milosz@adfin.com> wrote:
> Now that 3.10 is out I hope that David Howell's fscache improvement
> branch will go into the next kernel.
It has just been pulled.
David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ceph: Add FScache support
2013-07-02 19:14 ` David Howells
@ 2013-07-02 19:20 ` Milosz Tanski
2013-07-02 19:39 ` David Howells
0 siblings, 1 reply; 20+ messages in thread
From: Milosz Tanski @ 2013-07-02 19:20 UTC (permalink / raw)
To: David Howells; +Cc: ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
David,
I just looked and saw that it's been pulled a couple hours ago.
Can I also trouble you into looking at my patches for Ceph for the
FSCache? We're using it (in production starting today actually); we're
not able to find any bugs with the current iteration. But it's always
nice to have an extra pair of eyes (esp from the maintainer of
FSCache).
Thanks for everything David,
-- Milosz
On Tue, Jul 2, 2013 at 3:14 PM, David Howells <dhowells@redhat.com> wrote:
> Milosz Tanski <milosz@adfin.com> wrote:
>
>> Now that 3.10 is out I hope that David Howell's fscache improvement
>> branch will go into the next kernel.
>
> It has just been pulled.
>
> David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ceph: Add FScache support
2013-07-02 19:20 ` Milosz Tanski
@ 2013-07-02 19:39 ` David Howells
2013-07-02 19:56 ` Milosz Tanski
0 siblings, 1 reply; 20+ messages in thread
From: David Howells @ 2013-07-02 19:39 UTC (permalink / raw)
To: Milosz Tanski; +Cc: Yan, Zheng, ceph-devel, linux-cachefs, Sage Weil
Milosz Tanski <milosz@adfin.com> wrote:
> I just looked and saw that it's been pulled a couple hours ago.
>
> Can I also trouble you into looking at my patches for Ceph for the
> FSCache? We're using it (in production starting today actually); we're
> not able to find any bugs with the current iteration. But it's always
> nice to have an extra pair of eyes (esp from the maintainer of
> FSCache).
I'll have a look, but I can't run them as I don't have Ceph set up.
Can you point me at or email me your current patches, just so as I'm looking
at up to date stuff?
David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ceph: Add FScache support
2013-07-02 19:39 ` David Howells
@ 2013-07-02 19:56 ` Milosz Tanski
2013-07-02 20:49 ` David Howells
0 siblings, 1 reply; 20+ messages in thread
From: Milosz Tanski @ 2013-07-02 19:56 UTC (permalink / raw)
To: David Howells; +Cc: ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
David,
It hasn't changed since the patch I posted inline 4 days ago (same as
the one that went out to linux-cachefs mailing list). You can also get
the 'wip-ceph-fscache' branch from my gitrepo:
https://bitbucket.org/adfin/linux-fs.git. Finally, you can take a look
at the changes in the browser here:
https://bitbucket.org/adfin/linux-fs/commits/61908aea3ce3faf8eda7bdbbbd604068aa3283c9?at=wip-ceph-fscache
I understand you won't be able to run it, but it's still worth it to
see if I'm doing anything that feels wrong.
Thanks,
- Milosz
On Tue, Jul 2, 2013 at 3:39 PM, David Howells <dhowells@redhat.com> wrote:
> Milosz Tanski <milosz@adfin.com> wrote:
>
>> I just looked and saw that it's been pulled a couple hours ago.
>>
>> Can I also trouble you into looking at my patches for Ceph for the
>> FSCache? We're using it (in production starting today actually); we're
>> not able to find any bugs with the current iteration. But it's always
>> nice to have an extra pair of eyes (esp from the maintainer of
>> FSCache).
>
> I'll have a look, but I can't run them as I don't have Ceph set up.
>
> Can you point me at or email me your current patches, just so as I'm looking
> at up to date stuff?
>
> David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ceph: Add FScache support
2013-07-02 19:56 ` Milosz Tanski
@ 2013-07-02 20:49 ` David Howells
2013-07-02 21:14 ` Milosz Tanski
0 siblings, 1 reply; 20+ messages in thread
From: David Howells @ 2013-07-02 20:49 UTC (permalink / raw)
To: Milosz Tanski; +Cc: Yan, Zheng, ceph-devel, linux-cachefs, Sage Weil
Milosz Tanski <milosz@adfin.com> wrote:
> You can also get the 'wip-ceph-fscache' branch from my gitrepo:
There's only one patch from you there. Shouldn't there be at least two as you
posted?
David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ceph: Add FScache support
2013-07-02 20:49 ` David Howells
@ 2013-07-02 21:14 ` Milosz Tanski
2013-07-02 23:40 ` David Howells
0 siblings, 1 reply; 20+ messages in thread
From: Milosz Tanski @ 2013-07-02 21:14 UTC (permalink / raw)
To: David Howells; +Cc: ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
I've combined them into one.
On Tue, Jul 2, 2013 at 4:49 PM, David Howells <dhowells@redhat.com> wrote:
> Milosz Tanski <milosz@adfin.com> wrote:
>
>> You can also get the 'wip-ceph-fscache' branch from my gitrepo:
>
> There's only one patch from you there. Shouldn't there be at least two as you
> posted?
>
> David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ceph: Add FScache support
2013-07-02 21:14 ` Milosz Tanski
@ 2013-07-02 23:40 ` David Howells
2013-07-03 19:02 ` Milosz Tanski
0 siblings, 1 reply; 20+ messages in thread
From: David Howells @ 2013-07-02 23:40 UTC (permalink / raw)
To: Milosz Tanski; +Cc: dhowells, ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
Okay, my analysis of the patch:
Looking at your index structure, ceph has per-fsid indices under the top
"ceph" index and then per-inode indices under those? Are fsids universally
unique - or just for a given server/cell/whatever?
> +#ifdef CONFIG_CEPH_FSCACHE
> + if (PageFsCache(page))
> + ceph_invalidate_fscache_page(inode, page);
> +#endif
The PageFsCache() test here should be folded into the header file. You
actually have a redundant test:
+static inline void ceph_invalidate_fscache_page(struct inode *inode,
+ struct page *page)
+{
+ if (ceph_inode(inode)->fscache == NULL)
+ return;
+
+ if (PageFsCache(page))
+ return __ceph_invalidate_fscache_page(inode, page);
+}
> +#ifdef CONFIG_CEPH_FSCACHE
> + /* Can we release the page from the cache? */
> + if (PageFsCache(page) && ceph_release_fscache_page(page, g) == 0)
> + return 0;
> +#endif
The PageFsCache() test here is also redundant as fscache_maybe_release_page()
also does it - though I acknowledge it does it after doing the "is this cookie
valid" test. The reason I put that test first is that if CONFIG_FSCACHE=n
then the "is this cookie valid" test just evaluates immediately to false at
compile time.
> +void ceph_fscache_inode_get_cookie(struct inode *inode);
No such function?
> +static inline void ceph_fsxache_async_uncache_inode(struct inode* inode)
Misspelling?
> +static inline int ceph_readpage_from_fscache(struct inode *inode,
> + struct page *page)
> +{
> + if (ceph_inode(inode)->fscache == NULL)
> + return -ENOBUFS;
> +
> + return __ceph_readpage_from_fscache(inode, page);
> +}
Looking at functions like this, if you wrap:
ceph_inode(inode)->fscache
as, say:
struct fscache_cookie *ceph_inode_cookie(struct inode *inode)
{
#ifdef CONFIG_CEPH_FSCACHE
return ceph_inode(inode)->fscache;
#else
return NULL;
#endif
}
then you can get rid of a lot of cpp conditionals and just rely on gcc's
optimiser (see what I did in include/linux/fscache.h). Note that anything in
fs/ceph/cache.c wouldn't need to use this.
> static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> ...
> +#ifdef CONFIG_CEPH_FSCACHE
> + spin_lock(&ci->i_ceph_lock);
> + ceph_fscache_register_inode_cookie(fsc, ci);
> + spin_unlock(&ci->i_ceph_lock);
> +#endif
Ummm... ceph_fscache_register_inode_cookie() calls fscache_acquire_cookie()
which allocates GFP_KERNEL and grabs fscache_addremove_sem. You can't wrap
this call in a spinlock. Do you use lockdep?
David
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] ceph: Add FScache support
2013-07-02 23:40 ` David Howells
@ 2013-07-03 19:02 ` Milosz Tanski
2013-07-03 23:52 ` David Howells
2013-07-08 14:46 ` Milosz Tanski
0 siblings, 2 replies; 20+ messages in thread
From: Milosz Tanski @ 2013-07-03 19:02 UTC (permalink / raw)
To: David Howells; +Cc: ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
David,
I took your suggestions and updated my wip branch (on bitbucket) with
a handful of fixes except for the locking around registering the
cookie. I'm not sure what's the correct thing to do there.
On Tue, Jul 2, 2013 at 7:40 PM, David Howells <dhowells@redhat.com> wrote:
>
> Okay, my analysis of the patch:
>
> Looking at your index structure, ceph has per-fsid indices under the top
> "ceph" index and then per-inode indices under those? Are fsids universally
> unique - or just for a given server/cell/whatever?
It's my understanding that's a guuid assigned to the cluster.
>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + if (PageFsCache(page))
>> + ceph_invalidate_fscache_page(inode, page);
>> +#endif
>
> The PageFsCache() test here should be folded into the header file. You
> actually have a redundant test:
>
> +static inline void ceph_invalidate_fscache_page(struct inode *inode,
> + struct page *page)
> +{
> + if (ceph_inode(inode)->fscache == NULL)
> + return;
> +
> + if (PageFsCache(page))
> + return __ceph_invalidate_fscache_page(inode, page);
> +}
>
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + /* Can we release the page from the cache? */
>> + if (PageFsCache(page) && ceph_release_fscache_page(page, g) == 0)
>> + return 0;
>> +#endif
>
> The PageFsCache() test here is also redundant as fscache_maybe_release_page()
> also does it - though I acknowledge it does it after doing the "is this cookie
> valid" test. The reason I put that test first is that if CONFIG_FSCACHE=n
> then the "is this cookie valid" test just evaluates immediately to false at
> compile time.
>
>> +void ceph_fscache_inode_get_cookie(struct inode *inode);
>
> No such function?
Fixed.
>
>> +static inline void ceph_fsxache_async_uncache_inode(struct inode* inode)
>
> Misspelling?
Fixed.
>
>> +static inline int ceph_readpage_from_fscache(struct inode *inode,
>> + struct page *page)
>> +{
>> + if (ceph_inode(inode)->fscache == NULL)
>> + return -ENOBUFS;
>> +
>> + return __ceph_readpage_from_fscache(inode, page);
>> +}
>
> Looking at functions like this, if you wrap:
>
> ceph_inode(inode)->fscache
>
> as, say:
>
> struct fscache_cookie *ceph_inode_cookie(struct inode *inode)
> {
> #ifdef CONFIG_CEPH_FSCACHE
> return ceph_inode(inode)->fscache;
> #else
> return NULL;
> #endif
> }
>
> then you can get rid of a lot of cpp conditionals and just rely on gcc's
> optimiser (see what I did in include/linux/fscache.h). Note that anything in
> fs/ceph/cache.c wouldn't need to use this.
Per your suggestion I implemented this. This helped me to get rid of
some of the CONFIG_CEPH_FSCACHE checks (but not all).
>
>> static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>> ...
>> +#ifdef CONFIG_CEPH_FSCACHE
>> + spin_lock(&ci->i_ceph_lock);
>> + ceph_fscache_register_inode_cookie(fsc, ci);
>> + spin_unlock(&ci->i_ceph_lock);
>> +#endif
>
> Ummm... ceph_fscache_register_inode_cookie() calls fscache_acquire_cookie()
> which allocates GFP_KERNEL and grabs fscache_addremove_sem. You can't wrap
> this call in a spinlock. Do you use lockdep?
I took a look at the nfs code it seams like there isn't any kind of
locking in nfs_open around acquiring of the cookie. Then looking back
at Ceph code it seams like extensively locks in the open code. I'm not
sure if I have open worry about open races.
>
> David
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] ceph: Add FScache support
2013-07-03 19:02 ` Milosz Tanski
@ 2013-07-03 23:52 ` David Howells
2013-07-04 0:03 ` Sage Weil
2013-07-08 14:46 ` Milosz Tanski
1 sibling, 1 reply; 20+ messages in thread
From: David Howells @ 2013-07-03 23:52 UTC (permalink / raw)
To: Milosz Tanski; +Cc: dhowells, ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
Milosz Tanski <milosz@adfin.com> wrote:
> > Looking at your index structure, ceph has per-fsid indices under the top
> > "ceph" index and then per-inode indices under those? Are fsids universally
> > unique - or just for a given server/cell/whatever?
>
> It's my understanding that's a guuid assigned to the cluster.
What's the likelyhood of two clusters throwing up the same guuid?
David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ceph: Add FScache support
2013-07-03 23:52 ` David Howells
@ 2013-07-04 0:03 ` Sage Weil
0 siblings, 0 replies; 20+ messages in thread
From: Sage Weil @ 2013-07-04 0:03 UTC (permalink / raw)
To: David Howells; +Cc: Milosz Tanski, ceph-devel, Yan, Zheng, linux-cachefs
On Thu, 4 Jul 2013, David Howells wrote:
> Milosz Tanski <milosz@adfin.com> wrote:
>
> > > Looking at your index structure, ceph has per-fsid indices under the top
> > > "ceph" index and then per-inode indices under those? Are fsids universally
> > > unique - or just for a given server/cell/whatever?
> >
> > It's my understanding that's a guuid assigned to the cluster.
>
> What's the likelyhood of two clusters throwing up the same guuid?
1 in 2^128. The uuid comes from /dev/random, courtesy of uuidgen.
sage
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ceph: Add FScache support
2013-07-03 19:02 ` Milosz Tanski
2013-07-03 23:52 ` David Howells
@ 2013-07-08 14:46 ` Milosz Tanski
2013-07-09 10:33 ` David Howells
1 sibling, 1 reply; 20+ messages in thread
From: Milosz Tanski @ 2013-07-08 14:46 UTC (permalink / raw)
To: David Howells; +Cc: ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
David,
It looks like both the cifs and NFS code do not bother with any
locking around cifs_fscache_set_inode_cookie. Is there no concern over
multiple open() calls racing to create the cookie in those
filesystems?
Thanks,
-- Milosz
On Wed, Jul 3, 2013 at 3:02 PM, Milosz Tanski <milosz@adfin.com> wrote:
> David,
>
> I took your suggestions and updated my wip branch (on bitbucket) with
> a handful of fixes except for the locking around registering the
> cookie. I'm not sure what's the correct thing to do there.
>
> On Tue, Jul 2, 2013 at 7:40 PM, David Howells <dhowells@redhat.com> wrote:
>>
>> Okay, my analysis of the patch:
>>
>> Looking at your index structure, ceph has per-fsid indices under the top
>> "ceph" index and then per-inode indices under those? Are fsids universally
>> unique - or just for a given server/cell/whatever?
>
> It's my understanding that's a guuid assigned to the cluster.
>
>>
>>> +#ifdef CONFIG_CEPH_FSCACHE
>>> + if (PageFsCache(page))
>>> + ceph_invalidate_fscache_page(inode, page);
>>> +#endif
>>
>> The PageFsCache() test here should be folded into the header file. You
>> actually have a redundant test:
>>
>> +static inline void ceph_invalidate_fscache_page(struct inode *inode,
>> + struct page *page)
>> +{
>> + if (ceph_inode(inode)->fscache == NULL)
>> + return;
>> +
>> + if (PageFsCache(page))
>> + return __ceph_invalidate_fscache_page(inode, page);
>> +}
>>
>>> +#ifdef CONFIG_CEPH_FSCACHE
>>> + /* Can we release the page from the cache? */
>>> + if (PageFsCache(page) && ceph_release_fscache_page(page, g) == 0)
>>> + return 0;
>>> +#endif
>>
>> The PageFsCache() test here is also redundant as fscache_maybe_release_page()
>> also does it - though I acknowledge it does it after doing the "is this cookie
>> valid" test. The reason I put that test first is that if CONFIG_FSCACHE=n
>> then the "is this cookie valid" test just evaluates immediately to false at
>> compile time.
>>
>>> +void ceph_fscache_inode_get_cookie(struct inode *inode);
>>
>> No such function?
>
> Fixed.
>
>>
>>> +static inline void ceph_fsxache_async_uncache_inode(struct inode* inode)
>>
>> Misspelling?
>
> Fixed.
>
>>
>>> +static inline int ceph_readpage_from_fscache(struct inode *inode,
>>> + struct page *page)
>>> +{
>>> + if (ceph_inode(inode)->fscache == NULL)
>>> + return -ENOBUFS;
>>> +
>>> + return __ceph_readpage_from_fscache(inode, page);
>>> +}
>>
>> Looking at functions like this, if you wrap:
>>
>> ceph_inode(inode)->fscache
>>
>> as, say:
>>
>> struct fscache_cookie *ceph_inode_cookie(struct inode *inode)
>> {
>> #ifdef CONFIG_CEPH_FSCACHE
>> return ceph_inode(inode)->fscache;
>> #else
>> return NULL;
>> #endif
>> }
>>
>> then you can get rid of a lot of cpp conditionals and just rely on gcc's
>> optimiser (see what I did in include/linux/fscache.h). Note that anything in
>> fs/ceph/cache.c wouldn't need to use this.
>
> Per your suggestion I implemented this. This helped me to get rid of
> some of the CONFIG_CEPH_FSCACHE checks (but not all).
>
>>
>>> static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>>> ...
>>> +#ifdef CONFIG_CEPH_FSCACHE
>>> + spin_lock(&ci->i_ceph_lock);
>>> + ceph_fscache_register_inode_cookie(fsc, ci);
>>> + spin_unlock(&ci->i_ceph_lock);
>>> +#endif
>>
>> Ummm... ceph_fscache_register_inode_cookie() calls fscache_acquire_cookie()
>> which allocates GFP_KERNEL and grabs fscache_addremove_sem. You can't wrap
>> this call in a spinlock. Do you use lockdep?
>
> I took a look at the nfs code it seams like there isn't any kind of
> locking in nfs_open around acquiring of the cookie. Then looking back
> at Ceph code it seams like extensively locks in the open code. I'm not
> sure if I have open worry about open races.
>
>>
>> David
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] ceph: Add FScache support
2013-07-08 14:46 ` Milosz Tanski
@ 2013-07-09 10:33 ` David Howells
2013-07-09 12:26 ` Myklebust, Trond
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: David Howells @ 2013-07-09 10:33 UTC (permalink / raw)
To: Milosz Tanski, Trond.Myklebust
Cc: dhowells, ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
Milosz Tanski <milosz@adfin.com> wrote:
> It looks like both the cifs and NFS code do not bother with any
> locking around cifs_fscache_set_inode_cookie. Is there no concern over
> multiple open() calls racing to create the cookie in those
> filesystems?
Yeah... That's probably wrong. AFS obviates the need for special locking by
doing it in afs_iget().
Hmmm... I think I've just spotted what might be the cause of pages getting
marked PG_fscache whilst belonging to the allocator.
void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
{
if (NFS_FSCACHE(inode)) {
nfs_fscache_inode_lock(inode);
if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
nfs_fscache_disable_inode_cookie(inode);
else
nfs_fscache_enable_inode_cookie(inode);
nfs_fscache_inode_unlock(inode);
}
}
can release the cookie whilst reads are in progress on it when an inode being
read suddenly changes to an inode being written. We need some sort of
synchronisation on that there.
David
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] ceph: Add FScache support
2013-07-09 10:33 ` David Howells
@ 2013-07-09 12:26 ` Myklebust, Trond
2013-07-09 12:46 ` Milosz Tanski
2013-07-09 14:07 ` Milosz Tanski
2 siblings, 0 replies; 20+ messages in thread
From: Myklebust, Trond @ 2013-07-09 12:26 UTC (permalink / raw)
To: David Howells
Cc: Milosz Tanski, ceph-devel, Sage Weil, Yan, Zheng,
linux-cachefs@redhat.com
On Tue, 2013-07-09 at 11:33 +0100, David Howells wrote:
> Milosz Tanski <milosz@adfin.com> wrote:
>
> > It looks like both the cifs and NFS code do not bother with any
> > locking around cifs_fscache_set_inode_cookie. Is there no concern over
> > multiple open() calls racing to create the cookie in those
> > filesystems?
>
> Yeah... That's probably wrong. AFS obviates the need for special locking by
> doing it in afs_iget().
>
> Hmmm... I think I've just spotted what might be the cause of pages getting
> marked PG_fscache whilst belonging to the allocator.
>
> void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
> {
> if (NFS_FSCACHE(inode)) {
> nfs_fscache_inode_lock(inode);
> if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
> nfs_fscache_disable_inode_cookie(inode);
> else
> nfs_fscache_enable_inode_cookie(inode);
> nfs_fscache_inode_unlock(inode);
> }
> }
>
> can release the cookie whilst reads are in progress on it when an inode being
> read suddenly changes to an inode being written. We need some sort of
> synchronisation on that there.
Change fscache_uncache_all_inode_pages to always take the page lock,
check the value of page->mapping, and wait for outstanding writes to
finish before trying to release PG_fscache?
--
Trond Myklebust
Linux NFS client maintainer
NetApp
Trond.Myklebust@netapp.com
www.netapp.com
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] ceph: Add FScache support
2013-07-09 10:33 ` David Howells
2013-07-09 12:26 ` Myklebust, Trond
@ 2013-07-09 12:46 ` Milosz Tanski
2013-07-09 13:04 ` Milosz Tanski
2013-07-09 14:07 ` Milosz Tanski
2 siblings, 1 reply; 20+ messages in thread
From: Milosz Tanski @ 2013-07-09 12:46 UTC (permalink / raw)
To: David Howells
Cc: Trond.Myklebust, ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
On Tue, Jul 9, 2013 at 6:33 AM, David Howells <dhowells@redhat.com> wrote:
> Milosz Tanski <milosz@adfin.com> wrote:
>
>> It looks like both the cifs and NFS code do not bother with any
>> locking around cifs_fscache_set_inode_cookie. Is there no concern over
>> multiple open() calls racing to create the cookie in those
>> filesystems?
>
> Yeah... That's probably wrong. AFS obviates the need for special locking by
> doing it in afs_iget().
I'm going to create a mutex around the enable cache / disable cache in
the Ceph code since the spinlock is also right now.
>
> Hmmm... I think I've just spotted what might be the cause of pages getting
> marked PG_fscache whilst belonging to the allocator.
>
> void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
> {
> if (NFS_FSCACHE(inode)) {
> nfs_fscache_inode_lock(inode);
> if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
> nfs_fscache_disable_inode_cookie(inode);
> else
> nfs_fscache_enable_inode_cookie(inode);
> nfs_fscache_inode_unlock(inode);
> }
> }
>
> can release the cookie whilst reads are in progress on it when an inode being
> read suddenly changes to an inode being written. We need some sort of
> synchronisation on that there.
So far my experience has been that synchronization has been the most
tricky part of implementing fscache for Ceph. Things work great when
there's a simple shell accessing data and break down when you're doing
a HPC kind of workload.
>
> David
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] ceph: Add FScache support
2013-07-09 12:46 ` Milosz Tanski
@ 2013-07-09 13:04 ` Milosz Tanski
0 siblings, 0 replies; 20+ messages in thread
From: Milosz Tanski @ 2013-07-09 13:04 UTC (permalink / raw)
To: David Howells
Cc: Trond.Myklebust, ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
David,
On a somewhat related node. The header definition of
fscache_maybe_release_page doesn't seam quite correct. The comment
states it should return true if the page can be freed ... yet when
there's not a cookie or PG_fscache is not set it returns false. This
doesn't seam right, in fact the NFS code wraps this bit in a second
PageFsCache check to make sure to return true.
661 /**
662 * fscache_maybe_release_page - Consider releasing a page,
cancelling a store
663 * @cookie: The cookie representing the cache object
664 * @page: The netfs page that is being cached.
665 * @gfp: The gfp flags passed to releasepage()
666 *
667 * Consider releasing a page for the vmscan algorithm, on behalf
of the netfs's
668 * releasepage() call. A storage request on the page may cancelled if it is
669 * not currently being processed.
670 *
671 * The function returns true if the page no longer has a storage
request on it,
672 * and false if a storage request is left in place. If true is
returned, the
673 * page will have been passed to fscache_uncache_page(). If false
is returned
674 * the page cannot be freed yet.
675 */
676 static inline
677 bool fscache_maybe_release_page(struct fscache_cookie *cookie,
678 struct page *page,
679 gfp_t gfp)
680 {
681 if (fscache_cookie_valid(cookie) && PageFsCache(page))
682 return __fscache_maybe_release_page(cookie, page, gfp);
683 return false;
684 }
On Tue, Jul 9, 2013 at 8:46 AM, Milosz Tanski <milosz@adfin.com> wrote:
> On Tue, Jul 9, 2013 at 6:33 AM, David Howells <dhowells@redhat.com> wrote:
>> Milosz Tanski <milosz@adfin.com> wrote:
>>
>>> It looks like both the cifs and NFS code do not bother with any
>>> locking around cifs_fscache_set_inode_cookie. Is there no concern over
>>> multiple open() calls racing to create the cookie in those
>>> filesystems?
>>
>> Yeah... That's probably wrong. AFS obviates the need for special locking by
>> doing it in afs_iget().
>
> I'm going to create a mutex around the enable cache / disable cache in
> the Ceph code since the spinlock is also right now.
>
>>
>> Hmmm... I think I've just spotted what might be the cause of pages getting
>> marked PG_fscache whilst belonging to the allocator.
>>
>> void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
>> {
>> if (NFS_FSCACHE(inode)) {
>> nfs_fscache_inode_lock(inode);
>> if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
>> nfs_fscache_disable_inode_cookie(inode);
>> else
>> nfs_fscache_enable_inode_cookie(inode);
>> nfs_fscache_inode_unlock(inode);
>> }
>> }
>>
>> can release the cookie whilst reads are in progress on it when an inode being
>> read suddenly changes to an inode being written. We need some sort of
>> synchronisation on that there.
>
> So far my experience has been that synchronization has been the most
> tricky part of implementing fscache for Ceph. Things work great when
> there's a simple shell accessing data and break down when you're doing
> a HPC kind of workload.
>
>>
>> David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ceph: Add FScache support
2013-07-09 10:33 ` David Howells
2013-07-09 12:26 ` Myklebust, Trond
2013-07-09 12:46 ` Milosz Tanski
@ 2013-07-09 14:07 ` Milosz Tanski
2013-07-09 17:43 ` Milosz Tanski
2 siblings, 1 reply; 20+ messages in thread
From: Milosz Tanski @ 2013-07-09 14:07 UTC (permalink / raw)
To: David Howells
Cc: Trond.Myklebust, ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
David,
I have another one for you... I though maybe I caused this somehow so
I spent like 8 hours tracking. But now I'm not so sure. I got this
error when running this "stress test"
root@pbnode-58019a39:/mnt/cluster/petabucket/prod# find -type f |
xargs -P 16 cat | pv > /dev/null
^C83GB 0:00:52 [ 0B/s] [
<=>
Any ideas where to look for the issue?
- Milosz
[4123376.919781] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000040
[4123376.919799]
[4123376.919800] CacheFiles: Error: Unexpected object collision
[4123376.919803] object: OBJ35bf
[4123376.919806] objstate=LOOK_UP_OBJECT fl=8 wbusy=2 ev=0[0]
[4123376.919807] ops=0 inp=0 exc=0
[4123376.919808] parent=ffff880ecafec180
[4123376.919809] cookie=ffff880bc9cb5f00 [pr=ffff880ece306000
nd=ffff880eaadeb648 fl=3]
[4123376.919820] key=[16] '1d86000000010000feffffffffffffff'
[4123376.919821] xobject: OBJ35bc
[4123376.919824] xobjstate=WAIT_FOR_CLEARANCE fl=30 wbusy=0 ev=0[10]
[4123376.919826] xops=1 inp=1 exc=0
[4123376.919826] xparent=ffff880ecafec180
[4123376.919828] xcookie=ffff880bc9cb5d80 [pr=ffff880ece306000 nd=
(null) fl=10]
[4123376.919861] IP: [<ffffffffa01aed62>]
__fscache_read_or_alloc_pages+0x122/0x400 [fscache]
[4123376.919882] PGD bc9eaa067 PUD d7a501067 PMD 0
[4123376.919892] Oops: 0000 [#1] SMP
[4123376.919901] Modules linked in: ceph libceph cachefiles
ghash_clmulni_intel aesni_intel ablk_helper cryptd lrw gf128mul
glue_helper aes_x86_64 microcode auth_rpcgss oid_registry nfsv4 nfs
fscache lockd sunrpc raid10 raid456 async_pq async_xor async_memcpy
async_raid6_recov async_tx raid1 multipath linear btrfs raid6_pq
lzo_compress raid0 xor zlib_deflate libcrc32c
[4123376.919966] CPU: 12 PID: 2523 Comm: cat Not tainted 3.10.0-virtual #22
[4123376.919973] task: ffff880ecbfdae40 ti: ffff880bc2cda000 task.ti:
ffff880bc2cda000
[4123376.919982] RIP: e030:[<ffffffffa01aed62>] [<ffffffffa01aed62>]
__fscache_read_or_alloc_pages+0x122/0x400 [fscache]
[4123376.919998] RSP: e02b:ffff880bc2cdb928 EFLAGS: 00010206
[4123376.920004] RAX: 0000000000000000 RBX: ffff880bc9cb5d80 RCX:
ffff880f1b68c000
[4123376.920012] RDX: ffff880bc9cb5d80 RSI: 0000000000000000 RDI:
ffff880bc9cb5d90
[4123376.920019] RBP: ffff880bc2cdb978 R08: f020000000000000 R09:
0ecac6e278100000
[4123376.920027] R10: f1173cd039a89e04 R11: 0000000000302135 R12:
ffff880bc9c79200
[4123376.920036] R13: ffff880ecac6e240 R14: ffff880bc9c79280 R15:
ffff880bc2cdba64
[4123376.920050] FS: 00007fc8b53bb700(0000) GS:ffff880f1b580000(0000)
knlGS:0000000000000000
[4123376.920059] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b
[4123376.920065] CR2: 0000000000000040 CR3: 0000000bc9c61000 CR4:
0000000000002660
[4123376.920073] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[4123376.920081] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[4123376.920090] Stack:
[4123376.920095] ffff880eca660a00 0000000000000000 ffff880bc9cb5d90
ffff880bc2cdbb38
[4123376.920108] ffff880bc2cdb978 ffff880eaadebb20 ffff880bc2cdbb38
ffff880eaadeb9a8
[4123376.920121] ffff880bc2cdba64 0000000000000016 ffff880bc2cdb9c8
ffffffffa02f3a93
[4123376.920133] Call Trace:
[4123376.920147] [<ffffffffa02f3a93>]
__ceph_readpages_from_fscache+0x93/0xc0 [ceph]
[4123376.920160] [<ffffffffa02e1f35>] ceph_readpages+0x45/0x400 [ceph]
[4123376.920173] [<ffffffff8115868a>] ? alloc_pages_current+0xba/0x170
[4123376.920184] [<ffffffff81116547>] ? __page_cache_alloc+0xb7/0xd0
[4123376.920193] [<ffffffff81132939>] ? zone_statistics+0x99/0xc0
[4123376.920204] [<ffffffff811224f2>] __do_page_cache_readahead+0x1b2/0x260
[4123376.920216] [<ffffffff812a9ecb>] ? radix_tree_lookup+0xb/0x10
[4123376.920225] [<ffffffff81122aa9>] ondemand_readahead+0x189/0x230
[4123376.920233] [<ffffffff81122c31>] page_cache_sync_readahead+0x31/0x50
[4123376.920242] [<ffffffff81118188>] generic_file_aio_read+0x4a8/0x720
[4123376.920253] [<ffffffffa02e7d2b>] ? ceph_get_caps+0xcb/0x1e0 [ceph]
[4123376.920265] [<ffffffffa02dfca1>] ceph_aio_read+0x531/0x5c0 [ceph]
[4123376.920275] [<ffffffff81190f28>] ? __mnt_want_write+0x58/0x70
[4123376.920285] [<ffffffff8118bcf7>] ? file_update_time+0xa7/0x100
[4123376.920296] [<ffffffff81171a4a>] do_sync_read+0x7a/0xb0
[4123376.920304] [<ffffffff81172920>] vfs_read+0xb0/0x180
[4123376.920313] [<ffffffff81172ba2>] SyS_read+0x52/0xa0
[4123376.920323] [<ffffffff8155de99>] system_call_fastpath+0x16/0x1b
[4123376.920329] Code: 89 e7 e8 e2 d4 ff ff 85 c0 0f 88 f2 00 00 00 48
8b 7d c0 e8 41 66 3a e1 f0 ff 05 96 59 00 00 49 8b 56 18 49 8b 75 70
48 8b 42 48 <48> 8b 40 40 48 85 c0 74 06 48 8b 7a 58 ff d0 4c 89 e7 48
c7 c1
[4123376.920399] RIP [<ffffffffa01aed62>]
__fscache_read_or_alloc_pages+0x122/0x400 [fscache]
[4123376.920412] RSP <ffff880bc2cdb928>
[4123376.920416] CR2: 0000000000000040
[4123376.920422] ---[ end trace c2ccf1289b7570af ]---
[4123376.920425] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000020
[4123376.920431] IP: [<ffffffffa02d09b5>]
__cachefiles_printk_object+0xe9/0x161 [cachefiles]
[4123376.920436] PGD bca510067 PUD bdea09067 PMD 0
[4123376.920438] Oops: 0000 [#2] SMP
[4123376.920439] Modules linked in: ceph libceph cachefiles
ghash_clmulni_intel aesni_intel ablk_helper cryptd lrw gf128mul
glue_helper aes_x86_64 microcode auth_rpcgss oid_registry nfsv4 nfs
fscache lockd sunrpc raid10 raid456 async_pq async_xor async_memcpy
async_raid6_recov async_tx raid1 multipath linear btrfs raid6_pq
lzo_compress raid0 xor zlib_deflate libcrc32c
[4123376.920488] CPU: 4 PID: 1027 Comm: kworker/u32:2 Tainted: G
D 3.10.0-virtual #22
[4123376.920500] Workqueue: fscache_object fscache_object_work_func [fscache]
[4123376.920509] task: ffff880eca469720 ti: ffff880ecd21e000 task.ti:
ffff880ecd21e000
[4123376.920517] RIP: e030:[<ffffffffa02d09b5>] [<ffffffffa02d09b5>]
__cachefiles_printk_object+0xe9/0x161 [cachefiles]
[4123376.920531] RSP: e02b:ffff880ecd21fbc8 EFLAGS: 00010286
[4123376.920537] RAX: 0000000000000000 RBX: ffff880bc9cb5d80 RCX:
0000000000002322
[4123376.920546] RDX: 0000000000000200 RSI: ffff880ecaddc000 RDI:
0000000000000000
[4123376.920554] RBP: ffff880ecd21fc08 R08: 00000000430e73cc R09:
ffffffff81c228a4
[4123376.923888] [kworke] unexpected submission OP111e11 [OBJ35c1
LOOK_UP_OBJECT]
[4123376.923890] [kworke] objstate=LOOK_UP_OBJECT [LOOK_UP_OBJECT]
[4123376.923890] [kworke] objflags=a
[4123376.923891] [kworke] objevent=0 [0]
[4123376.923892] [kworke] ops=0 inp=0 exc=0
[4123376.923895] CPU: 1 PID: 99 Comm: kworker/1:1 Tainted: G D
3.10.0-virtual #22
[4123376.923905] Workqueue: ceph-msgr con_work [libceph]
[4123376.923909] ffff880ecc8c0700 ffff880eccccfb28 ffffffff8154f6cc
ffff880eccccfb78
[4123376.923911] ffffffffa01ac6b2 ffff880bc9c78268 ffff880ecff4c9b8
0000000000000000
[4123376.923913] ffff880bc9cb5c00 ffffea002eb8cd80 ffff880bc9c78200
ffff880ecc8c0700
[4123376.923913] Call Trace:
[4123376.923922] [<ffffffff8154f6cc>] dump_stack+0x19/0x1b
[4123376.923930] [<ffffffffa01ac6b2>] fscache_submit_op+0x492/0x500 [fscache]
[4123376.923934] [<ffffffffa01ad8a2>]
__fscache_write_page+0x302/0x4f0 [fscache]
[4123376.923941] [<ffffffffa02f3aee>]
__ceph_readpage_to_fscache+0x2e/0x60 [ceph]
[4123376.923945] [<ffffffffa02e2e5e>] finish_read+0x9e/0x160 [ceph]
[4123376.923950] [<ffffffffa02645fd>] dispatch+0x3fd/0x730 [libceph]
[4123376.923954] [<ffffffffa025d7e4>] con_work+0x1294/0x2150 [libceph]
[4123376.923960] [<ffffffff81555455>] ? _raw_spin_unlock_irq+0x15/0x20
[4123376.923964] [<ffffffff810855e3>] ? finish_task_switch+0x63/0xd0
[4123376.923968] [<ffffffff810736ed>] process_one_work+0x17d/0x490
[4123376.923970] [<ffffffff810747eb>] worker_thread+0x11b/0x370
[4123376.923972] [<ffffffff810746d0>] ? manage_workers.isra.21+0x2e0/0x2e0
[4123376.923975] [<ffffffff8107af88>] kthread+0xd8/0xe0
[4123376.923977] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
[4123376.923980] [<ffffffff8155ddec>] ret_from_fork+0x7c/0xb0
[4123376.923982] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
[4123377.122015] R10: 0000000000000068 R11: 000000000003b4d8 R12:
ffffffffa02d1fbb
[4123377.122024] R13: ffff880bc9c79230 R14: ffff880ecaddc000 R15:
0000000000000000
[4123377.122035] FS: 00007f14a1362700(0000) GS:ffff880f1b480000(0000)
knlGS:0000000000000000
[4123377.122044] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b
[4123377.122050] CR2: 0000000000000020 CR3: 0000000bc9e6c000 CR4:
0000000000002660
[4123377.122057] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[4123377.122065] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[4123377.122072] Stack:
[4123377.122076] 0000000000000010 00000010c9c79350 ffff880ecd21fc38
ffff880bc9c79200
[4123377.122090] ffff880bc9c78900 ffff880ecaddc000 ffff880bc9c79200
ffff880bc9c79350
[4123377.122102] ffff880ecd21fc38 ffffffffa02d0a89 0000000000000606
ffff880bc9c78900
[4123377.122114] Call Trace:
[4123377.122128] [<ffffffffa02d0a89>]
cachefiles_printk_object+0x5c/0x5d3 [cachefiles]
[4123377.122140] [<ffffffffa02ccaf2>]
cachefiles_walk_to_object+0xbd2/0xda0 [cachefiles]
[4123377.122153] [<ffffffff810d51fe>] ? irq_get_irq_data+0xe/0x10
[4123377.122163] [<ffffffff81010640>] ? xen_smp_send_reschedule+0x10/0x20
[4123377.122174] [<ffffffff81087760>] ? resched_task+0x60/0x70
[4123377.122185] [<ffffffffa02ca40c>]
cachefiles_lookup_object+0x6c/0x180 [cachefiles]
[4123377.122200] [<ffffffffa01aa4a9>]
fscache_look_up_object+0xe9/0x370 [fscache]
[4123377.122212] [<ffffffffa01aac47>]
fscache_object_work_func+0x107/0x4b0 [fscache]
[4123377.122221] [<ffffffff810736ed>] process_one_work+0x17d/0x490
[4123377.122227] [<ffffffff810747eb>] worker_thread+0x11b/0x370
[4123377.122232] [<ffffffff810746d0>] ? manage_workers.isra.21+0x2e0/0x2e0
[4123377.122238] [<ffffffff8107af88>] kthread+0xd8/0xe0
[4123377.122243] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
[4123377.122251] [<ffffffff8155ddec>] ret_from_fork+0x7c/0xb0
[4123377.122256] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
[4123377.122260] Code: 4c 8b 4b 70 48 89 da 4c 89 e6 4c 8b 43 58 45 31
ff e8 15 b3 27 e1 4d 85 f6 74 2d 48 8b 43 48 ba 00 02 00 00 4c 89 f6
48 8b 7b 58 <ff> 50 20 44 0f b7 f8 eb 14 4c 89 e6 48 c7 c7 92 1f 2d a0
31 c0
[4123377.122307] RIP [<ffffffffa02d09b5>]
__cachefiles_printk_object+0xe9/0x161 [cachefiles]
[4123377.122315] RSP <ffff880ecd21fbc8>
[4123377.122318] CR2: 0000000000000020
[4123377.122349] ---[ end trace c2ccf1289b7570b0 ]---
[4123377.122415] BUG: unable to handle kernel paging request at ffffffffffffffc8
[4123377.122436] IP: [<ffffffff8107afd1>] kthread_data+0x11/0x20
[4123377.122442] PGD 1a0f067 PUD 1a11067 PMD 0
[4123377.122447] Oops: 0000 [#3] SMP
[4123377.122452] Modules linked in: ceph libceph cachefiles
ghash_clmulni_intel aesni_intel ablk_helper cryptd lrw gf128mul
glue_helper aes_x86_64 microcode auth_rpcgss oid_registry nfsv4 nfs
fscache lockd sunrpc raid10 raid456 async_pq async_xor async_memcpy
async_raid6_recov async_tx raid1 multipath linear btrfs raid6_pq
lzo_compress raid0 xor zlib_deflate libcrc32c
[4123377.122495] CPU: 4 PID: 1027 Comm: kworker/u32:2 Tainted: G
D 3.10.0-virtual #22
[4123377.122511] task: ffff880eca469720 ti: ffff880ecd21e000 task.ti:
ffff880ecd21e000
[4123377.122516] RIP: e030:[<ffffffff8107afd1>] [<ffffffff8107afd1>]
kthread_data+0x11/0x20
[4123377.122523] RSP: e02b:ffff880ecd21f7b0 EFLAGS: 00010046
[4123377.122527] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
ffffffff81c62a80
[4123377.321869] RDX: 0000000000000000 RSI: 0000000000000004 RDI:
ffff880eca469720
[4123377.321876] RBP: ffff880ecd21f7c8 R08: 000000000a046368 R09:
0000000000000000
[4123377.321881] R10: ffffffff81287372 R11: ffff880ecbe72a08 R12:
0000000000000004
[4123377.321888] R13: ffff880eca469ae8 R14: ffff880ed00e8000 R15:
ffff880eca4699f0
[4123377.321899] FS: 00007f14a1362700(0000) GS:ffff880f1b480000(0000)
knlGS:0000000000000000
[4123377.321907] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b
[4123377.321913] CR2: 0000000000000028 CR3: 0000000bc9e6c000 CR4:
0000000000002660
[4123377.321921] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[4123377.321929] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[4123377.321937] Stack:
[4123377.321941] ffffffff81074c66 ffff880f1b494200 0000000000000004
ffff880ecd21f848
[4123377.321955] ffffffff81554048 ffff880ecd21f7f8 ffff880ecbde59d8
ffff880eca469720
[4123377.321968] ffff880ecd21ffd8 ffff880ecd21ffd8 ffff880ecd21ffd8
ffff880ecbde59d8
[4123377.321981] Call Trace:
[4123377.321990] [<ffffffff81074c66>] ? wq_worker_sleeping+0x16/0x90
[4123377.322002] [<ffffffff81554048>] __schedule+0x5c8/0x810
[4123377.322010] [<ffffffff81554349>] schedule+0x29/0x70
[4123377.322022] [<ffffffff8105ab20>] do_exit+0x6e0/0xa60
[4123377.322032] [<ffffffff8154bd02>] ? printk+0x4d/0x4f
[4123377.322043] [<ffffffff8100afc2>] ? check_events+0x12/0x20
[4123377.322054] [<ffffffff815566c0>] oops_end+0xb0/0xf0
[4123377.322063] [<ffffffff8154b5c0>] no_context+0x276/0x285
[4123377.322071] [<ffffffff8154b7a2>] __bad_area_nosemaphore+0x1d3/0x1f2
[4123377.322080] [<ffffffff8154b7d4>] bad_area_nosemaphore+0x13/0x15
[4123377.322088] [<ffffffff81559502>] __do_page_fault+0x3b2/0x550
[4123377.322100] [<ffffffff810196b9>] ? sched_clock+0x9/0x10
[4123377.322111] [<ffffffff8108cd8d>] ? sched_clock_cpu+0xbd/0x110
[4123377.322121] [<ffffffff8108ce14>] ? local_clock+0x34/0x40
[4123377.322131] [<ffffffff81555380>] ? _raw_spin_unlock_irqrestore+0x20/0x30
[4123377.322142] [<ffffffff81080336>] ? down_trylock+0x36/0x50
[4123377.322152] [<ffffffff815596ae>] do_page_fault+0xe/0x10
[4123377.322161] [<ffffffff81555b18>] page_fault+0x28/0x30
[4123377.322171] [<ffffffffa02d09b5>] ?
__cachefiles_printk_object+0xe9/0x161 [cachefiles]
[4123377.322183] [<ffffffffa02d09a0>] ?
__cachefiles_printk_object+0xd4/0x161 [cachefiles]
[4123377.322195] [<ffffffffa02d0a89>]
cachefiles_printk_object+0x5c/0x5d3 [cachefiles]
[4123377.322207] [<ffffffffa02ccaf2>]
cachefiles_walk_to_object+0xbd2/0xda0 [cachefiles]
[4123377.322218] [<ffffffff810d51fe>] ? irq_get_irq_data+0xe/0x10
[4123377.322228] [<ffffffff81010640>] ? xen_smp_send_reschedule+0x10/0x20
[4123377.322237] [<ffffffff81087760>] ? resched_task+0x60/0x70
[4123377.322247] [<ffffffffa02ca40c>]
cachefiles_lookup_object+0x6c/0x180 [cachefiles]
[4123377.322261] [<ffffffffa01aa4a9>]
fscache_look_up_object+0xe9/0x370 [fscache]
[4123377.322274] [<ffffffffa01aac47>]
fscache_object_work_func+0x107/0x4b0 [fscache]
[4123377.322285] [<ffffffff810736ed>] process_one_work+0x17d/0x490
[4123377.322293] [<ffffffff810747eb>] worker_thread+0x11b/0x370
[4123377.322303] [<ffffffff810746d0>] ? manage_workers.isra.21+0x2e0/0x2e0
[4123377.322311] [<ffffffff8107af88>] kthread+0xd8/0xe0
[4123377.322320] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
[4123377.322329] [<ffffffff8155ddec>] ret_from_fork+0x7c/0xb0
[4123377.322337] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
[4123377.322344] Code: 48 89 e5 5d 48 8b 40 b8 48 c1 e8 02 83 e0 01 c3
66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 48 8b 87 70 03 00 00 55
48 89 e5 5d <48> 8b 40 c8 c3 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66
90 55
[4123377.322414] RIP [<ffffffff8107afd1>] kthread_data+0x11/0x20
[4123377.322426] RSP <ffff880ecd21f7b0>
[4123377.322431] CR2: ffffffffffffffc8
[4123377.322437] ---[ end trace c2ccf1289b7570b1 ]---
[4123377.322443] Fixing recursive fault but reboot is needed!
On Tue, Jul 9, 2013 at 6:33 AM, David Howells <dhowells@redhat.com> wrote:
> Milosz Tanski <milosz@adfin.com> wrote:
>
>> It looks like both the cifs and NFS code do not bother with any
>> locking around cifs_fscache_set_inode_cookie. Is there no concern over
>> multiple open() calls racing to create the cookie in those
>> filesystems?
>
> Yeah... That's probably wrong. AFS obviates the need for special locking by
> doing it in afs_iget().
>
> Hmmm... I think I've just spotted what might be the cause of pages getting
> marked PG_fscache whilst belonging to the allocator.
>
> void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
> {
> if (NFS_FSCACHE(inode)) {
> nfs_fscache_inode_lock(inode);
> if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
> nfs_fscache_disable_inode_cookie(inode);
> else
> nfs_fscache_enable_inode_cookie(inode);
> nfs_fscache_inode_unlock(inode);
> }
> }
>
> can release the cookie whilst reads are in progress on it when an inode being
> read suddenly changes to an inode being written. We need some sort of
> synchronisation on that there.
>
> David
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] ceph: Add FScache support
2013-07-09 14:07 ` Milosz Tanski
@ 2013-07-09 17:43 ` Milosz Tanski
0 siblings, 0 replies; 20+ messages in thread
From: Milosz Tanski @ 2013-07-09 17:43 UTC (permalink / raw)
To: David Howells
Cc: Trond Myklebust, ceph-devel, Sage Weil, Yan, Zheng, linux-cachefs
It turns out that I did this to myself with the launder_page callback
that implemented. Originally I would be able to solve the occasional
page left over with PG_fscache mark.
- Milosz
On Tue, Jul 9, 2013 at 10:07 AM, Milosz Tanski <milosz@adfin.com> wrote:
> David,
>
> I have another one for you... I though maybe I caused this somehow so
> I spent like 8 hours tracking. But now I'm not so sure. I got this
> error when running this "stress test"
>
> root@pbnode-58019a39:/mnt/cluster/petabucket/prod# find -type f |
> xargs -P 16 cat | pv > /dev/null
> ^C83GB 0:00:52 [ 0B/s] [
> <=>
>
> Any ideas where to look for the issue?
>
> - Milosz
>
> [4123376.919781] BUG: unable to handle kernel NULL pointer dereference
> at 0000000000000040
> [4123376.919799]
> [4123376.919800] CacheFiles: Error: Unexpected object collision
> [4123376.919803] object: OBJ35bf
> [4123376.919806] objstate=LOOK_UP_OBJECT fl=8 wbusy=2 ev=0[0]
> [4123376.919807] ops=0 inp=0 exc=0
> [4123376.919808] parent=ffff880ecafec180
> [4123376.919809] cookie=ffff880bc9cb5f00 [pr=ffff880ece306000
> nd=ffff880eaadeb648 fl=3]
> [4123376.919820] key=[16] '1d86000000010000feffffffffffffff'
> [4123376.919821] xobject: OBJ35bc
> [4123376.919824] xobjstate=WAIT_FOR_CLEARANCE fl=30 wbusy=0 ev=0[10]
> [4123376.919826] xops=1 inp=1 exc=0
> [4123376.919826] xparent=ffff880ecafec180
> [4123376.919828] xcookie=ffff880bc9cb5d80 [pr=ffff880ece306000 nd=
> (null) fl=10]
> [4123376.919861] IP: [<ffffffffa01aed62>]
> __fscache_read_or_alloc_pages+0x122/0x400 [fscache]
> [4123376.919882] PGD bc9eaa067 PUD d7a501067 PMD 0
> [4123376.919892] Oops: 0000 [#1] SMP
> [4123376.919901] Modules linked in: ceph libceph cachefiles
> ghash_clmulni_intel aesni_intel ablk_helper cryptd lrw gf128mul
> glue_helper aes_x86_64 microcode auth_rpcgss oid_registry nfsv4 nfs
> fscache lockd sunrpc raid10 raid456 async_pq async_xor async_memcpy
> async_raid6_recov async_tx raid1 multipath linear btrfs raid6_pq
> lzo_compress raid0 xor zlib_deflate libcrc32c
> [4123376.919966] CPU: 12 PID: 2523 Comm: cat Not tainted 3.10.0-virtual #22
> [4123376.919973] task: ffff880ecbfdae40 ti: ffff880bc2cda000 task.ti:
> ffff880bc2cda000
> [4123376.919982] RIP: e030:[<ffffffffa01aed62>] [<ffffffffa01aed62>]
> __fscache_read_or_alloc_pages+0x122/0x400 [fscache]
> [4123376.919998] RSP: e02b:ffff880bc2cdb928 EFLAGS: 00010206
> [4123376.920004] RAX: 0000000000000000 RBX: ffff880bc9cb5d80 RCX:
> ffff880f1b68c000
> [4123376.920012] RDX: ffff880bc9cb5d80 RSI: 0000000000000000 RDI:
> ffff880bc9cb5d90
> [4123376.920019] RBP: ffff880bc2cdb978 R08: f020000000000000 R09:
> 0ecac6e278100000
> [4123376.920027] R10: f1173cd039a89e04 R11: 0000000000302135 R12:
> ffff880bc9c79200
> [4123376.920036] R13: ffff880ecac6e240 R14: ffff880bc9c79280 R15:
> ffff880bc2cdba64
> [4123376.920050] FS: 00007fc8b53bb700(0000) GS:ffff880f1b580000(0000)
> knlGS:0000000000000000
> [4123376.920059] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b
> [4123376.920065] CR2: 0000000000000040 CR3: 0000000bc9c61000 CR4:
> 0000000000002660
> [4123376.920073] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [4123376.920081] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
> 0000000000000400
> [4123376.920090] Stack:
> [4123376.920095] ffff880eca660a00 0000000000000000 ffff880bc9cb5d90
> ffff880bc2cdbb38
> [4123376.920108] ffff880bc2cdb978 ffff880eaadebb20 ffff880bc2cdbb38
> ffff880eaadeb9a8
> [4123376.920121] ffff880bc2cdba64 0000000000000016 ffff880bc2cdb9c8
> ffffffffa02f3a93
> [4123376.920133] Call Trace:
> [4123376.920147] [<ffffffffa02f3a93>]
> __ceph_readpages_from_fscache+0x93/0xc0 [ceph]
> [4123376.920160] [<ffffffffa02e1f35>] ceph_readpages+0x45/0x400 [ceph]
> [4123376.920173] [<ffffffff8115868a>] ? alloc_pages_current+0xba/0x170
> [4123376.920184] [<ffffffff81116547>] ? __page_cache_alloc+0xb7/0xd0
> [4123376.920193] [<ffffffff81132939>] ? zone_statistics+0x99/0xc0
> [4123376.920204] [<ffffffff811224f2>] __do_page_cache_readahead+0x1b2/0x260
> [4123376.920216] [<ffffffff812a9ecb>] ? radix_tree_lookup+0xb/0x10
> [4123376.920225] [<ffffffff81122aa9>] ondemand_readahead+0x189/0x230
> [4123376.920233] [<ffffffff81122c31>] page_cache_sync_readahead+0x31/0x50
> [4123376.920242] [<ffffffff81118188>] generic_file_aio_read+0x4a8/0x720
> [4123376.920253] [<ffffffffa02e7d2b>] ? ceph_get_caps+0xcb/0x1e0 [ceph]
> [4123376.920265] [<ffffffffa02dfca1>] ceph_aio_read+0x531/0x5c0 [ceph]
> [4123376.920275] [<ffffffff81190f28>] ? __mnt_want_write+0x58/0x70
> [4123376.920285] [<ffffffff8118bcf7>] ? file_update_time+0xa7/0x100
> [4123376.920296] [<ffffffff81171a4a>] do_sync_read+0x7a/0xb0
> [4123376.920304] [<ffffffff81172920>] vfs_read+0xb0/0x180
> [4123376.920313] [<ffffffff81172ba2>] SyS_read+0x52/0xa0
> [4123376.920323] [<ffffffff8155de99>] system_call_fastpath+0x16/0x1b
> [4123376.920329] Code: 89 e7 e8 e2 d4 ff ff 85 c0 0f 88 f2 00 00 00 48
> 8b 7d c0 e8 41 66 3a e1 f0 ff 05 96 59 00 00 49 8b 56 18 49 8b 75 70
> 48 8b 42 48 <48> 8b 40 40 48 85 c0 74 06 48 8b 7a 58 ff d0 4c 89 e7 48
> c7 c1
> [4123376.920399] RIP [<ffffffffa01aed62>]
> __fscache_read_or_alloc_pages+0x122/0x400 [fscache]
> [4123376.920412] RSP <ffff880bc2cdb928>
> [4123376.920416] CR2: 0000000000000040
> [4123376.920422] ---[ end trace c2ccf1289b7570af ]---
> [4123376.920425] BUG: unable to handle kernel NULL pointer dereference
> at 0000000000000020
> [4123376.920431] IP: [<ffffffffa02d09b5>]
> __cachefiles_printk_object+0xe9/0x161 [cachefiles]
> [4123376.920436] PGD bca510067 PUD bdea09067 PMD 0
> [4123376.920438] Oops: 0000 [#2] SMP
> [4123376.920439] Modules linked in: ceph libceph cachefiles
> ghash_clmulni_intel aesni_intel ablk_helper cryptd lrw gf128mul
> glue_helper aes_x86_64 microcode auth_rpcgss oid_registry nfsv4 nfs
> fscache lockd sunrpc raid10 raid456 async_pq async_xor async_memcpy
> async_raid6_recov async_tx raid1 multipath linear btrfs raid6_pq
> lzo_compress raid0 xor zlib_deflate libcrc32c
> [4123376.920488] CPU: 4 PID: 1027 Comm: kworker/u32:2 Tainted: G
> D 3.10.0-virtual #22
> [4123376.920500] Workqueue: fscache_object fscache_object_work_func [fscache]
> [4123376.920509] task: ffff880eca469720 ti: ffff880ecd21e000 task.ti:
> ffff880ecd21e000
> [4123376.920517] RIP: e030:[<ffffffffa02d09b5>] [<ffffffffa02d09b5>]
> __cachefiles_printk_object+0xe9/0x161 [cachefiles]
> [4123376.920531] RSP: e02b:ffff880ecd21fbc8 EFLAGS: 00010286
> [4123376.920537] RAX: 0000000000000000 RBX: ffff880bc9cb5d80 RCX:
> 0000000000002322
> [4123376.920546] RDX: 0000000000000200 RSI: ffff880ecaddc000 RDI:
> 0000000000000000
> [4123376.920554] RBP: ffff880ecd21fc08 R08: 00000000430e73cc R09:
> ffffffff81c228a4
> [4123376.923888] [kworke] unexpected submission OP111e11 [OBJ35c1
> LOOK_UP_OBJECT]
> [4123376.923890] [kworke] objstate=LOOK_UP_OBJECT [LOOK_UP_OBJECT]
> [4123376.923890] [kworke] objflags=a
> [4123376.923891] [kworke] objevent=0 [0]
> [4123376.923892] [kworke] ops=0 inp=0 exc=0
> [4123376.923895] CPU: 1 PID: 99 Comm: kworker/1:1 Tainted: G D
> 3.10.0-virtual #22
> [4123376.923905] Workqueue: ceph-msgr con_work [libceph]
> [4123376.923909] ffff880ecc8c0700 ffff880eccccfb28 ffffffff8154f6cc
> ffff880eccccfb78
> [4123376.923911] ffffffffa01ac6b2 ffff880bc9c78268 ffff880ecff4c9b8
> 0000000000000000
> [4123376.923913] ffff880bc9cb5c00 ffffea002eb8cd80 ffff880bc9c78200
> ffff880ecc8c0700
> [4123376.923913] Call Trace:
> [4123376.923922] [<ffffffff8154f6cc>] dump_stack+0x19/0x1b
> [4123376.923930] [<ffffffffa01ac6b2>] fscache_submit_op+0x492/0x500 [fscache]
> [4123376.923934] [<ffffffffa01ad8a2>]
> __fscache_write_page+0x302/0x4f0 [fscache]
> [4123376.923941] [<ffffffffa02f3aee>]
> __ceph_readpage_to_fscache+0x2e/0x60 [ceph]
> [4123376.923945] [<ffffffffa02e2e5e>] finish_read+0x9e/0x160 [ceph]
> [4123376.923950] [<ffffffffa02645fd>] dispatch+0x3fd/0x730 [libceph]
> [4123376.923954] [<ffffffffa025d7e4>] con_work+0x1294/0x2150 [libceph]
> [4123376.923960] [<ffffffff81555455>] ? _raw_spin_unlock_irq+0x15/0x20
> [4123376.923964] [<ffffffff810855e3>] ? finish_task_switch+0x63/0xd0
> [4123376.923968] [<ffffffff810736ed>] process_one_work+0x17d/0x490
> [4123376.923970] [<ffffffff810747eb>] worker_thread+0x11b/0x370
> [4123376.923972] [<ffffffff810746d0>] ? manage_workers.isra.21+0x2e0/0x2e0
> [4123376.923975] [<ffffffff8107af88>] kthread+0xd8/0xe0
> [4123376.923977] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
> [4123376.923980] [<ffffffff8155ddec>] ret_from_fork+0x7c/0xb0
> [4123376.923982] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
> [4123377.122015] R10: 0000000000000068 R11: 000000000003b4d8 R12:
> ffffffffa02d1fbb
> [4123377.122024] R13: ffff880bc9c79230 R14: ffff880ecaddc000 R15:
> 0000000000000000
> [4123377.122035] FS: 00007f14a1362700(0000) GS:ffff880f1b480000(0000)
> knlGS:0000000000000000
> [4123377.122044] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b
> [4123377.122050] CR2: 0000000000000020 CR3: 0000000bc9e6c000 CR4:
> 0000000000002660
> [4123377.122057] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [4123377.122065] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
> 0000000000000400
> [4123377.122072] Stack:
> [4123377.122076] 0000000000000010 00000010c9c79350 ffff880ecd21fc38
> ffff880bc9c79200
> [4123377.122090] ffff880bc9c78900 ffff880ecaddc000 ffff880bc9c79200
> ffff880bc9c79350
> [4123377.122102] ffff880ecd21fc38 ffffffffa02d0a89 0000000000000606
> ffff880bc9c78900
> [4123377.122114] Call Trace:
> [4123377.122128] [<ffffffffa02d0a89>]
> cachefiles_printk_object+0x5c/0x5d3 [cachefiles]
> [4123377.122140] [<ffffffffa02ccaf2>]
> cachefiles_walk_to_object+0xbd2/0xda0 [cachefiles]
> [4123377.122153] [<ffffffff810d51fe>] ? irq_get_irq_data+0xe/0x10
> [4123377.122163] [<ffffffff81010640>] ? xen_smp_send_reschedule+0x10/0x20
> [4123377.122174] [<ffffffff81087760>] ? resched_task+0x60/0x70
> [4123377.122185] [<ffffffffa02ca40c>]
> cachefiles_lookup_object+0x6c/0x180 [cachefiles]
> [4123377.122200] [<ffffffffa01aa4a9>]
> fscache_look_up_object+0xe9/0x370 [fscache]
> [4123377.122212] [<ffffffffa01aac47>]
> fscache_object_work_func+0x107/0x4b0 [fscache]
> [4123377.122221] [<ffffffff810736ed>] process_one_work+0x17d/0x490
> [4123377.122227] [<ffffffff810747eb>] worker_thread+0x11b/0x370
> [4123377.122232] [<ffffffff810746d0>] ? manage_workers.isra.21+0x2e0/0x2e0
> [4123377.122238] [<ffffffff8107af88>] kthread+0xd8/0xe0
> [4123377.122243] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
> [4123377.122251] [<ffffffff8155ddec>] ret_from_fork+0x7c/0xb0
> [4123377.122256] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
> [4123377.122260] Code: 4c 8b 4b 70 48 89 da 4c 89 e6 4c 8b 43 58 45 31
> ff e8 15 b3 27 e1 4d 85 f6 74 2d 48 8b 43 48 ba 00 02 00 00 4c 89 f6
> 48 8b 7b 58 <ff> 50 20 44 0f b7 f8 eb 14 4c 89 e6 48 c7 c7 92 1f 2d a0
> 31 c0
> [4123377.122307] RIP [<ffffffffa02d09b5>]
> __cachefiles_printk_object+0xe9/0x161 [cachefiles]
> [4123377.122315] RSP <ffff880ecd21fbc8>
> [4123377.122318] CR2: 0000000000000020
> [4123377.122349] ---[ end trace c2ccf1289b7570b0 ]---
> [4123377.122415] BUG: unable to handle kernel paging request at ffffffffffffffc8
> [4123377.122436] IP: [<ffffffff8107afd1>] kthread_data+0x11/0x20
> [4123377.122442] PGD 1a0f067 PUD 1a11067 PMD 0
> [4123377.122447] Oops: 0000 [#3] SMP
> [4123377.122452] Modules linked in: ceph libceph cachefiles
> ghash_clmulni_intel aesni_intel ablk_helper cryptd lrw gf128mul
> glue_helper aes_x86_64 microcode auth_rpcgss oid_registry nfsv4 nfs
> fscache lockd sunrpc raid10 raid456 async_pq async_xor async_memcpy
> async_raid6_recov async_tx raid1 multipath linear btrfs raid6_pq
> lzo_compress raid0 xor zlib_deflate libcrc32c
> [4123377.122495] CPU: 4 PID: 1027 Comm: kworker/u32:2 Tainted: G
> D 3.10.0-virtual #22
> [4123377.122511] task: ffff880eca469720 ti: ffff880ecd21e000 task.ti:
> ffff880ecd21e000
> [4123377.122516] RIP: e030:[<ffffffff8107afd1>] [<ffffffff8107afd1>]
> kthread_data+0x11/0x20
> [4123377.122523] RSP: e02b:ffff880ecd21f7b0 EFLAGS: 00010046
> [4123377.122527] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
> ffffffff81c62a80
> [4123377.321869] RDX: 0000000000000000 RSI: 0000000000000004 RDI:
> ffff880eca469720
> [4123377.321876] RBP: ffff880ecd21f7c8 R08: 000000000a046368 R09:
> 0000000000000000
> [4123377.321881] R10: ffffffff81287372 R11: ffff880ecbe72a08 R12:
> 0000000000000004
> [4123377.321888] R13: ffff880eca469ae8 R14: ffff880ed00e8000 R15:
> ffff880eca4699f0
> [4123377.321899] FS: 00007f14a1362700(0000) GS:ffff880f1b480000(0000)
> knlGS:0000000000000000
> [4123377.321907] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b
> [4123377.321913] CR2: 0000000000000028 CR3: 0000000bc9e6c000 CR4:
> 0000000000002660
> [4123377.321921] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [4123377.321929] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
> 0000000000000400
> [4123377.321937] Stack:
> [4123377.321941] ffffffff81074c66 ffff880f1b494200 0000000000000004
> ffff880ecd21f848
> [4123377.321955] ffffffff81554048 ffff880ecd21f7f8 ffff880ecbde59d8
> ffff880eca469720
> [4123377.321968] ffff880ecd21ffd8 ffff880ecd21ffd8 ffff880ecd21ffd8
> ffff880ecbde59d8
> [4123377.321981] Call Trace:
> [4123377.321990] [<ffffffff81074c66>] ? wq_worker_sleeping+0x16/0x90
> [4123377.322002] [<ffffffff81554048>] __schedule+0x5c8/0x810
> [4123377.322010] [<ffffffff81554349>] schedule+0x29/0x70
> [4123377.322022] [<ffffffff8105ab20>] do_exit+0x6e0/0xa60
> [4123377.322032] [<ffffffff8154bd02>] ? printk+0x4d/0x4f
> [4123377.322043] [<ffffffff8100afc2>] ? check_events+0x12/0x20
> [4123377.322054] [<ffffffff815566c0>] oops_end+0xb0/0xf0
> [4123377.322063] [<ffffffff8154b5c0>] no_context+0x276/0x285
> [4123377.322071] [<ffffffff8154b7a2>] __bad_area_nosemaphore+0x1d3/0x1f2
> [4123377.322080] [<ffffffff8154b7d4>] bad_area_nosemaphore+0x13/0x15
> [4123377.322088] [<ffffffff81559502>] __do_page_fault+0x3b2/0x550
> [4123377.322100] [<ffffffff810196b9>] ? sched_clock+0x9/0x10
> [4123377.322111] [<ffffffff8108cd8d>] ? sched_clock_cpu+0xbd/0x110
> [4123377.322121] [<ffffffff8108ce14>] ? local_clock+0x34/0x40
> [4123377.322131] [<ffffffff81555380>] ? _raw_spin_unlock_irqrestore+0x20/0x30
> [4123377.322142] [<ffffffff81080336>] ? down_trylock+0x36/0x50
> [4123377.322152] [<ffffffff815596ae>] do_page_fault+0xe/0x10
> [4123377.322161] [<ffffffff81555b18>] page_fault+0x28/0x30
> [4123377.322171] [<ffffffffa02d09b5>] ?
> __cachefiles_printk_object+0xe9/0x161 [cachefiles]
> [4123377.322183] [<ffffffffa02d09a0>] ?
> __cachefiles_printk_object+0xd4/0x161 [cachefiles]
> [4123377.322195] [<ffffffffa02d0a89>]
> cachefiles_printk_object+0x5c/0x5d3 [cachefiles]
> [4123377.322207] [<ffffffffa02ccaf2>]
> cachefiles_walk_to_object+0xbd2/0xda0 [cachefiles]
> [4123377.322218] [<ffffffff810d51fe>] ? irq_get_irq_data+0xe/0x10
> [4123377.322228] [<ffffffff81010640>] ? xen_smp_send_reschedule+0x10/0x20
> [4123377.322237] [<ffffffff81087760>] ? resched_task+0x60/0x70
> [4123377.322247] [<ffffffffa02ca40c>]
> cachefiles_lookup_object+0x6c/0x180 [cachefiles]
> [4123377.322261] [<ffffffffa01aa4a9>]
> fscache_look_up_object+0xe9/0x370 [fscache]
> [4123377.322274] [<ffffffffa01aac47>]
> fscache_object_work_func+0x107/0x4b0 [fscache]
> [4123377.322285] [<ffffffff810736ed>] process_one_work+0x17d/0x490
> [4123377.322293] [<ffffffff810747eb>] worker_thread+0x11b/0x370
> [4123377.322303] [<ffffffff810746d0>] ? manage_workers.isra.21+0x2e0/0x2e0
> [4123377.322311] [<ffffffff8107af88>] kthread+0xd8/0xe0
> [4123377.322320] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
> [4123377.322329] [<ffffffff8155ddec>] ret_from_fork+0x7c/0xb0
> [4123377.322337] [<ffffffff8107aeb0>] ? flush_kthread_worker+0xe0/0xe0
> [4123377.322344] Code: 48 89 e5 5d 48 8b 40 b8 48 c1 e8 02 83 e0 01 c3
> 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 48 8b 87 70 03 00 00 55
> 48 89 e5 5d <48> 8b 40 c8 c3 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66
> 90 55
> [4123377.322414] RIP [<ffffffff8107afd1>] kthread_data+0x11/0x20
> [4123377.322426] RSP <ffff880ecd21f7b0>
> [4123377.322431] CR2: ffffffffffffffc8
> [4123377.322437] ---[ end trace c2ccf1289b7570b1 ]---
> [4123377.322443] Fixing recursive fault but reboot is needed!
>
> On Tue, Jul 9, 2013 at 6:33 AM, David Howells <dhowells@redhat.com> wrote:
>> Milosz Tanski <milosz@adfin.com> wrote:
>>
>>> It looks like both the cifs and NFS code do not bother with any
>>> locking around cifs_fscache_set_inode_cookie. Is there no concern over
>>> multiple open() calls racing to create the cookie in those
>>> filesystems?
>>
>> Yeah... That's probably wrong. AFS obviates the need for special locking by
>> doing it in afs_iget().
>>
>> Hmmm... I think I've just spotted what might be the cause of pages getting
>> marked PG_fscache whilst belonging to the allocator.
>>
>> void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
>> {
>> if (NFS_FSCACHE(inode)) {
>> nfs_fscache_inode_lock(inode);
>> if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
>> nfs_fscache_disable_inode_cookie(inode);
>> else
>> nfs_fscache_enable_inode_cookie(inode);
>> nfs_fscache_inode_unlock(inode);
>> }
>> }
>>
>> can release the cookie whilst reads are in progress on it when an inode being
>> read suddenly changes to an inode being written. We need some sort of
>> synchronisation on that there.
>>
>> David
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2013-07-09 17:43 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-06-29 3:52 [PATCH] ceph: Add FScache support Milosz Tanski
2013-06-29 3:58 ` Milosz Tanski
2013-07-01 15:55 ` Milosz Tanski
2013-07-02 19:14 ` David Howells
2013-07-02 19:20 ` Milosz Tanski
2013-07-02 19:39 ` David Howells
2013-07-02 19:56 ` Milosz Tanski
2013-07-02 20:49 ` David Howells
2013-07-02 21:14 ` Milosz Tanski
2013-07-02 23:40 ` David Howells
2013-07-03 19:02 ` Milosz Tanski
2013-07-03 23:52 ` David Howells
2013-07-04 0:03 ` Sage Weil
2013-07-08 14:46 ` Milosz Tanski
2013-07-09 10:33 ` David Howells
2013-07-09 12:26 ` Myklebust, Trond
2013-07-09 12:46 ` Milosz Tanski
2013-07-09 13:04 ` Milosz Tanski
2013-07-09 14:07 ` Milosz Tanski
2013-07-09 17:43 ` Milosz Tanski
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.