Re: GFS - Pekka Enberg

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Pekka Enberg <penberg@cs.helsinki.fi>
To: Zach Brown <zab@zabbo.net>
Cc: Mark Fasheh <mark.fasheh@oracle.com>,
	Christoph Hellwig <hch@infradead.org>,
	David Teigland <teigland@redhat.com>,
	Pekka Enberg <penberg@gmail.com>,
	akpm@osdl.org, linux-kernel@vger.kernel.org,
	linux-cluster@redhat.com, Andreas Dilger <adilger@clusterfs.com>
Subject: Re: GFS
Date: Thu, 11 Aug 2005 19:44:50 +0300	[thread overview]
Message-ID: <1123778691.24181.8.camel@localhost> (raw)
In-Reply-To: <42FB7DE5.2080506@zabbo.net>

On Thu, 2005-08-11 at 09:33 -0700, Zach Brown wrote:
> I don't think this patch is the way to go at all.  It imposes an
> allocation and vma walking overhead for the vast majority of IOs that
> aren't interested.  It doesn't look like it will get a consistent
> ordering when multiple file systems are concerned.  It doesn't record
> the ranges of the mappings involved so Lustre can't properly use its
> range locks.  And finally, it doesn't prohibit mapping operations for
> the duration of the IO -- the whole reason we ended up in this thread in
> the first place :)

Hmm. So how do you propose we get rid of the mandatory vma walk? I was
thinking of making iolock a config option so when you don't have any
filesystems that need it, it can go away. I have also optimized the
extra allocation away when there are none mmap'd files that require
locking.

As for the rest of your comments, I heartly agree with them and
hopefully some interested party will take care of them :-).

			Pekka

Index: 2.6-mm/fs/iolock.c
===================================================================
--- /dev/null
+++ 2.6-mm/fs/iolock.c
@@ -0,0 +1,183 @@
+/*
+ * I/O locking for memory regions. Used by filesystems that need special
+ * locking for mmap'd files.
+ */
+
+#include <linux/iolock.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+/*
+ * TODO:
+ *
+ *  - Deadlock when two nodes acquire iolocks in reverse order for two
+ *    different filesystems. Solution: use rbtree in iolock_chain so we
+ *    can walk iolocks in order. XXX: what order is stable for two nodes
+ *    that don't know about each other?
+ */
+
+/*
+ * I/O lock contains all files that participate in locking a memory region
+ * in an address_space.
+ */
+struct iolock {
+	struct address_space	*mapping;
+	unsigned long		nr_files;
+	struct file		**files;
+	struct list_head	chain;
+};
+
+struct iolock_chain {
+	struct list_head	list;
+};
+
+static struct iolock *iolock_new(unsigned long max_files)
+{
+	struct iolock *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		goto out;
+	ret->files = kcalloc(max_files, sizeof(struct file *), GFP_KERNEL);
+	if (!ret->files) {
+		kfree(ret);
+		ret = NULL;
+		goto out;
+	}
+	INIT_LIST_HEAD(&ret->chain);
+out:
+	return ret;
+}
+
+static struct iolock_chain *iolock_chain_new(void)
+{
+	struct iolock_chain * ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (ret) {
+		INIT_LIST_HEAD(&ret->list);
+	}
+	return ret;
+}
+
+static int iolock_chain_acquire(struct iolock_chain *chain)
+{
+	struct iolock * iolock;
+	int err = 0;
+
+	list_for_each_entry(iolock, &chain->list, chain) {
+		if (iolock->mapping->a_ops->iolock_acquire) {
+			err = iolock->mapping->a_ops->iolock_acquire(
+					iolock->files, iolock->nr_files);
+			if (!err)
+				goto error;
+		}
+	}
+error:
+	return err;
+}
+
+static struct iolock *iolock_lookup(struct iolock_chain *chain,
+				    struct address_space *mapping)
+{
+	struct iolock *ret = NULL;
+	struct iolock *iolock;
+
+	list_for_each_entry(iolock, &chain->list, chain) {
+		if (iolock->mapping == mapping) {
+			ret = iolock;
+			break;
+		}
+	}
+	return ret;
+}
+
+/**
+ * iolock_region - Lock memory region for file I/O.
+ * @buf: the buffer we want to lock.
+ * @size: size of the buffer.
+ *
+ * Returns a pointer to the iolock_chain or NULL to denote an empty chain;
+ * otherwise returns ERR_PTR().
+ */
+struct iolock_chain *iolock_region(const char __user *buf, size_t size)
+{
+	struct iolock_chain *ret = NULL;
+	int err = -ENOMEM;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long start = (unsigned long)buf;
+	unsigned long end = start + size;
+	int max_files;
+
+	down_read(&mm->mmap_sem);
+	max_files = mm->map_count;
+
+	for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
+		struct file *file;
+		struct address_space *mapping;
+		struct iolock *iolock;
+
+		if (end <= vma->vm_start)
+			break;
+
+		file = vma->vm_file;
+		if (!file)
+			continue;
+
+		mapping = file->f_mapping;
+		if (!mapping->a_ops->iolock_acquire ||
+		    !mapping->a_ops->iolock_release)
+			continue;
+
+		/* Allocate chain lazily to avoid initialization overhead
+		   when we don't have any files that require iolock.  */
+		if (!ret) {
+			ret = iolock_chain_new();
+			if (!ret)
+				goto error;
+		}
+
+		iolock = iolock_lookup(ret, mapping);
+		if (!iolock) {
+			iolock = iolock_new(max_files);
+			if (!iolock)
+				goto error;
+			iolock->mapping = mapping;
+		}
+
+		iolock->files[iolock->nr_files++] = file;
+		list_add(&iolock->chain, &ret->list);
+	}
+	err = iolock_chain_acquire(ret);
+	if (!err)
+		goto error;
+
+out:
+	up_read(&mm->mmap_sem);
+	return ret;
+
+error:
+	iolock_release(ret);
+	ret = ERR_PTR(err);
+	goto out;
+}
+
+/**
+ * iolock_release - Release file I/O locks for a memory region.
+ * @chain: The I/O lock chain to release. Passing NULL means no-op.
+ */
+void iolock_release(struct iolock_chain *chain)
+{
+	struct iolock *iolock;
+
+	if (!chain)
+		return;
+
+	list_for_each_entry(iolock, &chain->list, chain) {
+		struct address_space *mapping = iolock->mapping;
+		if (mapping && mapping->a_ops->iolock_release)
+			mapping->a_ops->iolock_release(iolock->files, iolock->nr_files);
+		kfree(iolock->files);
+		kfree(iolock);
+	}
+	kfree(chain);
+}
Index: 2.6-mm/fs/read_write.c
===================================================================
--- 2.6-mm.orig/fs/read_write.c
+++ 2.6-mm/fs/read_write.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/iolock.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -247,14 +248,21 @@ ssize_t vfs_read(struct file *file, char
 	if (!ret) {
 		ret = security_file_permission (file, MAY_READ);
 		if (!ret) {
+			struct iolock_chain * lock = iolock_region(buf, count);
+			if (IS_ERR(lock)) {
+				ret = PTR_ERR(lock);
+				goto out;
+			}
 			if (file->f_op->read)
 				ret = file->f_op->read(file, buf, count, pos);
 			else
 				ret = do_sync_read(file, buf, count, pos);
+			iolock_release(lock);
 			if (ret > 0) {
 				fsnotify_access(file->f_dentry);
 				current->rchar += ret;
 			}
+  out:
 			current->syscr++;
 		}
 	}
@@ -298,14 +306,21 @@ ssize_t vfs_write(struct file *file, con
 	if (!ret) {
 		ret = security_file_permission (file, MAY_WRITE);
 		if (!ret) {
+			struct iolock_chain * lock = iolock_region(buf, count);
+			if (IS_ERR(lock)) {
+				ret = PTR_ERR(lock);
+				goto out;
+			}
 			if (file->f_op->write)
 				ret = file->f_op->write(file, buf, count, pos);
 			else
 				ret = do_sync_write(file, buf, count, pos);
+			iolock_release(lock);
 			if (ret > 0) {
 				fsnotify_modify(file->f_dentry);
 				current->wchar += ret;
 			}
+  out:
 			current->syscw++;
 		}
 	}
Index: 2.6-mm/include/linux/iolock.h
===================================================================
--- /dev/null
+++ 2.6-mm/include/linux/iolock.h
@@ -0,0 +1,11 @@
+#ifndef __LINUX_IOLOCK_H
+#define __LINUX_IOLOCK_H
+
+#include <linux/kernel.h>
+
+struct iolock_chain;
+
+extern struct iolock_chain *iolock_region(const char __user *, size_t);
+extern void iolock_release(struct iolock_chain *);
+
+#endif
Index: 2.6-mm/fs/Makefile
===================================================================
--- 2.6-mm.orig/fs/Makefile
+++ 2.6-mm/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o
+		ioprio.o iolock.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
Index: 2.6-mm/include/linux/fs.h
===================================================================
--- 2.6-mm.orig/include/linux/fs.h
+++ 2.6-mm/include/linux/fs.h
@@ -334,6 +334,8 @@ struct address_space_operations {
 			loff_t offset, unsigned long nr_segs);
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
 			int);
+	int (*iolock_acquire)(struct file **, unsigned long);
+	void (*iolock_release)(struct file **, unsigned long);
 };
 
 struct backing_dev_info;

next prev parent reply	other threads:[~2005-08-11 16:45 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-08-11  7:10 GFS Pekka J Enberg
2005-08-11 16:33 ` GFS Zach Brown
2005-08-11 16:35   ` GFS Christoph Hellwig
2005-08-11 16:39     ` GFS Zach Brown
2005-08-11 16:44   ` Pekka Enberg [this message]
  -- strict thread matches above, loose matches on Subject: below --
2005-08-02  7:18 [PATCH 00/14] GFS David Teigland
2005-08-02 10:16 ` Pekka Enberg
2005-08-03  6:36   ` David Teigland
2005-08-08 14:14     ` GFS Pekka J Enberg
2005-08-08 18:32       ` GFS Zach Brown
2005-08-09 14:49         ` GFS Pekka Enberg
2005-08-09 17:17           ` GFS Zach Brown
2005-08-09 18:35             ` GFS Pekka J Enberg
2005-08-10  4:48             ` GFS Pekka J Enberg
2005-08-10  7:21           ` GFS Christoph Hellwig
2005-08-10  7:31             ` GFS Pekka J Enberg
2005-08-10 16:26               ` GFS Mark Fasheh
2005-08-10 16:57                 ` GFS Pekka J Enberg
2005-08-10 18:21                   ` GFS Mark Fasheh
2005-08-10 20:18                     ` GFS Pekka J Enberg
2005-08-10 22:07                       ` GFS Mark Fasheh
2005-08-11  4:41                         ` GFS Pekka J Enberg
2005-08-10  5:59       ` GFS David Teigland
2005-08-10  6:06         ` GFS Pekka J Enberg
2005-08-03  6:44 ` [PATCH 00/14] GFS Pekka Enberg
2005-08-08  9:57   ` David Teigland
2005-08-08 10:00     ` GFS Pekka J Enberg
2005-08-08 10:18     ` GFS Pekka J Enberg
2005-08-08 10:56       ` GFS David Teigland
2005-08-08 10:57         ` GFS Pekka J Enberg
2005-08-08 11:39           ` GFS David Teigland
2005-08-08 10:34     ` GFS Pekka J Enberg
2005-08-09 14:55     ` GFS Pekka J Enberg
2005-08-10  7:40     ` GFS Pekka J Enberg
2005-08-10  7:43       ` GFS Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1123778691.24181.8.camel@localhost \
    --to=penberg@cs.helsinki.fi \
    --cc=adilger@clusterfs.com \
    --cc=akpm@osdl.org \
    --cc=hch@infradead.org \
    --cc=linux-cluster@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mark.fasheh@oracle.com \
    --cc=penberg@gmail.com \
    --cc=teigland@redhat.com \
    --cc=zab@zabbo.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox