All of lore.kernel.org
 help / color / mirror / Atom feed
From: Pekka Enberg <penberg@cs.helsinki.fi>
To: Zach Brown <zab@zabbo.net>
Cc: Mark Fasheh <mark.fasheh@oracle.com>,
	Christoph Hellwig <hch@infradead.org>,
	David Teigland <teigland@redhat.com>,
	Pekka Enberg <penberg@gmail.com>,
	akpm@osdl.org, linux-kernel@vger.kernel.org,
	linux-cluster@redhat.com, Andreas Dilger <adilger@clusterfs.com>
Subject: Re: GFS
Date: Thu, 11 Aug 2005 19:44:50 +0300	[thread overview]
Message-ID: <1123778691.24181.8.camel@localhost> (raw)
In-Reply-To: <42FB7DE5.2080506@zabbo.net>

On Thu, 2005-08-11 at 09:33 -0700, Zach Brown wrote:
> I don't think this patch is the way to go at all.  It imposes an
> allocation and vma walking overhead for the vast majority of IOs that
> aren't interested.  It doesn't look like it will get a consistent
> ordering when multiple file systems are concerned.  It doesn't record
> the ranges of the mappings involved so Lustre can't properly use its
> range locks.  And finally, it doesn't prohibit mapping operations for
> the duration of the IO -- the whole reason we ended up in this thread in
> the first place :)

Hmm. So how do you propose we get rid of the mandatory vma walk? I was
thinking of making iolock a config option so when you don't have any
filesystems that need it, it can go away. I have also optimized the
extra allocation away when there are none mmap'd files that require
locking.

As for the rest of your comments, I heartly agree with them and
hopefully some interested party will take care of them :-).

			Pekka

Index: 2.6-mm/fs/iolock.c
===================================================================
--- /dev/null
+++ 2.6-mm/fs/iolock.c
@@ -0,0 +1,183 @@
+/*
+ * I/O locking for memory regions. Used by filesystems that need special
+ * locking for mmap'd files.
+ */
+
+#include <linux/iolock.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+/*
+ * TODO:
+ *
+ *  - Deadlock when two nodes acquire iolocks in reverse order for two
+ *    different filesystems. Solution: use rbtree in iolock_chain so we
+ *    can walk iolocks in order. XXX: what order is stable for two nodes
+ *    that don't know about each other?
+ */
+
+/*
+ * I/O lock contains all files that participate in locking a memory region
+ * in an address_space.
+ */
+struct iolock {
+	struct address_space	*mapping;
+	unsigned long		nr_files;
+	struct file		**files;
+	struct list_head	chain;
+};
+
+struct iolock_chain {
+	struct list_head	list;
+};
+
+static struct iolock *iolock_new(unsigned long max_files)
+{
+	struct iolock *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		goto out;
+	ret->files = kcalloc(max_files, sizeof(struct file *), GFP_KERNEL);
+	if (!ret->files) {
+		kfree(ret);
+		ret = NULL;
+		goto out;
+	}
+	INIT_LIST_HEAD(&ret->chain);
+out:
+	return ret;
+}
+
+static struct iolock_chain *iolock_chain_new(void)
+{
+	struct iolock_chain * ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (ret) {
+		INIT_LIST_HEAD(&ret->list);
+	}
+	return ret;
+}
+
+static int iolock_chain_acquire(struct iolock_chain *chain)
+{
+	struct iolock * iolock;
+	int err = 0;
+
+	list_for_each_entry(iolock, &chain->list, chain) {
+		if (iolock->mapping->a_ops->iolock_acquire) {
+			err = iolock->mapping->a_ops->iolock_acquire(
+					iolock->files, iolock->nr_files);
+			if (!err)
+				goto error;
+		}
+	}
+error:
+	return err;
+}
+
+static struct iolock *iolock_lookup(struct iolock_chain *chain,
+				    struct address_space *mapping)
+{
+	struct iolock *ret = NULL;
+	struct iolock *iolock;
+
+	list_for_each_entry(iolock, &chain->list, chain) {
+		if (iolock->mapping == mapping) {
+			ret = iolock;
+			break;
+		}
+	}
+	return ret;
+}
+
+/**
+ * iolock_region - Lock memory region for file I/O.
+ * @buf: the buffer we want to lock.
+ * @size: size of the buffer.
+ *
+ * Returns a pointer to the iolock_chain or NULL to denote an empty chain;
+ * otherwise returns ERR_PTR().
+ */
+struct iolock_chain *iolock_region(const char __user *buf, size_t size)
+{
+	struct iolock_chain *ret = NULL;
+	int err = -ENOMEM;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long start = (unsigned long)buf;
+	unsigned long end = start + size;
+	int max_files;
+
+	down_read(&mm->mmap_sem);
+	max_files = mm->map_count;
+
+	for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
+		struct file *file;
+		struct address_space *mapping;
+		struct iolock *iolock;
+
+		if (end <= vma->vm_start)
+			break;
+
+		file = vma->vm_file;
+		if (!file)
+			continue;
+
+		mapping = file->f_mapping;
+		if (!mapping->a_ops->iolock_acquire ||
+		    !mapping->a_ops->iolock_release)
+			continue;
+
+		/* Allocate chain lazily to avoid initialization overhead
+		   when we don't have any files that require iolock.  */
+		if (!ret) {
+			ret = iolock_chain_new();
+			if (!ret)
+				goto error;
+		}
+
+		iolock = iolock_lookup(ret, mapping);
+		if (!iolock) {
+			iolock = iolock_new(max_files);
+			if (!iolock)
+				goto error;
+			iolock->mapping = mapping;
+		}
+
+		iolock->files[iolock->nr_files++] = file;
+		list_add(&iolock->chain, &ret->list);
+	}
+	err = iolock_chain_acquire(ret);
+	if (!err)
+		goto error;
+
+out:
+	up_read(&mm->mmap_sem);
+	return ret;
+
+error:
+	iolock_release(ret);
+	ret = ERR_PTR(err);
+	goto out;
+}
+
+/**
+ * iolock_release - Release file I/O locks for a memory region.
+ * @chain: The I/O lock chain to release. Passing NULL means no-op.
+ */
+void iolock_release(struct iolock_chain *chain)
+{
+	struct iolock *iolock;
+
+	if (!chain)
+		return;
+
+	list_for_each_entry(iolock, &chain->list, chain) {
+		struct address_space *mapping = iolock->mapping;
+		if (mapping && mapping->a_ops->iolock_release)
+			mapping->a_ops->iolock_release(iolock->files, iolock->nr_files);
+		kfree(iolock->files);
+		kfree(iolock);
+	}
+	kfree(chain);
+}
Index: 2.6-mm/fs/read_write.c
===================================================================
--- 2.6-mm.orig/fs/read_write.c
+++ 2.6-mm/fs/read_write.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/iolock.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -247,14 +248,21 @@ ssize_t vfs_read(struct file *file, char
 	if (!ret) {
 		ret = security_file_permission (file, MAY_READ);
 		if (!ret) {
+			struct iolock_chain * lock = iolock_region(buf, count);
+			if (IS_ERR(lock)) {
+				ret = PTR_ERR(lock);
+				goto out;
+			}
 			if (file->f_op->read)
 				ret = file->f_op->read(file, buf, count, pos);
 			else
 				ret = do_sync_read(file, buf, count, pos);
+			iolock_release(lock);
 			if (ret > 0) {
 				fsnotify_access(file->f_dentry);
 				current->rchar += ret;
 			}
+  out:
 			current->syscr++;
 		}
 	}
@@ -298,14 +306,21 @@ ssize_t vfs_write(struct file *file, con
 	if (!ret) {
 		ret = security_file_permission (file, MAY_WRITE);
 		if (!ret) {
+			struct iolock_chain * lock = iolock_region(buf, count);
+			if (IS_ERR(lock)) {
+				ret = PTR_ERR(lock);
+				goto out;
+			}
 			if (file->f_op->write)
 				ret = file->f_op->write(file, buf, count, pos);
 			else
 				ret = do_sync_write(file, buf, count, pos);
+			iolock_release(lock);
 			if (ret > 0) {
 				fsnotify_modify(file->f_dentry);
 				current->wchar += ret;
 			}
+  out:
 			current->syscw++;
 		}
 	}
Index: 2.6-mm/include/linux/iolock.h
===================================================================
--- /dev/null
+++ 2.6-mm/include/linux/iolock.h
@@ -0,0 +1,11 @@
+#ifndef __LINUX_IOLOCK_H
+#define __LINUX_IOLOCK_H
+
+#include <linux/kernel.h>
+
+struct iolock_chain;
+
+extern struct iolock_chain *iolock_region(const char __user *, size_t);
+extern void iolock_release(struct iolock_chain *);
+
+#endif
Index: 2.6-mm/fs/Makefile
===================================================================
--- 2.6-mm.orig/fs/Makefile
+++ 2.6-mm/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o
+		ioprio.o iolock.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
Index: 2.6-mm/include/linux/fs.h
===================================================================
--- 2.6-mm.orig/include/linux/fs.h
+++ 2.6-mm/include/linux/fs.h
@@ -334,6 +334,8 @@ struct address_space_operations {
 			loff_t offset, unsigned long nr_segs);
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
 			int);
+	int (*iolock_acquire)(struct file **, unsigned long);
+	void (*iolock_release)(struct file **, unsigned long);
 };
 
 struct backing_dev_info;



  parent reply	other threads:[~2005-08-11 16:45 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-08-11  7:10 GFS Pekka J Enberg
2005-08-11 16:33 ` GFS Zach Brown
2005-08-11 16:35   ` GFS Christoph Hellwig
2005-08-11 16:39     ` GFS Zach Brown
2005-08-11 16:44   ` Pekka Enberg [this message]
  -- strict thread matches above, loose matches on Subject: below --
2005-08-02  7:18 [PATCH 00/14] GFS David Teigland
2005-08-02 10:16 ` Pekka Enberg
2005-08-03  6:36   ` David Teigland
2005-08-08 14:14     ` GFS Pekka J Enberg
2005-08-08 18:32       ` GFS Zach Brown
2005-08-09 14:49         ` GFS Pekka Enberg
2005-08-09 17:17           ` GFS Zach Brown
2005-08-09 18:35             ` GFS Pekka J Enberg
2005-08-10  4:48             ` GFS Pekka J Enberg
2005-08-10  7:21           ` GFS Christoph Hellwig
2005-08-10  7:31             ` GFS Pekka J Enberg
2005-08-10 16:26               ` GFS Mark Fasheh
2005-08-10 16:57                 ` GFS Pekka J Enberg
2005-08-10 18:21                   ` GFS Mark Fasheh
2005-08-10 20:18                     ` GFS Pekka J Enberg
2005-08-10 22:07                       ` GFS Mark Fasheh
2005-08-11  4:41                         ` GFS Pekka J Enberg
2005-08-10  5:59       ` GFS David Teigland
2005-08-10  6:06         ` GFS Pekka J Enberg
2005-08-03  6:44 ` [PATCH 00/14] GFS Pekka Enberg
2005-08-08  9:57   ` David Teigland
2005-08-08 10:00     ` GFS Pekka J Enberg
2005-08-08 10:18     ` GFS Pekka J Enberg
2005-08-08 10:56       ` GFS David Teigland
2005-08-08 10:57         ` GFS Pekka J Enberg
2005-08-08 11:39           ` GFS David Teigland
2005-08-08 10:34     ` GFS Pekka J Enberg
2005-08-09 14:55     ` GFS Pekka J Enberg
2005-08-10  7:40     ` GFS Pekka J Enberg
2005-08-10  7:43       ` GFS Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1123778691.24181.8.camel@localhost \
    --to=penberg@cs.helsinki.fi \
    --cc=adilger@clusterfs.com \
    --cc=akpm@osdl.org \
    --cc=hch@infradead.org \
    --cc=linux-cluster@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mark.fasheh@oracle.com \
    --cc=penberg@gmail.com \
    --cc=teigland@redhat.com \
    --cc=zab@zabbo.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.