* Re: GFS
@ 2005-08-11 7:10 Pekka J Enberg
2005-08-11 16:33 ` GFS Zach Brown
0 siblings, 1 reply; 30+ messages in thread
From: Pekka J Enberg @ 2005-08-11 7:10 UTC (permalink / raw)
To: Mark Fasheh
Cc: Christoph Hellwig, Zach Brown, David Teigland, Pekka Enberg, akpm,
linux-kernel, linux-cluster
Hi Mark,
On Thu, 11 Aug 2005, Pekka J Enberg wrote:
> Reading and writing from other filesystems to a GFS2 mmap'd file
> does not walk the vmas. Therefore, data consistency guarantees
> are different:
What I meant was that, if a filesystem requires vma walks, we need to do
it VFS level with something like the following patch. With this, your
filesystem would implement a_ops->iolock_acquire that sorts the locks
and takes them all. In case of GFS2, this would replace walk_vm().
Thoughts?
Pekka
[PATCH] vfs: iolock
This patch introduces iolock which can be used by filesystems that require
special locking when accessing an mmap'd region.
Unfinished and untested.
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
fs/Makefile | 2 -
fs/iolock.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++
fs/read_write.c | 15 ++++++++
include/linux/fs.h | 2 +
include/linux/iolock.h | 11 ++++++
5 files changed, 117 insertions(+), 1 deletion(-)
Index: 2.6-mm/fs/iolock.c
===================================================================
--- /dev/null
+++ 2.6-mm/fs/iolock.c
@@ -0,0 +1,88 @@
+/*
+ * fs/iolock.c
+ *
+ * Derived from GFS2.
+ */
+
+#include <linux/iolock.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+/*
+ * I/O lock contains all files that participate in locking a memory region.
+ * It is used for filesystems that require special locks to access mmap'd
+ * memory.
+ */
+struct iolock {
+ struct address_space *mapping;
+ unsigned long nr_files;
+ struct file **files;
+};
+
+struct iolock *iolock_region(const char __user *buf, size_t size)
+{
+ int err = -ENOMEM;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long start = (unsigned long)buf;
+ unsigned long end = start + size;
+ struct iolock *ret;
+
+ ret = kcalloc(1, sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ down_read(&mm->mmap_sem);
+
+ ret->files = kcalloc(mm->map_count, sizeof(struct file*), GFP_KERNEL);
+ if (!ret->files)
+ goto error;
+
+ for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
+ struct file *file;
+ struct address_space *mapping;
+
+ if (end <= vma->vm_start)
+ break;
+
+ file = vma->vm_file;
+ if (!file)
+ continue;
+
+ mapping = file->f_mapping;
+ if (!mapping->a_ops->iolock_acquire ||
+ !mapping->a_ops->iolock_release)
+ continue;
+
+ /* FIXME: This only works when one address_space participates
+ in the iolock. */
+ ret->mapping = mapping;
+ ret->files[ret->nr_files++] = file;
+ }
+out:
+ up_read(&mm->mmap_sem);
+
+ if (ret->mapping->a_ops->iolock_acquire) {
+ err = ret->mapping->a_ops->iolock_acquire(ret->files, ret->nr_files);
+ if (!err)
+ goto error;
+ }
+
+ return ret;
+
+error:
+ iolock_release(ret);
+ ret = ERR_PTR(err);
+ goto out;
+}
+
+void iolock_release(struct iolock *iolock)
+{
+ struct address_space *mapping = iolock->mapping;
+ if (mapping && mapping->a_ops->iolock_release)
+ mapping->a_ops->iolock_release(iolock->files, iolock->nr_files);
+ kfree(iolock->files);
+ kfree(iolock);
+}
Index: 2.6-mm/fs/read_write.c
===================================================================
--- 2.6-mm.orig/fs/read_write.c
+++ 2.6-mm/fs/read_write.c
@@ -14,6 +14,7 @@
#include <linux/security.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/iolock.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -247,14 +248,21 @@ ssize_t vfs_read(struct file *file, char
if (!ret) {
ret = security_file_permission (file, MAY_READ);
if (!ret) {
+ struct iolock * iolock = iolock_region(buf, count);
+ if (IS_ERR(iolock)) {
+ ret = PTR_ERR(iolock);
+ goto out;
+ }
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else
ret = do_sync_read(file, buf, count, pos);
+ iolock_release(iolock);
if (ret > 0) {
fsnotify_access(file->f_dentry);
current->rchar += ret;
}
+ out:
current->syscr++;
}
}
@@ -298,14 +306,21 @@ ssize_t vfs_write(struct file *file, con
if (!ret) {
ret = security_file_permission (file, MAY_WRITE);
if (!ret) {
+ struct iolock * iolock = iolock_region(buf, count);
+ if (IS_ERR(iolock)) {
+ ret = PTR_ERR(iolock);
+ goto out;
+ }
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
+ iolock_release(iolock);
if (ret > 0) {
fsnotify_modify(file->f_dentry);
current->wchar += ret;
}
+ out:
current->syscw++;
}
}
Index: 2.6-mm/include/linux/iolock.h
===================================================================
--- /dev/null
+++ 2.6-mm/include/linux/iolock.h
@@ -0,0 +1,11 @@
+#ifndef __LINUX_IOLOCK_H
+#define __LINUX_IOLOCK_H
+
+#include <linux/kernel.h>
+
+struct iolock;
+
+struct iolock *iolock_region(const char __user *buf, size_t count);
+void iolock_release(struct iolock *lock);
+
+#endif
Index: 2.6-mm/fs/Makefile
===================================================================
--- 2.6-mm.orig/fs/Makefile
+++ 2.6-mm/fs/Makefile
@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
- ioprio.o
+ ioprio.o iolock.o
obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_EPOLL) += eventpoll.o
Index: 2.6-mm/include/linux/fs.h
===================================================================
--- 2.6-mm.orig/include/linux/fs.h
+++ 2.6-mm/include/linux/fs.h
@@ -334,6 +334,8 @@ struct address_space_operations {
loff_t offset, unsigned long nr_segs);
struct page* (*get_xip_page)(struct address_space *, sector_t,
int);
+ int (*iolock_acquire)(struct file **, unsigned long);
+ void (*iolock_release)(struct file **, unsigned long);
};
struct backing_dev_info;
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: GFS 2005-08-11 7:10 GFS Pekka J Enberg @ 2005-08-11 16:33 ` Zach Brown 2005-08-11 16:35 ` GFS Christoph Hellwig 2005-08-11 16:44 ` GFS Pekka Enberg 0 siblings, 2 replies; 30+ messages in thread From: Zach Brown @ 2005-08-11 16:33 UTC (permalink / raw) To: Pekka J Enberg Cc: Mark Fasheh, Christoph Hellwig, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, Andreas Dilger > What I meant was that, if a filesystem requires vma walks, we need to do > it VFS level with something like the following patch. I don't think this patch is the way to go at all. It imposes an allocation and vma walking overhead for the vast majority of IOs that aren't interested. It doesn't look like it will get a consistent ordering when multiple file systems are concerned. It doesn't record the ranges of the mappings involved so Lustre can't properly use its range locks. And finally, it doesn't prohibit mapping operations for the duration of the IO -- the whole reason we ended up in this thread in the first place :) Christoph, would you be interested in looking at a more thorough patch if I threw one together? - z ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-11 16:33 ` GFS Zach Brown @ 2005-08-11 16:35 ` Christoph Hellwig 2005-08-11 16:39 ` GFS Zach Brown 2005-08-11 16:44 ` GFS Pekka Enberg 1 sibling, 1 reply; 30+ messages in thread From: Christoph Hellwig @ 2005-08-11 16:35 UTC (permalink / raw) To: Zach Brown Cc: Pekka J Enberg, Mark Fasheh, Christoph Hellwig, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, Andreas Dilger On Thu, Aug 11, 2005 at 09:33:41AM -0700, Zach Brown wrote: > ordering when multiple file systems are concerned. It doesn't record > the ranges of the mappings involved so Lustre can't properly use its > range locks. That doesn't matter. Please don't put in any effort for lustre special cases - they are unwilling to cooperate and they'll get what they deserve. > And finally, it doesn't prohibit mapping operations for > the duration of the IO -- the whole reason we ended up in this thread in > the first place :) > > Christoph, would you be interested in looking at a more thorough patch > if I threw one together? Sure, I'm not sure that'll happen in a timely fashion, though. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-11 16:35 ` GFS Christoph Hellwig @ 2005-08-11 16:39 ` Zach Brown 0 siblings, 0 replies; 30+ messages in thread From: Zach Brown @ 2005-08-11 16:39 UTC (permalink / raw) To: Christoph Hellwig Cc: Pekka J Enberg, Mark Fasheh, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, Andreas Dilger > That doesn't matter. Please don't put in any effort for lustre special > cases - they are unwilling to cooperate and they'll get what they deserve. Sure, we can add that extra functional layer in another pass. I thought I'd still bring it up, though, as OCFS2 is slated to care at some point in the not too distant future. > Sure, I'm not sure that'll happen in a timely fashion, though. Roger. - z ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-11 16:33 ` GFS Zach Brown 2005-08-11 16:35 ` GFS Christoph Hellwig @ 2005-08-11 16:44 ` Pekka Enberg 1 sibling, 0 replies; 30+ messages in thread From: Pekka Enberg @ 2005-08-11 16:44 UTC (permalink / raw) To: Zach Brown Cc: Mark Fasheh, Christoph Hellwig, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, Andreas Dilger On Thu, 2005-08-11 at 09:33 -0700, Zach Brown wrote: > I don't think this patch is the way to go at all. It imposes an > allocation and vma walking overhead for the vast majority of IOs that > aren't interested. It doesn't look like it will get a consistent > ordering when multiple file systems are concerned. It doesn't record > the ranges of the mappings involved so Lustre can't properly use its > range locks. And finally, it doesn't prohibit mapping operations for > the duration of the IO -- the whole reason we ended up in this thread in > the first place :) Hmm. So how do you propose we get rid of the mandatory vma walk? I was thinking of making iolock a config option so when you don't have any filesystems that need it, it can go away. I have also optimized the extra allocation away when there are none mmap'd files that require locking. As for the rest of your comments, I heartly agree with them and hopefully some interested party will take care of them :-). Pekka Index: 2.6-mm/fs/iolock.c =================================================================== --- /dev/null +++ 2.6-mm/fs/iolock.c @@ -0,0 +1,183 @@ +/* + * I/O locking for memory regions. Used by filesystems that need special + * locking for mmap'd files. + */ + +#include <linux/iolock.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/slab.h> + +/* + * TODO: + * + * - Deadlock when two nodes acquire iolocks in reverse order for two + * different filesystems. Solution: use rbtree in iolock_chain so we + * can walk iolocks in order. XXX: what order is stable for two nodes + * that don't know about each other? + */ + +/* + * I/O lock contains all files that participate in locking a memory region + * in an address_space. + */ +struct iolock { + struct address_space *mapping; + unsigned long nr_files; + struct file **files; + struct list_head chain; +}; + +struct iolock_chain { + struct list_head list; +}; + +static struct iolock *iolock_new(unsigned long max_files) +{ + struct iolock *ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + goto out; + ret->files = kcalloc(max_files, sizeof(struct file *), GFP_KERNEL); + if (!ret->files) { + kfree(ret); + ret = NULL; + goto out; + } + INIT_LIST_HEAD(&ret->chain); +out: + return ret; +} + +static struct iolock_chain *iolock_chain_new(void) +{ + struct iolock_chain * ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (ret) { + INIT_LIST_HEAD(&ret->list); + } + return ret; +} + +static int iolock_chain_acquire(struct iolock_chain *chain) +{ + struct iolock * iolock; + int err = 0; + + list_for_each_entry(iolock, &chain->list, chain) { + if (iolock->mapping->a_ops->iolock_acquire) { + err = iolock->mapping->a_ops->iolock_acquire( + iolock->files, iolock->nr_files); + if (!err) + goto error; + } + } +error: + return err; +} + +static struct iolock *iolock_lookup(struct iolock_chain *chain, + struct address_space *mapping) +{ + struct iolock *ret = NULL; + struct iolock *iolock; + + list_for_each_entry(iolock, &chain->list, chain) { + if (iolock->mapping == mapping) { + ret = iolock; + break; + } + } + return ret; +} + +/** + * iolock_region - Lock memory region for file I/O. + * @buf: the buffer we want to lock. + * @size: size of the buffer. + * + * Returns a pointer to the iolock_chain or NULL to denote an empty chain; + * otherwise returns ERR_PTR(). + */ +struct iolock_chain *iolock_region(const char __user *buf, size_t size) +{ + struct iolock_chain *ret = NULL; + int err = -ENOMEM; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long start = (unsigned long)buf; + unsigned long end = start + size; + int max_files; + + down_read(&mm->mmap_sem); + max_files = mm->map_count; + + for (vma = find_vma(mm, start); vma; vma = vma->vm_next) { + struct file *file; + struct address_space *mapping; + struct iolock *iolock; + + if (end <= vma->vm_start) + break; + + file = vma->vm_file; + if (!file) + continue; + + mapping = file->f_mapping; + if (!mapping->a_ops->iolock_acquire || + !mapping->a_ops->iolock_release) + continue; + + /* Allocate chain lazily to avoid initialization overhead + when we don't have any files that require iolock. */ + if (!ret) { + ret = iolock_chain_new(); + if (!ret) + goto error; + } + + iolock = iolock_lookup(ret, mapping); + if (!iolock) { + iolock = iolock_new(max_files); + if (!iolock) + goto error; + iolock->mapping = mapping; + } + + iolock->files[iolock->nr_files++] = file; + list_add(&iolock->chain, &ret->list); + } + err = iolock_chain_acquire(ret); + if (!err) + goto error; + +out: + up_read(&mm->mmap_sem); + return ret; + +error: + iolock_release(ret); + ret = ERR_PTR(err); + goto out; +} + +/** + * iolock_release - Release file I/O locks for a memory region. + * @chain: The I/O lock chain to release. Passing NULL means no-op. + */ +void iolock_release(struct iolock_chain *chain) +{ + struct iolock *iolock; + + if (!chain) + return; + + list_for_each_entry(iolock, &chain->list, chain) { + struct address_space *mapping = iolock->mapping; + if (mapping && mapping->a_ops->iolock_release) + mapping->a_ops->iolock_release(iolock->files, iolock->nr_files); + kfree(iolock->files); + kfree(iolock); + } + kfree(chain); +} Index: 2.6-mm/fs/read_write.c =================================================================== --- 2.6-mm.orig/fs/read_write.c +++ 2.6-mm/fs/read_write.c @@ -14,6 +14,7 @@ #include <linux/security.h> #include <linux/module.h> #include <linux/syscalls.h> +#include <linux/iolock.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -247,14 +248,21 @@ ssize_t vfs_read(struct file *file, char if (!ret) { ret = security_file_permission (file, MAY_READ); if (!ret) { + struct iolock_chain * lock = iolock_region(buf, count); + if (IS_ERR(lock)) { + ret = PTR_ERR(lock); + goto out; + } if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else ret = do_sync_read(file, buf, count, pos); + iolock_release(lock); if (ret > 0) { fsnotify_access(file->f_dentry); current->rchar += ret; } + out: current->syscr++; } } @@ -298,14 +306,21 @@ ssize_t vfs_write(struct file *file, con if (!ret) { ret = security_file_permission (file, MAY_WRITE); if (!ret) { + struct iolock_chain * lock = iolock_region(buf, count); + if (IS_ERR(lock)) { + ret = PTR_ERR(lock); + goto out; + } if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else ret = do_sync_write(file, buf, count, pos); + iolock_release(lock); if (ret > 0) { fsnotify_modify(file->f_dentry); current->wchar += ret; } + out: current->syscw++; } } Index: 2.6-mm/include/linux/iolock.h =================================================================== --- /dev/null +++ 2.6-mm/include/linux/iolock.h @@ -0,0 +1,11 @@ +#ifndef __LINUX_IOLOCK_H +#define __LINUX_IOLOCK_H + +#include <linux/kernel.h> + +struct iolock_chain; + +extern struct iolock_chain *iolock_region(const char __user *, size_t); +extern void iolock_release(struct iolock_chain *); + +#endif Index: 2.6-mm/fs/Makefile =================================================================== --- 2.6-mm.orig/fs/Makefile +++ 2.6-mm/fs/Makefile @@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table. ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ - ioprio.o + ioprio.o iolock.o obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_EPOLL) += eventpoll.o Index: 2.6-mm/include/linux/fs.h =================================================================== --- 2.6-mm.orig/include/linux/fs.h +++ 2.6-mm/include/linux/fs.h @@ -334,6 +334,8 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); struct page* (*get_xip_page)(struct address_space *, sector_t, int); + int (*iolock_acquire)(struct file **, unsigned long); + void (*iolock_release)(struct file **, unsigned long); }; struct backing_dev_info; ^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH 00/14] GFS @ 2005-08-02 7:18 David Teigland 2005-08-02 10:16 ` Pekka Enberg 2005-08-03 6:44 ` [PATCH 00/14] GFS Pekka Enberg 0 siblings, 2 replies; 30+ messages in thread From: David Teigland @ 2005-08-02 7:18 UTC (permalink / raw) To: akpm; +Cc: linux-kernel, linux-cluster Hi, GFS (Global File System) is a cluster file system that we'd like to see added to the kernel. The 14 patches total about 900K so I won't send them to the list unless that's requested. Comments and suggestions are welcome. Thanks http://redhat.com/~teigland/gfs2/20050801/gfs2-full.patch http://redhat.com/~teigland/gfs2/20050801/broken-out/ Dave ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 00/14] GFS 2005-08-02 7:18 [PATCH 00/14] GFS David Teigland @ 2005-08-02 10:16 ` Pekka Enberg 2005-08-03 6:36 ` David Teigland 2005-08-03 6:44 ` [PATCH 00/14] GFS Pekka Enberg 1 sibling, 1 reply; 30+ messages in thread From: Pekka Enberg @ 2005-08-02 10:16 UTC (permalink / raw) To: David Teigland; +Cc: akpm, linux-kernel, linux-cluster, Pekka Enberg Hi David, On 8/2/05, David Teigland <teigland@redhat.com> wrote: > Hi, GFS (Global File System) is a cluster file system that we'd like to > see added to the kernel. The 14 patches total about 900K so I won't send > them to the list unless that's requested. Comments and suggestions are > welcome. Thanks > +#define kmalloc_nofail(size, flags) \ > + gmalloc_nofail((size), (flags), __FILE__, __LINE__) [snip] > +void *gmalloc_nofail_real(unsigned int size, int flags, char *file, > + unsigned int line) > +{ > + void *x; > + for (;;) { > + x = kmalloc(size, flags); > + if (x) > + return x; > + if (time_after_eq(jiffies, gfs2_malloc_warning + 5 * HZ)) { > + printk("GFS2: out of memory: %s, %u\n", > + __FILE__, __LINE__); > + gfs2_malloc_warning = jiffies; > + } > + yield(); This does not belong in a filesystem. It also seems like a very bad idea. What are you trying to do here? If you absolutely must not fail, use __GFP_NOFAIL instead. > + } > +} > + > +#if defined(GFS2_MEMORY_SIMPLE) > + > +atomic_t gfs2_memory_count; > + > +void gfs2_memory_add_i(void *data, char *file, unsigned int line) > +{ > + atomic_inc(&gfs2_memory_count); > +} > + > +void gfs2_memory_rm_i(void *data, char *file, unsigned int line) > +{ > + if (data) > + atomic_dec(&gfs2_memory_count); > +} > + > +void *gmalloc(unsigned int size, int flags, char *file, unsigned int line) > +{ > + void *data = kmalloc(size, flags); > + if (data) > + atomic_inc(&gfs2_memory_count); > + return data; > +} > + > +void *gmalloc_nofail(unsigned int size, int flags, char *file, > + unsigned int line) > +{ > + atomic_inc(&gfs2_memory_count); > + return gmalloc_nofail_real(size, flags, file, line); > +} > + > +void gfree(void *data, char *file, unsigned int line) > +{ > + if (data) { > + atomic_dec(&gfs2_memory_count); > + kfree(data); > + } > +} -mm has memory leak detection patches and there are others floating around. Please do not introduce yet another subsystem-specific debug allocator. Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 00/14] GFS 2005-08-02 10:16 ` Pekka Enberg @ 2005-08-03 6:36 ` David Teigland 2005-08-08 14:14 ` GFS Pekka J Enberg 0 siblings, 1 reply; 30+ messages in thread From: David Teigland @ 2005-08-03 6:36 UTC (permalink / raw) To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-cluster, Pekka Enberg On Tue, Aug 02, 2005 at 01:16:53PM +0300, Pekka Enberg wrote: > > +void *gmalloc_nofail_real(unsigned int size, int flags, char *file, > > + unsigned int line) > > +{ > > + void *x; > > + for (;;) { > > + x = kmalloc(size, flags); > > + if (x) > > + return x; > > + if (time_after_eq(jiffies, gfs2_malloc_warning + 5 * HZ)) { > > + printk("GFS2: out of memory: %s, %u\n", > > + __FILE__, __LINE__); > > + gfs2_malloc_warning = jiffies; > > + } > > + yield(); > > This does not belong in a filesystem. It also seems like a very bad > idea. What are you trying to do here? If you absolutely must not fail, > use __GFP_NOFAIL instead. will do, carried over from before NOFAIL existed > -mm has memory leak detection patches and there are others floating > around. Please do not introduce yet another subsystem-specific debug > allocator. ok, thanks Dave ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-03 6:36 ` David Teigland @ 2005-08-08 14:14 ` Pekka J Enberg 2005-08-08 18:32 ` GFS Zach Brown 2005-08-10 5:59 ` GFS David Teigland 0 siblings, 2 replies; 30+ messages in thread From: Pekka J Enberg @ 2005-08-08 14:14 UTC (permalink / raw) To: David Teigland; +Cc: Pekka Enberg, akpm, linux-kernel, linux-cluster David Teigland writes: > +static ssize_t walk_vm_hard(struct file *file, char *buf, size_t size, > + loff_t *offset, do_rw_t operation) > +{ > + struct gfs2_holder *ghs; > + unsigned int num_gh = 0; > + ssize_t count; > + > + { Can we please get rid of the extra braces everywhere? [snip] David Teigland writes: > + > + for (vma = find_vma(mm, start); vma; vma = vma->vm_next) { > + if (end <= vma->vm_start) > + break; > + if (vma->vm_file && > + vma->vm_file->f_dentry->d_inode->i_sb == sb) { > + num_gh++; > + } > + } > + > + ghs = kmalloc((num_gh + 1) * sizeof(struct gfs2_holder), > + GFP_KERNEL); > + if (!ghs) { > + if (!dumping) > + up_read(&mm->mmap_sem); > + return -ENOMEM; > + } > + > + for (vma = find_vma(mm, start); vma; vma = vma->vm_next) { Sorry if this is an obvious question but what prevents another thread from doing mmap() before we do the second walk and messing up num_gh? > + if (end <= vma->vm_start) > + break; > + if (vma->vm_file) { > + struct inode *inode; > + inode = vma->vm_file->f_dentry->d_inode; > + if (inode->i_sb == sb) > + gfs2_holder_init(get_v2ip(inode)->i_gl, > + vma2state(vma), > + 0, &ghs[x++]); > + } > + } Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 14:14 ` GFS Pekka J Enberg @ 2005-08-08 18:32 ` Zach Brown 2005-08-09 14:49 ` GFS Pekka Enberg 2005-08-10 5:59 ` GFS David Teigland 1 sibling, 1 reply; 30+ messages in thread From: Zach Brown @ 2005-08-08 18:32 UTC (permalink / raw) To: Pekka J Enberg Cc: David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, mark.fasheh Pekka J Enberg wrote: > Sorry if this is an obvious question but what prevents another thread > from doing mmap() before we do the second walk and messing up num_gh? Nothing, I suspect. OCFS2 has a problem like this, too. It wants a way for a file system to serialize mmap/munmap/mremap during file IO. Well, more specifically, it wants to make sure that the locks it acquired at the start of the IO really cover the buf regions that might fault during the IO.. mapping activity during the IO can wreck that. - z ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 18:32 ` GFS Zach Brown @ 2005-08-09 14:49 ` Pekka Enberg 2005-08-09 17:17 ` GFS Zach Brown 2005-08-10 7:21 ` GFS Christoph Hellwig 0 siblings, 2 replies; 30+ messages in thread From: Pekka Enberg @ 2005-08-09 14:49 UTC (permalink / raw) To: Zach Brown Cc: David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, mark.fasheh On Mon, 2005-08-08 at 11:32 -0700, Zach Brown wrote: > > Sorry if this is an obvious question but what prevents another thread > > from doing mmap() before we do the second walk and messing up num_gh? > > Nothing, I suspect. OCFS2 has a problem like this, too. It wants a way > for a file system to serialize mmap/munmap/mremap during file IO. Well, > more specifically, it wants to make sure that the locks it acquired at > the start of the IO really cover the buf regions that might fault during > the IO.. mapping activity during the IO can wreck that. In addition, the vma walk will become an unmaintainable mess as soon as someone introduces another mmap() capable fs that needs similar locking. I am not an expert so could someone please explain why this cannot be done with a_ops->prepare_write and friends? Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-09 14:49 ` GFS Pekka Enberg @ 2005-08-09 17:17 ` Zach Brown 2005-08-09 18:35 ` GFS Pekka J Enberg 2005-08-10 4:48 ` GFS Pekka J Enberg 2005-08-10 7:21 ` GFS Christoph Hellwig 1 sibling, 2 replies; 30+ messages in thread From: Zach Brown @ 2005-08-09 17:17 UTC (permalink / raw) To: Pekka Enberg Cc: David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, mark.fasheh Pekka Enberg wrote: > In addition, the vma walk will become an unmaintainable mess as soon > as someone introduces another mmap() capable fs that needs similar > locking. Yup, I suspect that if the core kernel ends up caring about this problem then the VFS will be involved in helping file systems sort the locks they'll acquire around IO. > I am not an expert so could someone please explain why this cannot be > done with a_ops->prepare_write and friends? I'll try, briefly. Usually clustered file systems in Linux maintain data consistency for normal posix IO by holding DLM locks for the duration of their file->{read,write} methods. A task on a node won't be able to read until all tasks on other nodes have finished any conflicting writes they might have been performing, etc, nothing surprising here. Now say we want to extend consistency guarantees to mmap(). This boils down to protecting mappings with DLM locks. Say a page is mapped for reading, the continued presence of that mapping is protected by holding a DLM lock. If another node goes to write to that page, the read lock is revoked and the mapping is torn down. These locks are acquired in a_ops->nopage as the task faults and tries to bring up the mapping. And that's the problem. Because they're acquired in ->nopage they can be acquired during a fault that is servicing the 'buf' argument to an outer file->{read,write} operation which has grabbed a lock for the target file. Acquiring multiple locks introduces the risk of ABBA deadlocks. It's trivial to construct examples of mmap(), read(), and write() on 2 nodes with 2 files that deadlock. So clustered file systems in Linux (GFS, Lustre, OCFS2, (GPFS?)) all walk vmas in their file->{read,write} to discover mappings that belong to their files so that they can preemptively sort and acquire the locks that will be needed to cover the mappings that might be established in ->nopage. As you point out, this both relies on the mappings not changing and gets very exciting when you mix files and mappings between file systems that are each sorting and acquiring their own DLM locks. I brought this up with some people at the kernel summit but no one, including myself, considers it a high priority. It wouldn't be too hard to construct a patch if people want to take a look. - z ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-09 17:17 ` GFS Zach Brown @ 2005-08-09 18:35 ` Pekka J Enberg 2005-08-10 4:48 ` GFS Pekka J Enberg 1 sibling, 0 replies; 30+ messages in thread From: Pekka J Enberg @ 2005-08-09 18:35 UTC (permalink / raw) To: Zach Brown Cc: David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, mark.fasheh Hi Zach, Zach Brown writes: > I'll try, briefly. Thanks for the excellent explanation. Zach Brown writes: > And that's the problem. Because they're acquired in ->nopage they can > be acquired during a fault that is servicing the 'buf' argument to an > outer file->{read,write} operation which has grabbed a lock for the > target file. Acquiring multiple locks introduces the risk of ABBA > deadlocks. It's trivial to construct examples of mmap(), read(), and > write() on 2 nodes with 2 files that deadlock. But couldn't we use make_pages_present() to figure which locks we need, sort them, and then grab them? Zach Brown writes: > I brought this up with some people at the kernel summit but no one, > including myself, considers it a high priority. It wouldn't be too hard > to construct a patch if people want to take a look. I guess it's not a problem as long as the kernel has zero or one cluster filesystems that support mmap(). After we have two or more, we have a problem. The GFS2 vma walk needs fixing anyway, I think, as it can lead to buffer overflow (if we have more locks during the second walk). Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-09 17:17 ` GFS Zach Brown 2005-08-09 18:35 ` GFS Pekka J Enberg @ 2005-08-10 4:48 ` Pekka J Enberg 1 sibling, 0 replies; 30+ messages in thread From: Pekka J Enberg @ 2005-08-10 4:48 UTC (permalink / raw) To: Pekka J Enberg Cc: Zach Brown, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, mark.fasheh Zach Brown writes: > But couldn't we use make_pages_present() to figure which locks we need, > sort them, and then grab them? Doh, obviously we can't as nopage() needs to bring the page in. Sorry about that. I also thought of another failure case for the vma walk. When a thread uses userspace memcpy() between two clusterfs mmap'd regions instead of write() or read(). Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-09 14:49 ` GFS Pekka Enberg 2005-08-09 17:17 ` GFS Zach Brown @ 2005-08-10 7:21 ` Christoph Hellwig 2005-08-10 7:31 ` GFS Pekka J Enberg 1 sibling, 1 reply; 30+ messages in thread From: Christoph Hellwig @ 2005-08-10 7:21 UTC (permalink / raw) To: Pekka Enberg Cc: Zach Brown, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, mark.fasheh On Tue, Aug 09, 2005 at 05:49:43PM +0300, Pekka Enberg wrote: > On Mon, 2005-08-08 at 11:32 -0700, Zach Brown wrote: > > > Sorry if this is an obvious question but what prevents another thread > > > from doing mmap() before we do the second walk and messing up num_gh? > > > > Nothing, I suspect. OCFS2 has a problem like this, too. It wants a way > > for a file system to serialize mmap/munmap/mremap during file IO. Well, > > more specifically, it wants to make sure that the locks it acquired at > > the start of the IO really cover the buf regions that might fault during > > the IO.. mapping activity during the IO can wreck that. > > In addition, the vma walk will become an unmaintainable mess as soon as > someone introduces another mmap() capable fs that needs similar locking. We already have OCFS2 in -mm that does similar things. I think we need to solve this in common code before either of them can be merged. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-10 7:21 ` GFS Christoph Hellwig @ 2005-08-10 7:31 ` Pekka J Enberg 2005-08-10 16:26 ` GFS Mark Fasheh 0 siblings, 1 reply; 30+ messages in thread From: Pekka J Enberg @ 2005-08-10 7:31 UTC (permalink / raw) To: Christoph Hellwig Cc: Zach Brown, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster, mark.fasheh On Tue, Aug 09, 2005 at 05:49:43PM +0300, Pekka Enberg wrote: > > In addition, the vma walk will become an unmaintainable mess as soon as > > someone introduces another mmap() capable fs that needs similar locking. Christoph Hellwig writes: > We already have OCFS2 in -mm that does similar things. I think we need > to solve this in common code before either of them can be merged. It seems to me that the distributed locks must be acquired in ->nopage anyway to solve the problem with memcpy() between two mmap'd regions. One possible solution would be for the lock manager to detect deadlocks and break some locks accordingly. Don't know how well that would mix with ->nopage though... Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-10 7:31 ` GFS Pekka J Enberg @ 2005-08-10 16:26 ` Mark Fasheh 2005-08-10 16:57 ` GFS Pekka J Enberg 0 siblings, 1 reply; 30+ messages in thread From: Mark Fasheh @ 2005-08-10 16:26 UTC (permalink / raw) To: Pekka J Enberg Cc: Christoph Hellwig, Zach Brown, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster On Wed, Aug 10, 2005 at 10:31:04AM +0300, Pekka J Enberg wrote: > It seems to me that the distributed locks must be acquired in ->nopage > anyway to solve the problem with memcpy() between two mmap'd regions. One > possible solution would be for the lock manager to detect deadlocks and > break some locks accordingly. Don't know how well that would mix with > ->nopage though... Yeah, my experience with ->nopage so far has indicated to me that we are to avoid erroring out if at all possible which I believe is what we'd have to do if a deadlock is found. Also, I'm not sure how multiple dlms would coordinate deadlock detection in that case. This may sound naive, but so far OCFS2 has avoided the nead for deadlock detection... I'd hate to have to add it now -- better to try avoiding them in the first place. --Mark -- Mark Fasheh Senior Software Developer, Oracle mark.fasheh@oracle.com ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-10 16:26 ` GFS Mark Fasheh @ 2005-08-10 16:57 ` Pekka J Enberg 2005-08-10 18:21 ` GFS Mark Fasheh 0 siblings, 1 reply; 30+ messages in thread From: Pekka J Enberg @ 2005-08-10 16:57 UTC (permalink / raw) To: Mark Fasheh Cc: Christoph Hellwig, Zach Brown, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster Hi Mark, Mark Fasheh writes: > This may sound naive, but so far OCFS2 has avoided the nead for deadlock > detection... I'd hate to have to add it now -- better to try avoiding them > in the first place. Surely avoiding them is preferred but how do you do that when you have to mmap'd regions where userspace does memcpy()? The kernel won't much saying in it until ->nopage. We cannot grab all the required locks in proper order here because we don't know what size the buffer is. That's why I think lock sorting won't work of all the cases and thus the problem needs to be taken care of by the dlm. Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-10 16:57 ` GFS Pekka J Enberg @ 2005-08-10 18:21 ` Mark Fasheh 2005-08-10 20:18 ` GFS Pekka J Enberg 0 siblings, 1 reply; 30+ messages in thread From: Mark Fasheh @ 2005-08-10 18:21 UTC (permalink / raw) To: Pekka J Enberg Cc: Christoph Hellwig, Zach Brown, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster On Wed, Aug 10, 2005 at 07:57:43PM +0300, Pekka J Enberg wrote: > Surely avoiding them is preferred but how do you do that when you have to > mmap'd regions where userspace does memcpy()? The kernel won't much saying > in it until ->nopage. We cannot grab all the required locks in proper order > here because we don't know what size the buffer is. That's why I think lock > sorting won't work of all the cases and thus the problem needs to be taken > care of by the dlm. Hmm, well today in OCFS2 if you're not coming from read or write, the lock is held only for the duration of ->nopage so I don't think we could get into any deadlocks for that usage. --Mark -- Mark Fasheh Senior Software Developer, Oracle mark.fasheh@oracle.com ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-10 18:21 ` GFS Mark Fasheh @ 2005-08-10 20:18 ` Pekka J Enberg 2005-08-10 22:07 ` GFS Mark Fasheh 0 siblings, 1 reply; 30+ messages in thread From: Pekka J Enberg @ 2005-08-10 20:18 UTC (permalink / raw) To: Mark Fasheh Cc: Christoph Hellwig, Zach Brown, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster Mark Fasheh writes: > Hmm, well today in OCFS2 if you're not coming from read or write, the lock > is held only for the duration of ->nopage so I don't think we could get into > any deadlocks for that usage. Aah, I see GFS2 does that too so no deadlocks here. Thanks. You, however, don't maintain the same level of data consistency when reads and writes are from other filesystems as they use ->nopage. Fixing this requires a generic vma walk in every write() and read(), no? That doesn't seem such an hot idea which brings us back to using ->nopage for taking the locks (but now the deadlocks are back). Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-10 20:18 ` GFS Pekka J Enberg @ 2005-08-10 22:07 ` Mark Fasheh 2005-08-11 4:41 ` GFS Pekka J Enberg 0 siblings, 1 reply; 30+ messages in thread From: Mark Fasheh @ 2005-08-10 22:07 UTC (permalink / raw) To: Pekka J Enberg Cc: Christoph Hellwig, Zach Brown, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster On Wed, Aug 10, 2005 at 11:18:48PM +0300, Pekka J Enberg wrote: > Aah, I see GFS2 does that too so no deadlocks here. Thanks. Yep, no problem :) > You, however, don't maintain the same level of data consistency when reads > and writes are from other filesystems as they use ->nopage. I'm not sure what you mean here... > Fixing this requires a generic vma walk in every write() and read(), no? > That doesn't seem such an hot idea which brings us back to using ->nopage > for taking the locks (but now the deadlocks are back). Yeah if you look through mmap.c in ocfs2_fill_ctxt_from_buf() we do this... Or am I misunderstanding what you mean? --Mark -- Mark Fasheh Senior Software Developer, Oracle mark.fasheh@oracle.com ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-10 22:07 ` GFS Mark Fasheh @ 2005-08-11 4:41 ` Pekka J Enberg 0 siblings, 0 replies; 30+ messages in thread From: Pekka J Enberg @ 2005-08-11 4:41 UTC (permalink / raw) To: Mark Fasheh Cc: Christoph Hellwig, Zach Brown, David Teigland, Pekka Enberg, akpm, linux-kernel, linux-cluster Hi, On Wed, Aug 10, 2005 at 11:18:48PM +0300, Pekka J Enberg wrote: > > You, however, don't maintain the same level of data consistency when reads > > and writes are from other filesystems as they use ->nopage. Mark Fasheh writes: > I'm not sure what you mean here... Reading and writing from other filesystems to a GFS2 mmap'd file does not walk the vmas. Therefore, data consistency guarantees are different: - A GFS2 filesystem does a read that writes to a GFS2 mmap'd file -> we take all locks for the mmap'd buffer in order and release them after read() is done. - A ext3 filesystem, for example, does a read that writes to a GFS2 mmap'd file -> we now take locks one page at the time releasing them before we exit ->nopage(). Other nodes are now free to write to the same GFS2 mmap'd file. Or am I missing something here? On Wed, Aug 10, 2005 at 11:18:48PM +0300, Pekka J Enberg wrote: > > Fixing this requires a generic vma walk in every write() and read(), no? > > That doesn't seem such an hot idea which brings us back to using ->nopage > > for taking the locks (but now the deadlocks are back). Mark Fasheh writes: > Yeah if you look through mmap.c in ocfs2_fill_ctxt_from_buf() we do this... > Or am I misunderstanding what you mean? If are doing write() or read() from some other filesystem, we don't walk the vmas but instead rely on ->nopage for locking, right? Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 14:14 ` GFS Pekka J Enberg 2005-08-08 18:32 ` GFS Zach Brown @ 2005-08-10 5:59 ` David Teigland 2005-08-10 6:06 ` GFS Pekka J Enberg 1 sibling, 1 reply; 30+ messages in thread From: David Teigland @ 2005-08-10 5:59 UTC (permalink / raw) To: Pekka J Enberg; +Cc: Pekka Enberg, akpm, linux-kernel, linux-cluster On Mon, Aug 08, 2005 at 05:14:45PM +0300, Pekka J Enberg wrote: if (!dumping) down_read(&mm->mmap_sem); > >+ > >+ for (vma = find_vma(mm, start); vma; vma = vma->vm_next) { > >+ if (end <= vma->vm_start) > >+ break; > >+ if (vma->vm_file && > >+ vma->vm_file->f_dentry->d_inode->i_sb == sb) { > >+ num_gh++; > >+ } > >+ } > >+ > >+ ghs = kmalloc((num_gh + 1) * sizeof(struct gfs2_holder), > >+ GFP_KERNEL); > >+ if (!ghs) { > >+ if (!dumping) > >+ up_read(&mm->mmap_sem); > >+ return -ENOMEM; > >+ } > >+ > >+ for (vma = find_vma(mm, start); vma; vma = vma->vm_next) { > > Sorry if this is an obvious question but what prevents another thread from > doing mmap() before we do the second walk and messing up num_gh? mm->mmap_sem ? ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-10 5:59 ` GFS David Teigland @ 2005-08-10 6:06 ` Pekka J Enberg 0 siblings, 0 replies; 30+ messages in thread From: Pekka J Enberg @ 2005-08-10 6:06 UTC (permalink / raw) To: David Teigland; +Cc: Pekka Enberg, akpm, linux-kernel, linux-cluster David Teigland writes: > > if (!dumping) > down_read(&mm->mmap_sem); > > > + > > > + for (vma = find_vma(mm, start); vma; vma = vma->vm_next) { > > > + if (end <= vma->vm_start) > > > + break; > > > + if (vma->vm_file && > > > + vma->vm_file->f_dentry->d_inode->i_sb == sb) { > > > + num_gh++; > > > + } > > > + } > > > + > > > + ghs = kmalloc((num_gh + 1) * sizeof(struct gfs2_holder), > > > + GFP_KERNEL); > > > + if (!ghs) { > > > + if (!dumping) > > > + up_read(&mm->mmap_sem); > > > + return -ENOMEM; > > > + } > > > + > > > + for (vma = find_vma(mm, start); vma; vma = vma->vm_next) { > > > > Sorry if this is an obvious question but what prevents another thread from > > doing mmap() before we do the second walk and messing up num_gh? > > mm->mmap_sem ? Aah, I read that !dumping expression the other way around. Sorry and thanks. Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 00/14] GFS 2005-08-02 7:18 [PATCH 00/14] GFS David Teigland 2005-08-02 10:16 ` Pekka Enberg @ 2005-08-03 6:44 ` Pekka Enberg 2005-08-08 9:57 ` David Teigland 1 sibling, 1 reply; 30+ messages in thread From: Pekka Enberg @ 2005-08-03 6:44 UTC (permalink / raw) To: David Teigland; +Cc: akpm, linux-kernel, linux-cluster, Pekka Enberg Hi David, Some more comments below. Pekka On 8/2/05, David Teigland <teigland@redhat.com> wrote: > +/** > + * inode_create - create a struct gfs2_inode > + * @i_gl: The glock covering the inode > + * @inum: The inode number > + * @io_gl: the iopen glock to acquire/hold (using holder in new gfs2_inode) > + * @io_state: the state the iopen glock should be acquired in > + * @ipp: pointer to put the returned inode in > + * > + * Returns: errno > + */ > + > +static int inode_create(struct gfs2_glock *i_gl, struct gfs2_inum *inum, > + struct gfs2_glock *io_gl, unsigned int io_state, > + struct gfs2_inode **ipp) > +{ > + struct gfs2_sbd *sdp = i_gl->gl_sbd; > + struct gfs2_inode *ip; > + int error = 0; > + > + RETRY_MALLOC(ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL), ip); Why do you want to do this? The callers can handle ENOMEM just fine. > +/** > + * gfs2_random - Generate a random 32-bit number > + * > + * Generate a semi-crappy 32-bit pseudo-random number without using > + * floating point. > + * > + * The PRNG is from "Numerical Recipes in C" (second edition), page 284. > + * > + * Returns: a 32-bit random number > + */ > + > +uint32_t gfs2_random(void) > +{ > + gfs2_random_number = 0x0019660D * gfs2_random_number + 0x3C6EF35F; > + return gfs2_random_number; > +} Please consider moving this into lib/random.c. This one already appears in drivers/net/hamradio/dmascc.c. > +/** > + * gfs2_hash - hash an array of data > + * @data: the data to be hashed > + * @len: the length of data to be hashed > + * > + * Take some data and convert it to a 32-bit hash. > + * > + * This is the 32-bit FNV-1a hash from: > + * http://www.isthe.com/chongo/tech/comp/fnv/ > + * > + * Returns: the hash > + */ > + > +uint32_t gfs2_hash(const void *data, unsigned int len) > +{ > + uint32_t h = 0x811C9DC5; > + h = hash_more_internal(data, len, h); > + return h; > +} Is there a reason why you cannot use <linux/hash.h> or <linux/jhash.h>? > +void gfs2_sort(void *base, unsigned int num_elem, unsigned int size, > + int (*compar) (const void *, const void *)) > +{ > + register char *pbase = (char *)base; > + int i, j, k, h; > + static int cols[16] = {1391376, 463792, 198768, 86961, > + 33936, 13776, 4592, 1968, > + 861, 336, 112, 48, > + 21, 7, 3, 1}; > + > + for (k = 0; k < 16; k++) { > + h = cols[k]; > + for (i = h; i < num_elem; i++) { > + j = i; > + while (j >= h && > + (*compar)((void *)(pbase + size * (j - h)), > + (void *)(pbase + size * j)) > 0) { > + SWAP(pbase + size * j, > + pbase + size * (j - h), > + size); > + j = j - h; > + } > + } > + } > +} Please use sort() from lib/sort.c. > +/** > + * gfs2_io_error_inode_i - Flag an inode I/O error and withdraw > + * @ip: > + * @function: > + * @file: > + * @line: Please drop empty kerneldoc tags. (Appears in various other places as well.) > +#define RETRY_MALLOC(do_this, until_this) \ > +for (;;) { \ > + { do_this; } \ > + if (until_this) \ > + break; \ > + if (time_after_eq(jiffies, gfs2_malloc_warning + 5 * HZ)) { \ > + printk("GFS2: out of memory: %s, %u\n", __FILE__, __LINE__); \ > + gfs2_malloc_warning = jiffies; \ > + } \ > + yield(); \ > +} Please drop this. > +int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) > +{ > + struct gfs2_sbd *sdp = dip->i_sbd; > + struct posix_acl *acl = NULL; > + struct gfs2_ea_request er; > + mode_t mode = ip->i_di.di_mode; > + int error; > + > + if (!sdp->sd_args.ar_posix_acl) > + return 0; > + if (S_ISLNK(ip->i_di.di_mode)) > + return 0; > + > + memset(&er, 0, sizeof(struct gfs2_ea_request)); > + er.er_type = GFS2_EATYPE_SYS; > + > + error = acl_get(dip, ACL_DEFAULT, &acl, NULL, > + &er.er_data, &er.er_data_len); > + if (error) > + return error; > + if (!acl) { > + mode &= ~current->fs->umask; > + if (mode != ip->i_di.di_mode) > + error = munge_mode(ip, mode); > + return error; > + } > + > + { > + struct posix_acl *clone = posix_acl_clone(acl, GFP_KERNEL); > + error = -ENOMEM; > + if (!clone) > + goto out; > + gfs2_memory_add(clone); > + gfs2_memory_rm(acl); > + posix_acl_release(acl); > + acl = clone; > + } Please make this a real function. It is duplicated below. > + if (error > 0) { > + er.er_name = GFS2_POSIX_ACL_ACCESS; > + er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; > + posix_acl_to_xattr(acl, er.er_data, er.er_data_len); > + er.er_mode = mode; > + er.er_flags = GFS2_ERF_MODE; > + error = gfs2_system_eaops.eo_set(ip, &er); > + if (error) > + goto out; > + } else > + munge_mode(ip, mode); > + > + out: > + gfs2_memory_rm(acl); > + posix_acl_release(acl); > + kfree(er.er_data); > + > + return error; Whitespace damage. > +int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) > +{ > + struct posix_acl *acl = NULL; > + struct gfs2_ea_location el; > + char *data; > + unsigned int len; > + int error; > + > + error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len); > + if (error) > + return error; > + if (!acl) > + return gfs2_setattr_simple(ip, attr); > + > + { > + struct posix_acl *clone = posix_acl_clone(acl, GFP_KERNEL); > + error = -ENOMEM; > + if (!clone) > + goto out; > + gfs2_memory_add(clone); > + gfs2_memory_rm(acl); > + posix_acl_release(acl); > + acl = clone; > + } Duplicated above. > +static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) > +{ > + struct buffer_head *bh; > + int error; > + > + error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, > + DIO_START | DIO_WAIT, &bh); > + if (error) > + return error; > + > + if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) > + error = ea_foreach_i(ip, bh, ea_call, data); goto out here so you can drop the else branch below. > + else { > + struct buffer_head *eabh; > + uint64_t *eablk, *end; > + > + if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_IN)) { > + error = -EIO; > + goto out; > + } > + > + eablk = (uint64_t *)(bh->b_data + > + sizeof(struct gfs2_meta_header)); > + end = eablk + ip->i_sbd->sd_inptrs; > + > +static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, > + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, > + void *private) > +{ > + struct ea_find *ef = (struct ea_find *)private; > + struct gfs2_ea_request *er = ef->ef_er; > + > + if (ea->ea_type == GFS2_EATYPE_UNUSED) > + return 0; > + > + if (ea->ea_type == er->er_type) { > + if (ea->ea_name_len == er->er_name_len && > + !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) { > + struct gfs2_ea_location *el = ef->ef_el; > + get_bh(bh); > + el->el_bh = bh; > + el->el_ea = ea; > + el->el_prev = prev; > + return 1; > + } > + } > + > +#if 0 > + else if ((ip->i_di.di_flags & GFS2_DIF_EA_PACKED) && > + er->er_type == GFS2_EATYPE_SYS) > + return 1; > +#endif Please drop commented out code. > +static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, > + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, > + void *private) > +{ > + struct ea_list *ei = (struct ea_list *)private; Please drop redundant cast. > +static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, > + struct gfs2_ea_location *el) > +{ > + { > + struct ea_set es; > + int error; > + > + memset(&es, 0, sizeof(struct ea_set)); > + es.es_er = er; > + es.es_el = el; > + > + error = ea_foreach(ip, ea_set_simple, &es); > + if (error > 0) > + return 0; > + if (error) > + return error; > + } > + { > + unsigned int blks = 2; > + if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) > + blks++; > + if (GFS2_EAREQ_SIZE_STUFFED(er) > ip->i_sbd->sd_jbsize) > + blks += DIV_RU(er->er_data_len, > + ip->i_sbd->sd_jbsize); > + > + return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); > + } Please drop the extra braces. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 00/14] GFS 2005-08-03 6:44 ` [PATCH 00/14] GFS Pekka Enberg @ 2005-08-08 9:57 ` David Teigland 2005-08-08 10:00 ` GFS Pekka J Enberg ` (4 more replies) 0 siblings, 5 replies; 30+ messages in thread From: David Teigland @ 2005-08-08 9:57 UTC (permalink / raw) To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-cluster, Pekka Enberg On Wed, Aug 03, 2005 at 09:44:06AM +0300, Pekka Enberg wrote: > > +uint32_t gfs2_hash(const void *data, unsigned int len) > > +{ > > + uint32_t h = 0x811C9DC5; > > + h = hash_more_internal(data, len, h); > > + return h; > > +} > > Is there a reason why you cannot use <linux/hash.h> or <linux/jhash.h>? See gfs2_hash_more() and comment; we hash discontiguous regions. > > +#define RETRY_MALLOC(do_this, until_this) \ > > +for (;;) { \ > > + { do_this; } \ > > + if (until_this) \ > > + break; \ > > + if (time_after_eq(jiffies, gfs2_malloc_warning + 5 * HZ)) { \ > > + printk("GFS2: out of memory: %s, %u\n", __FILE__, __LINE__); \ > > + gfs2_malloc_warning = jiffies; \ > > + } \ > > + yield(); \ > > +} > > Please drop this. Done in the spot that could deal with an error, but there are three other places that still need it. > > +static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, > > + struct gfs2_ea_location *el) > > +{ > > + { > > + struct ea_set es; > > + int error; > > + > > + memset(&es, 0, sizeof(struct ea_set)); > > + es.es_er = er; > > + es.es_el = el; > > + > > + error = ea_foreach(ip, ea_set_simple, &es); > > + if (error > 0) > > + return 0; > > + if (error) > > + return error; > > + } > > + { > > + unsigned int blks = 2; > > + if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) > > + blks++; > > + if (GFS2_EAREQ_SIZE_STUFFED(er) > ip->i_sbd->sd_jbsize) > > + blks += DIV_RU(er->er_data_len, > > + ip->i_sbd->sd_jbsize); > > + > > + return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); > > + } > > Please drop the extra braces. Here and elsewhere we try to keep unused stuff off the stack. Are you suggesting that we're being overly cautious, or do you just dislike the way it looks? Thanks, Dave ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 9:57 ` David Teigland @ 2005-08-08 10:00 ` Pekka J Enberg 2005-08-08 10:18 ` GFS Pekka J Enberg ` (3 subsequent siblings) 4 siblings, 0 replies; 30+ messages in thread From: Pekka J Enberg @ 2005-08-08 10:00 UTC (permalink / raw) To: David Teigland; +Cc: Pekka Enberg, akpm, linux-kernel, linux-cluster David Teigland writes: > > > +static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, > > > + struct gfs2_ea_location *el) > > > +{ > > > + { > > > + struct ea_set es; > > > + int error; > > > + > > > + memset(&es, 0, sizeof(struct ea_set)); > > > + es.es_er = er; > > > + es.es_el = el; > > > + > > > + error = ea_foreach(ip, ea_set_simple, &es); > > > + if (error > 0) > > > + return 0; > > > + if (error) > > > + return error; > > > + } > > > + { > > > + unsigned int blks = 2; > > > + if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) > > > + blks++; > > > + if (GFS2_EAREQ_SIZE_STUFFED(er) > ip->i_sbd->sd_jbsize) > > > + blks += DIV_RU(er->er_data_len, > > > + ip->i_sbd->sd_jbsize); > > > + > > > + return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); > > > + } > > > > Please drop the extra braces. > > Here and elsewhere we try to keep unused stuff off the stack. Are you > suggesting that we're being overly cautious, or do you just dislike the > way it looks? The extra braces hurt readability. Please drop them or make them proper functions instead. And yes, I think you're hiding potential stack usage problems here. Small unused stuff on the stack don't matter and large ones should probably be kmalloc() anyway. Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 9:57 ` David Teigland 2005-08-08 10:00 ` GFS Pekka J Enberg @ 2005-08-08 10:18 ` Pekka J Enberg 2005-08-08 10:56 ` GFS David Teigland 2005-08-08 10:34 ` GFS Pekka J Enberg ` (2 subsequent siblings) 4 siblings, 1 reply; 30+ messages in thread From: Pekka J Enberg @ 2005-08-08 10:18 UTC (permalink / raw) To: David Teigland; +Cc: Pekka Enberg, akpm, linux-kernel, linux-cluster David Teigland writes: > > > +#define RETRY_MALLOC(do_this, until_this) \ > > > +for (;;) { \ > > > + { do_this; } \ > > > + if (until_this) \ > > > + break; \ > > > + if (time_after_eq(jiffies, gfs2_malloc_warning + 5 * HZ)) { \ > > > + printk("GFS2: out of memory: %s, %u\n", __FILE__, __LINE__); \ > > > + gfs2_malloc_warning = jiffies; \ > > > + } \ > > > + yield(); \ > > > +} > > > > Please drop this. > > Done in the spot that could deal with an error, but there are three other > places that still need it. Which places are those? I only see these: gfs2-02.patch:+ RETRY_MALLOC(ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL), ip); gfs2-02.patch-+ gfs2_memory_add(ip); gfs2-02.patch-+ memset(ip, 0, sizeof(struct gfs2_inode)); gfs2-02.patch-+ gfs2-02.patch-+ ip->i_num = *inum; gfs2-02.patch-+ -> GFP_NOFAIL. gfs2-02.patch:+ RETRY_MALLOC(page = grab_cache_page(aspace->i_mapping, index), gfs2-02.patch-+ page); gfs2-02.patch-+ } else { gfs2-02.patch-+ page = find_lock_page(aspace->i_mapping, index); gfs2-02.patch-+ if (!page) gfs2-02.patch-+ return NULL; I think you can set aspace->flags to GFP_NOFAIL but why can't you return NULL here on failure like you do for find_lock_page()? gfs2-02.patch:+ RETRY_MALLOC(bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_KERNEL), gfs2-02.patch-+ bd); gfs2-02.patch-+ gfs2_memory_add(bd); gfs2-02.patch-+ atomic_inc(&gl->gl_sbd->sd_bufdata_count); gfs2-02.patch-+ gfs2-02.patch-+ memset(bd, 0, sizeof(struct gfs2_bufdata)); -> GFP_NOFAIL gfs2-08.patch:+ RETRY_MALLOC(gm = kmalloc(sizeof(struct gfs2_memory), GFP_KERNEL), gm); gfs2-08.patch-+ gm->gm_data = data; gfs2-08.patch-+ gm->gm_file = file; gfs2-08.patch-+ gm->gm_line = line; gfs2-08.patch-+ gfs2-08.patch-+ spin_lock(&memory_lock); -> GFP_NOFAIL gfs2-10.patch:+ RETRY_MALLOC(new_gh = gfs2_holder_get(gl, state, gfs2-10.patch-+ LM_FLAG_TRY | gfs2-10.patch-+ GL_NEVER_RECURSE), gfs2-10.patch-+ new_gh); gfs2-10.patch-+ set_bit(HIF_DEMOTE, &new_gh->gh_iflags); gfs2-10.patch-+ set_bit(HIF_DEALLOC, &new_gh->gh_iflags); gfs2_holder_get uses kmalloc which can use GFP_NOFAIL. Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 10:18 ` GFS Pekka J Enberg @ 2005-08-08 10:56 ` David Teigland 2005-08-08 10:57 ` GFS Pekka J Enberg 0 siblings, 1 reply; 30+ messages in thread From: David Teigland @ 2005-08-08 10:56 UTC (permalink / raw) To: Pekka J Enberg; +Cc: Pekka Enberg, akpm, linux-kernel, linux-cluster On Mon, Aug 08, 2005 at 01:18:45PM +0300, Pekka J Enberg wrote: > gfs2-02.patch:+ RETRY_MALLOC(ip = kmem_cache_alloc(gfs2_inode_cachep, > -> GFP_NOFAIL. Already gone, inode_create() can return an error. if (create) { RETRY_MALLOC(page = grab_cache_page(aspace->i_mapping, index), page); } else { page = find_lock_page(aspace->i_mapping, index); if (!page) return NULL; } > I think you can set aspace->flags to GFP_NOFAIL will try that > but why can't you return NULL here on failure like you do for > find_lock_page()? because create is set > gfs2-02.patch:+ RETRY_MALLOC(bd = kmem_cache_alloc(gfs2_bufdata_cachep, > GFP_KERNEL), > -> GFP_NOFAIL It looks to me like NOFAIL does nothing for kmem_cache_alloc(). Am I seeing that wrong? > gfs2-10.patch:+ RETRY_MALLOC(new_gh = gfs2_holder_get(gl, state, > gfs2_holder_get uses kmalloc which can use GFP_NOFAIL. Which means adding a new gfp_flags parameter... fine. Dave ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 10:56 ` GFS David Teigland @ 2005-08-08 10:57 ` Pekka J Enberg 2005-08-08 11:39 ` GFS David Teigland 0 siblings, 1 reply; 30+ messages in thread From: Pekka J Enberg @ 2005-08-08 10:57 UTC (permalink / raw) To: David Teigland; +Cc: Pekka Enberg, akpm, linux-kernel, linux-cluster David Teigland writes: > > but why can't you return NULL here on failure like you do for > > find_lock_page()? > > because create is set Yes, but looking at (some of the) top-level callers, there's no real reason why create must not fail. Am I missing something here? > > gfs2-02.patch:+ RETRY_MALLOC(bd = kmem_cache_alloc(gfs2_bufdata_cachep, > > GFP_KERNEL), > > -> GFP_NOFAIL > > It looks to me like NOFAIL does nothing for kmem_cache_alloc(). > Am I seeing that wrong? It is passed to the page allocator just like with kmalloc() which uses __cache_alloc() too. Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 10:57 ` GFS Pekka J Enberg @ 2005-08-08 11:39 ` David Teigland 0 siblings, 0 replies; 30+ messages in thread From: David Teigland @ 2005-08-08 11:39 UTC (permalink / raw) To: Pekka J Enberg; +Cc: Pekka Enberg, akpm, linux-kernel, linux-cluster On Mon, Aug 08, 2005 at 01:57:55PM +0300, Pekka J Enberg wrote: > David Teigland writes: > >> but why can't you return NULL here on failure like you do for > >> find_lock_page()? > > > >because create is set > > Yes, but looking at (some of the) top-level callers, there's no real reason > why create must not fail. Am I missing something here? I'll trace the callers back farther and see about dealing with errors. > >> gfs2-02.patch:+ RETRY_MALLOC(bd = kmem_cache_alloc(gfs2_bufdata_cachep, > > It is passed to the page allocator just like with kmalloc() which uses > __cache_alloc() too. Yes, I read it wrongly, looks like NOFAIL should work fine. I think we can get rid of the RETRY macro entirely. Thanks, Dave ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 9:57 ` David Teigland 2005-08-08 10:00 ` GFS Pekka J Enberg 2005-08-08 10:18 ` GFS Pekka J Enberg @ 2005-08-08 10:34 ` Pekka J Enberg 2005-08-09 14:55 ` GFS Pekka J Enberg 2005-08-10 7:40 ` GFS Pekka J Enberg 4 siblings, 0 replies; 30+ messages in thread From: Pekka J Enberg @ 2005-08-08 10:34 UTC (permalink / raw) To: David Teigland; +Cc: Pekka Enberg, akpm, linux-kernel, linux-cluster David Teigland writes: > > Is there a reason why you cannot use <linux/hash.h> or <linux/jhash.h>? > > See gfs2_hash_more() and comment; we hash discontiguous regions. jhash() takes an initial value. Isn't that sufficient? Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 9:57 ` David Teigland ` (2 preceding siblings ...) 2005-08-08 10:34 ` GFS Pekka J Enberg @ 2005-08-09 14:55 ` Pekka J Enberg 2005-08-10 7:40 ` GFS Pekka J Enberg 4 siblings, 0 replies; 30+ messages in thread From: Pekka J Enberg @ 2005-08-09 14:55 UTC (permalink / raw) To: David Teigland; +Cc: akpm, linux-kernel, linux-cluster Hi David, Here are some more comments. Pekka +/************************************************************************** **** > +******************************************************************************* > +** > +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. > +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. > +** > +** This copyrighted material is made available to anyone wishing to use, > +** modify, copy, or redistribute it subject to the terms and conditions > +** of the GNU General Public License v.2. > +** > +******************************************************************************* > +******************************************************************************/ Do you really need this verbose banner? > +#define NO_CREATE 0 > +#define CREATE 1 > + > +#define NO_WAIT 0 > +#define WAIT 1 > + > +#define NO_FORCE 0 > +#define FORCE 1 The code seems to interchangeably use FORCE and NO_FORCE together with TRUE and FALSE. Perhaps they could be dropped? > +#define GLF_PLUG 0 > +#define GLF_LOCK 1 > +#define GLF_STICKY 2 > +#define GLF_PREFETCH 3 > +#define GLF_SYNC 4 > +#define GLF_DIRTY 5 > +#define GLF_SKIP_WAITERS2 6 > +#define GLF_GREEDY 7 Would be nice if these were either enums or had a comment linking them to the struct member they are used for. > +#define GIF_MIN_INIT 0 > +#define GIF_QD_LOCKED 1 > +#define GIF_PAGED 2 > +#define GIF_SW_PAGED 3 Same here and in few other places too. > +#define LO_BEFORE_COMMIT(sdp) \ > +do { \ > + int __lops_x; \ > + for (__lops_x = 0; gfs2_log_ops[__lops_x]; __lops_x++) \ > + if (gfs2_log_ops[__lops_x]->lo_before_commit) \ > + gfs2_log_ops[__lops_x]->lo_before_commit((sdp)); \ > +} while (0) > + > +#define LO_AFTER_COMMIT(sdp, ai) \ > +do { \ > + int __lops_x; \ > + for (__lops_x = 0; gfs2_log_ops[__lops_x]; __lops_x++) \ > + if (gfs2_log_ops[__lops_x]->lo_after_commit) \ > + gfs2_log_ops[__lops_x]->lo_after_commit((sdp), (ai)); \ > +} while (0) > + > +#define LO_BEFORE_SCAN(jd, head, pass) \ > +do \ > +{ \ > + int __lops_x; \ > + for (__lops_x = 0; gfs2_log_ops[__lops_x]; __lops_x++) \ > + if (gfs2_log_ops[__lops_x]->lo_before_scan) \ > + gfs2_log_ops[__lops_x]->lo_before_scan((jd), (head), (pass)); \ > +} \ > +while (0) static inline functions, please. > +static inline int LO_SCAN_ELEMENTS(struct gfs2_jdesc *jd, unsigned int start, > + struct gfs2_log_descriptor *ld, > + unsigned int pass) Lower case name, please. > +{ > + unsigned int x; > + int error; > + > + for (x = 0; gfs2_log_ops[x]; x++) > + if (gfs2_log_ops[x]->lo_scan_elements) { > + error = gfs2_log_ops[x]->lo_scan_elements(jd, start, > + ld, pass); > + if (error) > + return error; > + } > + > + return 0; > +} > + > +#define LO_AFTER_SCAN(jd, error, pass) \ > +do \ > +{ \ > + int __lops_x; \ > + for (__lops_x = 0; gfs2_log_ops[__lops_x]; __lops_x++) \ > + if (gfs2_log_ops[__lops_x]->lo_before_scan) \ > + gfs2_log_ops[__lops_x]->lo_after_scan((jd), (error), (pass)); \ > +} \ > +while (0) static inline function, please. > + > +#include <linux/sched.h> > +#include <linux/slab.h> > +#include <linux/smp_lock.h> > +#include <linux/spinlock.h> > +#include <asm/semaphore.h> > +#include <linux/completion.h> > +#include <linux/buffer_head.h> > +#include <asm/uaccess.h> > +#include <linux/pagemap.h> > +#include <linux/uio.h> > +#include <linux/blkdev.h> > +#include <linux/mm.h> > +#include <asm/uaccess.h> > +#include <linux/gfs2_ioctl.h> Preferred order is to include linux/ first and asm/ after that. > +#define vma2state(vma) \ > +((((vma)->vm_flags & (VM_MAYWRITE | VM_MAYSHARE)) == \ > + (VM_MAYWRITE | VM_MAYSHARE)) ? \ > + LM_ST_EXCLUSIVE : LM_ST_SHARED) \ static inline function, please. The above is completely unreadable. > +struct inode *gfs2_ip2v(struct gfs2_inode *ip, int create) > +{ > + struct inode *inode = NULL, *tmp; > + > + gfs2_assert_warn(ip->i_sbd, > + test_bit(GIF_MIN_INIT, &ip->i_flags)); > + > + spin_lock(&ip->i_spin); > + if (ip->i_vnode) > + inode = igrab(ip->i_vnode); > + spin_unlock(&ip->i_spin); Suggestion: make the above a separate function __gfs2_lookup_inode(), use it here and where you pass NO_CREATE to get rid of the create parameter. > + > + if (inode || !create) > + return inode; > + > + tmp = new_inode(ip->i_sbd->sd_vfs); > + if (!tmp) > + return NULL; [snip] > + entries = gfs2_tune_get(sdp, gt_entries_per_readdir); > + size = sizeof(struct filldir_bad) + > + entries * (sizeof(struct filldir_bad_entry) + GFS2_FAST_NAME_SIZE); > + > + fdb = kmalloc(size, GFP_KERNEL); > + if (!fdb) > + return -ENOMEM; > + memset(fdb, 0, size); kzalloc(), which is in 2.6.13-rc6-mm5 please. Appears in other places as well. > + if (error) { > + printk("GFS2: fsid=%s: can't make FS RW: %d\n", > + sdp->sd_fsname, error); > + goto fail_proc; > + } > + } > + > + gfs2_glock_dq_uninit(&mount_gh); > + > + return 0; > + > + fail_proc: > + gfs2_proc_fs_del(sdp); > + init_threads(sdp, UNDO); Please provide a release_threads instead and make it deal with partial initialization. The above is very confusing. > + parent, > + strlen(system_utsname.nodename)); > + else if (gfs2_filecmp(&dentry->d_name, "@mach", 5)) > + new = lookup_one_len(system_utsname.machine, > + parent, > + strlen(system_utsname.machine)); > + else if (gfs2_filecmp(&dentry->d_name, "@os", 3)) > + new = lookup_one_len(system_utsname.sysname, > + parent, > + strlen(system_utsname.sysname)); > + else if (gfs2_filecmp(&dentry->d_name, "@uid", 4)) > + new = lookup_one_len(buf, > + parent, > + sprintf(buf, "%u", current->fsuid)); > + else if (gfs2_filecmp(&dentry->d_name, "@gid", 4)) > + new = lookup_one_len(buf, > + parent, > + sprintf(buf, "%u", current->fsgid)); > + else if (gfs2_filecmp(&dentry->d_name, "@sys", 4)) > + new = lookup_one_len(buf, > + parent, > + sprintf(buf, "%s_%s", > + system_utsname.machine, > + system_utsname.sysname)); > + else if (gfs2_filecmp(&dentry->d_name, "@jid", 4)) > + new = lookup_one_len(buf, > + parent, > + sprintf(buf, "%u", > + sdp->sd_jdesc->jd_jid)); Smells like policy in the kernel. Why can't this be done in the userspace? > + parent, > + strlen(system_utsname.nodename)); > + else if (gfs2_filecmp(&dentry->d_name, "{mach}", 6)) > + new = lookup_one_len(system_utsname.machine, > + parent, > + strlen(system_utsname.machine)); > + else if (gfs2_filecmp(&dentry->d_name, "{os}", 4)) > + new = lookup_one_len(system_utsname.sysname, > + parent, > + strlen(system_utsname.sysname)); > + else if (gfs2_filecmp(&dentry->d_name, "{uid}", 5)) > + new = lookup_one_len(buf, > + parent, > + sprintf(buf, "%u", current->fsuid)); > + else if (gfs2_filecmp(&dentry->d_name, "{gid}", 5)) > + new = lookup_one_len(buf, > + parent, > + sprintf(buf, "%u", current->fsgid)); > + else if (gfs2_filecmp(&dentry->d_name, "{sys}", 5)) > + new = lookup_one_len(buf, > + parent, > + sprintf(buf, "%s_%s", > + system_utsname.machine, > + system_utsname.sysname)); > + else if (gfs2_filecmp(&dentry->d_name, "{jid}", 5)) > + new = lookup_one_len(buf, > + parent, > + sprintf(buf, "%u", > + sdp->sd_jdesc->jd_jid)); Ditto. > +int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc) > +{ > + struct gfs2_holder ri_gh; > + struct gfs2_rgrpd *rgd_next; > + struct gfs2_holder *gha, *gh; > + unsigned int slots = 64; > + unsigned int x; > + int done; > + int error = 0, err; > + > + memset(sc, 0, sizeof(struct gfs2_statfs_change)); > + gha = kmalloc(slots * sizeof(struct gfs2_holder), GFP_KERNEL); > + if (!gha) > + return -ENOMEM; > + memset(gha, 0, slots * sizeof(struct gfs2_holder)); kcalloc, please > + line = kmalloc(256, GFP_KERNEL); > + if (!line) > + return -ENOMEM; > + > + len = snprintf(line, 256, "GFS2: fsid=%s: quota %s for %s %u\r\n", > + sdp->sd_fsname, type, > + (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group", > + qd->qd_id); Please use constant instead of magic number 256. > +struct lm_lockops gdlm_ops = { > + lm_proto_name:"lock_dlm", > + lm_mount:gdlm_mount, > + lm_others_may_mount:gdlm_others_may_mount, > + lm_unmount:gdlm_unmount, > + lm_withdraw:gdlm_withdraw, > + lm_get_lock:gdlm_get_lock, > + lm_put_lock:gdlm_put_lock, > + lm_lock:gdlm_lock, > + lm_unlock:gdlm_unlock, > + lm_plock:gdlm_plock, > + lm_punlock:gdlm_punlock, > + lm_plock_get:gdlm_plock_get, > + lm_cancel:gdlm_cancel, > + lm_hold_lvb:gdlm_hold_lvb, > + lm_unhold_lvb:gdlm_unhold_lvb, > + lm_sync_lvb:gdlm_sync_lvb, > + lm_recovery_done:gdlm_recovery_done, > + lm_owner:THIS_MODULE, > +}; C99 initializers, please. ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-08 9:57 ` David Teigland ` (3 preceding siblings ...) 2005-08-09 14:55 ` GFS Pekka J Enberg @ 2005-08-10 7:40 ` Pekka J Enberg 2005-08-10 7:43 ` GFS Christoph Hellwig 4 siblings, 1 reply; 30+ messages in thread From: Pekka J Enberg @ 2005-08-10 7:40 UTC (permalink / raw) To: David Teigland; +Cc: akpm, linux-kernel, linux-cluster Hi David, > + return -EINVAL; > + if (!access_ok(VERIFY_WRITE, buf, size)) > + return -EFAULT; > + > + if (!(file->f_flags & O_LARGEFILE)) { > + if (*offset >= 0x7FFFFFFFull) > + return -EFBIG; > + if (*offset + size > 0x7FFFFFFFull) > + size = 0x7FFFFFFFull - *offset; Please use a constant instead for 0x7FFFFFFFull. (Appears in various other places as well.) Pekka ^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: GFS 2005-08-10 7:40 ` GFS Pekka J Enberg @ 2005-08-10 7:43 ` Christoph Hellwig 0 siblings, 0 replies; 30+ messages in thread From: Christoph Hellwig @ 2005-08-10 7:43 UTC (permalink / raw) To: Pekka J Enberg; +Cc: David Teigland, akpm, linux-kernel, linux-cluster On Wed, Aug 10, 2005 at 10:40:37AM +0300, Pekka J Enberg wrote: > Hi David, > > >+ return -EINVAL; > >+ if (!access_ok(VERIFY_WRITE, buf, size)) > >+ return -EFAULT; > >+ > >+ if (!(file->f_flags & O_LARGEFILE)) { > >+ if (*offset >= 0x7FFFFFFFull) > >+ return -EFBIG; > >+ if (*offset + size > 0x7FFFFFFFull) > >+ size = 0x7FFFFFFFull - *offset; > > Please use a constant instead for 0x7FFFFFFFull. (Appears in various other > places as well.) In fact this very much looks like it's duplicating generic_write_checks(). Folks, please use common code. ^ permalink raw reply [flat|nested] 30+ messages in thread
end of thread, other threads:[~2005-08-11 16:45 UTC | newest] Thread overview: 30+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2005-08-11 7:10 GFS Pekka J Enberg 2005-08-11 16:33 ` GFS Zach Brown 2005-08-11 16:35 ` GFS Christoph Hellwig 2005-08-11 16:39 ` GFS Zach Brown 2005-08-11 16:44 ` GFS Pekka Enberg -- strict thread matches above, loose matches on Subject: below -- 2005-08-02 7:18 [PATCH 00/14] GFS David Teigland 2005-08-02 10:16 ` Pekka Enberg 2005-08-03 6:36 ` David Teigland 2005-08-08 14:14 ` GFS Pekka J Enberg 2005-08-08 18:32 ` GFS Zach Brown 2005-08-09 14:49 ` GFS Pekka Enberg 2005-08-09 17:17 ` GFS Zach Brown 2005-08-09 18:35 ` GFS Pekka J Enberg 2005-08-10 4:48 ` GFS Pekka J Enberg 2005-08-10 7:21 ` GFS Christoph Hellwig 2005-08-10 7:31 ` GFS Pekka J Enberg 2005-08-10 16:26 ` GFS Mark Fasheh 2005-08-10 16:57 ` GFS Pekka J Enberg 2005-08-10 18:21 ` GFS Mark Fasheh 2005-08-10 20:18 ` GFS Pekka J Enberg 2005-08-10 22:07 ` GFS Mark Fasheh 2005-08-11 4:41 ` GFS Pekka J Enberg 2005-08-10 5:59 ` GFS David Teigland 2005-08-10 6:06 ` GFS Pekka J Enberg 2005-08-03 6:44 ` [PATCH 00/14] GFS Pekka Enberg 2005-08-08 9:57 ` David Teigland 2005-08-08 10:00 ` GFS Pekka J Enberg 2005-08-08 10:18 ` GFS Pekka J Enberg 2005-08-08 10:56 ` GFS David Teigland 2005-08-08 10:57 ` GFS Pekka J Enberg 2005-08-08 11:39 ` GFS David Teigland 2005-08-08 10:34 ` GFS Pekka J Enberg 2005-08-09 14:55 ` GFS Pekka J Enberg 2005-08-10 7:40 ` GFS Pekka J Enberg 2005-08-10 7:43 ` GFS Christoph Hellwig
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox