* [rfc][patch 1/5] ecryptfs new aops
2007-11-12 7:12 [rfc][patches] remove ->prepare_write Nick Piggin
@ 2007-11-12 7:13 ` Nick Piggin
2007-11-12 7:14 ` [rfc][patch 2/5] cifs: " Nick Piggin
` (4 subsequent siblings)
5 siblings, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-12 7:13 UTC (permalink / raw)
To: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench,
dhowells
Convert ecryptfs to new aops.
Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/fs/ecryptfs/mmap.c
===================================================================
--- linux-2.6.orig/fs/ecryptfs/mmap.c
+++ linux-2.6/fs/ecryptfs/mmap.c
@@ -263,31 +263,38 @@ out:
return 0;
}
-static int ecryptfs_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ecryptfs_write_begin(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct page *page;
int rc = 0;
- if (from == 0 && to == PAGE_CACHE_SIZE)
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+
+ if (flags & AOP_FLAG_UNINTERRUPTIBLE && len == PAGE_CACHE_SIZE)
goto out; /* If we are writing a full page, it will be
up to date. */
if (!PageUptodate(page)) {
- rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
+ rc = ecryptfs_read_lower_page_segment(page, index, 0,
PAGE_CACHE_SIZE,
- page->mapping->host);
+ mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attemping to read lower "
"page segment; rc = [%d]\n", __FUNCTION__, rc);
- ClearPageUptodate(page);
goto out;
} else
SetPageUptodate(page);
}
- if (page->index != 0) {
+ if (index != 0) {
loff_t end_of_prev_pg_pos =
- (((loff_t)page->index << PAGE_CACHE_SHIFT) - 1);
+ (((loff_t)index << PAGE_CACHE_SHIFT) - 1);
- if (end_of_prev_pg_pos > i_size_read(page->mapping->host)) {
+ if (end_of_prev_pg_pos > i_size_read(mapping->host)) {
rc = ecryptfs_truncate(file->f_path.dentry,
end_of_prev_pg_pos);
if (rc) {
@@ -297,7 +304,7 @@ static int ecryptfs_prepare_write(struct
goto out;
}
}
- if (end_of_prev_pg_pos + 1 > i_size_read(page->mapping->host))
+ if (end_of_prev_pg_pos + 1 > i_size_read(mapping->host))
zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
}
out:
@@ -391,21 +398,25 @@ int ecryptfs_write_inode_size_to_metadat
}
/**
- * ecryptfs_commit_write
+ * ecryptfs_write_end
* @file: The eCryptfs file object
+ * @mapping: The eCryptfs object
+ * @pos, len: Ignored (we rotate the page IV on each write)
* @page: The eCryptfs page
- * @from: Ignored (we rotate the page IV on each write)
- * @to: Ignored
*
* This is where we encrypt the data and pass the encrypted data to
* the lower filesystem. In OpenPGP-compatible mode, we operate on
* entire underlying packets.
*/
-static int ecryptfs_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
-{
- loff_t pos;
- struct inode *ecryptfs_inode = page->mapping->host;
+static int ecryptfs_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned to = from + copied;
+ struct inode *ecryptfs_inode = mapping->host;
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
int rc;
@@ -417,25 +428,22 @@ static int ecryptfs_commit_write(struct
} else
ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
- "(page w/ index = [0x%.16x], to = [%d])\n", page->index,
- to);
+ "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
/* Fills in zeros if 'to' goes beyond inode size */
rc = fill_zeros_to_end_of_page(page, to);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
- "zeros in page with index = [0x%.16x]\n",
- page->index);
+ "zeros in page with index = [0x%.16x]\n", index);
goto out;
}
rc = ecryptfs_encrypt_page(page);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
- "index [0x%.16x])\n", page->index);
+ "index [0x%.16x])\n", index);
goto out;
}
- pos = (((loff_t)page->index) << PAGE_CACHE_SHIFT) + to;
- if (pos > i_size_read(ecryptfs_inode)) {
- i_size_write(ecryptfs_inode, pos);
+ if (pos + copied > i_size_read(ecryptfs_inode)) {
+ i_size_write(ecryptfs_inode, pos + copied);
ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
"[0x%.16x]\n", i_size_read(ecryptfs_inode));
}
@@ -443,7 +451,11 @@ static int ecryptfs_commit_write(struct
if (rc)
printk(KERN_ERR "Error writing inode size to metadata; "
"rc = [%d]\n", rc);
+
+ rc = copied;
out:
+ unlock_page(page);
+ page_cache_release(page);
return rc;
}
@@ -464,7 +476,7 @@ static sector_t ecryptfs_bmap(struct add
struct address_space_operations ecryptfs_aops = {
.writepage = ecryptfs_writepage,
.readpage = ecryptfs_readpage,
- .prepare_write = ecryptfs_prepare_write,
- .commit_write = ecryptfs_commit_write,
+ .write_begin = ecryptfs_write_begin,
+ .write_end = ecryptfs_write_end,
.bmap = ecryptfs_bmap,
};
^ permalink raw reply [flat|nested] 18+ messages in thread* [rfc][patch 2/5] cifs: new aops
2007-11-12 7:12 [rfc][patches] remove ->prepare_write Nick Piggin
2007-11-12 7:13 ` [rfc][patch 1/5] ecryptfs new aops Nick Piggin
@ 2007-11-12 7:14 ` Nick Piggin
2007-11-12 7:14 ` [rfc][patch 3/5] afs: " Nick Piggin
` (3 subsequent siblings)
5 siblings, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-12 7:14 UTC (permalink / raw)
To: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench,
dhowells
Convert cifs to new aops.
This is another relatively naive conversion. Always do the read upfront when
the page is not uptodate (unless we're in the writethrough path).
Fix an uninitialized data exposure where SetPageUptodate was called before
the page was uptodate.
SetPageUptodate and switch to writeback mode in the case that the full page
was dirtied.
Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/fs/cifs/file.c
===================================================================
--- linux-2.6.orig/fs/cifs/file.c
+++ linux-2.6/fs/cifs/file.c
@@ -103,7 +103,7 @@ static inline int cifs_open_inode_helper
/* want handles we can use to read with first
in the list so we do not have to walk the
- list to search for one in prepare_write */
+ list to search for one in write_begin */
if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
list_add_tail(&pCifsFile->flist,
&pCifsInode->openFileList);
@@ -1411,49 +1411,53 @@ static int cifs_writepage(struct page *p
return rc;
}
-static int cifs_commit_write(struct file *file, struct page *page,
- unsigned offset, unsigned to)
+static int cifs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
- int xid;
- int rc = 0;
- struct inode *inode = page->mapping->host;
- loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
- char *page_data;
+ int rc;
+ struct inode *inode = mapping->host;
+ loff_t position;
+
+ cFYI(1, ("commit write for page %p from position %lld with %d bytes",
+ page, pos, copied));
+
+ if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+ SetPageUptodate(page);
- xid = GetXid();
- cFYI(1, ("commit write for page %p up to position %lld for %d",
- page, position, to));
- spin_lock(&inode->i_lock);
- if (position > inode->i_size) {
- i_size_write(inode, position);
- }
- spin_unlock(&inode->i_lock);
if (!PageUptodate(page)) {
- position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset;
- /* can not rely on (or let) writepage write this data */
- if (to < offset) {
- cFYI(1, ("Illegal offsets, can not copy from %d to %d",
- offset, to));
- FreeXid(xid);
- return rc;
- }
+ char *page_data;
+ unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ int xid;
+
+ xid = GetXid();
/* this is probably better than directly calling
partialpage_write since in this function the file handle is
known which we might as well leverage */
/* BB check if anything else missing out of ppw
such as updating last write time */
page_data = kmap(page);
- rc = cifs_write(file, page_data + offset, to-offset,
- &position);
- if (rc > 0)
- rc = 0;
- /* else if (rc < 0) should we set writebehind rc? */
+ rc = cifs_write(file, page_data + offset, copied, &position);
+ /* if (rc < 0) should we set writebehind rc? */
kunmap(page);
+
+ FreeXid(xid);
} else {
+ rc = copied;
set_page_dirty(page);
}
- FreeXid(xid);
+ if (rc > 0) {
+ position = pos + rc;
+ spin_lock(&inode->i_lock);
+ if (position > inode->i_size)
+ i_size_write(inode, position);
+ spin_unlock(&inode->i_lock);
+ }
+
+ unlock_page(page);
+ page_cache_release(page);
+
return rc;
}
@@ -1999,49 +2003,45 @@ int is_size_safe_to_change(struct cifsIn
return 1;
}
-static int cifs_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int cifs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
- int rc = 0;
- loff_t i_size;
- loff_t offset;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+ struct page *page;
+
+ cFYI(1, ("write begin from %lld len %d", (long long)pos, len));
+
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
- cFYI(1, ("prepare write for page %p from %d to %d", page, from, to));
if (PageUptodate(page))
return 0;
/* If we are writing a full page it will be up to date,
no need to read from the server */
- if ((to == PAGE_CACHE_SIZE) && (from == 0)) {
- SetPageUptodate(page);
+ if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE)
return 0;
- }
- offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
- i_size = i_size_read(page->mapping->host);
+ if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+ int rc;
- if ((offset >= i_size) ||
- ((from == 0) && (offset + to) >= i_size)) {
- /*
- * We don't need to read data beyond the end of the file.
- * zero it, and set the page uptodate
- */
- simple_prepare_write(file, page, from, to);
- SetPageUptodate(page);
- } else if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
/* might as well read a page, it is fast enough */
rc = cifs_readpage_worker(file, page, &offset);
+
+ /* we do not need to pass errors back
+ e.g. if we do not have read access to the file
+ because cifs_write_end will attempt synchronous writes
+ -- shaggy */
} else {
/* we could try using another file handle if there is one -
but how would we lock it to prevent close of that handle
racing with this read? In any case
- this will be written out by commit_write so is fine */
+ this will be written out by write_end so is fine */
}
- /* we do not need to pass errors back
- e.g. if we do not have read access to the file
- because cifs_commit_write will do the right thing. -- shaggy */
-
return 0;
}
@@ -2050,8 +2050,8 @@ const struct address_space_operations ci
.readpages = cifs_readpages,
.writepage = cifs_writepage,
.writepages = cifs_writepages,
- .prepare_write = cifs_prepare_write,
- .commit_write = cifs_commit_write,
+ .write_begin = cifs_write_begin,
+ .write_end = cifs_write_end,
.set_page_dirty = __set_page_dirty_nobuffers,
/* .sync_page = cifs_sync_page, */
/* .direct_IO = */
@@ -2066,8 +2066,8 @@ const struct address_space_operations ci
.readpage = cifs_readpage,
.writepage = cifs_writepage,
.writepages = cifs_writepages,
- .prepare_write = cifs_prepare_write,
- .commit_write = cifs_commit_write,
+ .write_begin = cifs_write_begin,
+ .write_end = cifs_write_end,
.set_page_dirty = __set_page_dirty_nobuffers,
/* .sync_page = cifs_sync_page, */
/* .direct_IO = */
^ permalink raw reply [flat|nested] 18+ messages in thread* [rfc][patch 3/5] afs: new aops
2007-11-12 7:12 [rfc][patches] remove ->prepare_write Nick Piggin
2007-11-12 7:13 ` [rfc][patch 1/5] ecryptfs new aops Nick Piggin
2007-11-12 7:14 ` [rfc][patch 2/5] cifs: " Nick Piggin
@ 2007-11-12 7:14 ` Nick Piggin
2007-11-12 7:20 ` [rfc][patch 4/5] rd: rewrite rd Nick Piggin
` (2 subsequent siblings)
5 siblings, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-12 7:14 UTC (permalink / raw)
To: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench,
dhowells
Convert afs to new aops.
Cannot assume writes will fully complete, so this conversion goes the easy
way and always brings the page uptodate before the write.
Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/fs/afs/file.c
===================================================================
--- linux-2.6.orig/fs/afs/file.c
+++ linux-2.6/fs/afs/file.c
@@ -50,8 +50,8 @@ const struct address_space_operations af
.launder_page = afs_launder_page,
.releasepage = afs_releasepage,
.invalidatepage = afs_invalidatepage,
- .prepare_write = afs_prepare_write,
- .commit_write = afs_commit_write,
+ .write_begin = afs_write_begin,
+ .write_end = afs_write_end,
.writepage = afs_writepage,
.writepages = afs_writepages,
};
Index: linux-2.6/fs/afs/internal.h
===================================================================
--- linux-2.6.orig/fs/afs/internal.h
+++ linux-2.6/fs/afs/internal.h
@@ -731,8 +731,12 @@ extern int afs_volume_release_fileserver
*/
extern int afs_set_page_dirty(struct page *);
extern void afs_put_writeback(struct afs_writeback *);
-extern int afs_prepare_write(struct file *, struct page *, unsigned, unsigned);
-extern int afs_commit_write(struct file *, struct page *, unsigned, unsigned);
+extern int afs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+extern int afs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
extern int afs_writepage(struct page *, struct writeback_control *);
extern int afs_writepages(struct address_space *, struct writeback_control *);
extern int afs_write_inode(struct inode *, int);
Index: linux-2.6/fs/afs/write.c
===================================================================
--- linux-2.6.orig/fs/afs/write.c
+++ linux-2.6/fs/afs/write.c
@@ -84,15 +84,23 @@ void afs_put_writeback(struct afs_writeb
* partly or wholly fill a page that's under preparation for writing
*/
static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
- unsigned start, unsigned len, struct page *page)
+ loff_t pos, unsigned len, struct page *page)
{
+ loff_t i_size;
+ unsigned eof;
int ret;
- _enter(",,%u,%u", start, len);
+ _enter(",,%llu,%u", (unsigned long long)pos, len);
- ASSERTCMP(start + len, <=, PAGE_SIZE);
+ ASSERTCMP(len, <=, PAGE_CACHE_SIZE);
- ret = afs_vnode_fetch_data(vnode, key, start, len, page);
+ i_size = i_size_read(&vnode->vfs_inode);
+ if (pos + len > i_size)
+ eof = i_size;
+ else
+ eof = PAGE_CACHE_SIZE;
+
+ ret = afs_vnode_fetch_data(vnode, key, 0, eof, page);
if (ret < 0) {
if (ret == -ENOENT) {
_debug("got NOENT from server"
@@ -107,109 +115,55 @@ static int afs_fill_page(struct afs_vnod
}
/*
- * prepare a page for being written to
- */
-static int afs_prepare_page(struct afs_vnode *vnode, struct page *page,
- struct key *key, unsigned offset, unsigned to)
-{
- unsigned eof, tail, start, stop, len;
- loff_t i_size, pos;
- void *p;
- int ret;
-
- _enter("");
-
- if (offset == 0 && to == PAGE_SIZE)
- return 0;
-
- p = kmap_atomic(page, KM_USER0);
-
- i_size = i_size_read(&vnode->vfs_inode);
- pos = (loff_t) page->index << PAGE_SHIFT;
- if (pos >= i_size) {
- /* partial write, page beyond EOF */
- _debug("beyond");
- if (offset > 0)
- memset(p, 0, offset);
- if (to < PAGE_SIZE)
- memset(p + to, 0, PAGE_SIZE - to);
- kunmap_atomic(p, KM_USER0);
- return 0;
- }
-
- if (i_size - pos >= PAGE_SIZE) {
- /* partial write, page entirely before EOF */
- _debug("before");
- tail = eof = PAGE_SIZE;
- } else {
- /* partial write, page overlaps EOF */
- eof = i_size - pos;
- _debug("overlap %u", eof);
- tail = max(eof, to);
- if (tail < PAGE_SIZE)
- memset(p + tail, 0, PAGE_SIZE - tail);
- if (offset > eof)
- memset(p + eof, 0, PAGE_SIZE - eof);
- }
-
- kunmap_atomic(p, KM_USER0);
-
- ret = 0;
- if (offset > 0 || eof > to) {
- /* need to fill one or two bits that aren't going to be written
- * (cover both fillers in one read if there are two) */
- start = (offset > 0) ? 0 : to;
- stop = (eof > to) ? eof : offset;
- len = stop - start;
- _debug("wr=%u-%u av=0-%u rd=%u@%u",
- offset, to, eof, start, len);
- ret = afs_fill_page(vnode, key, start, len, page);
- }
-
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
* prepare to perform part of a write to a page
- * - the caller holds the page locked, preventing it from being written out or
- * modified by anyone else
*/
-int afs_prepare_write(struct file *file, struct page *page,
- unsigned offset, unsigned to)
+int afs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
+ struct page *page;
struct afs_writeback *candidate, *wb;
struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
struct key *key = file->private_data;
- pgoff_t index;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned to = from + len;
int ret;
_enter("{%x:%u},{%lx},%u,%u",
- vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
+ vnode->fid.vid, vnode->fid.vnode, index, from, to);
candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
if (!candidate)
return -ENOMEM;
candidate->vnode = vnode;
- candidate->first = candidate->last = page->index;
- candidate->offset_first = offset;
+ candidate->first = candidate->last = index;
+ candidate->offset_first = from;
candidate->to_last = to;
candidate->usage = 1;
candidate->state = AFS_WBACK_PENDING;
init_waitqueue_head(&candidate->waitq);
+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ kfree(candidate);
+ return -ENOMEM;
+ }
+ *pagep = page;
+ /* page won't leak in error case: it eventually gets cleaned off LRU */
+
if (!PageUptodate(page)) {
_debug("not up to date");
- ret = afs_prepare_page(vnode, page, key, offset, to);
+ ret = afs_fill_page(vnode, key, pos, len, page);
if (ret < 0) {
kfree(candidate);
_leave(" = %d [prep]", ret);
return ret;
}
+ SetPageUptodate(page);
}
try_again:
- index = page->index;
spin_lock(&vnode->writeback_lock);
/* see if this page is already pending a writeback under a suitable key
@@ -242,8 +196,8 @@ try_again:
subsume_in_current_wb:
_debug("subsume");
ASSERTRANGE(wb->first, <=, index, <=, wb->last);
- if (index == wb->first && offset < wb->offset_first)
- wb->offset_first = offset;
+ if (index == wb->first && from < wb->offset_first)
+ wb->offset_first = from;
if (index == wb->last && to > wb->to_last)
wb->to_last = to;
spin_unlock(&vnode->writeback_lock);
@@ -289,17 +243,17 @@ flush_conflicting_wb:
/*
* finalise part of a write to a page
*/
-int afs_commit_write(struct file *file, struct page *page,
- unsigned offset, unsigned to)
+int afs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
loff_t i_size, maybe_i_size;
- _enter("{%x:%u},{%lx},%u,%u",
- vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
+ _enter("{%x:%u},{%lx}",
+ vnode->fid.vid, vnode->fid.vnode, page->index);
- maybe_i_size = (loff_t) page->index << PAGE_SHIFT;
- maybe_i_size += to;
+ maybe_i_size = pos + copied;
i_size = i_size_read(&vnode->vfs_inode);
if (maybe_i_size > i_size) {
@@ -310,12 +264,13 @@ int afs_commit_write(struct file *file,
spin_unlock(&vnode->writeback_lock);
}
- SetPageUptodate(page);
set_page_dirty(page);
if (PageDirty(page))
_debug("dirtied");
+ unlock_page(page);
+ page_cache_release(page);
- return 0;
+ return copied;
}
/*
^ permalink raw reply [flat|nested] 18+ messages in thread* [rfc][patch 4/5] rd: rewrite rd
2007-11-12 7:12 [rfc][patches] remove ->prepare_write Nick Piggin
` (2 preceding siblings ...)
2007-11-12 7:14 ` [rfc][patch 3/5] afs: " Nick Piggin
@ 2007-11-12 7:20 ` Nick Piggin
2007-11-12 7:23 ` [rfc][patch 5/5] remove prepare_write Nick Piggin
2007-11-12 15:29 ` [rfc][patch 3/5] afs: new aops David Howells
5 siblings, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-12 7:20 UTC (permalink / raw)
To: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench,
dhowells
This is a rewrite of the ramdisk block device driver.
The old one is really difficult because it effectively implements a block
device which serves data out of its own buffer cache. It relies on the dirty
bit being set, to pin its backing store in cache, however there are non
trivial paths which can clear the dirty bit (eg. try_to_free_buffers()),
which had recently lead to data corruption. And in general it is completely
wrong for a block device driver to do this.
The new one is more like a regular block device driver. It has no idea
about vm/vfs stuff. It's backing store is similar to the buffer cache
(a simple radix-tree of pages), but it doesn't know anything about page
cache (the pages in the radix tree are not pagecache pages).
There is one slight downside -- direct block device access and filesystem
metadata access goes through an extra copy and gets stored in RAM twice.
However, this downside is only slight, because the real buffercache of the
device is now reclaimable (because we're not playing crazy games with it),
so under memory intensive situations, footprint should effectively be the
same -- maybe even a slight advantage to the new driver because it can also
reclaim buffer heads.
The fact that it now goes through all the regular vm/fs paths makes it
much more useful for testing, too.
Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/drivers/block/Kconfig
===================================================================
--- linux-2.6.orig/drivers/block/Kconfig
+++ linux-2.6/drivers/block/Kconfig
@@ -311,7 +311,7 @@ config BLK_DEV_UB
If unsure, say N.
config BLK_DEV_RAM
- tristate "RAM disk support"
+ tristate "RAM block device support"
---help---
Saying Y here will allow you to use a portion of your RAM memory as
a block device, so that you can make file systems on it, read and
@@ -346,16 +346,6 @@ config BLK_DEV_RAM_SIZE
The default value is 4096 kilobytes. Only change this if you know
what you are doing.
-config BLK_DEV_RAM_BLOCKSIZE
- int "Default RAM disk block size (bytes)"
- depends on BLK_DEV_RAM
- default "1024"
- help
- The default value is 1024 bytes. PAGE_SIZE is a much more
- efficient choice however. The default is kept to ensure initrd
- setups function - apparently needed by the rd_load_image routine
- that supposes the filesystem in the image uses a 1024 blocksize.
-
config CDROM_PKTCDVD
tristate "Packet writing on CD/DVD media"
depends on !UML
Index: linux-2.6/drivers/block/Makefile
===================================================================
--- linux-2.6.orig/drivers/block/Makefile
+++ linux-2.6/drivers/block/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_AMIGA_FLOPPY) += amiflop.o
obj-$(CONFIG_PS3_DISK) += ps3disk.o
obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
-obj-$(CONFIG_BLK_DEV_RAM) += rd.o
+obj-$(CONFIG_BLK_DEV_RD) += brd.o
obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
obj-$(CONFIG_BLK_DEV_PS2) += ps2esdi.o
obj-$(CONFIG_BLK_DEV_XD) += xd.o
Index: linux-2.6/drivers/block/brd.c
===================================================================
--- /dev/null
+++ linux-2.6/drivers/block/brd.c
@@ -0,0 +1,477 @@
+/*
+ * Ram backed block device driver.
+ *
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ *
+ * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
+ * of their respective owners.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/radix-tree.h>
+#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
+
+#include <asm/uaccess.h>
+
+#define PAGE_SECTORS (1 << (PAGE_SHIFT - 9))
+
+struct brd_device {
+ int brd_number;
+ int brd_refcnt;
+ loff_t brd_offset;
+ loff_t brd_sizelimit;
+ unsigned brd_blocksize;
+
+ struct request_queue *brd_queue;
+ struct gendisk *brd_disk;
+
+ spinlock_t brd_lock;
+ struct radix_tree_root brd_pages;
+
+ struct list_head brd_list;
+};
+
+static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
+{
+ unsigned long idx;
+ struct page *page;
+
+ rcu_read_lock();
+ idx = sector >> (PAGE_SHIFT - 9);
+ page = radix_tree_lookup(&brd->brd_pages, idx);
+ rcu_read_unlock();
+
+ BUG_ON(page && page->index != idx);
+
+ return page;
+}
+
+static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
+{
+ unsigned long idx;
+ struct page *page;
+
+ page = brd_lookup_page(brd, sector);
+ if (page)
+ return page;
+
+ page = alloc_page(GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO);
+ if (!page)
+ return NULL;
+
+ if (radix_tree_preload(GFP_NOIO)) {
+ __free_page(page);
+ return NULL;
+ }
+
+ spin_lock(&brd->brd_lock);
+ idx = sector >> (PAGE_SHIFT - 9);
+ if (radix_tree_insert(&brd->brd_pages, idx, page)) {
+ __free_page(page);
+ page = radix_tree_lookup(&brd->brd_pages, idx);
+ BUG_ON(!page);
+ BUG_ON(page->index != idx);
+ } else
+ page->index = idx;
+ spin_unlock(&brd->brd_lock);
+
+ radix_tree_preload_end();
+
+ return page;
+}
+
+#define FREE_BATCH 16
+static void brd_free_pages(struct brd_device *brd)
+{
+ unsigned long pos = 0;
+ struct page *pages[FREE_BATCH];
+ int nr_pages;
+
+ do {
+ int i;
+
+ nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
+ (void **)pages, pos, FREE_BATCH);
+
+ for (i = 0; i < nr_pages; i++) {
+ void *ret;
+
+ BUG_ON(pages[i]->index < pos);
+ pos = pages[i]->index;
+ ret = radix_tree_delete(&brd->brd_pages, pos);
+ BUG_ON(!ret || ret != pages[i]);
+ __free_page(pages[i]);
+ }
+
+ pos++;
+
+ } while (nr_pages == FREE_BATCH);
+}
+
+static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
+{
+ unsigned int offset = (sector & (PAGE_SECTORS-1)) << 9;
+ size_t copy;
+
+ copy = min((unsigned long)n, PAGE_SIZE - offset);
+ if (!brd_insert_page(brd, sector))
+ return -ENOMEM;
+ if (copy < n) {
+ if (!brd_insert_page(brd, sector))
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void copy_to_brd(struct brd_device *brd, const void *src, sector_t sector, size_t n)
+{
+ struct page *page;
+ void *dst;
+ unsigned int offset = (sector & (PAGE_SECTORS-1)) << 9;
+ size_t copy;
+
+ copy = min((unsigned long)n, PAGE_SIZE - offset);
+ page = brd_lookup_page(brd, sector);
+ BUG_ON(!page);
+
+ dst = kmap_atomic(page, KM_USER1);
+ memcpy(dst + offset, src, copy);
+ kunmap_atomic(dst, KM_USER1);
+
+ if (copy < n) {
+ copy = n - copy;
+ page = brd_lookup_page(brd, sector);
+ BUG_ON(!page);
+
+ dst = kmap_atomic(page, KM_USER1);
+ memcpy(dst, src, copy);
+ kunmap_atomic(dst, KM_USER1);
+ }
+}
+
+static void copy_from_brd(void *dst, struct brd_device *brd, sector_t sector, size_t n)
+{
+ struct page *page;
+ const void *src;
+ unsigned int offset = (sector & (PAGE_SECTORS-1)) << 9;
+ size_t copy;
+
+ copy = min((unsigned long)n, PAGE_SIZE - offset);
+ page = brd_lookup_page(brd, sector);
+ if (page) {
+ src = kmap_atomic(page, KM_USER1);
+ memcpy(dst, src + offset, copy);
+ kunmap_atomic(src, KM_USER1);
+ } else
+ memset(dst, 0, copy);
+
+ if (copy < n) {
+ copy = n - copy;
+ page = brd_lookup_page(brd, sector);
+ if (page) {
+ src = kmap_atomic(page, KM_USER1);
+ memcpy(dst, src, copy);
+ kunmap_atomic(src, KM_USER1);
+ } else
+ memset(dst, 0, copy);
+ }
+}
+
+static int brd_do_bvec(struct brd_device *brd, struct page *page, unsigned int len, unsigned int off, int rw, sector_t sector)
+{
+ void *mem;
+ int err = 0;
+
+ if (rw != READ) {
+ err = copy_to_brd_setup(brd, sector, len);
+ if (err)
+ goto out;
+ }
+
+ mem = kmap_atomic(page, KM_USER0);
+ if (rw == READ) {
+ copy_from_brd(mem + off, brd, sector, len);
+ flush_dcache_page(page);
+ } else
+ copy_to_brd(brd, mem + off, sector, len);
+ kunmap_atomic(mem, KM_USER0);
+
+out:
+ return err;
+}
+
+static int brd_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ struct brd_device *brd = bdev->bd_disk->private_data;
+ int rw;
+ struct bio_vec *bvec;
+ sector_t sector;
+ int i;
+ int err = -EIO;
+
+ sector = bio->bi_sector;
+ if (sector + (bio->bi_size >> 9) > get_capacity(bdev->bd_disk))
+ goto out;
+
+ rw = bio_rw(bio);
+ if (rw == READA)
+ rw = READ;
+
+ bio_for_each_segment(bvec, bio, i) {
+ unsigned int len = bvec->bv_len;
+ err = brd_do_bvec(brd, bvec->bv_page, len, bvec->bv_offset, rw, sector);
+ if (err)
+ break;
+ sector += len >> 9;
+ }
+
+out:
+ bio_endio(bio, err);
+
+ return 0;
+}
+
+static int brd_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ int error;
+ struct block_device *bdev = inode->i_bdev;
+ struct brd_device *brd = bdev->bd_disk->private_data;
+
+ if (cmd != BLKFLSBUF)
+ return -ENOTTY;
+
+ /*
+ * ram device BLKFLSBUF has special semantics, we want to actually
+ * release and destroy the ramdisk data.
+ */
+ mutex_lock(&bdev->bd_mutex);
+ error = -EBUSY;
+ if (bdev->bd_openers <= 1) {
+ brd_free_pages(brd);
+ invalidate_bh_lrus();
+ truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+ error = 0;
+ }
+ mutex_unlock(&bdev->bd_mutex);
+
+ return error;
+}
+
+static struct block_device_operations brd_fops = {
+ .owner = THIS_MODULE,
+ .ioctl = brd_ioctl,
+};
+
+/*
+ * And now the modules code and kernel interface.
+ */
+static int rd_nr;
+static int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
+module_param(rd_nr, int, 0);
+MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
+module_param(rd_size, int, 0);
+MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
+
+/* options - nonmodular */
+#ifndef MODULE
+static int __init ramdisk_size(char *str)
+{
+ rd_size = simple_strtol(str,NULL,0);
+ return 1;
+}
+static int __init ramdisk_size2(char *str)
+{
+ return ramdisk_size(str);
+}
+static int __init ramdisk_nr(char *str)
+{
+ rd_nr = simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("ramdisk=", ramdisk_size);
+__setup("ramdisk_size=", ramdisk_size2);
+__setup("ramdisk_nr=", ramdisk_nr);
+#endif
+
+
+static LIST_HEAD(brd_devices);
+static DEFINE_MUTEX(brd_devices_mutex);
+
+static struct brd_device *brd_alloc(int i)
+{
+ struct brd_device *brd;
+ struct gendisk *disk;
+
+ brd = kzalloc(sizeof(*brd), GFP_KERNEL);
+ if (!brd)
+ goto out;
+ brd->brd_number = i;
+ spin_lock_init(&brd->brd_lock);
+ INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
+
+ brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
+ if (!brd->brd_queue)
+ goto out_free_dev;
+ blk_queue_make_request(brd->brd_queue, brd_make_request);
+ blk_queue_max_sectors(brd->brd_queue, 1024);
+ blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
+
+ disk = brd->brd_disk = alloc_disk(1);
+ if (!disk)
+ goto out_free_queue;
+ disk->major = RAMDISK_MAJOR;
+ disk->first_minor = i;
+ disk->fops = &brd_fops;
+ disk->private_data = brd;
+ disk->queue = brd->brd_queue;
+ sprintf(disk->disk_name, "ram%d", i);
+ set_capacity(disk, rd_size * 2);
+
+ return brd;
+
+out_free_queue:
+ blk_cleanup_queue(brd->brd_queue);
+out_free_dev:
+ kfree(brd);
+out:
+ return NULL;
+}
+
+static void brd_free(struct brd_device *brd)
+{
+ put_disk(brd->brd_disk);
+ blk_cleanup_queue(brd->brd_queue);
+ brd_free_pages(brd);
+ kfree(brd);
+}
+
+static struct brd_device *brd_init_one(int i)
+{
+ struct brd_device *brd;
+
+ list_for_each_entry(brd, &brd_devices, brd_list) {
+ if (brd->brd_number == i)
+ goto out;
+ }
+
+ brd = brd_alloc(i);
+ if (brd) {
+ add_disk(brd->brd_disk);
+ list_add_tail(&brd->brd_list, &brd_devices);
+ }
+out:
+ return brd;
+}
+
+static void brd_del_one(struct brd_device *brd)
+{
+ list_del(&brd->brd_list);
+ del_gendisk(brd->brd_disk);
+ brd_free(brd);
+}
+
+static struct kobject *brd_probe(dev_t dev, int *part, void *data)
+{
+ struct brd_device *brd;
+ struct kobject *kobj;
+
+ mutex_lock(&brd_devices_mutex);
+ brd = brd_init_one(dev & MINORMASK);
+ kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
+ mutex_unlock(&brd_devices_mutex);
+
+ *part = 0;
+ return kobj;
+}
+
+static int __init brd_init(void)
+{
+ int i, nr;
+ unsigned long range;
+ struct brd_device *brd, *next;
+
+ /*
+ * brd module now has a feature to instantiate underlying device
+ * structure on-demand, provided that there is an access dev node.
+ * However, this will not work well with user space tool that doesn't
+ * know about such "feature". In order to not break any existing
+ * tool, we do the following:
+ *
+ * (1) if rd_nr is specified, create that many upfront, and this
+ * also becomes a hard limit.
+ * (2) if rd_nr is not specified, create 1 rd device on module
+ * load, user can further extend brd device by create dev node
+ * themselves and have kernel automatically instantiate actual
+ * device on-demand.
+ */
+ if (rd_nr > 1UL << MINORBITS)
+ return -EINVAL;
+
+ if (rd_nr) {
+ nr = rd_nr;
+ range = rd_nr;
+ } else {
+ nr = CONFIG_BLK_DEV_RAM_COUNT;
+ range = 1UL << MINORBITS;
+ }
+
+ if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
+ return -EIO;
+
+ for (i = 0; i < nr; i++) {
+ brd = brd_alloc(i);
+ if (!brd)
+ goto out_free;
+ list_add_tail(&brd->brd_list, &brd_devices);
+ }
+
+ /* point of no return */
+
+ list_for_each_entry(brd, &brd_devices, brd_list)
+ add_disk(brd->brd_disk);
+
+ blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
+ THIS_MODULE, brd_probe, NULL, NULL);
+
+ printk(KERN_INFO "brd: module loaded\n");
+ return 0;
+
+out_free:
+ list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
+ list_del(&brd->brd_list);
+ brd_free(brd);
+ }
+
+ unregister_blkdev(RAMDISK_MAJOR, "brd");
+ return -ENOMEM;
+}
+
+static void __exit brd_exit(void)
+{
+ unsigned long range;
+ struct brd_device *brd, *next;
+
+ range = rd_nr ? rd_nr : 1UL << MINORBITS;
+
+ list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
+ brd_del_one(brd);
+
+ blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
+ unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+}
+
+module_init(brd_init);
+module_exit(brd_exit);
+
Index: linux-2.6/drivers/block/rd.c
===================================================================
--- linux-2.6.orig/drivers/block/rd.c
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- * ramdisk.c - Multiple RAM disk driver - gzip-loading version - v. 0.8 beta.
- *
- * (C) Chad Page, Theodore Ts'o, et. al, 1995.
- *
- * This RAM disk is designed to have filesystems created on it and mounted
- * just like a regular floppy disk.
- *
- * It also does something suggested by Linus: use the buffer cache as the
- * RAM disk data. This makes it possible to dynamically allocate the RAM disk
- * buffer - with some consequences I have to deal with as I write this.
- *
- * This code is based on the original ramdisk.c, written mostly by
- * Theodore Ts'o (TYT) in 1991. The code was largely rewritten by
- * Chad Page to use the buffer cache to store the RAM disk data in
- * 1995; Theodore then took over the driver again, and cleaned it up
- * for inclusion in the mainline kernel.
- *
- * The original CRAMDISK code was written by Richard Lyons, and
- * adapted by Chad Page to use the new RAM disk interface. Theodore
- * Ts'o rewrote it so that both the compressed RAM disk loader and the
- * kernel decompressor uses the same inflate.c codebase. The RAM disk
- * loader now also loads into a dynamic (buffer cache based) RAM disk,
- * not the old static RAM disk. Support for the old static RAM disk has
- * been completely removed.
- *
- * Loadable module support added by Tom Dyas.
- *
- * Further cleanups by Chad Page (page0588@sundance.sjsu.edu):
- * Cosmetic changes in #ifdef MODULE, code movement, etc.
- * When the RAM disk module is removed, free the protected buffers
- * Default RAM disk size changed to 2.88 MB
- *
- * Added initrd: Werner Almesberger & Hans Lermen, Feb '96
- *
- * 4/25/96 : Made RAM disk size a parameter (default is now 4 MB)
- * - Chad Page
- *
- * Add support for fs images split across >1 disk, Paul Gortmaker, Mar '98
- *
- * Make block size and block size shift for RAM disks a global macro
- * and set blk_size for -ENOSPC, Werner Fink <werner@suse.de>, Apr '99
- */
-
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <asm/atomic.h>
-#include <linux/bio.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/blkdev.h>
-#include <linux/genhd.h>
-#include <linux/buffer_head.h> /* for invalidate_bdev() */
-#include <linux/backing-dev.h>
-#include <linux/blkpg.h>
-#include <linux/writeback.h>
-
-#include <asm/uaccess.h>
-
-/* Various static variables go here. Most are used only in the RAM disk code.
- */
-
-static struct gendisk *rd_disks[CONFIG_BLK_DEV_RAM_COUNT];
-static struct block_device *rd_bdev[CONFIG_BLK_DEV_RAM_COUNT];/* Protected device data */
-static struct request_queue *rd_queue[CONFIG_BLK_DEV_RAM_COUNT];
-
-/*
- * Parameters for the boot-loading of the RAM disk. These are set by
- * init/main.c (from arguments to the kernel command line) or from the
- * architecture-specific setup routine (from the stored boot sector
- * information).
- */
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE; /* Size of the RAM disks */
-/*
- * It would be very desirable to have a soft-blocksize (that in the case
- * of the ramdisk driver is also the hardblocksize ;) of PAGE_SIZE because
- * doing that we'll achieve a far better MM footprint. Using a rd_blocksize of
- * BLOCK_SIZE in the worst case we'll make PAGE_SIZE/BLOCK_SIZE buffer-pages
- * unfreeable. With a rd_blocksize of PAGE_SIZE instead we are sure that only
- * 1 page will be protected. Depending on the size of the ramdisk you
- * may want to change the ramdisk blocksize to achieve a better or worse MM
- * behaviour. The default is still BLOCK_SIZE (needed by rd_load_image that
- * supposes the filesystem in the image uses a BLOCK_SIZE blocksize).
- */
-static int rd_blocksize = CONFIG_BLK_DEV_RAM_BLOCKSIZE;
-
-/*
- * Copyright (C) 2000 Linus Torvalds.
- * 2000 Transmeta Corp.
- * aops copied from ramfs.
- */
-
-/*
- * If a ramdisk page has buffers, some may be uptodate and some may be not.
- * To bring the page uptodate we zero out the non-uptodate buffers. The
- * page must be locked.
- */
-static void make_page_uptodate(struct page *page)
-{
- if (page_has_buffers(page)) {
- struct buffer_head *bh = page_buffers(page);
- struct buffer_head *head = bh;
-
- do {
- if (!buffer_uptodate(bh)) {
- memset(bh->b_data, 0, bh->b_size);
- /*
- * akpm: I'm totally undecided about this. The
- * buffer has just been magically brought "up to
- * date", but nobody should want to be reading
- * it anyway, because it hasn't been used for
- * anything yet. It is still in a "not read
- * from disk yet" state.
- *
- * But non-uptodate buffers against an uptodate
- * page are against the rules. So do it anyway.
- */
- set_buffer_uptodate(bh);
- }
- } while ((bh = bh->b_this_page) != head);
- } else {
- memset(page_address(page), 0, PAGE_CACHE_SIZE);
- }
- flush_dcache_page(page);
- SetPageUptodate(page);
-}
-
-static int ramdisk_readpage(struct file *file, struct page *page)
-{
- if (!PageUptodate(page))
- make_page_uptodate(page);
- unlock_page(page);
- return 0;
-}
-
-static int ramdisk_prepare_write(struct file *file, struct page *page,
- unsigned offset, unsigned to)
-{
- if (!PageUptodate(page))
- make_page_uptodate(page);
- return 0;
-}
-
-static int ramdisk_commit_write(struct file *file, struct page *page,
- unsigned offset, unsigned to)
-{
- set_page_dirty(page);
- return 0;
-}
-
-/*
- * ->writepage to the blockdev's mapping has to redirty the page so that the
- * VM doesn't go and steal it. We return AOP_WRITEPAGE_ACTIVATE so that the VM
- * won't try to (pointlessly) write the page again for a while.
- *
- * Really, these pages should not be on the LRU at all.
- */
-static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
-{
- if (!PageUptodate(page))
- make_page_uptodate(page);
- SetPageDirty(page);
- if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE;
- unlock_page(page);
- return 0;
-}
-
-/*
- * This is a little speedup thing: short-circuit attempts to write back the
- * ramdisk blockdev inode to its non-existent backing store.
- */
-static int ramdisk_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return 0;
-}
-
-/*
- * ramdisk blockdev pages have their own ->set_page_dirty() because we don't
- * want them to contribute to dirty memory accounting.
- */
-static int ramdisk_set_page_dirty(struct page *page)
-{
- if (!TestSetPageDirty(page))
- return 1;
- return 0;
-}
-
-static const struct address_space_operations ramdisk_aops = {
- .readpage = ramdisk_readpage,
- .prepare_write = ramdisk_prepare_write,
- .commit_write = ramdisk_commit_write,
- .writepage = ramdisk_writepage,
- .set_page_dirty = ramdisk_set_page_dirty,
- .writepages = ramdisk_writepages,
-};
-
-static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector,
- struct address_space *mapping)
-{
- pgoff_t index = sector >> (PAGE_CACHE_SHIFT - 9);
- unsigned int vec_offset = vec->bv_offset;
- int offset = (sector << 9) & ~PAGE_CACHE_MASK;
- int size = vec->bv_len;
- int err = 0;
-
- do {
- int count;
- struct page *page;
- char *src;
- char *dst;
-
- count = PAGE_CACHE_SIZE - offset;
- if (count > size)
- count = size;
- size -= count;
-
- page = grab_cache_page(mapping, index);
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
-
- if (!PageUptodate(page))
- make_page_uptodate(page);
-
- index++;
-
- if (rw == READ) {
- src = kmap_atomic(page, KM_USER0) + offset;
- dst = kmap_atomic(vec->bv_page, KM_USER1) + vec_offset;
- } else {
- src = kmap_atomic(vec->bv_page, KM_USER0) + vec_offset;
- dst = kmap_atomic(page, KM_USER1) + offset;
- }
- offset = 0;
- vec_offset += count;
-
- memcpy(dst, src, count);
-
- kunmap_atomic(src, KM_USER0);
- kunmap_atomic(dst, KM_USER1);
-
- if (rw == READ)
- flush_dcache_page(vec->bv_page);
- else
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
- } while (size);
-
- out:
- return err;
-}
-
-/*
- * Basically, my strategy here is to set up a buffer-head which can't be
- * deleted, and make that my Ramdisk. If the request is outside of the
- * allocated size, we must get rid of it...
- *
- * 19-JAN-1998 Richard Gooch <rgooch@atnf.csiro.au> Added devfs support
- *
- */
-static int rd_make_request(struct request_queue *q, struct bio *bio)
-{
- struct block_device *bdev = bio->bi_bdev;
- struct address_space * mapping = bdev->bd_inode->i_mapping;
- sector_t sector = bio->bi_sector;
- unsigned long len = bio->bi_size >> 9;
- int rw = bio_data_dir(bio);
- struct bio_vec *bvec;
- int ret = 0, i;
-
- if (sector + len > get_capacity(bdev->bd_disk))
- goto fail;
-
- if (rw==READA)
- rw=READ;
-
- bio_for_each_segment(bvec, bio, i) {
- ret |= rd_blkdev_pagecache_IO(rw, bvec, sector, mapping);
- sector += bvec->bv_len >> 9;
- }
- if (ret)
- goto fail;
-
- bio_endio(bio, 0);
- return 0;
-fail:
- bio_io_error(bio);
- return 0;
-}
-
-static int rd_ioctl(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg)
-{
- int error;
- struct block_device *bdev = inode->i_bdev;
-
- if (cmd != BLKFLSBUF)
- return -ENOTTY;
-
- /*
- * special: we want to release the ramdisk memory, it's not like with
- * the other blockdevices where this ioctl only flushes away the buffer
- * cache
- */
- error = -EBUSY;
- mutex_lock(&bdev->bd_mutex);
- if (bdev->bd_openers <= 2) {
- truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
- error = 0;
- }
- mutex_unlock(&bdev->bd_mutex);
- return error;
-}
-
-/*
- * This is the backing_dev_info for the blockdev inode itself. It doesn't need
- * writeback and it does not contribute to dirty memory accounting.
- */
-static struct backing_dev_info rd_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK | BDI_CAP_MAP_COPY,
- .unplug_io_fn = default_unplug_io_fn,
-};
-
-/*
- * This is the backing_dev_info for the files which live atop the ramdisk
- * "device". These files do need writeback and they do contribute to dirty
- * memory accounting.
- */
-static struct backing_dev_info rd_file_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_MAP_COPY, /* Does contribute to dirty memory */
- .unplug_io_fn = default_unplug_io_fn,
-};
-
-static int rd_open(struct inode *inode, struct file *filp)
-{
- unsigned unit = iminor(inode);
-
- if (rd_bdev[unit] == NULL) {
- struct block_device *bdev = inode->i_bdev;
- struct address_space *mapping;
- unsigned bsize;
- gfp_t gfp_mask;
-
- inode = igrab(bdev->bd_inode);
- rd_bdev[unit] = bdev;
- bdev->bd_openers++;
- bsize = bdev_hardsect_size(bdev);
- bdev->bd_block_size = bsize;
- inode->i_blkbits = blksize_bits(bsize);
- inode->i_size = get_capacity(bdev->bd_disk)<<9;
-
- mapping = inode->i_mapping;
- mapping->a_ops = &ramdisk_aops;
- mapping->backing_dev_info = &rd_backing_dev_info;
- bdev->bd_inode_backing_dev_info = &rd_file_backing_dev_info;
-
- /*
- * Deep badness. rd_blkdev_pagecache_IO() needs to allocate
- * pagecache pages within a request_fn. We cannot recur back
- * into the filesystem which is mounted atop the ramdisk, because
- * that would deadlock on fs locks. And we really don't want
- * to reenter rd_blkdev_pagecache_IO when we're already within
- * that function.
- *
- * So we turn off __GFP_FS and __GFP_IO.
- *
- * And to give this thing a hope of working, turn on __GFP_HIGH.
- * Hopefully, there's enough regular memory allocation going on
- * for the page allocator emergency pools to keep the ramdisk
- * driver happy.
- */
- gfp_mask = mapping_gfp_mask(mapping);
- gfp_mask &= ~(__GFP_FS|__GFP_IO);
- gfp_mask |= __GFP_HIGH;
- mapping_set_gfp_mask(mapping, gfp_mask);
- }
-
- return 0;
-}
-
-static struct block_device_operations rd_bd_op = {
- .owner = THIS_MODULE,
- .open = rd_open,
- .ioctl = rd_ioctl,
-};
-
-/*
- * Before freeing the module, invalidate all of the protected buffers!
- */
-static void __exit rd_cleanup(void)
-{
- int i;
-
- for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
- struct block_device *bdev = rd_bdev[i];
- rd_bdev[i] = NULL;
- if (bdev) {
- invalidate_bdev(bdev);
- blkdev_put(bdev);
- }
- del_gendisk(rd_disks[i]);
- put_disk(rd_disks[i]);
- blk_cleanup_queue(rd_queue[i]);
- }
- unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
-
- bdi_destroy(&rd_file_backing_dev_info);
- bdi_destroy(&rd_backing_dev_info);
-}
-
-/*
- * This is the registration and initialization section of the RAM disk driver
- */
-static int __init rd_init(void)
-{
- int i;
- int err;
-
- err = bdi_init(&rd_backing_dev_info);
- if (err)
- goto out2;
-
- err = bdi_init(&rd_file_backing_dev_info);
- if (err) {
- bdi_destroy(&rd_backing_dev_info);
- goto out2;
- }
-
- err = -ENOMEM;
-
- if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 ||
- (rd_blocksize & (rd_blocksize-1))) {
- printk("RAMDISK: wrong blocksize %d, reverting to defaults\n",
- rd_blocksize);
- rd_blocksize = BLOCK_SIZE;
- }
-
- for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
- rd_disks[i] = alloc_disk(1);
- if (!rd_disks[i])
- goto out;
-
- rd_queue[i] = blk_alloc_queue(GFP_KERNEL);
- if (!rd_queue[i]) {
- put_disk(rd_disks[i]);
- goto out;
- }
- }
-
- if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) {
- err = -EIO;
- goto out;
- }
-
- for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
- struct gendisk *disk = rd_disks[i];
-
- blk_queue_make_request(rd_queue[i], &rd_make_request);
- blk_queue_hardsect_size(rd_queue[i], rd_blocksize);
-
- /* rd_size is given in kB */
- disk->major = RAMDISK_MAJOR;
- disk->first_minor = i;
- disk->fops = &rd_bd_op;
- disk->queue = rd_queue[i];
- disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
- sprintf(disk->disk_name, "ram%d", i);
- set_capacity(disk, rd_size * 2);
- add_disk(rd_disks[i]);
- }
-
- /* rd_size is given in kB */
- printk("RAMDISK driver initialized: "
- "%d RAM disks of %dK size %d blocksize\n",
- CONFIG_BLK_DEV_RAM_COUNT, rd_size, rd_blocksize);
-
- return 0;
-out:
- while (i--) {
- put_disk(rd_disks[i]);
- blk_cleanup_queue(rd_queue[i]);
- }
- bdi_destroy(&rd_backing_dev_info);
- bdi_destroy(&rd_file_backing_dev_info);
-out2:
- return err;
-}
-
-module_init(rd_init);
-module_exit(rd_cleanup);
-
-/* options - nonmodular */
-#ifndef MODULE
-static int __init ramdisk_size(char *str)
-{
- rd_size = simple_strtol(str,NULL,0);
- return 1;
-}
-static int __init ramdisk_blocksize(char *str)
-{
- rd_blocksize = simple_strtol(str,NULL,0);
- return 1;
-}
-__setup("ramdisk_size=", ramdisk_size);
-__setup("ramdisk_blocksize=", ramdisk_blocksize);
-#endif
-
-/* options - modular */
-module_param(rd_size, int, 0);
-MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
-module_param(rd_blocksize, int, 0);
-MODULE_PARM_DESC(rd_blocksize, "Blocksize of each RAM disk in bytes.");
-MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
-
-MODULE_LICENSE("GPL");
Index: linux-2.6/MAINTAINERS
===================================================================
--- linux-2.6.orig/MAINTAINERS
+++ linux-2.6/MAINTAINERS
@@ -3158,6 +3158,11 @@ W: http://rt2x00.serialmonkey.com/
S: Maintained
F: drivers/net/wireless/rt2x00/
+RAMDISK RAM BLOCK DEVICE DRIVER
+P: Nick Piggin
+M: npiggin@suse.de
+S: Maintained
+
RANDOM NUMBER DRIVER
P: Matt Mackall
M: mpm@selenic.com
^ permalink raw reply [flat|nested] 18+ messages in thread* [rfc][patch 5/5] remove prepare_write
2007-11-12 7:12 [rfc][patches] remove ->prepare_write Nick Piggin
` (3 preceding siblings ...)
2007-11-12 7:20 ` [rfc][patch 4/5] rd: rewrite rd Nick Piggin
@ 2007-11-12 7:23 ` Nick Piggin
2007-11-12 15:29 ` [rfc][patch 3/5] afs: new aops David Howells
5 siblings, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-12 7:23 UTC (permalink / raw)
To: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench,
dhowells
Index: linux-2.6/drivers/block/loop.c
===================================================================
--- linux-2.6.orig/drivers/block/loop.c
+++ linux-2.6/drivers/block/loop.c
@@ -40,8 +40,7 @@
* Heinz Mauelshagen <mge@sistina.com>, Feb 2002
*
* Support for falling back on the write file operation when the address space
- * operations prepare_write and/or commit_write are not available on the
- * backing filesystem.
+ * operations write_begin is not available on the backing filesystem.
* Anton Altaparmakov, 16 Feb 2005
*
* Still To Fix:
@@ -761,7 +760,7 @@ static int loop_set_fd(struct loop_devic
*/
if (!file->f_op->splice_read)
goto out_putf;
- if (aops->prepare_write || aops->write_begin)
+ if (aops->write_begin)
lo_flags |= LO_FLAGS_USE_AOPS;
if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
lo_flags |= LO_FLAGS_READ_ONLY;
Index: linux-2.6/fs/fat/inode.c
===================================================================
--- linux-2.6.orig/fs/fat/inode.c
+++ linux-2.6/fs/fat/inode.c
@@ -175,7 +175,7 @@ static ssize_t fat_direct_IO(int rw, str
if (rw == WRITE) {
/*
- * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
+ * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
* so we need to update the ->mmu_private to block boundary.
*
* But we must fill the remaining area or hole by nul for
Index: linux-2.6/fs/ocfs2/file.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/file.c
+++ linux-2.6/fs/ocfs2/file.c
@@ -790,8 +790,7 @@ leave:
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
- * worry about recursive locking in ->prepare_write() and
- * ->commit_write(). */
+ * worry about recursive locking in ->write_begin() and ->write_end(). */
static int ocfs2_write_zero_page(struct inode *inode,
u64 size)
{
Index: linux-2.6/fs/splice.c
===================================================================
--- linux-2.6.orig/fs/splice.c
+++ linux-2.6/fs/splice.c
@@ -734,8 +734,8 @@ ssize_t splice_from_pipe(struct pipe_ino
};
/*
- * The actor worker might be calling ->prepare_write and
- * ->commit_write. Most of the time, these expect i_mutex to
+ * The actor worker might be calling ->write_begin and
+ * ->write_end. Most of the time, these expect i_mutex to
* be held. Since this may result in an ABBA deadlock with
* pipe->inode, we have to order lock acquiry here.
*/
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -453,13 +453,6 @@ struct address_space_operations {
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
- /*
- * ext3 requires that a successful prepare_write() call be followed
- * by a commit_write() call - they must be balanced
- */
- int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
- int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
-
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -1863,48 +1863,8 @@ int pagecache_write_begin(struct file *f
{
const struct address_space_operations *aops = mapping->a_ops;
- if (aops->write_begin) {
- return aops->write_begin(file, mapping, pos, len, flags,
+ return aops->write_begin(file, mapping, pos, len, flags,
pagep, fsdata);
- } else {
- int ret;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
- struct inode *inode = mapping->host;
- struct page *page;
-again:
- page = __grab_cache_page(mapping, index);
- *pagep = page;
- if (!page)
- return -ENOMEM;
-
- if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
- /*
- * There is no way to resolve a short write situation
- * for a !Uptodate page (except by double copying in
- * the caller done by generic_perform_write_2copy).
- *
- * Instead, we have to bring it uptodate here.
- */
- ret = aops->readpage(file, page);
- page_cache_release(page);
- if (ret) {
- if (ret == AOP_TRUNCATED_PAGE)
- goto again;
- return ret;
- }
- goto again;
- }
-
- ret = aops->prepare_write(file, page, offset, offset+len);
- if (ret) {
- unlock_page(page);
- page_cache_release(page);
- if (pos + len > inode->i_size)
- vmtruncate(inode, inode->i_size);
- }
- return ret;
- }
}
EXPORT_SYMBOL(pagecache_write_begin);
@@ -1915,28 +1875,8 @@ int pagecache_write_end(struct file *fil
const struct address_space_operations *aops = mapping->a_ops;
int ret;
- if (aops->write_end) {
- mark_page_accessed(page);
- ret = aops->write_end(file, mapping, pos, len, copied,
- page, fsdata);
- } else {
- unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
- struct inode *inode = mapping->host;
-
- flush_dcache_page(page);
- ret = aops->commit_write(file, page, offset, offset+len);
- unlock_page(page);
- mark_page_accessed(page);
- page_cache_release(page);
-
- if (ret < 0) {
- if (pos + len > inode->i_size)
- vmtruncate(inode, inode->i_size);
- } else if (ret > 0)
- ret = min_t(size_t, copied, ret);
- else
- ret = copied;
- }
+ mark_page_accessed(page);
+ ret = aops->write_end(file, mapping, pos, len, copied, page, fsdata);
return ret;
}
@@ -2008,174 +1948,6 @@ repeat:
}
EXPORT_SYMBOL(__grab_cache_page);
-static ssize_t generic_perform_write_2copy(struct file *file,
- struct iov_iter *i, loff_t pos)
-{
- struct address_space *mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- ssize_t written = 0;
-
- do {
- struct page *src_page;
- struct page *page;
- pgoff_t index; /* Pagecache index for current page */
- unsigned long offset; /* Offset into pagecache page */
- unsigned long bytes; /* Bytes to write to page */
- size_t copied; /* Bytes copied from user */
-
- offset = (pos & (PAGE_CACHE_SIZE - 1));
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
- iov_iter_count(i));
-
- /*
- * a non-NULL src_page indicates that we're doing the
- * copy via get_user_pages and kmap.
- */
- src_page = NULL;
-
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
- */
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
- status = -EFAULT;
- break;
- }
-
- page = __grab_cache_page(mapping, index);
- if (!page) {
- status = -ENOMEM;
- break;
- }
-
- /*
- * non-uptodate pages cannot cope with short copies, and we
- * cannot take a pagefault with the destination page locked.
- * So pin the source page to copy it.
- */
- if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
- unlock_page(page);
-
- src_page = alloc_page(GFP_KERNEL);
- if (!src_page) {
- page_cache_release(page);
- status = -ENOMEM;
- break;
- }
-
- /*
- * Cannot get_user_pages with a page locked for the
- * same reason as we can't take a page fault with a
- * page locked (as explained below).
- */
- copied = iov_iter_copy_from_user(src_page, i,
- offset, bytes);
- if (unlikely(copied == 0)) {
- status = -EFAULT;
- page_cache_release(page);
- page_cache_release(src_page);
- break;
- }
- bytes = copied;
-
- lock_page(page);
- /*
- * Can't handle the page going uptodate here, because
- * that means we would use non-atomic usercopies, which
- * zero out the tail of the page, which can cause
- * zeroes to become transiently visible. We could just
- * use a non-zeroing copy, but the APIs aren't too
- * consistent.
- */
- if (unlikely(!page->mapping || PageUptodate(page))) {
- unlock_page(page);
- page_cache_release(page);
- page_cache_release(src_page);
- continue;
- }
- }
-
- status = a_ops->prepare_write(file, page, offset, offset+bytes);
- if (unlikely(status))
- goto fs_write_aop_error;
-
- if (!src_page) {
- /*
- * Must not enter the pagefault handler here, because
- * we hold the page lock, so we might recursively
- * deadlock on the same lock, or get an ABBA deadlock
- * against a different lock, or against the mmap_sem
- * (which nests outside the page lock). So increment
- * preempt count, and use _atomic usercopies.
- *
- * The page is uptodate so we are OK to encounter a
- * short copy: if unmodified parts of the page are
- * marked dirty and written out to disk, it doesn't
- * really matter.
- */
- pagefault_disable();
- copied = iov_iter_copy_from_user_atomic(page, i,
- offset, bytes);
- pagefault_enable();
- } else {
- void *src, *dst;
- src = kmap_atomic(src_page, KM_USER0);
- dst = kmap_atomic(page, KM_USER1);
- memcpy(dst + offset, src + offset, bytes);
- kunmap_atomic(dst, KM_USER1);
- kunmap_atomic(src, KM_USER0);
- copied = bytes;
- }
- flush_dcache_page(page);
-
- status = a_ops->commit_write(file, page, offset, offset+bytes);
- if (unlikely(status < 0))
- goto fs_write_aop_error;
- if (unlikely(status > 0)) /* filesystem did partial write */
- copied = min_t(size_t, copied, status);
-
- unlock_page(page);
- mark_page_accessed(page);
- page_cache_release(page);
- if (src_page)
- page_cache_release(src_page);
-
- iov_iter_advance(i, copied);
- pos += copied;
- written += copied;
-
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- continue;
-
-fs_write_aop_error:
- unlock_page(page);
- page_cache_release(page);
- if (src_page)
- page_cache_release(src_page);
-
- /*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
- */
- if (pos + bytes > inode->i_size)
- vmtruncate(inode, inode->i_size);
- break;
- } while (iov_iter_count(i));
-
- return written ? written : status;
-}
-
static ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
@@ -2276,10 +2048,7 @@ generic_file_buffered_write(struct kiocb
struct iov_iter i;
iov_iter_init(&i, iov, nr_segs, count, written);
- if (a_ops->write_begin)
- status = generic_perform_write(file, &i, pos);
- else
- status = generic_perform_write_2copy(file, &i, pos);
+ status = generic_perform_write(file, &i, pos);
if (likely(status >= 0)) {
written += status;
Index: linux-2.6/Documentation/filesystems/Locking
===================================================================
--- linux-2.6.orig/Documentation/filesystems/Locking
+++ linux-2.6/Documentation/filesystems/Locking
@@ -166,8 +166,12 @@ prototypes:
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
- int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
- int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ int (*write_begin)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+ int (*write_end)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
@@ -185,8 +189,6 @@ sync_page: no maybe
writepages: no
set_page_dirty no no
readpages: no
-prepare_write: no yes yes
-commit_write: no yes yes
write_begin: no locks the page yes
write_end: no yes, unlocks yes
perform_write: no n/a yes
@@ -196,7 +198,7 @@ releasepage: no yes
direct_IO: no
launder_page: no yes
- ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
+ ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
may be called from the request handler (/dev/loop).
->readpage() unlocks the page, either synchronously or via I/O
Index: linux-2.6/Documentation/filesystems/vfs.txt
===================================================================
--- linux-2.6.orig/Documentation/filesystems/vfs.txt
+++ linux-2.6/Documentation/filesystems/vfs.txt
@@ -506,7 +506,7 @@ written-back to storage typically in who
address_space has finer control of write sizes.
The read process essentially only requires 'readpage'. The write
-process is more complicated and uses prepare_write/commit_write or
+process is more complicated and uses write_begin/write_end or
set_page_dirty to write data into the address_space, and writepage,
sync_page, and writepages to writeback data to storage.
@@ -535,8 +535,6 @@ struct address_space_operations {
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
- int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
- int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
@@ -612,37 +610,7 @@ struct address_space_operations {
readpages is only used for read-ahead, so read errors are
ignored. If anything goes wrong, feel free to give up.
- prepare_write: called by the generic write path in VM to set up a write
- request for a page. This indicates to the address space that
- the given range of bytes is about to be written. The
- address_space should check that the write will be able to
- complete, by allocating space if necessary and doing any other
- internal housekeeping. If the write will update parts of
- any basic-blocks on storage, then those blocks should be
- pre-read (if they haven't been read already) so that the
- updated blocks can be written out properly.
- The page will be locked.
-
- Note: the page _must not_ be marked uptodate in this function
- (or anywhere else) unless it actually is uptodate right now. As
- soon as a page is marked uptodate, it is possible for a concurrent
- read(2) to copy it to userspace.
-
- commit_write: If prepare_write succeeds, new data will be copied
- into the page and then commit_write will be called. It will
- typically update the size of the file (if appropriate) and
- mark the inode as dirty, and do any other related housekeeping
- operations. It should avoid returning an error if possible -
- errors should have been handled by prepare_write.
-
- write_begin: This is intended as a replacement for prepare_write. The
- key differences being that:
- - it returns a locked page (in *pagep) rather than being
- given a pre locked page;
- - it must be able to cope with short writes (where the
- length passed to write_begin is greater than the number
- of bytes copied into the page).
-
+ write_begin:
Called by the generic buffered write code to ask the filesystem to
prepare to write len bytes at the given offset in the file. The
address_space should check that the write will be able to complete,
@@ -654,6 +622,9 @@ struct address_space_operations {
The filesystem must return the locked pagecache page for the specified
offset, in *pagep, for the caller to write into.
+ It must be able to cope with short writes (where the length passed to
+ write_begin is greater than the number of bytes copied into the page).
+
flags is a field for AOP_FLAG_xxx flags, described in
include/linux/fs.h.
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [rfc][patch 3/5] afs: new aops
2007-11-12 7:12 [rfc][patches] remove ->prepare_write Nick Piggin
` (4 preceding siblings ...)
2007-11-12 7:23 ` [rfc][patch 5/5] remove prepare_write Nick Piggin
@ 2007-11-12 15:29 ` David Howells
2007-11-13 0:15 ` Nick Piggin
2007-11-13 0:30 ` David Howells
5 siblings, 2 replies; 18+ messages in thread
From: David Howells @ 2007-11-12 15:29 UTC (permalink / raw)
To: Nick Piggin
Cc: dhowells, Andrew Morton, linux-fsdevel, mhalcrow, phillip,
sfrench
Nick Piggin <npiggin@suse.de> wrote:
> - ASSERTCMP(start + len, <=, PAGE_SIZE);
> + ASSERTCMP(len, <=, PAGE_CACHE_SIZE);
Do you guarantee this will work if PAGE_CACHE_SIZE != PAGE_SIZE? If not, you
can't make this particular change.
Do we ever intend to have PAGE_CACHE_SIZE != PAGE_SIZE? If not, then surely
the former is redundant and should scrapped to avoid confusion?
> + i_size = i_size_read(&vnode->vfs_inode);
> + if (pos + len > i_size)
> + eof = i_size;
> + else
> + eof = PAGE_CACHE_SIZE;
> +
> + ret = afs_vnode_fetch_data(vnode, key, 0, eof, page);
That can't be right, surely. Either 'eof' is the size of the file or it's the
length of the data to be read. It can't be both. The first case needs eof
masking off. Also, 'eof' isn't a good choice of name. 'len' would be better
were it not already taken:-/
I notice you removed the stuff that clears holes in the page to be written. Is
this is now done by the caller?
I notice also that you use afs_fill_page() in place of afs_prepare_page() to
prepare a page. You can't do this if the region to be filled currently lies
beyond the server's idea of EOF.
I'll try and get a look at fixing this patch tomorrow.
David
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [rfc][patch 3/5] afs: new aops
2007-11-12 15:29 ` [rfc][patch 3/5] afs: new aops David Howells
@ 2007-11-13 0:15 ` Nick Piggin
2007-11-13 0:30 ` David Howells
1 sibling, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-13 0:15 UTC (permalink / raw)
To: David Howells; +Cc: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench
On Mon, Nov 12, 2007 at 03:29:14PM +0000, David Howells wrote:
> Nick Piggin <npiggin@suse.de> wrote:
>
> > - ASSERTCMP(start + len, <=, PAGE_SIZE);
> > + ASSERTCMP(len, <=, PAGE_CACHE_SIZE);
>
> Do you guarantee this will work if PAGE_CACHE_SIZE != PAGE_SIZE? If not, you
> can't make this particular change.
PAGE_CACHE_SIZE should be used to address the pagecache.
> Do we ever intend to have PAGE_CACHE_SIZE != PAGE_SIZE? If not, then surely
> the former is redundant and should scrapped to avoid confusion?
Maybe.
> > + i_size = i_size_read(&vnode->vfs_inode);
> > + if (pos + len > i_size)
> > + eof = i_size;
> > + else
> > + eof = PAGE_CACHE_SIZE;
> > +
> > + ret = afs_vnode_fetch_data(vnode, key, 0, eof, page);
>
> That can't be right, surely. Either 'eof' is the size of the file or it's the
> length of the data to be read. It can't be both. The first case needs eof
> masking off. Also, 'eof' isn't a good choice of name. 'len' would be better
> were it not already taken:-/
Yeah, just missed the mask.
> I notice you removed the stuff that clears holes in the page to be written. Is
> this is now done by the caller?
It is supposed to bring the page uptodate first. So, no need to clear
AFAIKS?
> I notice also that you use afs_fill_page() in place of afs_prepare_page() to
> prepare a page. You can't do this if the region to be filled currently lies
> beyond the server's idea of EOF.
>
> I'll try and get a look at fixing this patch tomorrow.
No rush, it won't get into 2.6.24 obviously. But that would be nice, thanks.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [rfc][patch 3/5] afs: new aops
2007-11-12 15:29 ` [rfc][patch 3/5] afs: new aops David Howells
2007-11-13 0:15 ` Nick Piggin
@ 2007-11-13 0:30 ` David Howells
2007-11-13 0:44 ` Nick Piggin
2007-11-13 10:56 ` David Howells
1 sibling, 2 replies; 18+ messages in thread
From: David Howells @ 2007-11-13 0:30 UTC (permalink / raw)
To: Nick Piggin
Cc: dhowells, Andrew Morton, linux-fsdevel, mhalcrow, phillip,
sfrench
Nick Piggin <npiggin@suse.de> wrote:
> PAGE_CACHE_SIZE should be used to address the pagecache.
Perhaps, but the function being called from there takes pages not page cache
slots. If I have to allow for PAGE_CACHE_SIZE > PAGE_SIZE then I need to
modify my code, if not then the assertion needs to remain what it is.
> > I notice you removed the stuff that clears holes in the page to be
> > written. Is this is now done by the caller?
>
> It is supposed to bring the page uptodate first. So, no need to clear
> AFAIKS?
Hmmm... I suppose. However, it is wasteful in the common case as it is then
bringing the page up to date by filling/clearing the whole of it and not just
the bits that are not going to be written.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [rfc][patch 3/5] afs: new aops
2007-11-13 0:30 ` David Howells
@ 2007-11-13 0:44 ` Nick Piggin
2007-11-13 10:56 ` David Howells
1 sibling, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-13 0:44 UTC (permalink / raw)
To: David Howells; +Cc: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench
On Tue, Nov 13, 2007 at 12:30:05AM +0000, David Howells wrote:
> Nick Piggin <npiggin@suse.de> wrote:
>
> > PAGE_CACHE_SIZE should be used to address the pagecache.
>
> Perhaps, but the function being called from there takes pages not page cache
> slots. If I have to allow for PAGE_CACHE_SIZE > PAGE_SIZE then I need to
> modify my code, if not then the assertion needs to remain what it is.
It takes a pagecache page, yes. If you follow convention, you use
PAGE_CACHE_SIZE for that guy. You don't have to allow PAGE_CACHE_SIZE !=
PAGE_SIZE, and if all the rest of your code is in units of PAGE_SIZE, then
obviously my changing of just the one unit is even more confusing than
the current arrangement ;)
> > > I notice you removed the stuff that clears holes in the page to be
> > > written. Is this is now done by the caller?
> >
> > It is supposed to bring the page uptodate first. So, no need to clear
> > AFAIKS?
>
> Hmmm... I suppose. However, it is wasteful in the common case as it is then
> bringing the page up to date by filling/clearing the whole of it and not just
> the bits that are not going to be written.
Yes, that's where you come in. You are free (and encouraged) to optimise
this.
Let's see, for a network filesystem this is what you could do:
- if the page is not uptodate at write_begin time, do not bring it uptodate
(at least, not the region that is going to be written to)
- if the page is not uptodate at write_end time, but the copy was fully
completed, just mark it uptodate (provided you brought the regions outside
the copy uptodate).
- if the page is not uptodate and you have a short copy, simply do not
mark the page uptodate or dirty, and return 0 from write_end, indicating
that you have committed 0 bytes. The generic code should DTRT.
Or you could:
Pass back a temporary (not pagecache) page in *pagep, and copy that yourself
into the _real_ pagecache page at write_end time, so you know exactly how
big the copy will be (this is basically what the 2copy method does now...
it is probably not as good as the first method I described, but for a
high latency filesystem it may be preferable to always bringing the page
uptodate).
Or: keep track of sub-page dirty / uptodate status eg. with a light weight
buffer_head like structure, so you can actually have partially dirty pages
that are not completely uptodate.
Or... if you can think of another way...
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [rfc][patch 3/5] afs: new aops
2007-11-13 0:30 ` David Howells
2007-11-13 0:44 ` Nick Piggin
@ 2007-11-13 10:56 ` David Howells
2007-11-14 4:24 ` Nick Piggin
2007-11-14 12:18 ` David Howells
1 sibling, 2 replies; 18+ messages in thread
From: David Howells @ 2007-11-13 10:56 UTC (permalink / raw)
To: Nick Piggin
Cc: dhowells, Andrew Morton, linux-fsdevel, mhalcrow, phillip,
sfrench
Nick Piggin <npiggin@suse.de> wrote:
> It takes a pagecache page, yes. If you follow convention, you use
> PAGE_CACHE_SIZE for that guy. You don't have to allow PAGE_CACHE_SIZE !=
> PAGE_SIZE, and if all the rest of your code is in units of PAGE_SIZE, then
> obviously my changing of just the one unit is even more confusing than
> the current arrangement ;)
The problem is that the code called assumes that the struct page * argument
points to a single page, not an array of pages as would presumably be the case
if PAGE_CACHE_SIZE > PAGE_SIZE. If I should allow for an array of pages then
the lower functions (specifically afs_deliver_fs_fetch_data()) need to change,
and until that time occurs, the assertion *must* remain as it is now. It
defends the lower functions against being asked to do something they weren't
designed to do.
So: you may not change the assertion unless you also fix the lower functions.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [rfc][patch 3/5] afs: new aops
2007-11-13 10:56 ` David Howells
@ 2007-11-14 4:24 ` Nick Piggin
2007-11-14 12:18 ` David Howells
1 sibling, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-14 4:24 UTC (permalink / raw)
To: David Howells; +Cc: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench
On Tue, Nov 13, 2007 at 10:56:25AM +0000, David Howells wrote:
> Nick Piggin <npiggin@suse.de> wrote:
>
> > It takes a pagecache page, yes. If you follow convention, you use
> > PAGE_CACHE_SIZE for that guy. You don't have to allow PAGE_CACHE_SIZE !=
> > PAGE_SIZE, and if all the rest of your code is in units of PAGE_SIZE, then
> > obviously my changing of just the one unit is even more confusing than
> > the current arrangement ;)
>
> The problem is that the code called assumes that the struct page * argument
> points to a single page, not an array of pages as would presumably be the case
> if PAGE_CACHE_SIZE > PAGE_SIZE.
Incorrect. Christoph's patch for example does this by using compound pages.
Now I personally don't like the patch or see the point in PAGE_CACHE_SIZE /
PAGE_SIZE distinction, but I'm just telling you what the convention is. There
is no point you arguing against it, that's simply how it is.
> If I should allow for an array of pages then
> the lower functions (specifically afs_deliver_fs_fetch_data()) need to change,
> and until that time occurs, the assertion *must* remain as it is now. It
> defends the lower functions against being asked to do something they weren't
> designed to do.
>
> So: you may not change the assertion unless you also fix the lower functions.
I won't change the assertion, because you haven't been following said
convention, so just changing it in one place is stupider than not changing
it at all, but not for the reason cited.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [rfc][patch 3/5] afs: new aops
2007-11-13 10:56 ` David Howells
2007-11-14 4:24 ` Nick Piggin
@ 2007-11-14 12:18 ` David Howells
2007-11-14 15:18 ` Nick Piggin
2007-11-14 15:57 ` David Howells
1 sibling, 2 replies; 18+ messages in thread
From: David Howells @ 2007-11-14 12:18 UTC (permalink / raw)
To: Nick Piggin
Cc: dhowells, Andrew Morton, linux-fsdevel, mhalcrow, phillip,
sfrench
Nick Piggin <npiggin@suse.de> wrote:
> > The problem is that the code called assumes that the struct page *
> > argument points to a single page, not an array of pages as would
> > presumably be the case if PAGE_CACHE_SIZE > PAGE_SIZE.
>
> Incorrect. Christoph's patch for example does this by using compound pages.
> Now I personally don't like the patch or see the point in PAGE_CACHE_SIZE /
> PAGE_SIZE distinction, but I'm just telling you what the convention is. There
> is no point you arguing against it, that's simply how it is.
No! You are wrong. I wrote the AFS code. I *know* it can only deal with
single pages. It has no knowledge of compound pages and does not handle page
arrays. This may be a flaw in my code, but it's there nonetheless. The
assertion is a guard against that. *That* was the point of my statement.
Perhaps I should've said 'my code' rather than 'the code'.
If Christoph has a patch to deal with that, it's either not upstream yet or it
hasn't altered AFS.
> > So: you may not change the assertion unless you also fix the lower
> > functions.
>
> I won't change the assertion, because you haven't been following said
> convention, so just changing it in one place is stupider than not changing
> it at all, but not for the reason cited.
The convention is not precisely clear. Just grep for PAGE_CACHE_SIZE in
Documentation/. It's only mentioned twice, and in neither case does it give
any information about what PAGE_CACHE_SIZE is, what it represents, or where it
applies. Therefore it's an ill-defined concept.
If you look in Documentation/filesystems/vfs.txt, you'll see that it almost
always talks about 'pages'. It only mentions 'pagecache pages' once - in the
description of write_begin(), but it's not clear whether that means anything.
However, I've now noted that I need to fix my code, so just keep the assertion
for now and I'll fix my code to handle multipage blocks.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [rfc][patch 3/5] afs: new aops
2007-11-14 12:18 ` David Howells
@ 2007-11-14 15:18 ` Nick Piggin
2007-11-14 15:57 ` David Howells
1 sibling, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-14 15:18 UTC (permalink / raw)
To: David Howells; +Cc: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench
On Wed, Nov 14, 2007 at 12:18:43PM +0000, David Howells wrote:
> Nick Piggin <npiggin@suse.de> wrote:
>
> > > The problem is that the code called assumes that the struct page *
> > > argument points to a single page, not an array of pages as would
> > > presumably be the case if PAGE_CACHE_SIZE > PAGE_SIZE.
> >
> > Incorrect. Christoph's patch for example does this by using compound pages.
> > Now I personally don't like the patch or see the point in PAGE_CACHE_SIZE /
> > PAGE_SIZE distinction, but I'm just telling you what the convention is. There
> > is no point you arguing against it, that's simply how it is.
>
> No! You are wrong. I wrote the AFS code. I *know* it can only deal with
No I'm talking about core code. In core code, the PAGE_CACHE_SIZE is
for page cache struct pages. Single struct pages (not page arrays).
Take a look at generic mapping read or something.
There is nothing to deal with page arrays there either, but that's simply
the convention.
> > > So: you may not change the assertion unless you also fix the lower
> > > functions.
> >
> > I won't change the assertion, because you haven't been following said
> > convention, so just changing it in one place is stupider than not changing
> > it at all, but not for the reason cited.
>
> The convention is not precisely clear. Just grep for PAGE_CACHE_SIZE in
> Documentation/. It's only mentioned twice, and in neither case does it give
> any information about what PAGE_CACHE_SIZE is, what it represents, or where it
> applies. Therefore it's an ill-defined concept.
>
> If you look in Documentation/filesystems/vfs.txt, you'll see that it almost
> always talks about 'pages'. It only mentions 'pagecache pages' once - in the
> description of write_begin(), but it's not clear whether that means anything.
Documentation is the opposite of convention ;) Look in mm/.
> However, I've now noted that I need to fix my code, so just keep the assertion
> for now and I'll fix my code to handle multipage blocks.
I'm not saying you need to do that. Leave it at PAGE_SIZE, really it
doesn't matter that much at present. This has just blown out of proportion.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [rfc][patch 3/5] afs: new aops
2007-11-14 12:18 ` David Howells
2007-11-14 15:18 ` Nick Piggin
@ 2007-11-14 15:57 ` David Howells
2007-11-14 21:32 ` Nick Piggin
2007-11-15 12:15 ` David Howells
1 sibling, 2 replies; 18+ messages in thread
From: David Howells @ 2007-11-14 15:57 UTC (permalink / raw)
To: Nick Piggin
Cc: dhowells, Andrew Morton, linux-fsdevel, mhalcrow, phillip,
sfrench
Nick Piggin <npiggin@suse.de> wrote:
> In core code, the PAGE_CACHE_SIZE is for page cache struct pages. Single
> struct pages (not page arrays). Take a look at generic mapping read or
> something.
So you're saying a struct page controls an area of PAGE_CACHE_SIZE, not an
area of PAGE_SIZE? Is a struct page then a purely pagecache concept?
> Documentation is the opposite of convention ;)
If it's not Documented, then it's irrelevant.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [rfc][patch 3/5] afs: new aops
2007-11-14 15:57 ` David Howells
@ 2007-11-14 21:32 ` Nick Piggin
2007-11-15 12:15 ` David Howells
1 sibling, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-14 21:32 UTC (permalink / raw)
To: David Howells; +Cc: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench
On Wed, Nov 14, 2007 at 03:57:46PM +0000, David Howells wrote:
> Nick Piggin <npiggin@suse.de> wrote:
>
> > In core code, the PAGE_CACHE_SIZE is for page cache struct pages. Single
> > struct pages (not page arrays). Take a look at generic mapping read or
> > something.
>
> So you're saying a struct page controls an area of PAGE_CACHE_SIZE, not an
> area of PAGE_SIZE?
No, a pagecache page is PAGE_CACHE_SIZE. And not all struct pages control
the same amount of data anyway, with compound pages.
> Is a struct page then a purely pagecache concept?
>
> > Documentation is the opposite of convention ;)
>
> If it's not Documented, then it's irrelevant.
But you can't just decide yourself that it is irrelevant because you don't
grep hard enough ;)
include/linux/mm.h:
* A page may belong to an inode's memory mapping. In this case, page->mapping
* is the pointer to the inode, and page->index is the file offset of the page,
* in units of PAGE_CACHE_SIZE.
include/linux/mm_types.h
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
units, *not* PAGE_CACHE_SIZE */
Looks like documentation to me.
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [rfc][patch 3/5] afs: new aops
2007-11-14 15:57 ` David Howells
2007-11-14 21:32 ` Nick Piggin
@ 2007-11-15 12:15 ` David Howells
2007-11-15 21:37 ` Nick Piggin
1 sibling, 1 reply; 18+ messages in thread
From: David Howells @ 2007-11-15 12:15 UTC (permalink / raw)
To: Nick Piggin
Cc: dhowells, Andrew Morton, linux-fsdevel, mhalcrow, phillip,
sfrench
Nick Piggin <npiggin@suse.de> wrote:
> > So you're saying a struct page controls an area of PAGE_CACHE_SIZE, not an
> > area of PAGE_SIZE?
>
> No, a pagecache page is PAGE_CACHE_SIZE.
That doesn't answer my question. I didn't ask about 'pagecache pages' per se.
Are you saying then that a page struct always represents an area of PAGE_SIZE
to, say, the page allocator and PAGE_CACHE_SIZE to a filesystem's address
operations?
How about I state it this way: Please define what the coverage of a
(non-compound) struct page is, and how this relates to PAGE_SIZE and
PAGE_CACHE_SIZE. If it's well-defined then this cannot be hard, right?
> And not all struct pages control the same amount of data anyway, with
> compound pages.
Compound pages are irrelevant to my question. A compound page is actually a
regulated by a series of page structs, each of which represents a 'page' of
real memory.
Do you say, then, that all, say, readpage() and readpages() methods must
handle a compound page if that is given to them?
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [rfc][patch 3/5] afs: new aops
2007-11-15 12:15 ` David Howells
@ 2007-11-15 21:37 ` Nick Piggin
0 siblings, 0 replies; 18+ messages in thread
From: Nick Piggin @ 2007-11-15 21:37 UTC (permalink / raw)
To: David Howells; +Cc: Andrew Morton, linux-fsdevel, mhalcrow, phillip, sfrench
On Thu, Nov 15, 2007 at 12:15:41PM +0000, David Howells wrote:
> Nick Piggin <npiggin@suse.de> wrote:
>
> > > So you're saying a struct page controls an area of PAGE_CACHE_SIZE, not an
> > > area of PAGE_SIZE?
> >
> > No, a pagecache page is PAGE_CACHE_SIZE.
>
> That doesn't answer my question. I didn't ask about 'pagecache pages' per se.
>
> Are you saying then that a page struct always represents an area of PAGE_SIZE
> to, say, the page allocator and PAGE_CACHE_SIZE to a filesystem's address
> operations?
Yes.
> How about I state it this way: Please define what the coverage of a
> (non-compound) struct page is, and how this relates to PAGE_SIZE and
> PAGE_CACHE_SIZE. If it's well-defined then this cannot be hard, right?
No it's easy. It's PAGE_SIZE (which also happens to be PAGE_CACHE_SIZE).
An implementation that would (not trivially, but without changing the
basic concepts) allow a larger pagecache size is with compound pages. And
actually hugetlbfs already does this.
> > And not all struct pages control the same amount of data anyway, with
> > compound pages.
>
> Compound pages are irrelevant to my question. A compound page is actually a
> regulated by a series of page structs, each of which represents a 'page' of
> real memory.
And it can also represent a PAGE_CACHE_SIZE page of pagecache.
> Do you say, then, that all, say, readpage() and readpages() methods must
> handle a compound page if that is given to them?
I'm not talking about a specific implementation that allows
PAGE_CACHE_SIZE > PAGE_SIZE. So no, I don't say anything about that. I
say that pagecache pages are PAGE_CACHE_SIZE! Yes it is easy now because
that is the same as PAGE_SIZE. Yes it will be harder if you wanted to
change that. What you would not have to change is the assumption that
pagecache pages are in PAGE_SIZE units.
^ permalink raw reply [flat|nested] 18+ messages in thread