[Ocfs2-devel] [PATCH] split inode.c

All of lore.kernel.org
 help / color / mirror / Atom feed

* [Ocfs2-devel] [PATCH] split inode.c
@ 2004-08-13 20:40 Christoph Hellwig
  0 siblings, 0 replies; only message in thread
From: Christoph Hellwig @ 2004-08-13 20:40 UTC (permalink / raw)
  To: ocfs2-devel

two new files:

 - aops.c implementing ocfs_aops
 - 24io.c implementing 2.4 specific direct I/O and AIO code


Index: src/Makefile
===================================================================
--- src/Makefile	(revision 1355)
+++ src/Makefile	(working copy)
@@ -54,7 +54,9 @@
 endif
 
 CFILES = \
+	24io.c			\
 	alloc.c			\
+	aops.c			\
 	bitmap.c		\
 	buffer_head_io.c	\
 	dcache.c		\
Index: src/inode.c
===================================================================
--- src/inode.c	(revision 1355)
+++ src/inode.c	(working copy)
@@ -58,17 +58,8 @@
 
 #define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_INODE
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-# include <linux/iobuf.h>
-# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18)
-#  define free_kiovec_sz(nr, buf, bh)     free_kiovec(nr, buf)
-#  define alloc_kiovec_sz(nr, buf, bh)    alloc_kiovec(nr, buf)
-# endif
-#endif /* for 2.6 - no more kiovec, kiobuf structures - vfs handles
-	* this for us (direct i/o) */
-
-
 extern struct semaphore recovery_list_sem;
+extern struct address_space_operations ocfs_aops;
 
 typedef struct _ocfs_find_inode_args
 {
@@ -78,47 +69,14 @@
 }
 ocfs_find_inode_args;
 
-static int ocfs_readpage (struct file *file, struct page *page);
-static int ocfs_prepare_write (struct file *file, struct page *page, unsigned from, unsigned to);
-static int ocfs_commit_write (struct file *file, struct page *page, unsigned from, unsigned to);
-static int ocfs_get_block (struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create);
-static int ocfs_symlink_get_block (struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create);
-
 static int ocfs_read_locked_inode(struct inode *inode, ocfs_find_inode_args *args);
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-static sector_t ocfs_bmap(struct address_space *mapping, sector_t block);
-static int ocfs_writepage (struct page *page, struct writeback_control *wbc);
-static ssize_t ocfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
 static int ocfs_init_locked_inode(struct inode * inode, void * opaque);
 static int ocfs_find_actor (struct inode *inode, void *opaque);
 #else /* 2.4 kernel */
 static int ocfs_find_inode (struct inode *inode, unsigned long ino, void *opaque);
-static int ocfs_bmap(struct address_space *mapping, long block);
-static int ocfs_writepage (struct page *page);
-static int ocfs_get_block2 (struct inode *inode, long iblock, long *oblock, int len);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) || defined(SUSE)
-static int ocfs_direct_IO (int rw, struct file *filp, struct kiobuf *iobuf, unsigned long blocknr, int blocksize);
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
-static int ocfs_direct_IO (int rw, struct inode *inode, struct kiobuf *iobuf, unsigned long blocknr, int blocksize);
 #endif
-#endif
 
-static struct address_space_operations ocfs_aops = {
-	.readpage = ocfs_readpage,
-	.writepage = ocfs_writepage,
-	.prepare_write = ocfs_prepare_write,
-	.bmap = ocfs_bmap,
-	.commit_write = ocfs_commit_write,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
-/*
- * On a 2.4 system, we are only adding this here as a dummy basically, 
- * just need open with O_DIRECT to succeed, we still call ocfs_rw_direct().
- * For a 2.6 system, this is the way a filesystem provides direct-io support. 
- */
-	.direct_IO = ocfs_direct_IO
-#endif
-};
-
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
 /* 
  * ocfs_ilookup()
@@ -813,281 +771,7 @@
 	return;
 }				/* ocfs_clear_inode */
 
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-inline void __mark_dirty(struct buffer_head *bh)
-{
-	set_buffer_flushtime(bh);
-	refile_buffer(bh);
-}
-
-static int __block_commit_write(struct inode *inode, struct page *page,
-		unsigned from, unsigned to)
-{
-	unsigned block_start, block_end;
-	int partial = 0, need_balance_dirty = 0;
-	unsigned blocksize;
-	struct buffer_head *bh, *head;
-
-	blocksize = 1 << inode->i_blkbits;
-
-	for(bh = head = page->buffers, block_start = 0;
-	    bh != head || !block_start;
-	    block_start=block_end, bh = bh->b_this_page) {
-		block_end = block_start + blocksize;
-		if (block_end <= from || block_start >= to) {
-			if (!buffer_uptodate(bh))
-				partial = 1;
-		} else {
-			set_bit(BH_Uptodate, &bh->b_state);
-			if (!atomic_set_buffer_dirty(bh)) {
-				__mark_dirty(bh);
-				buffer_insert_inode_data_queue(bh, inode);
-				need_balance_dirty = 1;
-			}
-		}
-	}
-
-	if (need_balance_dirty)
-		balance_dirty();
-	/*
-	 * is this a partial write that happened to make all buffers
-	 * uptodate then we can optimize away a bogus readpage() for
-	 * the next read(). Here we 'discover' wether the page went
-	 * uptodate as a result of this (potentially partial) write.
-	 */
-	if (!partial)
-		SetPageUptodate(page);
-	return 0;
-}
-
-static int ocfs2_cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, loff_t *bytes)
-{
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = mapping->host;
-	struct page *new_page;
-	unsigned long pgpos;
-	long status;
-	unsigned zerofrom;
-	unsigned blocksize = 1 << inode->i_blkbits;
-	char *kaddr;
-
-	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
-		status = -ENOMEM;
-		new_page = grab_cache_page(mapping, pgpos);
-		if (!new_page)
-			goto out;
-		/* we might sleep */
-		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
-			unlock_page(new_page);
-			page_cache_release(new_page);
-			continue;
-		}
-		zerofrom = *bytes & ~PAGE_CACHE_MASK;
-		if (zerofrom & (blocksize-1)) {
-			*bytes |= (blocksize-1);
-			(*bytes)++;
-		}
-		status = block_prepare_write(new_page, zerofrom,
-					     PAGE_CACHE_SIZE, get_block);
-		if (status)
-			goto out_unmap;
-		kaddr = page_address(new_page);
-		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
-		flush_dcache_page(new_page);
-		__block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
-		kunmap(new_page);
-		unlock_page(new_page);
-		page_cache_release(new_page);
-	}
-
-	if (page->index < pgpos) {
-		/* completely inside the area */
-		zerofrom = offset;
-	} else {
-		/* page covers the boundary, find the boundary offset */
-		zerofrom = *bytes & ~PAGE_CACHE_MASK;
-
-		/* if we will expand the thing last block will be filled */
-		if (to > zerofrom && (zerofrom & (blocksize-1))) {
-			*bytes |= (blocksize-1);
-			(*bytes)++;
-		}
-
-		/* starting below the boundary? Nothing to zero out */
-		if (offset <= zerofrom)
-			zerofrom = offset;
-	}
-	status = block_prepare_write(page, zerofrom, to, get_block);
-	if (status)
-		goto out1;
-	kaddr = page_address(page);
-	if (zerofrom < offset) {
-		memset(kaddr+zerofrom, 0, offset-zerofrom);
-		flush_dcache_page(page);
-		__block_commit_write(inode, page, zerofrom, offset);
-	}
-	return 0;
-out1:
-	ClearPageUptodate(page);
-	kunmap(page);
-	return status;
-
-out_unmap:
-	ClearPageUptodate(new_page);
-	kunmap(new_page);
-	UnlockPage(new_page);
-	page_cache_release(new_page);
-out:
-	return status;
-}
-
-/* Mark's favorite hack */
-#undef cont_prepare_write
-#define cont_prepare_write ocfs2_cont_prepare_write
-#endif  /* < 2.6.0 */
-
 /*
- * ocfs_prepare_write()
- *
- */
-static int ocfs_prepare_write (struct file *file, struct page *page, unsigned from, unsigned to)
-{
-	int ret;
-	struct inode *inode = page->mapping->host;
-
-	LOG_SET_CONTEXT(PREPARE_WRITE);
-
-	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
-	if (!inode)
-		BUG();
-
-	ret = cont_prepare_write(page, from, to, ocfs_get_block,
-		&(OCFS_I(page->mapping->host)->ip_mmu_private));
-
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_prepare_write */
-
-/*
- * ocfs_commit_write()
- *
- */
-static int ocfs_commit_write (struct file *file, struct page *page, unsigned from, unsigned to)
-{
-	int ret;
-
-	LOG_SET_CONTEXT(COMMIT_WRITE);
-
-	LOG_ENTRY_ARGS ("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
-	ret = generic_commit_write (file, page, from, to);
-
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_commit_write */
-
-/*
- * ocfs_symlink_get_block()
- *  
- */
-static int ocfs_symlink_get_block (struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
-{
-	int err = -EIO;
-	int status;
-	ocfs2_dinode *fe = NULL;
-	struct buffer_head *bh = NULL;
-	struct buffer_head *buffer_cache_bh = NULL;
-	ocfs_super *osb = OCFS_SB(inode->i_sb);
-	void *kaddr;
-
-	LOG_ENTRY_ARGS ("(0x%p, %llu, 0x%p, %d)\n", inode,
-			(unsigned long long)iblock, bh_result, create);
-
-	if (!inode) {
-		LOG_ERROR_STR ("bad inode");
-		goto bail;
-	}
-
-	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
-		LOG_ERROR_ARGS ("block offset > PATH_MAX: %llu",
-				(unsigned long long)iblock);
-		goto bail;
-	}
-
-	status = ocfs_read_bh(OCFS_SB(inode->i_sb),
-			      OCFS_I(inode)->ip_blkno << inode->i_sb->s_blocksize_bits,
-			      &bh,
-			      OCFS_BH_CACHED, inode);
-	if (status < 0) {
-		LOG_ERROR_STATUS (status);
-		goto bail;
-	}
-	fe = (ocfs2_dinode *) bh->b_data;
-
-	if (!IS_VALID_FILE_ENTRY(fe)) {
-		LOG_ERROR_ARGS("Invalid fe at blkno %llu",
-			       OCFS_I(inode)->ip_blkno);
-		goto bail;
-	}
-
-	if ((u64)iblock >= ocfs_clusters_to_blocks(inode->i_sb,
-					      fe->i_clusters)) {
-		LOG_ERROR_ARGS ("block offset is outside the allocated size: %llu",
-		     (unsigned long long)iblock);
-		goto bail;
-	}
-
-	/* We don't use the page cache to create symlink data, so if
-	 * need be, copy it over from the buffer cache. */
-	if (!buffer_uptodate(bh_result) && !ocfs_inode_is_new(osb, inode)) {
-		buffer_cache_bh = sb_getblk(osb->sb, 
-					    fe->id2.i_list.l_recs[0].e_blkno + iblock);
-		if (!buffer_cache_bh) {
-			LOG_ERROR_STR("couldn't getblock for symlink!");
-			goto bail;
-		}
-
-		/* we haven't locked out transactions, so a commit
-		 * could've happened. Since we've got a reference on
-		 * the bh, even if it commits while we're doing the
-		 * copy, the data is still good. */
-		if (buffer_jbd(buffer_cache_bh) 
-		    && !ocfs_inode_is_new(osb, inode)) {
-			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
-			if (!kaddr) {
-				LOG_ERROR_ARGS("couldn't kmap!\n");
-				goto bail;
-			}
-			memcpy(kaddr + (bh_result->b_size * iblock), 
-			       buffer_cache_bh->b_data, 
-			       bh_result->b_size);
-			kunmap_atomic(kaddr, KM_USER0);
-			set_buffer_uptodate(bh_result);
-		}
-		brelse(buffer_cache_bh);
-	}
-
-	map_bh(bh_result, inode->i_sb,
-	       fe->id2.i_list.l_recs[0].e_blkno + iblock);
-
-	err = 0;
-
-bail:
-	if (bh)
-		brelse(bh);
-
-	LOG_EXIT_INT (err);
-	return err;
-}				/* ocfs_symlink_get_block */
-
-
-/*
  * TODO: this should probably be merged into ocfs_get_block
  * 
  * However, you now need to pay attention to the cont_prepare_write()
@@ -1194,834 +878,6 @@
 }
 
 /*
- * ocfs_get_block()
- *
- */
-static int ocfs_get_block (struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
-{
-	int err = -EIO;
-	__s64 vbo = 0;
-	__s64 lbo = 0;
-	__u32 len;
-	int open_direct;
-
-	LOG_ENTRY_ARGS ("(0x%p, %llu, 0x%p, %d)\n", inode,
-			(unsigned long long)iblock, bh_result, create);
-
-	if (!inode) {
-		LOG_ERROR_STR ("bad inode");
-		goto bail;
-	}
-
-	if (OCFS_I(inode)->ip_flags & OCFS_INODE_SYSTEM_FILE) {
-		printk("get_block on system inode 0x%p (%lu)\n",
-		       inode, inode->i_ino);
-	}
-
-	open_direct = OCFS_I(inode)->ip_open_flags & OCFS_OIN_OPEN_FOR_DIRECTIO;
-
-	if (S_ISLNK (inode->i_mode)) {
-		/* this always does I/O for some reason. */
-		down_read(&OCFS_I(inode)->ip_io_sem);
-		err = ocfs_symlink_get_block (inode, iblock, bh_result, 
-					      create);
-		up_read(&OCFS_I(inode)->ip_io_sem);
-		goto bail;
-	}
-
-	vbo = (__s64) iblock << inode->i_sb->s_blocksize_bits;
-
-#if 0
-	if (!INODE_JOURNAL(inode) && vbo >= OCFS_I(inode)->ip_alloc_size) {
-		int vbo_pad;
-		
-		vbo_pad = inode->i_sb->s_blocksize;
-		vbo_pad -= vbo & (s64)(inode->i_sb->s_blocksize - 1);
-
-		LOG_TRACE_STR("Extending allocation");
-		LOG_ERROR_ARGS("extending inode %lu in get_block!!\n", 
-			       inode->i_ino);
-		down_write(&OCFS_I(inode)->ip_io_sem);
-		err = ocfs_extend_file(osb, vbo + vbo_pad, 
-				       NULL, inode, NULL, 0, NULL);
-		up_write(&OCFS_I(inode)->ip_io_sem);
-		if (err < 0) {
-			err = -ENOSPC;
-			LOG_ERROR_STATUS (err);
-			goto bail;
-		}
-	}
-#else
-	if (vbo >= OCFS_I(inode)->ip_alloc_size) {
-		err = -EIO;
-		LOG_ERROR_ARGS("Trying to extend in ocfs_get_block() (inode %llu, blkno %llu, vbo %llu, alloc %llu)\n", OCFS_I(inode)->ip_blkno, (u64)iblock, (u64)vbo, OCFS_I(inode)->ip_alloc_size);
-		goto bail;
-	}
-#endif
-
-	len = inode->i_sb->s_blocksize;
-	if (!open_direct)
-		down_read(&OCFS_I(inode)->ip_extend_sem);
-	err = ocfs_lookup_file_allocation(OCFS2_SB(inode->i_sb),
-					  vbo, &lbo, len, NULL, 
-					  inode, open_direct);
-	if (!open_direct)
-		up_read(&OCFS_I(inode)->ip_extend_sem);
-	if (err < 0) {
-		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u", vbo, lbo, len);
-		goto bail;
-	}
-
-	map_bh(bh_result, inode->i_sb, lbo >> inode->i_sb->s_blocksize_bits);
-
-	err = 0;
-
-	if (bh_result->b_blocknr == 0) {
-		err = -EIO;
-		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u, blkno=(%llu)\n",
-				vbo, lbo, len, 
-				OCFS_I(inode)->ip_blkno);
-	}
-
-	if (vbo < OCFS_I(inode)->ip_mmu_private)
-		goto bail;
-	if (!create)
-		goto bail;
-	if (vbo != OCFS_I(inode)->ip_mmu_private) {
-		LOG_ERROR_ARGS("Uh-oh, vbo = %lld, i_size = %llu, mmu = %llu, "
-			       "inode = %llu\n",
-			       vbo, inode->i_size, 
-			       OCFS_I(inode)->ip_mmu_private,
-			       OCFS_I(inode)->ip_blkno);
-		BUG();
-		err = -EIO;
-		goto bail;
-	}
-
-	bh_result->b_state |= (1UL << BH_New);
-	OCFS_I(inode)->ip_mmu_private += inode->i_sb->s_blocksize;
-
-bail:
-	if (err < 0)
-		err = -EIO;
-
-	LOG_EXIT_INT (err);
-	return err;
-}				/* ocfs_get_block */
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-static sector_t ocfs_bmap(struct address_space *mapping, sector_t block) 
-#else
-static int ocfs_bmap(struct address_space *mapping, long block) 
-#endif
-{
-	int disk_block = 0;
-	ocfs_super *osb = OCFS_SB(mapping->host->i_sb);
-	__s64 vbo = 0;
-	__s64 lbo = 0;
-	__u32 len;
-	int err = 0, status;
-	struct inode *inode = mapping->host;
-
-	LOG_SET_CONTEXT(BMAP);
-
-	LOG_ENTRY_ARGS("(block = %llu)\n", (unsigned long long)block);
-
-	if (!inode) {
-		LOG_ERROR_STR ("bmap: bad inode");
-		err = -EINVAL;
-		LOG_ERROR_STATUS(err);
-		goto bail;
-	}
-
-	if (!INODE_JOURNAL(inode)) {
-		LOG_ERROR_STR("bmap is only for journal inodes!");
-		err = -EINVAL;
-		LOG_ERROR_STATUS(err);
-		goto bail;
-	}
-
-	vbo = (__s64) block << inode->i_sb->s_blocksize_bits;
-	len = osb->sb->s_blocksize;
-	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
-					   inode, 1);
-	if (err < 0) {
-		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u", vbo,
-				lbo, len);
-		LOG_ERROR_STATUS(err);
-		goto bail;
-	}
-
-	disk_block = lbo >> inode->i_sb->s_blocksize_bits;
-
-bail:
-	status = err ? err : disk_block;
-	LOG_EXIT_STATUS(status);
-
-	LOG_CLEAR_CONTEXT();
-	return(status);
-}
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-/*
- * ocfs_get_block2()
- *
- */
-static int ocfs_get_block2 (struct inode *inode, long iblock, long *oblock, int len)
-{
-	int err = -EIO;
-	ocfs_super *osb;
-	__s64 vbo = 0;
-	__s64 lbo = 0;
-
-	LOG_ENTRY_ARGS ("(0x%p, %ld)\n", inode, iblock);
-
-	if (!inode) {
-		LOG_ERROR_STR ("bad inode");
-		err = -1;
-		goto bail;
-	}
-
-	osb = OCFS_SB(inode->i_sb);
-
-	vbo = (__s64) iblock << osb->s_sectsize_bits;
-	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
-					   inode, 1);
-	if (err < 0) {
-		LOG_ERROR_STATUS (err);
-		err = -1;
-		goto bail;
-	}
-
-	err = 0;
-
-	*oblock = lbo >> osb->s_sectsize_bits;
-	if (*oblock == 0) {
-		err = -EIO;
-		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u, blkno=(%llu)\n",
-				vbo, lbo, len, 
-				OCFS_I(inode)->ip_blkno);
-	}
-
-bail:
-	if (err < 0)
-		err = -EIO;
-	LOG_EXIT_INT (err);
-	return err;
-}				/* ocfs_get_block2 */
-#endif
-
-/*
- * ocfs_readpage()
- *
- */
-static int ocfs_readpage (struct file *file, struct page *page)
-{
-	int ret;
-
-	LOG_SET_CONTEXT(READPAGE);
-
-	LOG_ENTRY_ARGS ("(0x%p, %lu)\n", file, (page ? page->index : 0));
-
-	ret = block_read_full_page (page, ocfs_get_block);
-	if (ret < 0)
-		goto bail;
-
-bail:
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_readpage */
-
-/*
- * ocfs_writepage()
- *
- */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-static int ocfs_writepage (struct page *page, struct writeback_control *wbc)
-{
-	int ret;
-
-	LOG_SET_CONTEXT(WRITEPAGE);
-
-	LOG_ENTRY_ARGS ("(0x%p)\n", page);
-
-	ret = block_write_full_page (page, ocfs_get_block, wbc);
-
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_writepage */
-#else
-static int ocfs_writepage (struct page *page)
-{
-	int ret;
-
-	LOG_SET_CONTEXT(WRITEPAGE);
-
-	LOG_ENTRY_ARGS ("(0x%p)\n", page);
-
-	ret = block_write_full_page (page, ocfs_get_block);
-
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_writepage */
-#endif
-
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- *  "So what we do is to permit the ->get_blocks function to populate 
- *   bh.b_size with the size of IO which is permitted at this offset and 
- *   this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * We should probably have this data in the oin for the inode.
- * Otherwise, we might want to look at ocfs_rw_direct, 
- *  ocfs_lookup_file_allocation and ocfs_get_block
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- * 					fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs_direct_IO_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create)
-{
-	int ret = -1;
-	int status;
-	ocfs_super *osb = NULL;
-	__s64 vbo; /* file offset */
-	__s64 lbo; /* logical (disk) offset */
-	__s64 vbo_max; /* file offset, max_blocks from iblock */
-	int set_new = 0; /* flag */
-	__u64 new_size; /* In bytes, the size of the contiguous block */
-	unsigned char blocksize_bits;
-
-	if (!inode || !bh_result) {
-		LOG_ERROR_STR("ocfs_direct_IO_get_blocks: inode or bh_result is null");
-		return -EIO;
-	}
-
-	osb = inode->i_sb->s_fs_info;
-	blocksize_bits = inode->i_sb->s_blocksize_bits;
-	/* make sure we're up to date... */
-	if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
-		LOG_TRACE_STR ("ocfs_direct_IO_get_blocks: verify oin.");
-		status = ocfs_verify_update_inode (osb, inode, 0);
-		if (status < 0) {
-			LOG_TRACE_STR ("ocfs_verify_update_inode failed");
-			ret = -EIO;
-			goto bail;
-		}
-	}
-
-	/* This function won't even be called if the request isn't all
-	 * nicely aligned and of the right size, so there's no need
-	 * for us to check any of that. */
-
-	vbo = (__s64) iblock << blocksize_bits;
-	vbo_max = vbo + ((__s64) max_blocks << blocksize_bits);
-
-	/* NOTE: create flag is set when we ?may? have to allocate some
-	   blocks for the file. */
-	if (create && vbo_max > OCFS_I(inode)->ip_alloc_size) {
-		/* WARNING: How much do we really want to extend the file? */
-		status = ocfs_extend_file(osb, vbo_max,
-					  NULL, inode, NULL, 0, NULL);
-		if (status < 0) {
-			status = -ENOSPC;
-			LOG_ERROR_STR("ocfs_direct_IO_get_blocks: failed to extend the file!");
-			goto bail;
-		}
-		set_new = 1;
-	}
-
-	/* This figure out the size of the next contiguous block, and
-	 * our logical offset */	
-	/* TODO: Try our damndest to give sizes in multiples of PAGE_SIZE */
-	status = ocfs_lookup_file_allocation(osb, vbo, &lbo, max_blocks << blocksize_bits, 
-					     &new_size, inode, 1);
-
-	/* Do whatever we need to the buffer_head */
-	if (set_new) {
-		set_buffer_new(bh_result);
-		/* Do we really want to set bh_result->b_blocknr here too? */
-		bh_result->b_blocknr = lbo >> blocksize_bits;
-	} else {
-		clear_buffer_new(bh_result);
-		/* is the last argument here correct? */
-		map_bh(bh_result, inode->i_sb, lbo >> blocksize_bits);
-	}
-
-	/* make sure we don't map more than max_blocks blocks here as
-	   that's all the kernel will handle at this point. */
-	if (new_size > (__u64)max_blocks << blocksize_bits)
-		new_size = (__u64)max_blocks << blocksize_bits;
-	bh_result->b_size = new_size;
-
-	ret = 0;
-bail:
-	return ret;
-}
-
-/*
- * ocfs_direct_IO()
- * used to be: 
- * static int ocfs_direct_IO (int rw,
- *	       struct inode *inode,
- *	       struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
- *
- * now:         
-  static int ocfs_direct_IO(int rw, struct kiocb *iocb,
-			const struct iovec *iov, loff_t offset,
-			unsigned long nr_segs)
- * int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
- *                  loff_t offset, unsigned long nr_segs);
- */
-static ssize_t ocfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
-	int ret;
-
-	LOG_SET_CONTEXT(DIRECT_IO);
-
-	LOG_ENTRY ();
-
-	/* blockdev_direct_IO checks alignment for us, using */
-	ret = blockdev_direct_IO (rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ocfs_direct_IO_get_blocks, NULL);
-
-	LOG_EXIT_INT (ret);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}				/* ocfs_direct_IO */
-
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
-/*
- * ocfs_direct_IO()
- *
- * we are not using this function anymore, in fact
- * we should never get here any more
- * so let's just BUG(), hint from sct@redhat.com
- */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) || defined(SUSE)
-static int ocfs_direct_IO (int rw, struct file *filp, struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
-{
-	BUG();
-	return 0;
-}				/* ocfs_direct_IO */
-#else
-static int ocfs_direct_IO (int rw, struct inode *inode, struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
-{
-	BUG();
-	return 0;
-}				/* ocfs_direct_IO */
-#endif
-#endif  /* version >= 2.4.10 */
-
-#if defined(SUSE) && LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)
-#define OCFS_KIO_BLOCKS(_iobuf)  ((_iobuf)->kio_blocks)
-#else
-#define OCFS_KIO_BLOCKS(_iobuf)  ((_iobuf)->blocks)
-#endif
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,10)
-#define KERNEL_NO_F_IOBUF 1
-#elif defined(SUSE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20)
-#define KERNEL_NO_F_IOBUF 1
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-/*
- * ocfs_rw_direct()
- *
- */
-ssize_t ocfs_rw_direct (int rw, struct file *filp, char *buf, size_t size, loff_t * offp)
-{
-#ifdef KERNEL_NO_F_IOBUF
-	struct kiobuf *iobuf;
-#else
-	struct kiobuf *iobuf = filp->f_iobuf;
-	int new_iobuf = 0;
-#endif
-	int err = 0;
-	unsigned long blocknr, blocks, myiosize;
-	size_t transferred;
-	int iosize, clustersize;
-	int i;
-	struct inode *inode = filp->f_dentry->d_inode;
-	int max_sectors;
-	int nbhs;
-	int sector_size, sector_bits, sector_mask, sectors_per_page;
-	int ret = 0;
-	int large_io = 0;
-	int inuse = 0;
-	unsigned long blocks_end_cluster = 0;
-	loff_t saved_off;
-	size_t saved_size;
-	unsigned long firstlogic;
-	long firstphys;
-	long nextphys;
-	unsigned long nextlogic = 0;
-	unsigned long totalioblocks = 0;
-
-	saved_off = *offp;
-	saved_size = size;
-	
-	/* FIXME: Need to differentiate between sectors and blocksize */
-	sector_bits = OCFS_SB(inode->i_sb)->s_sectsize_bits;
-	sector_size = 1 << OCFS_SB(inode->i_sb)->s_sectsize_bits;
-	sector_mask = sector_size - 1;
-	sectors_per_page = PAGE_SIZE / sector_size;
-	/* max sectors is 1024 in 2.4.9
-	 * max data is 512kb  
-	 */
-
-	err = -EINVAL;
-	if (size == 0) {
-		printk("direct write of 0 byte\n");
-		return 0;
-	}
-
-	if (rw == READ) {
-	   if (inode->i_size <= *offp) /* read past end of file */
-	      return 0;
-	   if  (size > (inode->i_size - *offp))
-	      size = inode->i_size - *offp;
-	}
-
-	/* make sure aligned to either PAGE_SIZE or sect_size IO */
-#ifndef LARGEIOS
-	if ((*offp & sector_mask) || (size & sector_mask)) 
-	   /* if not, then fail, we need either to do dio */
-	   return err;
-
-	max_sectors = KIO_MAX_SECTORS;
-	large_io = 0;
-#endif
-#ifdef LARGEIOS	
-	if ((*offp & ~PAGE_MASK) || (size & ~PAGE_MASK)) {
-		/* if it's not PAGE_SIZE, then sect_size */
-		 if ((*offp & sector_mask) || (size & sector_mask))
-			 /* if not, then fail, we need either to do dio */
-			 return err;
-		 max_sectors = KIO_MAX_SECTORS; /* for 2.4.9 - 1024 */
-	} /* ok we 're PAGE_SIZE aligned, lets see if the buffer is */
-	else {
-		if (!((unsigned long) buf & ~PAGE_MASK)) {
-			/* yippie we are .. we can do PAGE_SIZE size io's */
-			large_io = 1;
-			/* for 2.4.9 */
-			max_sectors = KIO_MAX_SECTORS / sectors_per_page;
-		} else {
-			max_sectors = KIO_MAX_SECTORS;
-			large_io = 0;
-		}
-
-	}	
-#endif
-	/* find out how far we are to the end of our cluster */
-
-	err = 0;
-	if (size)
-		err = -ENXIO;
-
-	/* Split the IO into KIO_MAX_SECTORS chunks, mapping and */
-	/* unmapping the single kiobuf as we go to perform each chunk of IO. */
-
-	transferred = 0;
-	blocknr = *offp >> sector_bits;
-	clustersize = inode->i_blksize >> sector_bits;
-	myiosize = size >> sector_bits;
-	blocks_end_cluster = clustersize - (blocknr % clustersize);
-	firstlogic = blocknr;
-	totalioblocks = 0;
-
-	ret = ocfs_get_block2 (inode, blocknr, &firstphys, sector_size);
-	if (ret == -1) {
-		err = 0;
-		goto out;
-	}
-	while (myiosize > 0) {
-	    if (blocks_end_cluster + 1 > myiosize) {
-		totalioblocks += myiosize;
-		myiosize = 0;
-		goto doio;
-	    } else {
-		totalioblocks += blocks_end_cluster;
-		myiosize -= blocks_end_cluster;
-		nextlogic = firstlogic + blocks_end_cluster;
-	    }
-again:
-	    ret = ocfs_get_block2 (inode, nextlogic, &nextphys, sector_size);
-	    if (ret == -1) {
-		err = 0;
-		goto out;
-	    }
-	    if (nextphys == (firstphys + totalioblocks)) {
-		// merge ok
-		blocks_end_cluster = clustersize - (nextlogic % clustersize);
-		if (blocks_end_cluster + 1 > myiosize) {
-		   totalioblocks += myiosize;
-		   myiosize = 0;
-		} else {
-		   totalioblocks += blocks_end_cluster;
-		   myiosize -= blocks_end_cluster;
-		   nextlogic = nextlogic + blocks_end_cluster;
-		   goto again;
-		}
-	    }
-doio:
-		size = totalioblocks << sector_bits;
-		if (large_io)
-			nbhs = (size >> PAGE_SHIFT);
-		else
-			nbhs = (size >> sector_bits);
-		if (nbhs > max_sectors)
-			nbhs = max_sectors;
-
-#ifdef KERNEL_NO_F_IOBUF
-		err = alloc_kiovec_sz (1, &iobuf, &nbhs);
-		if (err)
-			goto out;
-#else
-		if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
-			/*
-			* A parallel read/write is using the preallocated iobuf
-			* so just run slow and allocate a new one.
-			*/
-			err = alloc_kiovec_sz (1, &iobuf, &nbhs);
-			if (err)
-				goto out;
-			new_iobuf = 1;
-		} else
-			new_iobuf = 0;
-				
-#endif
-		inuse = 1;
-
-		totalioblocks = 0;
-		while (size > 0) {
-			if (large_io) {
-				blocks = size >> PAGE_SHIFT;
-				if (blocks > max_sectors)
-					blocks = max_sectors;
-				iosize = blocks << PAGE_SHIFT;
-			} else {
-				blocks = size >> sector_bits;
-				if (blocks > max_sectors)
-					blocks = max_sectors;
-				iosize = blocks << sector_bits;
-			}
-			if (!blocks)
-				break;
-			err = map_user_kiobuf (rw, iobuf, (unsigned long) buf, iosize);
-			if (err)
-				break;
-			/* get the blocknr depending on io size for all blocks */
-			/* since we are awlays within the extent we only need to get the first block */
-			OCFS_KIO_BLOCKS(iobuf)[0] = firstphys + totalioblocks;
-
-			if (large_io) {
-				blocknr += sectors_per_page;
-				OCFS_KIO_BLOCKS(iobuf)[0] = OCFS_KIO_BLOCKS(iobuf)[0] / sectors_per_page;
-			} else {
-				blocknr++;
-			}
-
-			for (i = 1; i < blocks; i++) {
-				if (large_io) {
-					blocknr += sectors_per_page;
-				} else {
-					blocknr++;
-				}
-				OCFS_KIO_BLOCKS(iobuf)[i] = OCFS_KIO_BLOCKS(iobuf)[0] + i;
-			}
-			err = brw_kiovec (rw, 1, &iobuf, inode->i_dev, OCFS_KIO_BLOCKS(iobuf),
-					large_io ? PAGE_SIZE : sector_size);
-#ifdef SUSE
-			if (rw == READ &&  err > 0)
-				mark_dirty_kiobuf(iobuf, err);
-#endif
-			if (err >= 0) {
-				transferred += err;
-				size -= err;
-				buf += err;
-				if (large_io) {
-					totalioblocks +=
-					    (blocks * sectors_per_page);
-				} else {
-					totalioblocks += blocks;
-				}
-			} else {
-				printk( "ocfs_rw_direct : brw_kiovec() %d\n", err);	
-				break;
-			}
-			unmap_kiobuf (iobuf);
-			if (err != iosize)
-				break;
-		}
-#ifdef KERNEL_NO_F_IOBUF
-		free_kiovec_sz(1, &iobuf, &nbhs);
-#else
-		if (!new_iobuf)
-			clear_bit(0, &filp->f_iobuf_lock);
-		else
-			free_kiovec_sz(1, &iobuf, &nbhs);
-#endif
-		inuse = 0;
-		totalioblocks = 0;
-		firstlogic = nextlogic;
-		firstphys = nextphys;
-	}
-	if (transferred) {
-		*offp += transferred;
-		err = transferred;
-	}
-
-out:
-#ifdef KERNEL_NO_F_IOBUF
-	if (inuse)
-	   free_kiovec_sz (1, &iobuf, &nbhs);
-#else
-	if (inuse) {
-		if (!new_iobuf)
-			clear_bit(0, &filp->f_iobuf_lock);
-		else
-			free_kiovec_sz(1, &iobuf, &nbhs);
-	}
-#endif
-	return err;
-}				/* ocfs_rw_direct */
-#endif /* 2.4.x kernel */
-
-#ifdef AIO_ENABLED
-static int ocfs_kvec_rw(struct file *filp, int rw, kvec_cb_t cb,
-		size_t size, loff_t pos)
-{
-	int	     err = 0;
-	int max_sectors = 25000;
-	struct inode *inode = filp->f_dentry->d_inode;
-	unsigned long blocknr, blocks, iosize,myiosize;
-	long firstphys;
-	int clustersize;
-	unsigned long blocks_end_cluster = 0;
- 
-	/* FIXME: Need to differentiate betwen sectors and blocksize */
-	int sector_bits = OCFS_SB(inode->i_sb)->s_sectsize_bits;
-	int sector_size = 1 << OCFS_SB(inode->i_sb)->s_sectsize_bits;
-	int sector_mask = sector_size - 1;
-
-	int ret;
-	unsigned long firstlogic;
-	long nextphys;
-	unsigned long nextlogic = 0;
-	unsigned long totalioblocks = 0;
-
-	if (!size || (pos == inode->i_size)) {
-		cb.fn(cb.data, cb.vec, err);
-		return err;
-	}
-
-	err = -ENXIO;
-	if (pos >= inode->i_size) {
-		return err;
-	}
-
-	err = -EINVAL;
-	if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) {
-		return err;
-	}
-
-	blocknr = pos >> sector_bits;
-
-	blocks = size >> sector_bits;;
-	if (blocks > max_sectors)
-		blocks = max_sectors;
-	if (!blocks) {
-		err = -ENXIO;
-		return err;;
-	}
-
-	iosize = blocks << sector_bits;
-	clustersize = inode->i_blksize >> sector_bits;
-	blocks_end_cluster = clustersize - (blocknr % clustersize);
-	myiosize = size >> sector_bits;
-	firstlogic = blocknr;
-	totalioblocks = 0;
-
-	err = ocfs_get_block2(inode, blocknr, &firstphys, sector_size);
-	if ( err == -1 ) {
-		err = 0;
-		return err;
-	}
-		if (blocks_end_cluster + 1 > myiosize) {
-			totalioblocks += myiosize;
-			myiosize = 0;
-			goto doio;
-		} else {
-			totalioblocks += blocks_end_cluster;
-			myiosize -= blocks_end_cluster;
-			nextlogic = firstlogic + blocks_end_cluster;
-		}
-again:
-		ret = ocfs_get_block2 (inode, nextlogic, &nextphys, sector_size);
-		if (ret == -1) {
-			err = 0;
-			return err;
-		}
-	    if (nextphys == (firstphys + totalioblocks)) {
-		blocks_end_cluster = clustersize - (nextlogic % clustersize);
-		if (blocks_end_cluster + 1 > myiosize) {
-		   totalioblocks += myiosize;
-		   myiosize = 0;
-		} else {
-		   totalioblocks += blocks_end_cluster;
-		   myiosize -= blocks_end_cluster;
-		   nextlogic = nextlogic + blocks_end_cluster;
-		   goto again;
-		}
-	    }
-doio:
-	blocks = totalioblocks;
-	err = brw_kvec_async(rw, cb, inode->i_dev, blocks, firstphys, sector_bits);
-	return err;
-
-}
-
-int ocfs_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) 
-{
-	int ret;
-
-	LOG_SET_CONTEXT(KVEC_READ);
-
-	ret = ocfs_kvec_rw(file, READ, cb, size, pos);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}
-
-int ocfs_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) 
-{
-	int ret;
-
-	LOG_SET_CONTEXT(KVEC_WRITE);
-
-	ret = ocfs_kvec_rw(file, WRITE, cb, size, pos);
-
-	LOG_CLEAR_CONTEXT();
-	return ret;
-}
-#endif
-
-/*
  * ocfs_inode_revalidate()
  *
  * In 2.4, this is called only from stat.c always without i_sem before
Index: src/aops.c
===================================================================
--- src/aops.c	(revision 0)
+++ src/aops.c	(revision 0)
@@ -0,0 +1,677 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Authors: Kurt Hackel, Mark Fasheh, Sunil Mushran, Wim Coekaerts,
+ *	    Manish Singh, Neeraj Goyal, Suchit Kaura
+ */
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/byteorder.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+
+#include "alloc.h"
+#include "buffer_head_io.h"
+#include "file.h"
+#include "inode.h"
+#include "ocfs_journal.h"
+
+
+#define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_INODE
+
+static int ocfs_symlink_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int status;
+	ocfs2_dinode *fe = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *buffer_cache_bh = NULL;
+	ocfs_super *osb = OCFS_SB(inode->i_sb);
+	void *kaddr;
+
+	LOG_ENTRY_ARGS("(0x%p, %llu, 0x%p, %d)\n", inode,
+			(unsigned long long)iblock, bh_result, create);
+
+	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
+		LOG_ERROR_ARGS ("block offset > PATH_MAX: %llu",
+				(unsigned long long)iblock);
+		goto bail;
+	}
+
+	status = ocfs_read_bh(OCFS_SB(inode->i_sb),
+			      OCFS_I(inode)->ip_blkno <<
+			      	inode->i_sb->s_blocksize_bits,
+			      &bh,
+			      OCFS_BH_CACHED, inode);
+	if (status < 0) {
+		LOG_ERROR_STATUS (status);
+		goto bail;
+	}
+	fe = (ocfs2_dinode *) bh->b_data;
+
+	if (!IS_VALID_FILE_ENTRY(fe)) {
+		LOG_ERROR_ARGS("Invalid fe at blkno %llu",
+			       OCFS_I(inode)->ip_blkno);
+		goto bail;
+	}
+
+	if ((u64)iblock >= ocfs_clusters_to_blocks(inode->i_sb,
+					      fe->i_clusters)) {
+		LOG_ERROR_ARGS ("block offset is outside the allocated size: %llu",
+		     (unsigned long long)iblock);
+		goto bail;
+	}
+
+	/* We don't use the page cache to create symlink data, so if
+	 * need be, copy it over from the buffer cache. */
+	if (!buffer_uptodate(bh_result) && !ocfs_inode_is_new(osb, inode)) {
+		buffer_cache_bh = sb_getblk(osb->sb, 
+					    fe->id2.i_list.l_recs[0].e_blkno + iblock);
+		if (!buffer_cache_bh) {
+			LOG_ERROR_STR("couldn't getblock for symlink!");
+			goto bail;
+		}
+
+		/* we haven't locked out transactions, so a commit
+		 * could've happened. Since we've got a reference on
+		 * the bh, even if it commits while we're doing the
+		 * copy, the data is still good. */
+		if (buffer_jbd(buffer_cache_bh) 
+		    && !ocfs_inode_is_new(osb, inode)) {
+			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+			if (!kaddr) {
+				LOG_ERROR_ARGS("couldn't kmap!\n");
+				goto bail;
+			}
+			memcpy(kaddr + (bh_result->b_size * iblock), 
+			       buffer_cache_bh->b_data, 
+			       bh_result->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			set_buffer_uptodate(bh_result);
+		}
+		brelse(buffer_cache_bh);
+	}
+
+	map_bh(bh_result, inode->i_sb,
+	       fe->id2.i_list.l_recs[0].e_blkno + iblock);
+
+	err = 0;
+
+bail:
+	if (bh)
+		brelse(bh);
+
+	LOG_EXIT_INT (err);
+	return err;
+}
+
+static int ocfs_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	__s64 vbo = 0;
+	__s64 lbo = 0;
+	__u32 len;
+	int open_direct;
+
+	LOG_ENTRY_ARGS("(0x%p, %llu, 0x%p, %d)\n", inode,
+			(unsigned long long)iblock, bh_result, create);
+
+	if (OCFS_I(inode)->ip_flags & OCFS_INODE_SYSTEM_FILE) {
+		printk("get_block on system inode 0x%p (%lu)\n",
+		       inode, inode->i_ino);
+	}
+
+	open_direct = OCFS_I(inode)->ip_open_flags & OCFS_OIN_OPEN_FOR_DIRECTIO;
+
+	if (S_ISLNK(inode->i_mode)) {
+		/* this always does I/O for some reason. */
+		down_read(&OCFS_I(inode)->ip_io_sem);
+		err = ocfs_symlink_get_block (inode, iblock, bh_result, 
+					      create);
+		up_read(&OCFS_I(inode)->ip_io_sem);
+		goto bail;
+	}
+
+	vbo = (__s64) iblock << inode->i_sb->s_blocksize_bits;
+
+#if 0
+	if (!INODE_JOURNAL(inode) && vbo >= OCFS_I(inode)->ip_alloc_size) {
+		int vbo_pad;
+		
+		vbo_pad = inode->i_sb->s_blocksize;
+		vbo_pad -= vbo & (s64)(inode->i_sb->s_blocksize - 1);
+
+		LOG_TRACE_STR("Extending allocation");
+		LOG_ERROR_ARGS("extending inode %lu in get_block!!\n", 
+			       inode->i_ino);
+		down_write(&OCFS_I(inode)->ip_io_sem);
+		err = ocfs_extend_file(osb, vbo + vbo_pad, 
+				       NULL, inode, NULL, 0, NULL);
+		up_write(&OCFS_I(inode)->ip_io_sem);
+		if (err < 0) {
+			err = -ENOSPC;
+			LOG_ERROR_STATUS (err);
+			goto bail;
+		}
+	}
+#else
+	if (vbo >= OCFS_I(inode)->ip_alloc_size) {
+		err = -EIO;
+		LOG_ERROR_ARGS("Trying to extend in ocfs_get_block() "
+			"(inode %llu, blkno %llu, vbo %llu, alloc %llu)\n",
+			OCFS_I(inode)->ip_blkno, (u64)iblock, (u64)vbo,
+			OCFS_I(inode)->ip_alloc_size);
+		goto bail;
+	}
+#endif
+
+	len = inode->i_sb->s_blocksize;
+	if (!open_direct)
+		down_read(&OCFS_I(inode)->ip_extend_sem);
+	err = ocfs_lookup_file_allocation(OCFS2_SB(inode->i_sb),
+					  vbo, &lbo, len, NULL, 
+					  inode, open_direct);
+	if (!open_direct)
+		up_read(&OCFS_I(inode)->ip_extend_sem);
+
+	if (err < 0) {
+		LOG_ERROR_ARGS("vbo=%lld lbo=%lld len=%u", vbo, lbo, len);
+		goto bail;
+	}
+
+	map_bh(bh_result, inode->i_sb, lbo >> inode->i_sb->s_blocksize_bits);
+
+	err = 0;
+
+	if (bh_result->b_blocknr == 0) {
+		err = -EIO;
+		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u, blkno=(%llu)\n",
+				vbo, lbo, len, 
+				OCFS_I(inode)->ip_blkno);
+	}
+
+	if (vbo < OCFS_I(inode)->ip_mmu_private)
+		goto bail;
+	if (!create)
+		goto bail;
+	if (vbo != OCFS_I(inode)->ip_mmu_private) {
+		LOG_ERROR_ARGS("Uh-oh, vbo = %lld, i_size = %llu, mmu = %llu, "
+			       "inode = %llu\n",
+			       vbo, inode->i_size, 
+			       OCFS_I(inode)->ip_mmu_private,
+			       OCFS_I(inode)->ip_blkno);
+		BUG();
+		err = -EIO;
+		goto bail;
+	}
+
+	bh_result->b_state |= (1UL << BH_New);
+	OCFS_I(inode)->ip_mmu_private += inode->i_sb->s_blocksize;
+
+bail:
+	if (err < 0)
+		err = -EIO;
+
+	LOG_EXIT_INT (err);
+	return err;
+}
+
+static int ocfs_readpage(struct file *file, struct page *page)
+{
+	int ret;
+
+	LOG_SET_CONTEXT(READPAGE);
+	LOG_ENTRY_ARGS("(0x%p, %lu)\n", file, (page ? page->index : 0));
+
+	ret = block_read_full_page(page, ocfs_get_block);
+
+	LOG_EXIT_INT(ret);
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+static int ocfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+
+	LOG_SET_CONTEXT(WRITEPAGE);
+	LOG_ENTRY_ARGS("(0x%p)\n", page);
+
+	ret = block_write_full_page(page, ocfs_get_block, wbc);
+
+	LOG_EXIT_INT(ret);
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+#else
+static int ocfs_writepage(struct page *page)
+{
+	int ret;
+
+	LOG_SET_CONTEXT(WRITEPAGE);
+	LOG_ENTRY_ARGS("(0x%p)\n", page);
+
+	ret = block_write_full_page(page, ocfs_get_block);
+
+	LOG_EXIT_INT(ret);
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+inline void __mark_dirty(struct buffer_head *bh)
+{
+	set_buffer_flushtime(bh);
+	refile_buffer(bh);
+}
+
+static int __block_commit_write(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	int partial = 0, need_balance_dirty = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	for(bh = head = page->buffers, block_start = 0;
+	    bh != head || !block_start;
+	    block_start=block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_bit(BH_Uptodate, &bh->b_state);
+			if (!atomic_set_buffer_dirty(bh)) {
+				__mark_dirty(bh);
+				buffer_insert_inode_data_queue(bh, inode);
+				need_balance_dirty = 1;
+			}
+		}
+	}
+
+	if (need_balance_dirty)
+		balance_dirty();
+	/*
+	 * is this a partial write that happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' wether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
+}
+
+static int ocfs2_cont_prepare_write(struct page *page, unsigned offset,
+		unsigned to, get_block_t *get_block, loff_t *bytes)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct page *new_page;
+	unsigned long pgpos;
+	long status;
+	unsigned zerofrom;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	char *kaddr;
+
+	while (page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
+		status = -ENOMEM;
+		new_page = grab_cache_page(mapping, pgpos);
+		if (!new_page)
+			goto out;
+		/* we might sleep */
+		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
+			unlock_page(new_page);
+			page_cache_release(new_page);
+			continue;
+		}
+		zerofrom = *bytes & ~PAGE_CACHE_MASK;
+		if (zerofrom & (blocksize-1)) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+		status = block_prepare_write(new_page, zerofrom,
+					     PAGE_CACHE_SIZE, get_block);
+		if (status)
+			goto out_unmap;
+		kaddr = page_address(new_page);
+		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
+		flush_dcache_page(new_page);
+		__block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
+		kunmap(new_page);
+		unlock_page(new_page);
+		page_cache_release(new_page);
+	}
+
+	if (page->index < pgpos) {
+		/* completely inside the area */
+		zerofrom = offset;
+	} else {
+		/* page covers the boundary, find the boundary offset */
+		zerofrom = *bytes & ~PAGE_CACHE_MASK;
+
+		/* if we will expand the thing last block will be filled */
+		if (to > zerofrom && (zerofrom & (blocksize-1))) {
+			*bytes |= (blocksize-1);
+			(*bytes)++;
+		}
+
+		/* starting below the boundary? Nothing to zero out */
+		if (offset <= zerofrom)
+			zerofrom = offset;
+	}
+	status = block_prepare_write(page, zerofrom, to, get_block);
+	if (status)
+		goto out1;
+	kaddr = page_address(page);
+	if (zerofrom < offset) {
+		memset(kaddr+zerofrom, 0, offset-zerofrom);
+		flush_dcache_page(page);
+		__block_commit_write(inode, page, zerofrom, offset);
+	}
+	return 0;
+out1:
+	ClearPageUptodate(page);
+	kunmap(page);
+	return status;
+
+out_unmap:
+	ClearPageUptodate(new_page);
+	kunmap(new_page);
+	UnlockPage(new_page);
+	page_cache_release(new_page);
+out:
+	return status;
+}
+
+/* Mark's favorite hack */
+#undef cont_prepare_write
+#define cont_prepare_write ocfs2_cont_prepare_write
+#endif  /* < 2.6.0 */
+
+/*
+ * ocfs_prepare_write()
+ *
+ */
+static int ocfs_prepare_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	int ret;
+
+	LOG_SET_CONTEXT(PREPARE_WRITE);
+	LOG_ENTRY_ARGS("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	ret = cont_prepare_write(page, from, to, ocfs_get_block,
+		&(OCFS_I(page->mapping->host)->ip_mmu_private));
+
+	LOG_EXIT_INT(ret);
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+
+/*
+ * ocfs_commit_write()
+ *
+ */
+static int ocfs_commit_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	int ret;
+
+	LOG_SET_CONTEXT(COMMIT_WRITE);
+	LOG_ENTRY_ARGS("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	ret = generic_commit_write(file, page, from, to);
+
+	LOG_EXIT_INT(ret);
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+static sector_t ocfs_bmap(struct address_space *mapping, sector_t block) 
+#else
+static int ocfs_bmap(struct address_space *mapping, long block) 
+#endif
+{
+	int disk_block = 0;
+	ocfs_super *osb = OCFS_SB(mapping->host->i_sb);
+	__s64 vbo = 0;
+	__s64 lbo = 0;
+	__u32 len;
+	int err = 0, status;
+	struct inode *inode = mapping->host;
+
+	LOG_SET_CONTEXT(BMAP);
+	LOG_ENTRY_ARGS("(block = %llu)\n", (unsigned long long)block);
+
+	if (!INODE_JOURNAL(inode)) {
+		LOG_ERROR_STR("bmap is only for journal inodes!");
+		err = -EINVAL;
+		LOG_ERROR_STATUS(err);
+		goto bail;
+	}
+
+	vbo = (__s64) block << inode->i_sb->s_blocksize_bits;
+	len = osb->sb->s_blocksize;
+	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
+					   inode, 1);
+	if (err < 0) {
+		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u", vbo,
+				lbo, len);
+		LOG_ERROR_STATUS(err);
+		goto bail;
+	}
+
+	disk_block = lbo >> inode->i_sb->s_blocksize_bits;
+
+bail:
+	status = err ? err : disk_block;
+
+	LOG_EXIT_STATUS(status);
+	LOG_CLEAR_CONTEXT();
+	return status;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate 
+ *   bh.b_size with the size of IO which is permitted at this offset and 
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * We should probably have this data in the oin for the inode.
+ * Otherwise, we might want to look at ocfs_rw_direct, 
+ *  ocfs_lookup_file_allocation and ocfs_get_block
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * 					fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs_direct_IO_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create)
+{
+	int ret = -1;
+	int status;
+	ocfs_super *osb = NULL;
+	__s64 vbo; /* file offset */
+	__s64 lbo; /* logical (disk) offset */
+	__s64 vbo_max; /* file offset, max_blocks from iblock */
+	int set_new = 0; /* flag */
+	__u64 new_size; /* In bytes, the size of the contiguous block */
+	unsigned char blocksize_bits;
+
+	if (!inode || !bh_result) {
+		LOG_ERROR_STR("ocfs_direct_IO_get_blocks: inode or bh_result is null");
+		return -EIO;
+	}
+
+	osb = inode->i_sb->s_fs_info;
+	blocksize_bits = inode->i_sb->s_blocksize_bits;
+	/* make sure we're up to date... */
+	if (atomic_read(&OCFS_I(inode)->ip_needs_verification)) {
+		LOG_TRACE_STR ("ocfs_direct_IO_get_blocks: verify oin.");
+		status = ocfs_verify_update_inode (osb, inode, 0);
+		if (status < 0) {
+			LOG_TRACE_STR ("ocfs_verify_update_inode failed");
+			ret = -EIO;
+			goto bail;
+		}
+	}
+
+	/* This function won't even be called if the request isn't all
+	 * nicely aligned and of the right size, so there's no need
+	 * for us to check any of that. */
+
+	vbo = (__s64) iblock << blocksize_bits;
+	vbo_max = vbo + ((__s64) max_blocks << blocksize_bits);
+
+	/* NOTE: create flag is set when we ?may? have to allocate some
+	   blocks for the file. */
+	if (create && vbo_max > OCFS_I(inode)->ip_alloc_size) {
+		/* WARNING: How much do we really want to extend the file? */
+		status = ocfs_extend_file(osb, vbo_max,
+					  NULL, inode, NULL, 0, NULL);
+		if (status < 0) {
+			status = -ENOSPC;
+			LOG_ERROR_STR("ocfs_direct_IO_get_blocks: failed to extend the file!");
+			goto bail;
+		}
+		set_new = 1;
+	}
+
+	/* This figure out the size of the next contiguous block, and
+	 * our logical offset */	
+	/* TODO: Try our damndest to give sizes in multiples of PAGE_SIZE */
+	status = ocfs_lookup_file_allocation(osb, vbo, &lbo, max_blocks << blocksize_bits, 
+					     &new_size, inode, 1);
+
+	/* Do whatever we need to the buffer_head */
+	if (set_new) {
+		set_buffer_new(bh_result);
+		/* Do we really want to set bh_result->b_blocknr here too? */
+		bh_result->b_blocknr = lbo >> blocksize_bits;
+	} else {
+		clear_buffer_new(bh_result);
+		/* is the last argument here correct? */
+		map_bh(bh_result, inode->i_sb, lbo >> blocksize_bits);
+	}
+
+	/* make sure we don't map more than max_blocks blocks here as
+	   that's all the kernel will handle at this point. */
+	if (new_size > (__u64)max_blocks << blocksize_bits)
+		new_size = (__u64)max_blocks << blocksize_bits;
+	bh_result->b_size = new_size;
+
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * ocfs_direct_IO()
+ * used to be: 
+ * static int ocfs_direct_IO (int rw,
+ *	       struct inode *inode,
+ *	       struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
+ *
+ * now:         
+  static int ocfs_direct_IO(int rw, struct kiocb *iocb,
+			const struct iovec *iov, loff_t offset,
+			unsigned long nr_segs)
+ * int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
+ *                  loff_t offset, unsigned long nr_segs);
+ */
+static ssize_t ocfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+	int ret;
+
+	LOG_SET_CONTEXT(DIRECT_IO);
+
+	LOG_ENTRY ();
+
+	/* blockdev_direct_IO checks alignment for us, using */
+	ret = blockdev_direct_IO (rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ocfs_direct_IO_get_blocks, NULL);
+
+	LOG_EXIT_INT (ret);
+
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}				/* ocfs_direct_IO */
+
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
+/*
+ * ocfs_direct_IO()
+ *
+ * we are not using this function anymore, in fact
+ * we should never get here any more
+ * so let's just BUG(), hint from sct@redhat.com
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20) || defined(SUSE)
+static int ocfs_direct_IO (int rw, struct file *filp, struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
+{
+	BUG();
+	return 0;
+}				/* ocfs_direct_IO */
+#else
+static int ocfs_direct_IO (int rw, struct inode *inode, struct kiobuf *iobuf, unsigned long blocknr, int blocksize)
+{
+	BUG();
+	return 0;
+}				/* ocfs_direct_IO */
+#endif
+#endif  /* version >= 2.4.10 */
+
+
+struct address_space_operations ocfs_aops = {
+	.readpage	= ocfs_readpage,
+	.writepage	= ocfs_writepage,
+	.prepare_write	= ocfs_prepare_write,
+	.commit_write	= ocfs_commit_write,
+	.bmap		= ocfs_bmap,
+
+	/*
+	 * On a 2.4 system, we are only adding this here as a dummy basically.
+	 * Just need open with O_DIRECT to succeed, we still call
+	 * ocfs_rw_direct().
+	 *
+	 * For a 2.6 system, this is the way a filesystem provides
+	 * direct-io support. 
+	 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
+	.direct_IO	= ocfs_direct_IO
+#endif
+};
Index: src/24io.c
===================================================================
--- src/24io.c	(revision 0)
+++ src/24io.c	(revision 0)
@@ -0,0 +1,481 @@
+
+#include <linux/version.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+
+#define KERNEL_NO_F_IOBUF
+#include "ocfs_compat.h"
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/iobuf.h>
+
+#include <asm/byteorder.h>
+
+#include "ocfs_log.h"
+#include "ocfs.h"
+
+#include "alloc.h"
+#include "dlm.h"
+#include "extmap.h"
+#include "file.h"
+#include "inode.h"
+#include "lockres.h"
+#include "namei.h"
+#include "super.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "util.h"
+#include "vote.h"
+
+#include "ocfs_journal.h"
+#include "buffer_head_io.h"
+
+#define OCFS_DEBUG_CONTEXT    OCFS_DEBUG_CONTEXT_INODE
+
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18)
+# define free_kiovec_sz(nr, buf, bh)     free_kiovec(nr, buf)
+# define alloc_kiovec_sz(nr, buf, bh)    alloc_kiovec(nr, buf)
+#endif
+
+#if defined(SUSE) && LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)
+#define OCFS_KIO_BLOCKS(_iobuf)  ((_iobuf)->kio_blocks)
+#else
+#define OCFS_KIO_BLOCKS(_iobuf)  ((_iobuf)->blocks)
+#endif
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,10)
+#define KERNEL_NO_F_IOBUF 1
+#elif defined(SUSE) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,20)
+#define KERNEL_NO_F_IOBUF 1
+#endif
+
+static int ocfs_get_block2 (struct inode *inode, long iblock, long *oblock, int len)
+{
+	int err = -EIO;
+	ocfs_super *osb;
+	__s64 vbo = 0;
+	__s64 lbo = 0;
+
+	LOG_ENTRY_ARGS ("(0x%p, %ld)\n", inode, iblock);
+
+	if (!inode) {
+		LOG_ERROR_STR ("bad inode");
+		err = -1;
+		goto bail;
+	}
+
+	osb = OCFS_SB(inode->i_sb);
+
+	vbo = (__s64) iblock << osb->s_sectsize_bits;
+	err = ocfs_lookup_file_allocation(osb, vbo, &lbo, len, NULL, 
+					   inode, 1);
+	if (err < 0) {
+		LOG_ERROR_STATUS (err);
+		err = -1;
+		goto bail;
+	}
+
+	err = 0;
+
+	*oblock = lbo >> osb->s_sectsize_bits;
+	if (*oblock == 0) {
+		err = -EIO;
+		LOG_ERROR_ARGS ("vbo=%lld lbo=%lld len=%u, blkno=(%llu)\n",
+				vbo, lbo, len, 
+				OCFS_I(inode)->ip_blkno);
+	}
+
+bail:
+	if (err < 0)
+		err = -EIO;
+	LOG_EXIT_INT (err);
+	return err;
+}				/* ocfs_get_block2 */
+
+/*
+ * ocfs_rw_direct()
+ *
+ */
+ssize_t ocfs_rw_direct (int rw, struct file *filp, char *buf, size_t size, loff_t * offp)
+{
+#ifdef KERNEL_NO_F_IOBUF
+	struct kiobuf *iobuf;
+#else
+	struct kiobuf *iobuf = filp->f_iobuf;
+	int new_iobuf = 0;
+#endif
+	int err = 0;
+	unsigned long blocknr, blocks, myiosize;
+	size_t transferred;
+	int iosize, clustersize;
+	int i;
+	struct inode *inode = filp->f_dentry->d_inode;
+	int max_sectors;
+	int nbhs;
+	int sector_size, sector_bits, sector_mask, sectors_per_page;
+	int ret = 0;
+	int large_io = 0;
+	int inuse = 0;
+	unsigned long blocks_end_cluster = 0;
+	loff_t saved_off;
+	size_t saved_size;
+	unsigned long firstlogic;
+	long firstphys;
+	long nextphys;
+	unsigned long nextlogic = 0;
+	unsigned long totalioblocks = 0;
+
+	saved_off = *offp;
+	saved_size = size;
+	
+	/* FIXME: Need to differentiate between sectors and blocksize */
+	sector_bits = OCFS_SB(inode->i_sb)->s_sectsize_bits;
+	sector_size = 1 << OCFS_SB(inode->i_sb)->s_sectsize_bits;
+	sector_mask = sector_size - 1;
+	sectors_per_page = PAGE_SIZE / sector_size;
+	/* max sectors is 1024 in 2.4.9
+	 * max data is 512kb  
+	 */
+
+	err = -EINVAL;
+	if (size == 0) {
+		printk("direct write of 0 byte\n");
+		return 0;
+	}
+
+	if (rw == READ) {
+	   if (inode->i_size <= *offp) /* read past end of file */
+	      return 0;
+	   if  (size > (inode->i_size - *offp))
+	      size = inode->i_size - *offp;
+	}
+
+	/* make sure aligned to either PAGE_SIZE or sect_size IO */
+#ifndef LARGEIOS
+	if ((*offp & sector_mask) || (size & sector_mask)) 
+	   /* if not, then fail, we need either to do dio */
+	   return err;
+
+	max_sectors = KIO_MAX_SECTORS;
+	large_io = 0;
+#endif
+#ifdef LARGEIOS	
+	if ((*offp & ~PAGE_MASK) || (size & ~PAGE_MASK)) {
+		/* if it's not PAGE_SIZE, then sect_size */
+		 if ((*offp & sector_mask) || (size & sector_mask))
+			 /* if not, then fail, we need either to do dio */
+			 return err;
+		 max_sectors = KIO_MAX_SECTORS; /* for 2.4.9 - 1024 */
+	} /* ok we 're PAGE_SIZE aligned, lets see if the buffer is */
+	else {
+		if (!((unsigned long) buf & ~PAGE_MASK)) {
+			/* yippie we are .. we can do PAGE_SIZE size io's */
+			large_io = 1;
+			/* for 2.4.9 */
+			max_sectors = KIO_MAX_SECTORS / sectors_per_page;
+		} else {
+			max_sectors = KIO_MAX_SECTORS;
+			large_io = 0;
+		}
+
+	}	
+#endif
+	/* find out how far we are to the end of our cluster */
+
+	err = 0;
+	if (size)
+		err = -ENXIO;
+
+	/* Split the IO into KIO_MAX_SECTORS chunks, mapping and */
+	/* unmapping the single kiobuf as we go to perform each chunk of IO. */
+
+	transferred = 0;
+	blocknr = *offp >> sector_bits;
+	clustersize = inode->i_blksize >> sector_bits;
+	myiosize = size >> sector_bits;
+	blocks_end_cluster = clustersize - (blocknr % clustersize);
+	firstlogic = blocknr;
+	totalioblocks = 0;
+
+	ret = ocfs_get_block2 (inode, blocknr, &firstphys, sector_size);
+	if (ret == -1) {
+		err = 0;
+		goto out;
+	}
+	while (myiosize > 0) {
+	    if (blocks_end_cluster + 1 > myiosize) {
+		totalioblocks += myiosize;
+		myiosize = 0;
+		goto doio;
+	    } else {
+		totalioblocks += blocks_end_cluster;
+		myiosize -= blocks_end_cluster;
+		nextlogic = firstlogic + blocks_end_cluster;
+	    }
+again:
+	    ret = ocfs_get_block2 (inode, nextlogic, &nextphys, sector_size);
+	    if (ret == -1) {
+		err = 0;
+		goto out;
+	    }
+	    if (nextphys == (firstphys + totalioblocks)) {
+		// merge ok
+		blocks_end_cluster = clustersize - (nextlogic % clustersize);
+		if (blocks_end_cluster + 1 > myiosize) {
+		   totalioblocks += myiosize;
+		   myiosize = 0;
+		} else {
+		   totalioblocks += blocks_end_cluster;
+		   myiosize -= blocks_end_cluster;
+		   nextlogic = nextlogic + blocks_end_cluster;
+		   goto again;
+		}
+	    }
+doio:
+		size = totalioblocks << sector_bits;
+		if (large_io)
+			nbhs = (size >> PAGE_SHIFT);
+		else
+			nbhs = (size >> sector_bits);
+		if (nbhs > max_sectors)
+			nbhs = max_sectors;
+
+#ifdef KERNEL_NO_F_IOBUF
+		err = alloc_kiovec_sz (1, &iobuf, &nbhs);
+		if (err)
+			goto out;
+#else
+		if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
+			/*
+			* A parallel read/write is using the preallocated iobuf
+			* so just run slow and allocate a new one.
+			*/
+			err = alloc_kiovec_sz (1, &iobuf, &nbhs);
+			if (err)
+				goto out;
+			new_iobuf = 1;
+		} else
+			new_iobuf = 0;
+				
+#endif
+		inuse = 1;
+
+		totalioblocks = 0;
+		while (size > 0) {
+			if (large_io) {
+				blocks = size >> PAGE_SHIFT;
+				if (blocks > max_sectors)
+					blocks = max_sectors;
+				iosize = blocks << PAGE_SHIFT;
+			} else {
+				blocks = size >> sector_bits;
+				if (blocks > max_sectors)
+					blocks = max_sectors;
+				iosize = blocks << sector_bits;
+			}
+			if (!blocks)
+				break;
+			err = map_user_kiobuf (rw, iobuf, (unsigned long) buf, iosize);
+			if (err)
+				break;
+			/* get the blocknr depending on io size for all blocks */
+			/* since we are awlays within the extent we only need to get the first block */
+			OCFS_KIO_BLOCKS(iobuf)[0] = firstphys + totalioblocks;
+
+			if (large_io) {
+				blocknr += sectors_per_page;
+				OCFS_KIO_BLOCKS(iobuf)[0] = OCFS_KIO_BLOCKS(iobuf)[0] / sectors_per_page;
+			} else {
+				blocknr++;
+			}
+
+			for (i = 1; i < blocks; i++) {
+				if (large_io) {
+					blocknr += sectors_per_page;
+				} else {
+					blocknr++;
+				}
+				OCFS_KIO_BLOCKS(iobuf)[i] = OCFS_KIO_BLOCKS(iobuf)[0] + i;
+			}
+			err = brw_kiovec (rw, 1, &iobuf, inode->i_dev, OCFS_KIO_BLOCKS(iobuf),
+					large_io ? PAGE_SIZE : sector_size);
+#ifdef SUSE
+			if (rw == READ &&  err > 0)
+				mark_dirty_kiobuf(iobuf, err);
+#endif
+			if (err >= 0) {
+				transferred += err;
+				size -= err;
+				buf += err;
+				if (large_io) {
+					totalioblocks +=
+					    (blocks * sectors_per_page);
+				} else {
+					totalioblocks += blocks;
+				}
+			} else {
+				printk( "ocfs_rw_direct : brw_kiovec() %d\n", err);	
+				break;
+			}
+			unmap_kiobuf (iobuf);
+			if (err != iosize)
+				break;
+		}
+#ifdef KERNEL_NO_F_IOBUF
+		free_kiovec_sz(1, &iobuf, &nbhs);
+#else
+		if (!new_iobuf)
+			clear_bit(0, &filp->f_iobuf_lock);
+		else
+			free_kiovec_sz(1, &iobuf, &nbhs);
+#endif
+		inuse = 0;
+		totalioblocks = 0;
+		firstlogic = nextlogic;
+		firstphys = nextphys;
+	}
+	if (transferred) {
+		*offp += transferred;
+		err = transferred;
+	}
+
+out:
+#ifdef KERNEL_NO_F_IOBUF
+	if (inuse)
+	   free_kiovec_sz (1, &iobuf, &nbhs);
+#else
+	if (inuse) {
+		if (!new_iobuf)
+			clear_bit(0, &filp->f_iobuf_lock);
+		else
+			free_kiovec_sz(1, &iobuf, &nbhs);
+	}
+#endif
+	return err;
+}				/* ocfs_rw_direct */
+
+#ifdef AIO_ENABLED
+static int ocfs_kvec_rw(struct file *filp, int rw, kvec_cb_t cb,
+		size_t size, loff_t pos)
+{
+	int	     err = 0;
+	int max_sectors = 25000;
+	struct inode *inode = filp->f_dentry->d_inode;
+	unsigned long blocknr, blocks, iosize,myiosize;
+	long firstphys;
+	int clustersize;
+	unsigned long blocks_end_cluster = 0;
+ 
+	/* FIXME: Need to differentiate betwen sectors and blocksize */
+	int sector_bits = OCFS_SB(inode->i_sb)->s_sectsize_bits;
+	int sector_size = 1 << OCFS_SB(inode->i_sb)->s_sectsize_bits;
+	int sector_mask = sector_size - 1;
+
+	int ret;
+	unsigned long firstlogic;
+	long nextphys;
+	unsigned long nextlogic = 0;
+	unsigned long totalioblocks = 0;
+
+	if (!size || (pos == inode->i_size)) {
+		cb.fn(cb.data, cb.vec, err);
+		return err;
+	}
+
+	err = -ENXIO;
+	if (pos >= inode->i_size) {
+		return err;
+	}
+
+	err = -EINVAL;
+	if ((pos < 0) || (pos & sector_mask) || (size & sector_mask)) {
+		return err;
+	}
+
+	blocknr = pos >> sector_bits;
+
+	blocks = size >> sector_bits;;
+	if (blocks > max_sectors)
+		blocks = max_sectors;
+	if (!blocks) {
+		err = -ENXIO;
+		return err;;
+	}
+
+	iosize = blocks << sector_bits;
+	clustersize = inode->i_blksize >> sector_bits;
+	blocks_end_cluster = clustersize - (blocknr % clustersize);
+	myiosize = size >> sector_bits;
+	firstlogic = blocknr;
+	totalioblocks = 0;
+
+	err = ocfs_get_block2(inode, blocknr, &firstphys, sector_size);
+	if ( err == -1 ) {
+		err = 0;
+		return err;
+	}
+		if (blocks_end_cluster + 1 > myiosize) {
+			totalioblocks += myiosize;
+			myiosize = 0;
+			goto doio;
+		} else {
+			totalioblocks += blocks_end_cluster;
+			myiosize -= blocks_end_cluster;
+			nextlogic = firstlogic + blocks_end_cluster;
+		}
+again:
+		ret = ocfs_get_block2 (inode, nextlogic, &nextphys, sector_size);
+		if (ret == -1) {
+			err = 0;
+			return err;
+		}
+	    if (nextphys == (firstphys + totalioblocks)) {
+		blocks_end_cluster = clustersize - (nextlogic % clustersize);
+		if (blocks_end_cluster + 1 > myiosize) {
+		   totalioblocks += myiosize;
+		   myiosize = 0;
+		} else {
+		   totalioblocks += blocks_end_cluster;
+		   myiosize -= blocks_end_cluster;
+		   nextlogic = nextlogic + blocks_end_cluster;
+		   goto again;
+		}
+	    }
+doio:
+	blocks = totalioblocks;
+	err = brw_kvec_async(rw, cb, inode->i_dev, blocks, firstphys, sector_bits);
+	return err;
+
+}
+
+int ocfs_kvec_read(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) 
+{
+	int ret;
+
+	LOG_SET_CONTEXT(KVEC_READ);
+
+	ret = ocfs_kvec_rw(file, READ, cb, size, pos);
+
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+
+int ocfs_kvec_write(struct file *file, kvec_cb_t cb, size_t size, loff_t pos) 
+{
+	int ret;
+
+	LOG_SET_CONTEXT(KVEC_WRITE);
+
+	ret = ocfs_kvec_rw(file, WRITE, cb, size, pos);
+
+	LOG_CLEAR_CONTEXT();
+	return ret;
+}
+
+#endif /* aio */
+#endif /* 2.6 */

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2004-08-13 20:40 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-08-13 20:40 [Ocfs2-devel] [PATCH] split inode.c Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.