[PATCH] prototype file data inode inlining

public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] prototype file data inode inlining
@ 2008-03-07  9:34 IWAMOTO Toshihiro
  2008-03-08  0:21 ` David Chinner
  0 siblings, 1 reply; 3+ messages in thread
From: IWAMOTO Toshihiro @ 2008-03-07  9:34 UTC (permalink / raw)
  To: xfs

Hi,

I've done a prototype implementation of file data inlining in inodes a
while ago.  It was originally meant to solve a performance problem
with a large number of small files at some customer site.
Although I measured some performance gains, a different workaround has
been adopted due to the patch quality problem.

As I'm not asking for inclusion, the patch hasn't been ported to the
current kernel version.  This patch might be useful if someone has a
similar performance problem and would like to see if file inlining
helps or not.

Some random notes and the patch itself follows.

Inlined file data are written from xfs_page_state_convert().
The xfs_trans related operations in that function is to get inode
written on disk and isn't for crash consistency.

Small files are made inlined when created.  Non inlined files don't
get inlined when they are truncated.

xfs_bmap_local_to_extents() has been modified to work with file data,
but logging isn't implemented.  A machine crash can cause data
corruption.

O_SYNC may behave incorrectly.

Use of attribute forks isn't considered and likely has issues.

diff -urp linux-2.6.12.5.orig/fs/xfs/linux-2.6/xfs_aops.c linux-2.6.12.5/fs/xfs/linux-2.6/xfs_aops.c
--- linux-2.6.12.5.orig/fs/xfs/linux-2.6/xfs_aops.c	2005-08-15 09:20:18.000000000 +0900
+++ linux-2.6.12.5/fs/xfs/linux-2.6/xfs_aops.c	2008-03-05 18:05:32.383592506 +0900
@@ -49,6 +49,7 @@
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_iomap.h"
@@ -567,6 +568,7 @@ xfs_submit_page(
 	if (bh_count) {
 		for (i = 0; i < bh_count; i++) {
 			bh = bh_arr[i];
+			BUG_ON(bh->b_bdev == NULL);
 			mark_buffer_async_write(bh);
 			if (buffer_unwritten(bh))
 				set_buffer_unwritten_io(bh);
@@ -725,6 +727,10 @@ xfs_page_state_convert(
 {
 	struct buffer_head	*bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
 	xfs_iomap_t		*iomp, iomap;
+	vnode_t			*vp = LINVFS_GET_VP(inode);
+	xfs_inode_t		*ip = XFS_BHVTOI(vp->v_fbhv);
+	xfs_trans_t		*tp;
+	xfs_mount_t		*mp = ip->i_mount;
 	loff_t			offset;
 	unsigned long           p_offset = 0;
 	__uint64_t              end_offset;
@@ -740,16 +746,18 @@ xfs_page_state_convert(
 	offset = i_size_read(inode);
 	end_index = offset >> PAGE_CACHE_SHIFT;
 	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
+	end_offset = min_t(unsigned long long,
+			(loff_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
 	if (page->index >= end_index) {
 		if ((page->index >= end_index + 1) ||
 		    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
+			if (printk_ratelimit())
+				printk("xfs_psc: i_size %d\n", i_size_read(inode));
 			err = -EIO;
 			goto error;
 		}
 	}
 
-	end_offset = min_t(unsigned long long,
-			(loff_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
 	offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
 
 	/*
@@ -765,6 +773,58 @@ xfs_page_state_convert(
 	p_offset = 0;
 	bh = head = page_buffers(page);
 
+	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
+	    end_offset <= XFS_IFORK_DSIZE(ip)) {
+		char *v;
+
+		if (printk_ratelimit())
+			printk("xfs_psc: %llu %d\n", end_offset, XFS_IFORK_DSIZE(ip)); 
+		if (end_offset > ip->i_df.if_bytes)
+			xfs_idata_realloc(ip, end_offset - ip->i_df.if_bytes,
+			    XFS_DATA_FORK);
+		if ((!PageDirty(page)) && printk_ratelimit())
+			printk("xfs_page_state_convert: is clean\n");
+		clear_page_dirty(page);
+		clear_buffer_dirty(bh); /* XXX */
+		v = kmap(page);
+		memcpy(ip->i_df.if_u1.if_data, v, end_offset);
+		kunmap(page);
+		set_buffer_uptodate(bh);
+		SetPageUptodate(page);
+			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
+			if ((err = xfs_trans_reserve(tp, 0,
+					      XFS_SWRITE_LOG_RES(mp),
+					      0, 0, 0))) {
+				/* Transaction reserve failed */
+				xfs_trans_cancel(tp, 0);
+			} else {
+				/* Transaction reserve successful */
+				xfs_ilock(ip, XFS_ILOCK_EXCL);
+				xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(tp, ip);
+				xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+				/* XXX O_SYNC handled by xfs_write?? */
+				/* xfs_trans_set_sync(tp); */
+				err = xfs_trans_commit(tp, 0, NULL);
+				xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			}
+		ASSERT(err == 0);
+		unlock_page(page); /* XXX */
+		return 0;
+	}
+
+	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		/*
+		 * Data no longer fits in an inode.
+		 * Clear the mapped bit so that xfs_bmap_local_to_extents()
+		 * gets called from xfs_bmapi().
+		 */
+		clear_buffer_mapped(bh);
+		unmapped = 1;
+		if (printk_ratelimit())
+			printk("xfs_psc: clearing LOCAL ino %llu %llu %x %x\n",
+			    ip->i_ino, end_offset, bh->b_state, page->flags);
+	}
 	do {
 		if (offset >= end_offset)
 			break;
@@ -897,9 +957,15 @@ xfs_page_state_convert(
 					startio, unmapped, tlast);
 	}
 
+	BUG_ON(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
+	    end_offset > XFS_IFORK_DSIZE(ip));
 	return page_dirty;
 
 error:
+	if (printk_ratelimit())
+		printk("xfs_psc: error\n");
+	BUG_ON(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
+	    end_offset > XFS_IFORK_DSIZE(ip));
 	for (i = 0; i < cnt; i++) {
 		unlock_buffer(bh_arr[i]);
 	}
@@ -929,6 +995,7 @@ __linvfs_get_block(
 	bmapi_flags_t		flags)
 {
 	vnode_t			*vp = LINVFS_GET_VP(inode);
+	xfs_inode_t		*ip = XFS_BHVTOI(vp->v_fbhv);
 	xfs_iomap_t		iomap;
 	int			retpbbm = 1;
 	int			error;
@@ -940,6 +1007,29 @@ __linvfs_get_block(
 	else
 		size = 1 << inode->i_blkbits;
 
+	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		char *v;
+		v = kmap(bh_result->b_page);
+		if (printk_ratelimit())
+			printk("__linvfs_get_block: memcpy ino %llu, %d bytes\n",
+			    ip->i_ino, (int)ip->i_d.di_size);
+		if (ip->i_df.if_u1.if_data == NULL ||
+		    ip->i_d.di_size > ip->i_df.if_bytes)
+			/* seek happened beyond EOF */
+			xfs_idata_realloc(ip,
+			    ip->i_d.di_size - ip->i_df.if_bytes, XFS_DATA_FORK);
+		memcpy(v, ip->i_df.if_u1.if_data, (int)ip->i_d.di_size);
+		memset(v + (int)ip->i_d.di_size, 0,
+		    PAGE_SIZE - ip->i_d.di_size);	/* XXX */
+		kunmap(bh_result->b_page);
+		set_buffer_uptodate(bh_result);
+		/* XXX do_mpage_readpage apparently needs this to be mapped */
+		set_buffer_mapped(bh_result);
+		SetPageUptodate(bh_result->b_page); /* XXX */
+		if (PageDirty(bh_result->b_page) && printk_ratelimit()) /* XXX */
+			printk("__linvfs_get_block: is dirty\n");
+		return 0;
+	}
 	VOP_BMAP(vp, offset, size,
 		create ? flags : BMAPI_READ, &iomap, &retpbbm, error);
 	if (error)
@@ -1143,6 +1233,7 @@ linvfs_writepage(
 	int			need_trans;
 	int			delalloc, unmapped, unwritten;
 	struct inode		*inode = page->mapping->host;
+	xfs_inode_t		*ip;
 
 	xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
 
@@ -1164,14 +1255,28 @@ linvfs_writepage(
 		need_trans = delalloc + unmapped + unwritten;
 	}
 
+	ip = XFS_BHVTOI(LINVFS_GET_VP(inode)->v_fbhv);
+	/* see xfs_page_state_convert */
+	/* XXX dup code */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
+	    i_size_read(inode) > XFS_IFORK_DSIZE(ip)) {
+		unmapped = 1;
+		need_trans = 1;
+	}
+
 	/*
 	 * If we need a transaction and the process flags say
 	 * we are already in a transaction, or no IO is allowed
 	 * then mark the page dirty again and leave the page
 	 * as is.
 	 */
-	if (PFLAGS_TEST_FSTRANS() && need_trans)
+	if (PFLAGS_TEST_FSTRANS() && need_trans) {
+		if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
+		    printk_ratelimit())
+			printk("linvfs_writepage: out_fail ino %llu\n",
+			    ip->i_ino);
 		goto out_fail;
+	}
 
 	/*
 	 * Delay hooking up buffer heads until we have
diff -urp linux-2.6.12.5.orig/fs/xfs/linux-2.6/xfs_lrw.c linux-2.6.12.5/fs/xfs/linux-2.6/xfs_lrw.c
--- linux-2.6.12.5.orig/fs/xfs/linux-2.6/xfs_lrw.c	2005-08-15 09:20:18.000000000 +0900
+++ linux-2.6.12.5/fs/xfs/linux-2.6/xfs_lrw.c	2008-02-29 17:28:36.170355201 +0900
@@ -411,6 +411,8 @@ xfs_zero_last_block(
 {
 	xfs_fileoff_t	last_fsb;
 	xfs_mount_t	*mp;
+	vnode_t         *vp = LINVFS_GET_VP(ip);
+	xfs_inode_t     *xip = XFS_BHVTOI(vp->v_fbhv);
 	int		nimaps;
 	int		zero_offset;
 	int		zero_len;
@@ -488,6 +490,7 @@ xfs_zero_eof(
 	xfs_fsize_t	end_size)	/* terminal inode size */
 {
 	struct inode	*ip = LINVFS_GET_IP(vp);
+	xfs_inode_t     *xip = XFS_BHVTOI(vp->v_fbhv);
 	xfs_fileoff_t	start_zero_fsb;
 	xfs_fileoff_t	end_zero_fsb;
 	xfs_fileoff_t	prev_zero_fsb;
@@ -507,6 +510,13 @@ xfs_zero_eof(
 
 	mp = io->io_mount;
 
+	if (xip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		if (offset < xip->i_df.if_bytes)
+			memset(xip->i_df.if_u1.if_data + offset, 0,
+			    xip->i_df.if_bytes - offset);
+		return 0;
+	}
+
 	/*
 	 * First handle zeroing the block on which isize resides.
 	 * We only zero a part of that block so it is handled specially.
@@ -664,6 +674,9 @@ xfs_write(
 		break;
 	}
 
+	if (xip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
+	    printk_ratelimit()) /* XXX */
+		printk("xfs_write: ino %llu, %lu bytes\n", xip->i_ino, ocount);
 	count = ocount;
 	pos = *offset;
 
@@ -769,6 +782,21 @@ start:
 		inode_update_time(inode, 1);
 	}
 
+	if ((!(mp->m_flags & XFS_MOUNT_NOIFILE)) &&
+	    new_size > isize && /* XXX unneeded ? */
+	    new_size <= XFS_IFORK_DSIZE(xip)) {
+		xfs_fileoff_t last_block;
+
+		/* XXX lock */
+		error = xfs_bmap_last_offset(NULL, xip, &last_block,
+			XFS_DATA_FORK);
+		if (!error && !last_block) {
+			xip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+			xip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+			xip->i_df.if_flags |= XFS_IFINLINE;
+		}
+	}
+
 	/*
 	 * If the offset is beyond the size of the file, we have a couple
 	 * of things to do. First, if there is already space allocated
@@ -855,6 +883,9 @@ retry:
 				*offset, ioflags);
 		ret = generic_file_buffered_write(iocb, iovp, segs,
 				pos, offset, count, ret);
+		if (xip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
+		    printk_ratelimit()) /* XXX */
+			printk("xfs_write: generic_file_buffered_write ino %llu, ret %d\n", xip->i_ino, ret);
 	}
 
 	current->backing_dev_info = NULL;
diff -urp linux-2.6.12.5.orig/fs/xfs/xfs_bmap.c linux-2.6.12.5/fs/xfs/xfs_bmap.c
--- linux-2.6.12.5.orig/fs/xfs/xfs_bmap.c	2005-08-15 09:20:18.000000000 +0900
+++ linux-2.6.12.5/fs/xfs/xfs_bmap.c	2008-02-29 17:28:36.207308945 +0900
@@ -3344,13 +3344,23 @@ xfs_bmap_local_to_extents(
 	static char	fname[] = "xfs_bmap_local_to_extents";
 #endif
 	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		ifile = 0;
 
+#if 0
 	/*
 	 * We don't want to deal with the case of keeping inode data inline yet.
 	 * So sending the data fork of a regular inode is invalid.
 	 */
 	ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG &&
 		 whichfork == XFS_DATA_FORK));
+#else
+	if ((ip->i_d.di_mode & S_IFMT) == S_IFREG &&
+		 whichfork == XFS_DATA_FORK) {
+		ifile = 1;
+		if (printk_ratelimit())
+			printk("xfs_bmap_local_to_extents: ino %d\n", ip->i_ino);
+	}
+#endif
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
 	flags = 0;
@@ -3386,10 +3396,12 @@ xfs_bmap_local_to_extents(
 		ASSERT(args.fsbno != NULLFSBLOCK);
 		ASSERT(args.len == 1);
 		*firstblock = args.fsbno;
+		if (!ifile) { /* XXX */
 		bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
 		memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data,
 			ifp->if_bytes);
 		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+		}
 		xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
 		xfs_iext_realloc(ip, 1, whichfork);
 		ep = ifp->if_u1.if_extents;
@@ -4628,6 +4640,10 @@ xfs_bmapi(
 	nallocs = 0;
 	cur = NULL;
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		if (!wr) { /* XXX */
+			*nmap = 0;
+			return 0;
+		}
 		ASSERT(wr && tp);
 		if ((error = xfs_bmap_local_to_extents(tp, ip,
 				firstblock, total, &logflags, whichfork)))
diff -urp linux-2.6.12.5.orig/fs/xfs/xfs_clnt.h linux-2.6.12.5/fs/xfs/xfs_clnt.h
--- linux-2.6.12.5.orig/fs/xfs/xfs_clnt.h	2005-08-15 09:20:18.000000000 +0900
+++ linux-2.6.12.5/fs/xfs/xfs_clnt.h	2008-02-29 17:36:25.283560299 +0900
@@ -106,5 +106,6 @@ struct xfs_mount_args {
 #define XFSMNT_IHASHSIZE	0x20000000	/* inode hash table size */
 #define XFSMNT_DIRSYNC		0x40000000	/* sync creat,link,unlink,rename
 						 * symlink,mkdir,rmdir,mknod */
+#define XFSMNT_NOIFILE		0x80000000	/* do not create inlined file */
 
 #endif	/* __XFS_CLNT_H__ */
diff -urp linux-2.6.12.5.orig/fs/xfs/xfs_inode.c linux-2.6.12.5/fs/xfs/xfs_inode.c
--- linux-2.6.12.5.orig/fs/xfs/xfs_inode.c	2005-08-15 09:20:18.000000000 +0900
+++ linux-2.6.12.5/fs/xfs/xfs_inode.c	2008-02-29 17:28:36.260126921 +0900
@@ -283,6 +283,20 @@ xfs_inotobp(
 	return 0;
 }
 
+void
+xfs_inode_buf_dump(xfs_buf_t *bp, u_short len)
+{
+	int pgs = len >> PAGE_SHIFT;
+	char *p;
+	int i, j;
+
+	for(i = 0; i < pgs; i++) {
+		p = xfs_buf_offset(bp, i << PAGE_SHIFT);
+		for(j = PAGE_SIZE; j; j--)
+			printk(" %02x", *p++);
+		printk("\n");
+	}
+}
 
 /*
  * This routine is called to map an inode to the buffer containing
@@ -413,6 +427,7 @@ xfs_itobp(
 				mp->m_ddev_targp,
 				(unsigned long long)imap.im_blkno, i,
 				INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
+			xfs_inode_buf_dump(bp, BBTOB(imap.im_len));
 #endif
 			XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
 					     mp, dip);
@@ -506,6 +521,7 @@ xfs_iformat(
 	case S_IFDIR:
 		switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) {
 		case XFS_DINODE_FMT_LOCAL:
+#if 0
 			/*
 			 * no local regular files yet
 			 */
@@ -518,7 +534,7 @@ xfs_iformat(
 						     ip->i_mount, dip);
 				return XFS_ERROR(EFSCORRUPTED);
 			}
-
+#endif
 			di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
 			if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
 				xfs_fs_cmn_err(CE_WARN, ip->i_mount,
@@ -634,6 +650,9 @@ xfs_iformat_local(
 		memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
 	ifp->if_flags &= ~XFS_IFEXTENTS;
 	ifp->if_flags |= XFS_IFINLINE;
+	if ((ip->i_d.di_mode & S_IFMT) == S_IFREG && printk_ratelimit())
+		printk("xfs_iformat_local: ino %llu %p, size %d\n",
+		    ip->i_ino, ip, size);
 	return 0;
 }
 
@@ -1684,6 +1703,18 @@ xfs_itruncate_finish(
 		unmap_len = last_block - first_unmap_block + 1;
 	}
 	while (!done) {
+		if (fork == XFS_DATA_FORK &&
+		    ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+			/* XXX realloc */
+			/* XXX real_size */
+			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+#if 0
+			ntp = xfs_trans_dup(*tp);
+			error = xfs_trans_commit(*tp, 0, NULL);
+#endif
+			committed = 0;
+			done = 1;
+		} else {
 		/*
 		 * Free up up to XFS_ITRUNC_MAX_EXTENTS.  xfs_bunmapi()
 		 * will tell us whether it freed the entire range or
@@ -1754,6 +1785,7 @@ xfs_itruncate_finish(
 			}
 			return error;
 		}
+		}
 
 		if (committed) {
 			/*
@@ -3026,6 +3058,8 @@ xfs_iflush_fork(
 			ASSERT(ifp->if_u1.if_data != NULL);
 			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
 			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+			if (printk_ratelimit())
+				printk("xfs_iflush_fork: copying %d\n", ifp->if_bytes);
 		}
 		if (whichfork == XFS_DATA_FORK) {
 			if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) {
@@ -3414,6 +3448,7 @@ xfs_iflush_int(
 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
 		    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
 			ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip);
+		xfs_inode_buf_dump(bp, BBTOB(ip->i_len));
 		goto corrupt_out;
 	}
 	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
@@ -3421,12 +3456,14 @@ xfs_iflush_int(
 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
 			"xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
 			ip->i_ino, ip, ip->i_d.di_magic);
+		xfs_inode_buf_dump(bp, BBTOB(ip->i_len));
 		goto corrupt_out;
 	}
 	if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 		if (XFS_TEST_ERROR(
 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
-		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
+		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
 		    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
 			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
 				"xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
diff -urp linux-2.6.12.5.orig/fs/xfs/xfs_mount.h linux-2.6.12.5/fs/xfs/xfs_mount.h
--- linux-2.6.12.5.orig/fs/xfs/xfs_mount.h	2005-08-15 09:20:18.000000000 +0900
+++ linux-2.6.12.5/fs/xfs/xfs_mount.h	2008-02-29 17:28:36.280412258 +0900
@@ -421,6 +421,7 @@ typedef struct xfs_mount {
 						 * allocation */
 #define XFS_MOUNT_IHASHSIZE	0x00100000	/* inode hash table size */
 #define XFS_MOUNT_DIRSYNC	0x00200000	/* synchronous directory ops */
+#define XFS_MOUNT_NOIFILE	0x00400000
 
 /*
  * Default minimum read and write sizes.
diff -urp linux-2.6.12.5.orig/fs/xfs/xfs_vfsops.c linux-2.6.12.5/fs/xfs/xfs_vfsops.c
--- linux-2.6.12.5.orig/fs/xfs/xfs_vfsops.c	2005-08-15 09:20:18.000000000 +0900
+++ linux-2.6.12.5/fs/xfs/xfs_vfsops.c	2008-02-29 17:28:36.303523635 +0900
@@ -307,6 +307,9 @@ xfs_start_flags(
 	if (ap->flags & XFSMNT_DIRSYNC)
 		mp->m_flags |= XFS_MOUNT_DIRSYNC;
 
+	if (ap->flags & XFSMNT_NOIFILE)
+		mp->m_flags |= XFS_MOUNT_NOIFILE;
+
 	/*
 	 * no recovery flag requires a read-only mount
 	 */
@@ -1657,6 +1660,7 @@ xfs_vget(
 #define MNTOPT_64BITINODE   "inode64"	/* inodes can be allocated anywhere */
 #define MNTOPT_IKEEP	"ikeep"		/* do not free empty inode clusters */
 #define MNTOPT_NOIKEEP	"noikeep"	/* free empty inode clusters */
+#define MNTOPT_NOIFILE	"noifile"	/* do not create inlined file */
 
 STATIC unsigned long
 suffix_strtoul(const char *cp, char **endp, unsigned int base)
@@ -1815,6 +1819,8 @@ xfs_parseargs(
 			args->flags &= ~XFSMNT_IDELETE;
 		} else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
 			args->flags |= XFSMNT_IDELETE;
+		} else if (!strcmp(this_char, MNTOPT_NOIFILE)) {
+			args->flags |= XFSMNT_NOIFILE;
 		} else if (!strcmp(this_char, "osyncisdsync")) {
 			/* no-op, this is now the default */
 printk("XFS: osyncisdsync is now the default, option is deprecated.\n");
@@ -1886,6 +1892,7 @@ xfs_showargs(
 		{ XFS_MOUNT_OSYNCISOSYNC,	"," MNTOPT_OSYNCISOSYNC },
 		{ XFS_MOUNT_NOLOGFLUSH,		"," MNTOPT_NOLOGFLUSH },
 		{ XFS_MOUNT_IDELETE,		"," MNTOPT_NOIKEEP },
+		{ XFS_MOUNT_NOIFILE,		"," MNTOPT_NOIFILE },
 		{ 0, NULL }
 	};
 	struct proc_xfs_info	*xfs_infop;

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] prototype file data inode inlining
  2008-03-07  9:34 [PATCH] prototype file data inode inlining IWAMOTO Toshihiro
@ 2008-03-08  0:21 ` David Chinner
  2008-03-11  8:30   ` IWAMOTO Toshihiro
  0 siblings, 1 reply; 3+ messages in thread
From: David Chinner @ 2008-03-08  0:21 UTC (permalink / raw)
  To: IWAMOTO Toshihiro; +Cc: xfs

On Fri, Mar 07, 2008 at 06:34:09PM +0900, IWAMOTO Toshihiro wrote:
> Hi,
> 
> I've done a prototype implementation of file data inlining in inodes a
> while ago.  It was originally meant to solve a performance problem
> with a large number of small files at some customer site.
> Although I measured some performance gains, a different workaround has
> been adopted due to the patch quality problem.
> 
> As I'm not asking for inclusion, the patch hasn't been ported to the
> current kernel version.  This patch might be useful if someone has a
> similar performance problem and would like to see if file inlining
> helps or not.

Interesting. I'm not going to comment on the code, just the overall
design and implementation.

Problems:

	- data loss on crash is unacceptable

	- this is an on-disk format change - it needs to be
	  implemented either as a mkfs option with a specific
	  superblock feature bit, or as a mount option with a
	  version 3 inode and a superblock feature bit to indicate
	  inodes with data in them have been created.

	- local -> extent conversion occurs at copy-in time, not writeback
	  time, so using the normal read/write paths through ->get_blocks()
	  will fail here in xfs_bmapi():

   4793         if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
   4794 >>>>>>>>        ASSERT(wr && tp);
   4795                 if ((error = xfs_bmap_local_to_extents(tp, ip,
   4796                                 firstblock, total, &logflags, whichfork)))
   4797                         goto error0;
   4798         }

	  because on a normal read or write (delayed allocation) we
	  are not doing allocation and hence do not have an open
	  transaction the first time we come through here. Just
	  avoiding this conversion and returning zero maps if we are
	  not writing will not help in the delayed allocation case.

I note that you hacked around this by special casing the inline
format ->get_blocks() callout to copy the data into the page and
marking it mapped and uptodate. I think this is the wrong approach
and is not the right layer at which to make this distinction - the special
casing needs to be done at higher layers, not in the block mapping
function.

I think for inline data, we'd do best to special case this as high
up the read/write paths as possible. e.g. for read() type operations
intercept in xfs_read() and just do what we need to do there for
populating the single page cache page. For write, we should let it
go through the normal delayed allocation mechanisms, only converting
to local format during ->writepage() if there's a single block
extent and it fits in the data fork. This also handles the truncate
case nicely.

For mmap operations, we need to handle the inline case separately
to the normal ->readpage case, similar to the xfs_read() case.
->readpages should never occur on an inline data inode.

> Some random notes and the patch itself follows.
> 
> Inlined file data are written from xfs_page_state_convert().
> The xfs_trans related operations in that function is to get inode
> written on disk and isn't for crash consistency.

Which is the exact opposite of what they are supposed to be used for.
Given that the next thing that happens after data write in the writeback path
is ->write_inode(), forcing the inode into the log for pure data changes
is unnecessary. We just need to format the data into the inode during
data writeback.

> Small files are made inlined when created.  Non inlined files don't
> get inlined when they are truncated.

As I inferred above, I think this is the wrong approach. Start the
inodes in extent format just like they currently are, and only
convert in writeback. This means no changes to the write path or
delayed allocation handling. That is, only the disk format should
care if the data is inline or not; everything in memory still treats
data as block based extents. i.e. the only time we do anything w.r.t
local data format is reading the inode off disk and writing it back
to disk.

The only issue here is that extent->local conversion requires a
free transaction, not an allocation transaction, but that should
not be difficult to handle as we can log the inode complete with
inline data in the free transaction to make that conversion atomic.

> xfs_bmap_local_to_extents() has been modified to work with file data,
> but logging isn't implemented.  A machine crash can cause data
> corruption.

There are two ways to do inline->extent safely from a crash recovery
perspective. 

Method 1: Use an Intent/Done transaction pair

The way this needs to be done is via a pair of transactions. The
first allocation transaction remains the same, but needs a different
type - an "allocation intent" rather than an "allocation"
transaction.  On data I/O completion, we then need an " allocation
complete" transaction that signals that the data is on disk and the
allocation intent is now permanent.

That means we can change state in memory and log it to disk before
the data write is done, but it won't get replayed on crash unless
the allocation completion transaction is also in the log after the
data is safely on disk.  Hence we don't overwrite data in the inode
during recovery if there is no copy of it elsewhere.

This needs modifications to the recovery code to understand the
new transaction types correctly.

Method 2: Log the data

We can log any object that is held in a xfs_buf_t. During
conversion, we could simply build an xfs_buf_t that points to the
page that holds the data and log that.  The complexity here is that
the buffer needs to point to the inodes address space, not the
address space of the buftarg where all the metadata resides.  The
xfs_buf_t in this case should only exist for the life of the data
I/O; once the data I/O is complete we can tear it down and go back
to treating the page normally.

> O_SYNC may behave incorrectly.

->write_inode(SYNC) should handle it just fine.

> Use of attribute forks isn't considered and likely has issues.

If you don't change the way the attribute fork handling works, it
should be just fine.

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] prototype file data inode inlining
  2008-03-08  0:21 ` David Chinner
@ 2008-03-11  8:30   ` IWAMOTO Toshihiro
  0 siblings, 0 replies; 3+ messages in thread
From: IWAMOTO Toshihiro @ 2008-03-11  8:30 UTC (permalink / raw)
  To: David Chinner; +Cc: IWAMOTO Toshihiro, xfs

At Sat, 8 Mar 2008 11:21:24 +1100,
David Chinner wrote:
> Interesting. I'm not going to comment on the code, just the overall
> design and implementation.

Thanks for comments.

> Problems:

> 	- local -> extent conversion occurs at copy-in time, not writeback
> 	  time, so using the normal read/write paths through ->get_blocks()
> 	  will fail here in xfs_bmapi():
> 
>    4793         if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
>    4794 >>>>>>>>        ASSERT(wr && tp);
>    4795                 if ((error = xfs_bmap_local_to_extents(tp, ip,
>    4796                                 firstblock, total, &logflags, whichfork)))
>    4797                         goto error0;
>    4798         }
> 
> 	  because on a normal read or write (delayed allocation) we
> 	  are not doing allocation and hence do not have an open
> 	  transaction the first time we come through here. Just
> 	  avoiding this conversion and returning zero maps if we are
> 	  not writing will not help in the delayed allocation case.
> 	
> 
> I note that you hacked around this by special casing the inline
> format ->get_blocks() callout to copy the data into the page and
> marking it mapped and uptodate. I think this is the wrong approach
> and is not the right layer at which to make this distinction - the special
> casing needs to be done at higher layers, not in the block mapping
> function.
> 
> I think for inline data, we'd do best to special case this as high
> up the read/write paths as possible. e.g. for read() type operations
> intercept in xfs_read() and just do what we need to do there for
> populating the single page cache page. For write, we should let it
> go through the normal delayed allocation mechanisms, only converting
> to local format during ->writepage() if there's a single block
> extent and it fits in the data fork. This also handles the truncate
> case nicely.

I was vaguely aware of layering violation, but took my dirty&quick
approach as the largest concern at that time was whether file inlining
gives enough performance gain.

With your suggestion, I would be able to implement better.
Unfortunately, I lack time to do so.

> > Some random notes and the patch itself follows.
> > 
> > Inlined file data are written from xfs_page_state_convert().
> > The xfs_trans related operations in that function is to get inode
> > written on disk and isn't for crash consistency.
> 
> Which is the exact opposite of what they are supposed to be used for.
> Given that the next thing that happens after data write in the writeback path
> is ->write_inode(), forcing the inode into the log for pure data changes
> is unnecessary. We just need to format the data into the inode during
> data writeback.

It seemed that setting the XFS_ILOG_DDATA bit in
ip->i_itemp->ili_format.ilf_fields was necessary for xfs_iflush_fork,
and I wasn't aware of other solutions.


> > xfs_bmap_local_to_extents() has been modified to work with file data,
> > but logging isn't implemented.  A machine crash can cause data
> > corruption.
> 
> There are two ways to do inline->extent safely from a crash recovery
> perspective. 
> 
> Method 1: Use an Intent/Done transaction pair

> Method 2: Log the data

Thanks for explanation.  It doesn't sound so complicated as I've
imagined.

--
IWAMOTO Toshihiro

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2008-03-11  8:30 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-07  9:34 [PATCH] prototype file data inode inlining IWAMOTO Toshihiro
2008-03-08  0:21 ` David Chinner
2008-03-11  8:30   ` IWAMOTO Toshihiro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox