[Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics

All of lore.kernel.org
 help / color / mirror / Atom feed

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
@ 2015-09-11  8:19 Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 1/8] ocfs2: add ocfs2_write_type_t type to identify the caller of write Ryan Ding
                   ` (8 more replies)
  0 siblings, 9 replies; 22+ messages in thread
From: Ryan Ding @ 2015-09-11  8:19 UTC (permalink / raw)
  To: ocfs2-devel

The idea is to use buffer io(more precisely use the interface
ocfs2_write_begin_nolock & ocfs2_write_end_nolock) to do the zero work beyond
block size. And clear UNWRITTEN flag until direct io data has been written to
disk, which can prevent data corruption when system crashed during direct write.

And we will also archive a better performance:
eg. dd direct write new file with block size 4KB:
before this patch:
2.5 MB/s
after this patch:
66.4 MB/s

----------------------------------------------------------------
Ryan Ding (8):
      ocfs2: add ocfs2_write_type_t type to identify the caller of write
      ocfs2: use c_new to indicate newly allocated extents
      ocfs2: test target page before change it
      ocfs2: do not change i_size in write_end for direct io
      ocfs2: return the physical address in ocfs2_write_cluster
      ocfs2: record UNWRITTEN extents when populate write desc
      ocfs2: fix sparse file & data ordering issue in direct io. 
      ocfs2: code clean up for direct io

 fs/ocfs2/aops.c        | 1118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------
 fs/ocfs2/aops.h        |   11 +-
 fs/ocfs2/file.c        |  138 +---------------------
 fs/ocfs2/inode.c       |    3 +
 fs/ocfs2/inode.h       |    3 + 
 fs/ocfs2/mmap.c        |    4 +-
 fs/ocfs2/ocfs2_trace.h |   16 +-- 
 fs/ocfs2/super.c       |    1 +
 8 files changed, 568 insertions(+), 726 deletions(-)

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 1/8] ocfs2: add ocfs2_write_type_t type to identify the caller of write
  2015-09-11  8:19 [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Ryan Ding
@ 2015-09-11  8:19 ` Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 2/8] ocfs2: use c_new to indicate newly allocated extents Ryan Ding
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 22+ messages in thread
From: Ryan Ding @ 2015-09-11  8:19 UTC (permalink / raw)
  To: ocfs2-devel

To support direct io in ocfs2_write_begin_nolock & ocfs2_write_end_nolock.

Remove unused args filp & flags. Add new arg type. The type is one of
buffer/direct/mmap. Indicate 3 way to perform write. buffer/mmap type has
implemented. direct type will be implemented later.

Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
cc: Joseph Qi <joseph.qi@huawei.com>
---
 fs/ocfs2/aops.c |   20 ++++++++++++--------
 fs/ocfs2/aops.h |   11 ++++++++---
 fs/ocfs2/mmap.c |    4 ++--
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 64b11d9..b65ebb1 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1216,6 +1216,9 @@ struct ocfs2_write_ctxt {
 	/* First cluster allocated in a nonsparse extend */
 	u32				w_first_new_cpos;
 
+	/* Type of caller. Must be one of buffer, mmap, direct.  */
+	ocfs2_write_type_t		w_type;
+
 	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
 
 	/*
@@ -1311,7 +1314,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
 
 static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 				  struct ocfs2_super *osb, loff_t pos,
-				  unsigned len, struct buffer_head *di_bh)
+				  unsigned len, ocfs2_write_type_t type,
+				  struct buffer_head *di_bh)
 {
 	u32 cend;
 	struct ocfs2_write_ctxt *wc;
@@ -1326,6 +1330,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 	wc->w_clen = cend - wc->w_cpos + 1;
 	get_bh(di_bh);
 	wc->w_di_bh = di_bh;
+	wc->w_type = type;
 
 	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
 		wc->w_large_pages = 1;
@@ -2069,9 +2074,8 @@ out:
 	return ret;
 }
 
-int ocfs2_write_begin_nolock(struct file *filp,
-			     struct address_space *mapping,
-			     loff_t pos, unsigned len, unsigned flags,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+			     loff_t pos, unsigned len, ocfs2_write_type_t type,
 			     struct page **pagep, void **fsdata,
 			     struct buffer_head *di_bh, struct page *mmap_page)
 {
@@ -2088,7 +2092,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
 	int try_free = 1, ret1;
 
 try_again:
-	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -2145,7 +2149,7 @@ try_again:
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
 			(long long)i_size_read(inode),
 			le32_to_cpu(di->i_clusters),
-			pos, len, flags, mmap_page,
+			pos, len, type, mmap_page,
 			clusters_to_alloc, extents_to_split);
 
 	/*
@@ -2315,8 +2319,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
-				       fsdata, di_bh, NULL);
+	ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
+				       pagep, fsdata, di_bh, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_fail;
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 24e496d..d06b80f 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct page *page, void *fsdata);
 
-int ocfs2_write_begin_nolock(struct file *filp,
-			     struct address_space *mapping,
-			     loff_t pos, unsigned len, unsigned flags,
+typedef enum {
+	OCFS2_WRITE_BUFFER = 0,
+	OCFS2_WRITE_DIRECT,
+	OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+			     loff_t pos, unsigned len, ocfs2_write_type_t type,
 			     struct page **pagep, void **fsdata,
 			     struct buffer_head *di_bh, struct page *mmap_page);
 
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9581d19..a88707a 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
 	if (page->index == last_index)
 		len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
 
-	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
-				       &fsdata, di_bh, page);
+	ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+				       &locked_page, &fsdata, di_bh, page);
 	if (ret) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 2/8] ocfs2: use c_new to indicate newly allocated extents
  2015-09-11  8:19 [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 1/8] ocfs2: add ocfs2_write_type_t type to identify the caller of write Ryan Ding
@ 2015-09-11  8:19 ` Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 3/8] ocfs2: test target page before change it Ryan Ding
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 22+ messages in thread
From: Ryan Ding @ 2015-09-11  8:19 UTC (permalink / raw)
  To: ocfs2-devel

To support direct io in ocfs2_write_begin_nolock & ocfs2_write_end_nolock.

There is a problem in ocfs2's direct io implement: if system crashed after
extents allocated, and before data return, we will get a extent with dirty data
on disk. This problem violate the journal=order semantics, which means meta
changes take effect after data written to disk.
To resolve this issue, direct write can use the UNWRITTEN flag to describe a
extent during direct data writeback. The direct write procedure should act in
the following order:

phase 1: alloc extent with UNWRITTEN flag
phase 2: submit direct data to disk, add zero page to page cache
phase 3: clear UNWRITTEN flag when data has been written to disk

This patch is to change the 'c_unwritten' member of ocfs2_write_cluster_desc to
'c_clear_unwritten'. Means whether to clear the unwritten flag. It do not care
if a extent is allocated or not. And use 'c_new' to specify a newly allocated
extent. So the direct io procedure can use c_clear_unwritten to control the
UNWRITTEN bit on extent.

Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
cc: Joseph Qi <joseph.qi@huawei.com>
---
 fs/ocfs2/aops.c |   22 ++++++++++++----------
 1 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b65ebb1..0f022dc 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1204,7 +1204,7 @@ struct ocfs2_write_cluster_desc {
 	 * filled.
 	 */
 	unsigned	c_new;
-	unsigned	c_unwritten;
+	unsigned	c_clear_unwritten;
 	unsigned	c_needs_zero;
 };
 
@@ -1580,19 +1580,19 @@ out:
  * Prepare a single cluster for write one cluster into the file.
  */
 static int ocfs2_write_cluster(struct address_space *mapping,
-			       u32 phys, unsigned int unwritten,
+			       u32 phys, unsigned int new,
+			       unsigned int clear_unwritten,
 			       unsigned int should_zero,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct ocfs2_write_ctxt *wc, u32 cpos,
 			       loff_t user_pos, unsigned user_len)
 {
-	int ret, i, new;
+	int ret, i;
 	u64 v_blkno, p_blkno;
 	struct inode *inode = mapping->host;
 	struct ocfs2_extent_tree et;
 
-	new = phys == 0 ? 1 : 0;
 	if (new) {
 		u32 tmp_pos;
 
@@ -1602,9 +1602,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		 */
 		tmp_pos = cpos;
 		ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
-					   &tmp_pos, 1, 0, wc->w_di_bh,
-					   wc->w_handle, data_ac,
-					   meta_ac, NULL);
+					   &tmp_pos, 1, !clear_unwritten,
+					   wc->w_di_bh, wc->w_handle,
+					   data_ac, meta_ac, NULL);
 		/*
 		 * This shouldn't happen because we must have already
 		 * calculated the correct meta data allocation required. The
@@ -1621,7 +1621,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 			mlog_errno(ret);
 			goto out;
 		}
-	} else if (unwritten) {
+	} else if (clear_unwritten) {
 		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
 					      wc->w_di_bh);
 		ret = ocfs2_mark_extent_written(inode, &et,
@@ -1704,7 +1704,8 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
 			local_len = osb->s_clustersize - cluster_off;
 
 		ret = ocfs2_write_cluster(mapping, desc->c_phys,
-					  desc->c_unwritten,
+					  desc->c_new,
+					  desc->c_clear_unwritten,
 					  desc->c_needs_zero,
 					  data_ac, meta_ac,
 					  wc, desc->c_cpos, pos, local_len);
@@ -1849,11 +1850,12 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 		if (phys == 0) {
 			desc->c_new = 1;
 			desc->c_needs_zero = 1;
+			desc->c_clear_unwritten = 1;
 			*clusters_to_alloc = *clusters_to_alloc + 1;
 		}
 
 		if (ext_flags & OCFS2_EXT_UNWRITTEN) {
-			desc->c_unwritten = 1;
+			desc->c_clear_unwritten = 1;
 			desc->c_needs_zero = 1;
 		}
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 3/8] ocfs2: test target page before change it
  2015-09-11  8:19 [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 1/8] ocfs2: add ocfs2_write_type_t type to identify the caller of write Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 2/8] ocfs2: use c_new to indicate newly allocated extents Ryan Ding
@ 2015-09-11  8:19 ` Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 4/8] ocfs2: do not change i_size in write_end for direct io Ryan Ding
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 22+ messages in thread
From: Ryan Ding @ 2015-09-11  8:19 UTC (permalink / raw)
  To: ocfs2-devel

To support direct io in ocfs2_write_begin_nolock & ocfs2_write_end_nolock.

Direct io data will not appear in buffer. The w_target_page member will not be
filled by direct io. So avoid to use it when it's NULL.
Unlinke buffer io and mmap, direct io will call write_begin with more than 1
page a time. So the target_index is not sufficient to describe the actual data.
change it to a range start at target_index, end in end_index.

Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
cc: Joseph Qi <joseph.qi@huawei.com>
---
 fs/ocfs2/aops.c |   32 ++++++++++++++++++++++++++------
 1 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0f022dc..eff5b59 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1398,12 +1398,13 @@ static void ocfs2_write_failure(struct inode *inode,
 		to = user_pos + user_len;
 	struct page *tmppage;
 
-	ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+	if (wc->w_target_page)
+		ocfs2_zero_new_buffers(wc->w_target_page, from, to);
 
 	for(i = 0; i < wc->w_num_pages; i++) {
 		tmppage = wc->w_pages[i];
 
-		if (page_has_buffers(tmppage)) {
+		if (tmppage && page_has_buffers(tmppage)) {
 			if (ocfs2_should_order_data(inode))
 				ocfs2_jbd2_file_inode(wc->w_handle, inode);
 
@@ -1533,11 +1534,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 		wc->w_num_pages = 1;
 		start = target_index;
 	}
+	end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
 
 	for(i = 0; i < wc->w_num_pages; i++) {
 		index = start + i;
 
-		if (index == target_index && mmap_page) {
+		if (index >= target_index && index <= end_index &&
+		    wc->w_type == OCFS2_WRITE_MMAP) {
 			/*
 			 * ocfs2_pagemkwrite() is a little different
 			 * and wants us to directly use the page
@@ -1556,6 +1559,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 			page_cache_get(mmap_page);
 			wc->w_pages[i] = mmap_page;
 			wc->w_target_locked = true;
+		} else if (index >= target_index && index <= end_index &&
+			   wc->w_type == OCFS2_WRITE_DIRECT) {
+			/* Direct write has no mapping page. */
+			wc->w_pages[i] = NULL;
+			continue;
 		} else {
 			wc->w_pages[i] = find_or_create_page(mapping, index,
 							     GFP_NOFS);
@@ -1657,6 +1665,12 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 	for(i = 0; i < wc->w_num_pages; i++) {
 		int tmpret;
 
+		/* This is the direct io target page. */
+		if (wc->w_pages[i] == NULL) {
+			p_blkno++;
+			continue;
+		}
+
 		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
 						      wc->w_pages[i], cpos,
 						      user_pos, user_len,
@@ -2258,7 +2272,8 @@ try_again:
 		ocfs2_free_alloc_context(meta_ac);
 
 success:
-	*pagep = wc->w_target_page;
+	if (pagep)
+		*pagep = wc->w_target_page;
 	*fsdata = wc;
 	return 0;
 out_quota:
@@ -2392,18 +2407,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		goto out_write_size;
 	}
 
-	if (unlikely(copied < len)) {
+	if (unlikely(copied < len) && wc->w_target_page) {
 		if (!PageUptodate(wc->w_target_page))
 			copied = 0;
 
 		ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
 				       start+len);
 	}
-	flush_dcache_page(wc->w_target_page);
+	if (wc->w_target_page)
+		flush_dcache_page(wc->w_target_page);
 
 	for(i = 0; i < wc->w_num_pages; i++) {
 		tmppage = wc->w_pages[i];
 
+		/* This is the direct io target page. */
+		if (tmppage == NULL)
+			continue;
+
 		if (tmppage == wc->w_target_page) {
 			from = wc->w_target_from;
 			to = wc->w_target_to;
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 4/8] ocfs2: do not change i_size in write_end for direct io
  2015-09-11  8:19 [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Ryan Ding
                   ` (2 preceding siblings ...)
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 3/8] ocfs2: test target page before change it Ryan Ding
@ 2015-09-11  8:19 ` Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 5/8] ocfs2: return the physical address in ocfs2_write_cluster Ryan Ding
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 22+ messages in thread
From: Ryan Ding @ 2015-09-11  8:19 UTC (permalink / raw)
  To: ocfs2-devel

To support direct io in ocfs2_write_begin_nolock & ocfs2_write_end_nolock.

Append direct io do not change i_size in get block phase. It only move to
orphan when starting write. After data is written to disk, it will delete
itself from orphan and update i_size. So skip i_size change section in
write_begin for direct io.
And when there is no extents alloc, no meta data changes needed for direct io
(since write_begin start trans for 2 reason: alloc extents & change i_size. Now
none of them needed). So we can skip start trans procedure.

Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
cc: Joseph Qi <joseph.qi@huawei.com>
---
 fs/ocfs2/aops.c |   79 ++++++++++++++++++++++++++++++++-----------------------
 1 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index eff5b59..41563aa 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2035,8 +2035,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
 	if (ret)
 		mlog_errno(ret);
 
-	wc->w_first_new_cpos =
-		ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+	/* There is no wc if this is call from direct. */
+	if (wc)
+		wc->w_first_new_cpos =
+			ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
 
 	return ret;
 }
@@ -2127,14 +2129,17 @@ try_again:
 		}
 	}
 
-	if (ocfs2_sparse_alloc(osb))
-		ret = ocfs2_zero_tail(inode, di_bh, pos);
-	else
-		ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
-						   wc);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
+	/* Direct io change i_size late, should not zero tail here. */
+	if (type != OCFS2_WRITE_DIRECT) {
+		if (ocfs2_sparse_alloc(osb))
+			ret = ocfs2_zero_tail(inode, di_bh, pos);
+		else
+			ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+							   len, wc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 	}
 
 	ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2195,8 +2200,9 @@ try_again:
 
 		credits = ocfs2_calc_extend_credits(inode->i_sb,
 						    &di->id2.i_list);
-
-	}
+	} else if (type == OCFS2_WRITE_DIRECT)
+		/* direct write needs not to start trans if no extents alloc. */
+		goto success;
 
 	/*
 	 * We have to zero sparse allocated clusters, unwritten extent clusters,
@@ -2394,12 +2400,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 	handle_t *handle = wc->w_handle;
 	struct page *tmppage;
 
-	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
-			OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		copied = ret;
-		mlog_errno(ret);
-		goto out;
+	if (handle) {
+		ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+				wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			copied = ret;
+			mlog_errno(ret);
+			goto out;
+		}
 	}
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2442,25 +2450,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		}
 
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode))
-				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+			if (handle && ocfs2_should_order_data(inode))
+				ocfs2_jbd2_file_inode(handle, inode);
 			block_commit_write(tmppage, from, to);
 		}
 	}
 
 out_write_size:
-	pos += copied;
-	if (pos > i_size_read(inode)) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-	inode->i_blocks = ocfs2_inode_sector_count(inode);
-	di->i_size = cpu_to_le64((u64)i_size_read(inode));
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-	ocfs2_update_inode_fsync_trans(handle, inode, 1);
-	ocfs2_journal_dirty(handle, wc->w_di_bh);
+	/* Direct io do not update i_size here. */
+	if (wc->w_type != OCFS2_WRITE_DIRECT) {
+		pos += copied;
+		if (pos > i_size_read(inode)) {
+			i_size_write(inode, pos);
+			mark_inode_dirty(inode);
+		}
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+		di->i_size = cpu_to_le64((u64)i_size_read(inode));
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+		di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+		ocfs2_update_inode_fsync_trans(handle, inode, 1);
+	}
+	if (handle)
+		ocfs2_journal_dirty(handle, wc->w_di_bh);
 
 out:
 	/* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2470,7 +2482,8 @@ out:
 	 */
 	ocfs2_unlock_pages(wc);
 
-	ocfs2_commit_trans(osb, handle);
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
 
 	ocfs2_run_deallocs(osb, &wc->w_dealloc);
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 5/8] ocfs2: return the physical address in ocfs2_write_cluster
  2015-09-11  8:19 [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Ryan Ding
                   ` (3 preceding siblings ...)
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 4/8] ocfs2: do not change i_size in write_end for direct io Ryan Ding
@ 2015-09-11  8:19 ` Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 6/8] ocfs2: record UNWRITTEN extents when populate write desc Ryan Ding
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 22+ messages in thread
From: Ryan Ding @ 2015-09-11  8:19 UTC (permalink / raw)
  To: ocfs2-devel

To support direct io in ocfs2_write_begin_nolock & ocfs2_write_end_nolock.

Direct io needs to get the physical address from write_begin, to map the user
page. This patch is to change the arg 'phys' of ocfs2_write_cluster to a
pointer, so it can be retrieved to write_begin. And we can retrieve it to the
direct io procedure.

Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
cc: Joseph Qi <joseph.qi@huawei.com>
---
 fs/ocfs2/aops.c |   28 +++++++++++++---------------
 1 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 41563aa..16bba6b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1588,7 +1588,7 @@ out:
  * Prepare a single cluster for write one cluster into the file.
  */
 static int ocfs2_write_cluster(struct address_space *mapping,
-			       u32 phys, unsigned int new,
+			       u32 *phys, unsigned int new,
 			       unsigned int clear_unwritten,
 			       unsigned int should_zero,
 			       struct ocfs2_alloc_context *data_ac,
@@ -1597,9 +1597,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 			       loff_t user_pos, unsigned user_len)
 {
 	int ret, i;
-	u64 v_blkno, p_blkno;
+	u64 p_blkno;
 	struct inode *inode = mapping->host;
 	struct ocfs2_extent_tree et;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 
 	if (new) {
 		u32 tmp_pos;
@@ -1633,7 +1634,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
 					      wc->w_di_bh);
 		ret = ocfs2_mark_extent_written(inode, &et,
-						wc->w_handle, cpos, 1, phys,
+						wc->w_handle, cpos, 1, *phys,
 						meta_ac, &wc->w_dealloc);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -1641,26 +1642,23 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		}
 	}
 
-	if (should_zero)
-		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
-	else
-		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
-
 	/*
 	 * The only reason this should fail is due to an inability to
 	 * find the extent added.
 	 */
-	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
-					  NULL);
+	ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
 	if (ret < 0) {
 		mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
-			    "at logical block %llu",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-			    (unsigned long long)v_blkno);
+			    "at logical cluster %u",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
 		goto out;
 	}
 
-	BUG_ON(p_blkno == 0);
+	BUG_ON(*phys == 0);
+
+	p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+	if (!should_zero)
+		p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
 
 	for(i = 0; i < wc->w_num_pages; i++) {
 		int tmpret;
@@ -1717,7 +1715,7 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
 		if ((cluster_off + local_len) > osb->s_clustersize)
 			local_len = osb->s_clustersize - cluster_off;
 
-		ret = ocfs2_write_cluster(mapping, desc->c_phys,
+		ret = ocfs2_write_cluster(mapping, &desc->c_phys,
 					  desc->c_new,
 					  desc->c_clear_unwritten,
 					  desc->c_needs_zero,
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 6/8] ocfs2: record UNWRITTEN extents when populate write desc
  2015-09-11  8:19 [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Ryan Ding
                   ` (4 preceding siblings ...)
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 5/8] ocfs2: return the physical address in ocfs2_write_cluster Ryan Ding
@ 2015-09-11  8:19 ` Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 7/8] ocfs2: fix sparse file & data ordering issue in direct io Ryan Ding
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 22+ messages in thread
From: Ryan Ding @ 2015-09-11  8:19 UTC (permalink / raw)
  To: ocfs2-devel

To support direct io in ocfs2_write_begin_nolock & ocfs2_write_end_nolock.

There is still one issue in the direct write procedure.

phase 1: alloc extent with UNWRITTEN flag
phase 2: submit direct data to disk, add zero page to page cache
phase 3: clear UNWRITTEN flag when data has been written to disk

When there are 2 direct write A(0~3KB),B(4~7KB) writing to the same cluster
0~7KB (cluster size 8KB). Write request A arrive phase 2 first, it will zero
the region (4~7KB). Before request A enter to phase 3, request B arrive phase
2, it will zero region (0~3KB). This is just like request B steps request A.

To resolve this issue, we should let request B knows this cluster is already
under zero, to prevent it from steps the previous write request.

This patch will add function ocfs2_unwritten_check() to do this job. It will
record all clusters that are under direct write(it will be recorded in the
'ip_unwritten_list' member of inode info), and prevent the later direct write
writing to the same cluster to do the zero work again.

Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
cc: Joseph Qi <joseph.qi@huawei.com>
---
 fs/ocfs2/aops.c  |  104 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/ocfs2/inode.c |    3 ++
 fs/ocfs2/inode.h |    3 ++
 fs/ocfs2/super.c |    1 +
 4 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 16bba6b..b4ec600 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1193,6 +1193,13 @@ next_bh:
 
 #define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
 
+struct ocfs2_unwritten_extent {
+	struct list_head	ue_node;
+	struct list_head	ue_ip_node;
+	u32			ue_cpos;
+	u32			ue_phys;
+};
+
 /*
  * Describe the state of a single cluster to be written to.
  */
@@ -1267,6 +1274,8 @@ struct ocfs2_write_ctxt {
 	struct buffer_head		*w_di_bh;
 
 	struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+	struct list_head		w_unwritten_list;
 };
 
 void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1305,8 +1314,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
 	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
 }
 
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+				 struct list_head *head)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_unwritten_extent *dz = NULL, *tmp = NULL;
+
+	list_for_each_entry_safe(dz, tmp, head, ue_node) {
+		list_del(&dz->ue_node);
+		spin_lock(&oi->ip_lock);
+		list_del(&dz->ue_ip_node);
+		spin_unlock(&oi->ip_lock);
+		kfree(dz);
+	}
+}
+
+static void ocfs2_free_write_ctxt(struct inode *inode,
+				  struct ocfs2_write_ctxt *wc)
 {
+	ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
 	ocfs2_unlock_pages(wc);
 	brelse(wc->w_di_bh);
 	kfree(wc);
@@ -1338,6 +1364,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 		wc->w_large_pages = 0;
 
 	ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+	INIT_LIST_HEAD(&wc->w_unwritten_list);
 
 	*wcp = wc;
 
@@ -1788,6 +1815,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 }
 
 /*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+				 struct ocfs2_write_ctxt *wc,
+				 struct ocfs2_write_cluster_desc *desc)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_unwritten_extent *dz = NULL, *new = NULL;
+	int ret = 0;
+
+	if (!desc->c_needs_zero)
+		return 0;
+
+retry:
+	spin_lock(&oi->ip_lock);
+	/* Needs not to zero no metter buffer or direct. The one who is zero
+	 * the cluster is doing zero. And he will clear unwritten after all
+	 * cluster io finished. */
+	list_for_each_entry(dz, &oi->ip_unwritten_list, ue_ip_node) {
+		if (desc->c_cpos == dz->ue_cpos) {
+			BUG_ON(desc->c_new);
+			desc->c_needs_zero = 0;
+			desc->c_clear_unwritten = 0;
+			goto unlock;
+		}
+	}
+
+	if (wc->w_type != OCFS2_WRITE_DIRECT)
+		goto unlock;
+
+	if (new == NULL) {
+		spin_unlock(&oi->ip_lock);
+		new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+			     GFP_NOFS);
+		if (new == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		goto retry;
+	}
+	/* This direct write will doing zero. */
+	new->ue_cpos = desc->c_cpos;
+	new->ue_phys = desc->c_phys;
+	desc->c_clear_unwritten = 0;
+	list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+	list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+	new = NULL;
+unlock:
+	spin_unlock(&oi->ip_lock);
+out:
+	if (new)
+		kfree(new);
+	return ret;
+}
+
+/*
  * Populate each single-cluster write descriptor in the write context
  * with information about the i/o to be done.
  *
@@ -1871,6 +1958,12 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 			desc->c_needs_zero = 1;
 		}
 
+		ret = ocfs2_unwritten_check(inode, wc, desc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
 		num_clusters--;
 	}
 
@@ -2207,9 +2300,8 @@ try_again:
 	 * and non-sparse clusters we just extended.  For non-sparse writes,
 	 * we know zeros will only be needed in the first and/or last cluster.
 	 */
-	if (clusters_to_alloc || extents_to_split ||
-	    (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
-			    wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+	if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+			   wc->w_desc[wc->w_clen - 1].c_needs_zero))
 		cluster_of_pages = 1;
 	else
 		cluster_of_pages = 0;
@@ -2288,7 +2380,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 
 out:
-	ocfs2_free_write_ctxt(wc);
+	ocfs2_free_write_ctxt(inode, wc);
 
 	if (data_ac) {
 		ocfs2_free_alloc_context(data_ac);
@@ -2398,6 +2490,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 	handle_t *handle = wc->w_handle;
 	struct page *tmppage;
 
+	BUG_ON(!list_empty(&wc->w_unwritten_list));
+
 	if (handle) {
 		ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
 				wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8f87e05..0fd9ebd 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1125,6 +1125,9 @@ static void ocfs2_clear_inode(struct inode *inode)
 	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
 			"Clear inode of %llu, inode has io markers\n",
 			(unsigned long long)oi->ip_blkno);
+	mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+			"Clear inode of %llu, inode has unwritten extents\n",
+			(unsigned long long)oi->ip_blkno);
 
 	ocfs2_extent_map_trunc(inode, 0);
 
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431e..b505241 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -57,6 +57,9 @@ struct ocfs2_inode_info
 	u32				ip_flags; /* see below */
 	u32				ip_attr; /* inode attributes */
 
+	/* Record unwritten extents during direct io. */
+	struct list_head		ip_unwritten_list;
+
 	/* protected by recovery_lock. */
 	struct inode			*ip_next_orphan;
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2de4c8a..0b28d58 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1744,6 +1744,7 @@ static void ocfs2_inode_init_once(void *data)
 	spin_lock_init(&oi->ip_lock);
 	ocfs2_extent_map_init(&oi->vfs_inode);
 	INIT_LIST_HEAD(&oi->ip_io_markers);
+	INIT_LIST_HEAD(&oi->ip_unwritten_list);
 	oi->ip_dir_start_lookup = 0;
 	mutex_init(&oi->ip_unaligned_aio);
 	init_rwsem(&oi->ip_alloc_sem);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 7/8] ocfs2: fix sparse file & data ordering issue in direct io.
  2015-09-11  8:19 [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Ryan Ding
                   ` (5 preceding siblings ...)
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 6/8] ocfs2: record UNWRITTEN extents when populate write desc Ryan Ding
@ 2015-09-11  8:19 ` Ryan Ding
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 8/8] ocfs2: code clean up for " Ryan Ding
  2015-09-28 10:20 ` [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Joseph Qi
  8 siblings, 0 replies; 22+ messages in thread
From: Ryan Ding @ 2015-09-11  8:19 UTC (permalink / raw)
  To: ocfs2-devel

There are mainly 3 issue in the direct io code path after commit 24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
  * Do not support sparse file.
  * Do not support data ordering. eg: when write to a file hole, it will alloc
    extent first. If system crashed before io finished, data will corrupt.
  * Potential risk when doing aio+dio. The -EIOCBQUEUED return value is likely
    to be ignored by ocfs2_direct_IO_write().

To resolve above problems, re-design direct io code with following ideas:
  * Use buffer io to fill in holes. And this will make better performance also.
  * Clear unwritten after direct write finished. So we can make sure meta data
    changes after data write to disk. (Unwritten extent is invisible to user,
    from user's view, meta data is not changed when allocate an unwritten
    extent.)
  * Clear ocfs2_direct_IO_write(). Do all ending work in end_io.

This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.

For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
  + rm /mnt/test.img -f
  + dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
  1048576+0 records in
  1048576+0 records out
  4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
  + rm /mnt/test.img -f
  + dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
  16384+0 records in
  16384+0 records out
  4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s

After this patch:
  + rm /mnt/test.img -f
  + dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
  1048576+0 records in
  1048576+0 records out
  4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
  + rm /mnt/test.img -f
  + dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
  16384+0 records in
  16384+0 records out
  4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s

Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
cc: Joseph Qi <joseph.qi@huawei.com>
---
 fs/ocfs2/aops.c |  851 ++++++++++++++++++++++---------------------------------
 1 files changed, 342 insertions(+), 509 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b4ec600..4bb9921 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -499,152 +499,6 @@ bail:
 	return status;
 }
 
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- *  "So what we do is to permit the ->get_blocks function to populate
- *   bh.b_size with the size of IO which is permitted at this offset and
- *   this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- * 					fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
-				     struct buffer_head *bh_result, int create)
-{
-	int ret;
-	u32 cpos = 0;
-	int alloc_locked = 0;
-	u64 p_blkno, inode_blocks, contig_blocks;
-	unsigned int ext_flags;
-	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
-	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
-	unsigned long len = bh_result->b_size;
-	unsigned int clusters_to_alloc = 0, contig_clusters = 0;
-
-	cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
-
-	/* This function won't even be called if the request isn't all
-	 * nicely aligned and of the right size, so there's no need
-	 * for us to check any of that. */
-
-	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	/* This figures out the size of the next contiguous block, and
-	 * our logical offset */
-	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-					  &contig_blocks, &ext_flags);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	if (ret) {
-		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
-		     (unsigned long long)iblock);
-		ret = -EIO;
-		goto bail;
-	}
-
-	/* We should already CoW the refcounted extent in case of create. */
-	BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-
-	/* allocate blocks if no p_blkno is found, and create == 1 */
-	if (!p_blkno && create) {
-		ret = ocfs2_inode_lock(inode, NULL, 1);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto bail;
-		}
-
-		alloc_locked = 1;
-
-		down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
-		/* fill hole, allocate blocks can't be larger than the size
-		 * of the hole */
-		clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
-		contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
-				contig_blocks);
-		if (clusters_to_alloc > contig_clusters)
-			clusters_to_alloc = contig_clusters;
-
-		/* allocate extent and insert them into the extent tree */
-		ret = ocfs2_extend_allocation(inode, cpos,
-				clusters_to_alloc, 0);
-		if (ret < 0) {
-			up_write(&OCFS2_I(inode)->ip_alloc_sem);
-			mlog_errno(ret);
-			goto bail;
-		}
-
-		ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-				&contig_blocks, &ext_flags);
-		if (ret < 0) {
-			up_write(&OCFS2_I(inode)->ip_alloc_sem);
-			mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
-					(unsigned long long)iblock);
-			ret = -EIO;
-			goto bail;
-		}
-		up_write(&OCFS2_I(inode)->ip_alloc_sem);
-	}
-
-	/*
-	 * get_more_blocks() expects us to describe a hole by clearing
-	 * the mapped bit on bh_result().
-	 *
-	 * Consider an unwritten extent as a hole.
-	 */
-	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
-		map_bh(bh_result, inode->i_sb, p_blkno);
-	else
-		clear_buffer_mapped(bh_result);
-
-	/* make sure we don't map more than max_blocks blocks here as
-	   that's all the kernel will handle at this point. */
-	if (max_blocks < contig_blocks)
-		contig_blocks = max_blocks;
-	bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
-	if (alloc_locked)
-		ocfs2_inode_unlock(inode, 1);
-	return ret;
-}
-
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
- * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static void ocfs2_dio_end_io(struct kiocb *iocb,
-			     loff_t offset,
-			     ssize_t bytes,
-			     void *private)
-{
-	struct inode *inode = file_inode(iocb->ki_filp);
-	int level;
-
-	/* this io's submitter should not have unlocked this before we could */
-	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-
-	if (ocfs2_iocb_is_unaligned_aio(iocb)) {
-		ocfs2_iocb_clear_unaligned_aio(iocb);
-
-		mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
-	}
-
-	/* Let rw unlock to be done later to protect append direct io write */
-	if (offset + bytes <= i_size_read(inode)) {
-		ocfs2_iocb_clear_rw_locked(iocb);
-
-		level = ocfs2_iocb_rw_locked_level(iocb);
-		ocfs2_rw_unlock(inode, level);
-	}
-}
-
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
 {
 	if (!page_has_buffers(page))
@@ -652,361 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
 	return try_to_free_buffers(page);
 }
 
-static int ocfs2_is_overwrite(struct ocfs2_super *osb,
-		struct inode *inode, loff_t offset)
-{
-	int ret = 0;
-	u32 v_cpos = 0;
-	u32 p_cpos = 0;
-	unsigned int num_clusters = 0;
-	unsigned int ext_flags = 0;
-
-	v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-	ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
-			&num_clusters, &ext_flags);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
-		return 1;
-
-	return 0;
-}
-
-static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
-		struct inode *inode, loff_t offset,
-		u64 zero_len, int cluster_align)
-{
-	u32 p_cpos = 0;
-	u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
-	unsigned int num_clusters = 0;
-	unsigned int ext_flags = 0;
-	int ret = 0;
-
-	if (offset <= i_size_read(inode) || cluster_align)
-		return 0;
-
-	ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
-			&ext_flags);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-		u64 s = i_size_read(inode);
-		sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
-			(do_div(s, osb->s_clustersize) >> 9);
-
-		ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
-				zero_len >> 9, GFP_NOFS, false);
-		if (ret < 0)
-			mlog_errno(ret);
-	}
-
-	return ret;
-}
-
-static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
-		struct inode *inode, loff_t offset)
-{
-	u64 zero_start, zero_len, total_zero_len;
-	u32 p_cpos = 0, clusters_to_add;
-	u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
-	unsigned int num_clusters = 0;
-	unsigned int ext_flags = 0;
-	u32 size_div, offset_div;
-	int ret = 0;
-
-	{
-		u64 o = offset;
-		u64 s = i_size_read(inode);
-
-		offset_div = do_div(o, osb->s_clustersize);
-		size_div = do_div(s, osb->s_clustersize);
-	}
-
-	if (offset <= i_size_read(inode))
-		return 0;
-
-	clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
-		ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
-	total_zero_len = offset - i_size_read(inode);
-	if (clusters_to_add)
-		total_zero_len -= offset_div;
-
-	/* Allocate clusters to fill out holes, and this is only needed
-	 * when we add more than one clusters. Otherwise the cluster will
-	 * be allocated during direct IO */
-	if (clusters_to_add > 1) {
-		ret = ocfs2_extend_allocation(inode,
-				OCFS2_I(inode)->ip_clusters,
-				clusters_to_add - 1, 0);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	while (total_zero_len) {
-		ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
-				&ext_flags);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-
-		zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
-			size_div;
-		zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
-			size_div;
-		zero_len = min(total_zero_len, zero_len);
-
-		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-			ret = blkdev_issue_zeroout(osb->sb->s_bdev,
-					zero_start >> 9, zero_len >> 9,
-					GFP_NOFS, false);
-			if (ret < 0) {
-				mlog_errno(ret);
-				goto out;
-			}
-		}
-
-		total_zero_len -= zero_len;
-		v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
-
-		/* Only at first iteration can be cluster not aligned.
-		 * So set size_div to 0 for the rest */
-		size_div = 0;
-	}
-
-out:
-	return ret;
-}
-
-static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
-		struct iov_iter *iter,
-		loff_t offset)
-{
-	ssize_t ret = 0;
-	ssize_t written = 0;
-	bool orphaned = false;
-	int is_overwrite = 0;
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file_inode(file)->i_mapping->host;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct buffer_head *di_bh = NULL;
-	size_t count = iter->count;
-	journal_t *journal = osb->journal->j_journal;
-	u64 zero_len_head, zero_len_tail;
-	int cluster_align_head, cluster_align_tail;
-	loff_t final_size = offset + count;
-	int append_write = offset >= i_size_read(inode) ? 1 : 0;
-	unsigned int num_clusters = 0;
-	unsigned int ext_flags = 0;
-
-	{
-		u64 o = offset;
-		u64 s = i_size_read(inode);
-
-		zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
-		cluster_align_head = !zero_len_head;
-
-		zero_len_tail = osb->s_clustersize -
-			do_div(s, osb->s_clustersize);
-		if ((offset - i_size_read(inode)) < zero_len_tail)
-			zero_len_tail = offset - i_size_read(inode);
-		cluster_align_tail = !zero_len_tail;
-	}
-
-	/*
-	 * when final_size > inode->i_size, inode->i_size will be
-	 * updated after direct write, so add the inode to orphan
-	 * dir first.
-	 */
-	if (final_size > i_size_read(inode)) {
-		ret = ocfs2_add_inode_to_orphan(osb, inode);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-		orphaned = true;
-	}
-
-	if (append_write) {
-		ret = ocfs2_inode_lock(inode, NULL, 1);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto clean_orphan;
-		}
-
-		/* zeroing out the previously allocated cluster tail
-		 * that but not zeroed */
-		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-			down_read(&OCFS2_I(inode)->ip_alloc_sem);
-			ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
-					zero_len_tail, cluster_align_tail);
-			up_read(&OCFS2_I(inode)->ip_alloc_sem);
-		} else {
-			down_write(&OCFS2_I(inode)->ip_alloc_sem);
-			ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
-					offset);
-			up_write(&OCFS2_I(inode)->ip_alloc_sem);
-		}
-		if (ret < 0) {
-			mlog_errno(ret);
-			ocfs2_inode_unlock(inode, 1);
-			goto clean_orphan;
-		}
-
-		is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
-		if (is_overwrite < 0) {
-			mlog_errno(is_overwrite);
-			ocfs2_inode_unlock(inode, 1);
-			goto clean_orphan;
-		}
-
-		ocfs2_inode_unlock(inode, 1);
-	}
-
-	written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-				       offset, ocfs2_direct_IO_get_blocks,
-				       ocfs2_dio_end_io, NULL, 0);
-	/* overwrite aio may return -EIOCBQUEUED, and it is not an error */
-	if ((written < 0) && (written != -EIOCBQUEUED)) {
-		loff_t i_size = i_size_read(inode);
-
-		if (offset + count > i_size) {
-			ret = ocfs2_inode_lock(inode, &di_bh, 1);
-			if (ret < 0) {
-				mlog_errno(ret);
-				goto clean_orphan;
-			}
-
-			if (i_size == i_size_read(inode)) {
-				ret = ocfs2_truncate_file(inode, di_bh,
-						i_size);
-				if (ret < 0) {
-					if (ret != -ENOSPC)
-						mlog_errno(ret);
-
-					ocfs2_inode_unlock(inode, 1);
-					brelse(di_bh);
-					di_bh = NULL;
-					goto clean_orphan;
-				}
-			}
-
-			ocfs2_inode_unlock(inode, 1);
-			brelse(di_bh);
-			di_bh = NULL;
-
-			ret = jbd2_journal_force_commit(journal);
-			if (ret < 0)
-				mlog_errno(ret);
-		}
-	} else if (written > 0 && append_write && !is_overwrite &&
-			!cluster_align_head) {
-		/* zeroing out the allocated cluster head */
-		u32 p_cpos = 0;
-		u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-
-		ret = ocfs2_inode_lock(inode, NULL, 0);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto clean_orphan;
-		}
-
-		ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
-				&num_clusters, &ext_flags);
-		if (ret < 0) {
-			mlog_errno(ret);
-			ocfs2_inode_unlock(inode, 0);
-			goto clean_orphan;
-		}
-
-		BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
-
-		ret = blkdev_issue_zeroout(osb->sb->s_bdev,
-				(u64)p_cpos << (osb->s_clustersize_bits - 9),
-				zero_len_head >> 9, GFP_NOFS, false);
-		if (ret < 0)
-			mlog_errno(ret);
-
-		ocfs2_inode_unlock(inode, 0);
-	}
-
-clean_orphan:
-	if (orphaned) {
-		int tmp_ret;
-		int update_isize = written > 0 ? 1 : 0;
-		loff_t end = update_isize ? offset + written : 0;
-
-		tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
-		if (tmp_ret < 0) {
-			ret = tmp_ret;
-			mlog_errno(ret);
-			goto out;
-		}
-
-		tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
-				update_isize, end);
-		if (tmp_ret < 0) {
-			ret = tmp_ret;
-			mlog_errno(ret);
-			brelse(di_bh);
-			goto out;
-		}
-
-		ocfs2_inode_unlock(inode, 1);
-		brelse(di_bh);
-
-		tmp_ret = jbd2_journal_force_commit(journal);
-		if (tmp_ret < 0) {
-			ret = tmp_ret;
-			mlog_errno(tmp_ret);
-		}
-	}
-
-out:
-	if (ret >= 0)
-		ret = written;
-	return ret;
-}
-
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			       loff_t offset)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file_inode(file)->i_mapping->host;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	int full_coherency = !(osb->s_mount_opt &
-			OCFS2_MOUNT_COHERENCY_BUFFERED);
-
-	/*
-	 * Fallback to buffered I/O if we see an inode without
-	 * extents.
-	 */
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-		return 0;
-
-	/* Fallback to buffered I/O if we are appending and
-	 * concurrent O_DIRECT writes are allowed.
-	 */
-	if (i_size_read(inode) <= offset && !full_coherency)
-		return 0;
-
-	if (iov_iter_rw(iter) == READ)
-		return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-					    iter, offset,
-					    ocfs2_direct_IO_get_blocks,
-					    ocfs2_dio_end_io, NULL, 0);
-	else
-		return ocfs2_direct_IO_write(iocb, iter, offset);
-}
-
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
 					    u32 cpos,
 					    unsigned int *start,
@@ -1318,14 +817,14 @@ static void ocfs2_free_unwritten_list(struct inode *inode,
 				 struct list_head *head)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_unwritten_extent *dz = NULL, *tmp = NULL;
+	struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
 
-	list_for_each_entry_safe(dz, tmp, head, ue_node) {
-		list_del(&dz->ue_node);
+	list_for_each_entry_safe(ue, tmp, head, ue_node) {
+		list_del(&ue->ue_node);
 		spin_lock(&oi->ip_lock);
-		list_del(&dz->ue_ip_node);
+		list_del(&ue->ue_ip_node);
 		spin_unlock(&oi->ip_lock);
-		kfree(dz);
+		kfree(ue);
 	}
 }
 
@@ -1826,7 +1325,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
 				 struct ocfs2_write_cluster_desc *desc)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_unwritten_extent *dz = NULL, *new = NULL;
+	struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
 	int ret = 0;
 
 	if (!desc->c_needs_zero)
@@ -1837,8 +1336,8 @@ retry:
 	/* Needs not to zero no metter buffer or direct. The one who is zero
 	 * the cluster is doing zero. And he will clear unwritten after all
 	 * cluster io finished. */
-	list_for_each_entry(dz, &oi->ip_unwritten_list, ue_ip_node) {
-		if (desc->c_cpos == dz->ue_cpos) {
+	list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+		if (desc->c_cpos == ue->ue_cpos) {
 			BUG_ON(desc->c_new);
 			desc->c_needs_zero = 0;
 			desc->c_clear_unwritten = 0;
@@ -2600,6 +2099,340 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
+struct ocfs2_dio_write_ctxt {
+	struct list_head	dw_zero_list;
+	unsigned		dw_zero_count;
+	int			dw_orphaned;
+	pid_t			dw_writer_pid;
+};
+
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+	struct ocfs2_dio_write_ctxt *dwc = NULL;
+
+	if (bh->b_private)
+		return bh->b_private;
+
+	dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+	if (dwc == NULL)
+		return NULL;
+	INIT_LIST_HEAD(&dwc->dw_zero_list);
+	dwc->dw_zero_count = 0;
+	dwc->dw_orphaned = 0;
+	dwc->dw_writer_pid = task_pid_nr(current);
+	bh->b_private = dwc;
+	*alloc = 1;
+
+	return dwc;
+}
+
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+				     struct ocfs2_dio_write_ctxt *dwc)
+{
+	ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+	kfree(dwc);
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * 					fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+			       struct buffer_head *bh_result, int create)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_write_ctxt *wc;
+	struct ocfs2_write_cluster_desc *desc = NULL;
+	struct ocfs2_dio_write_ctxt *dwc = NULL;
+	struct buffer_head *di_bh = NULL;
+	u64 p_blkno;
+	loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+	unsigned len, total_len = bh_result->b_size;
+	int ret = 0, first_get_block = 0;
+
+	len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+	len = min(total_len, len);
+
+	mlog(0, "get block of %lu at %llu:%u req %u\n",
+			inode->i_ino, pos, len, total_len);
+
+	/* This is the fast path for re-write. */
+	ret = ocfs2_get_block(inode, iblock, bh_result, create);
+
+	if (buffer_mapped(bh_result) &&
+	    !buffer_new(bh_result) &&
+	    ret == 0)
+		goto out;
+
+	/* Clear state set by ocfs2_get_block. */
+	bh_result->b_state = 0;
+
+	dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+	if (unlikely(dwc == NULL)) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+	    ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+	    !dwc->dw_orphaned) {
+		/*
+		 * when we are going to alloc extents beyond file size, add the
+		 * inode to orphan dir, so we can recall those spaces when
+		 * system crashed during write.
+		 */
+		ret = ocfs2_add_inode_to_orphan(osb, inode);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		dwc->dw_orphaned = 1;
+	}
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (first_get_block) {
+		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+			ret = ocfs2_zero_tail(inode, di_bh, pos);
+		else
+			ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+							   total_len, NULL);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto unlock;
+		}
+	}
+
+	ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+				       OCFS2_WRITE_DIRECT, NULL,
+				       (void **)&wc, di_bh, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto unlock;
+	}
+
+	desc = &wc->w_desc[0];
+
+	p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+	BUG_ON(p_blkno == 0);
+	p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+
+	map_bh(bh_result, inode->i_sb, p_blkno);
+	bh_result->b_size = len;
+	if (desc->c_needs_zero)
+		set_buffer_new(bh_result);
+
+	/* May sleep in end_io. It should not happen in a irq context. So defer
+	 * it to dio work queue. */
+	set_buffer_defer_completion(bh_result);
+
+	if (!list_empty(&wc->w_unwritten_list)) {
+		struct ocfs2_unwritten_extent *ue = NULL;
+
+		ue = list_first_entry(&wc->w_unwritten_list,
+				      struct ocfs2_unwritten_extent,
+				      ue_node);
+		BUG_ON(ue->ue_cpos != desc->c_cpos);
+		/* The physical address may be 0, fill it. */
+		ue->ue_phys = desc->c_phys;
+
+		list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+		dwc->dw_zero_count++;
+	}
+
+	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+	BUG_ON(ret != len);
+	ret = 0;
+unlock:
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+out:
+	if (ret < 0)
+		ret = -EIO;
+	return ret;
+}
+
+static void ocfs2_dio_end_io_write(struct inode *inode,
+				   struct ocfs2_dio_write_ctxt *dwc,
+				   loff_t offset,
+				   ssize_t bytes)
+{
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_unwritten_extent *ue = NULL;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle = NULL;
+	loff_t end = offset + bytes;
+	int ret = 0, credits = 0, locked = 0;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	/* We do clear unwritten, delete orphan, change i_size here. If neither
+	 * of these happen, we can skip all this. */
+	if (list_empty(&dwc->dw_zero_list) &&
+	    end <= i_size_read(inode) &&
+	    !dwc->dw_orphaned)
+		goto out;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
+	 * are in that context. */
+	if (dwc->dw_writer_pid != task_pid_nr(current)) {
+		mutex_lock(&inode->i_mutex);
+		locked = 1;
+	}
+
+	/* Delete orphan before acquire i_mutex. */
+	if (dwc->dw_orphaned) {
+		BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+
+		end = end > i_size_read(inode) ? end : 0;
+
+		ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+				!!end, end);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	di = (struct ocfs2_dinode *)di_bh;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+
+	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+				    &data_ac, &meta_ac);
+
+	credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto unlock;
+	}
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto commit;
+	}
+
+	list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+		ret = ocfs2_mark_extent_written(inode, &et, handle,
+						ue->ue_cpos, 1,
+						ue->ue_phys,
+						meta_ac, &dealloc);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	if (end > i_size_read(inode)) {
+		ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+commit:
+	ocfs2_commit_trans(osb, handle);
+unlock:
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+out:
+	ocfs2_run_deallocs(osb, &dealloc);
+	if (locked)
+		mutex_unlock(&inode->i_mutex);
+	ocfs2_dio_free_write_ctx(inode, dwc);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+			     loff_t offset,
+			     ssize_t bytes,
+			     void *private)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	int level;
+
+	/* this io's submitter should not have unlocked this before we could */
+	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+	if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+		ocfs2_iocb_clear_unaligned_aio(iocb);
+
+		mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
+	}
+
+	if (private)
+		ocfs2_dio_end_io_write(inode, private, offset, bytes);
+
+	ocfs2_iocb_clear_rw_locked(iocb);
+
+	level = ocfs2_iocb_rw_locked_level(iocb);
+	ocfs2_rw_unlock(inode, level);
+}
+
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+			       loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file)->i_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	loff_t end = offset + iter->count;
+	get_block_t *get_block;
+
+	/*
+	 * Fallback to buffered I/O if we see an inode without
+	 * extents.
+	 */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	/* Fallback to buffered I/O if we do not support append dio. */
+	if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+		return 0;
+
+	if (iov_iter_rw(iter) == READ)
+		get_block = ocfs2_get_block;
+	else
+		get_block = ocfs2_dio_get_block;
+
+	return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+				    iter, offset, get_block,
+				    ocfs2_dio_end_io, NULL, 0);
+}
+
 const struct address_space_operations ocfs2_aops = {
 	.readpage		= ocfs2_readpage,
 	.readpages		= ocfs2_readpages,
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 8/8] ocfs2: code clean up for direct io
  2015-09-11  8:19 [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Ryan Ding
                   ` (6 preceding siblings ...)
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 7/8] ocfs2: fix sparse file & data ordering issue in direct io Ryan Ding
@ 2015-09-11  8:19 ` Ryan Ding
  2015-09-28 10:20 ` [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Joseph Qi
  8 siblings, 0 replies; 22+ messages in thread
From: Ryan Ding @ 2015-09-11  8:19 UTC (permalink / raw)
  To: ocfs2-devel

Clean up ocfs2_file_write_iter & ocfs2_prepare_inode_for_write:
* remove append dio check: it will be checked in ocfs2_direct_IO()
* remove file hole check: file hole is supported for now
* remove inline data check: it will be checked in ocfs2_direct_IO()
* remove the full_coherence check when append dio: we will get the inode_lock
  in ocfs2_dio_get_block, there is no need to fall back to buffer io to ensure
  the coherence semantics.

Now the drop dio procedure is gone. :)

Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
cc: Joseph Qi <joseph.qi@huawei.com>
---
 fs/ocfs2/file.c        |  138 ++---------------------------------------------
 fs/ocfs2/ocfs2_trace.h |   16 ++----
 2 files changed, 10 insertions(+), 144 deletions(-)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e5b451..05346fb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1373,44 +1373,6 @@ out:
 	return ret;
 }
 
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
-				       size_t count)
-{
-	int ret = 0;
-	unsigned int extent_flags;
-	u32 cpos, clusters, extent_len, phys_cpos;
-	struct super_block *sb = inode->i_sb;
-
-	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
-	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-
-	while (clusters) {
-		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
-					 &extent_flags);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-
-		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
-			ret = 1;
-			break;
-		}
-
-		if (extent_len > clusters)
-			extent_len = clusters;
-
-		clusters -= extent_len;
-		cpos += extent_len;
-	}
-out:
-	return ret;
-}
-
 static int ocfs2_write_remove_suid(struct inode *inode)
 {
 	int ret;
@@ -2121,18 +2083,12 @@ out:
 
 static int ocfs2_prepare_inode_for_write(struct file *file,
 					 loff_t pos,
-					 size_t count,
-					 int appending,
-					 int *direct_io,
-					 int *has_refcount)
+					 size_t count)
 {
 	int ret = 0, meta_level = 0;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
 	loff_t end;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	int full_coherency = !(osb->s_mount_opt &
-		OCFS2_MOUNT_COHERENCY_BUFFERED);
 
 	/*
 	 * We start with a read level meta lock and only jump to an ex
@@ -2181,10 +2137,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
 							       pos,
 							       count,
 							       &meta_level);
-			if (has_refcount)
-				*has_refcount = 1;
-			if (direct_io)
-				*direct_io = 0;
 		}
 
 		if (ret < 0) {
@@ -2192,67 +2144,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
 			goto out_unlock;
 		}
 
-		/*
-		 * Skip the O_DIRECT checks if we don't need
-		 * them.
-		 */
-		if (!direct_io || !(*direct_io))
-			break;
-
-		/*
-		 * There's no sane way to do direct writes to an inode
-		 * with inline data.
-		 */
-		if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-			*direct_io = 0;
-			break;
-		}
-
-		/*
-		 * Allowing concurrent direct writes means
-		 * i_size changes wouldn't be synchronized, so
-		 * one node could wind up truncating another
-		 * nodes writes.
-		 */
-		if (end > i_size_read(inode) && !full_coherency) {
-			*direct_io = 0;
-			break;
-		}
-
-		/*
-		 * Fallback to old way if the feature bit is not set.
-		 */
-		if (end > i_size_read(inode) &&
-				!ocfs2_supports_append_dio(osb)) {
-			*direct_io = 0;
-			break;
-		}
-
-		/*
-		 * We don't fill holes during direct io, so
-		 * check for them here. If any are found, the
-		 * caller will have to retake some cluster
-		 * locks and initiate the io as buffered.
-		 */
-		ret = ocfs2_check_range_for_holes(inode, pos, count);
-		if (ret == 1) {
-			/*
-			 * Fallback to old way if the feature bit is not set.
-			 * Otherwise try dio first and then complete the rest
-			 * request through buffer io.
-			 */
-			if (!ocfs2_supports_append_dio(osb))
-				*direct_io = 0;
-			ret = 0;
-		} else if (ret < 0)
-			mlog_errno(ret);
 		break;
 	}
 
 out_unlock:
 	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
-					    pos, appending, count,
-					    direct_io, has_refcount);
+					    pos, count);
 
 	if (meta_level >= 0)
 		ocfs2_inode_unlock(inode, meta_level);
@@ -2264,18 +2161,16 @@ out:
 static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 				    struct iov_iter *from)
 {
-	int direct_io, appending, rw_level;
-	int can_do_direct, has_refcount = 0;
+	int direct_io, rw_level;
 	ssize_t written = 0;
 	ssize_t ret;
-	size_t count = iov_iter_count(from), orig_count;
+	size_t count = iov_iter_count(from);
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int full_coherency = !(osb->s_mount_opt &
 			       OCFS2_MOUNT_COHERENCY_BUFFERED);
 	int unaligned_dio = 0;
-	int dropped_dio = 0;
 	int append_write = ((iocb->ki_pos + count) >=
 			i_size_read(inode) ? 1 : 0);
 
@@ -2288,12 +2183,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 	if (count == 0)
 		return 0;
 
-	appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
 	direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
 
 	mutex_lock(&inode->i_mutex);
 
-relock:
 	/*
 	 * Concurrent O_DIRECT writes are allowed with
 	 * mount_option "coherency=buffered".
@@ -2326,7 +2219,6 @@ relock:
 		ocfs2_inode_unlock(inode, 1);
 	}
 
-	orig_count = iov_iter_count(from);
 	ret = generic_write_checks(iocb, from);
 	if (ret <= 0) {
 		if (ret)
@@ -2335,9 +2227,7 @@ relock:
 	}
 	count = ret;
 
-	can_do_direct = direct_io;
-	ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
-					    &can_do_direct, &has_refcount);
+	ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -2346,22 +2236,6 @@ relock:
 	if (direct_io && !is_sync_kiocb(iocb))
 		unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
 
-	/*
-	 * We can't complete the direct I/O as requested, fall back to
-	 * buffered I/O.
-	 */
-	if (direct_io && !can_do_direct) {
-		ocfs2_rw_unlock(inode, rw_level);
-
-		rw_level = -1;
-
-		direct_io = 0;
-		iocb->ki_flags &= ~IOCB_DIRECT;
-		iov_iter_reexpand(from, orig_count);
-		dropped_dio = 1;
-		goto relock;
-	}
-
 	if (unaligned_dio) {
 		/*
 		 * Wait on previous unaligned aio to complete before
@@ -2397,7 +2271,7 @@ relock:
 		goto no_sync;
 
 	if (((file->f_flags & O_DSYNC) && !direct_io) ||
-	    IS_SYNC(inode) || dropped_dio) {
+	    IS_SYNC(inode)) {
 		ret = filemap_fdatawrite_range(file->f_mapping,
 					       iocb->ki_pos - written,
 					       iocb->ki_pos - 1);
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 6cb019b..09d0c89 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
 
 TRACE_EVENT(ocfs2_prepare_inode_for_write,
 	TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
-		 int appending, unsigned long count,
-		 int *direct_io, int *has_refcount),
-	TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+		 unsigned long count),
+	TP_ARGS(ino, saved_pos, count),
 	TP_STRUCT__entry(
 		__field(unsigned long long, ino)
 		__field(unsigned long long, saved_pos)
-		__field(int, appending)
 		__field(unsigned long, count)
-		__field(int, direct_io)
-		__field(int, has_refcount)
 	),
 	TP_fast_assign(
 		__entry->ino = ino;
 		__entry->saved_pos = saved_pos;
-		__entry->appending = appending;
 		__entry->count = count;
-		__entry->direct_io = direct_io ? *direct_io : -1;
-		__entry->has_refcount = has_refcount ? *has_refcount : -1;
 	),
-	TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
-		  __entry->saved_pos, __entry->appending, __entry->count,
-		  __entry->direct_io, __entry->has_refcount)
+	TP_printk("%llu %llu %lu", __entry->ino,
+		  __entry->saved_pos, __entry->count)
 );
 
 DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-09-11  8:19 [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Ryan Ding
                   ` (7 preceding siblings ...)
  2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 8/8] ocfs2: code clean up for " Ryan Ding
@ 2015-09-28 10:20 ` Joseph Qi
  2015-10-08  3:12   ` Ryan Ding
  8 siblings, 1 reply; 22+ messages in thread
From: Joseph Qi @ 2015-09-28 10:20 UTC (permalink / raw)
  To: ocfs2-devel

Hi Ryan,
I have gone through this patch set and done a simple performance test
using direct dd, it indeed brings much performance promotion.
         Before      After
bs=4K    1.4 MB/s    5.0 MB/s
bs=256k  40.5 MB/s   56.3 MB/s

My questions are:
1) You solution is still using orphan dir to keep inode and allocation
consistency, am I right? From our test, it is the most complicated part
and has many race cases to be taken consideration. So I wonder if this
can be restructured.
2) Rather than using normal block direct io, you introduce a way to use
write begin/end in buffer io. IMO, if it wants to perform like direct
io, it should be committed to disk by forcing committing journal. But
journal committing will consume much time. Why does it bring performance
promotion instead?
3) Do you have a test in case of lack of memory?

On 2015/9/11 16:19, Ryan Ding wrote:
> The idea is to use buffer io(more precisely use the interface
> ocfs2_write_begin_nolock & ocfs2_write_end_nolock) to do the zero work beyond
> block size. And clear UNWRITTEN flag until direct io data has been written to
> disk, which can prevent data corruption when system crashed during direct write.
> 
> And we will also archive a better performance:
> eg. dd direct write new file with block size 4KB:
> before this patch:
> 2.5 MB/s
> after this patch:
> 66.4 MB/s
> 
> ----------------------------------------------------------------
> Ryan Ding (8):
>       ocfs2: add ocfs2_write_type_t type to identify the caller of write
>       ocfs2: use c_new to indicate newly allocated extents
>       ocfs2: test target page before change it
>       ocfs2: do not change i_size in write_end for direct io
>       ocfs2: return the physical address in ocfs2_write_cluster
>       ocfs2: record UNWRITTEN extents when populate write desc
>       ocfs2: fix sparse file & data ordering issue in direct io. 
>       ocfs2: code clean up for direct io
> 
>  fs/ocfs2/aops.c        | 1118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------
>  fs/ocfs2/aops.h        |   11 +-
>  fs/ocfs2/file.c        |  138 +---------------------
>  fs/ocfs2/inode.c       |    3 +
>  fs/ocfs2/inode.h       |    3 + 
>  fs/ocfs2/mmap.c        |    4 +-
>  fs/ocfs2/ocfs2_trace.h |   16 +-- 
>  fs/ocfs2/super.c       |    1 +
>  8 files changed, 568 insertions(+), 726 deletions(-)
> 
> .
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-09-28 10:20 ` [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Joseph Qi
@ 2015-10-08  3:12   ` Ryan Ding
  2015-10-08  6:13     ` Joseph Qi
  0 siblings, 1 reply; 22+ messages in thread
From: Ryan Ding @ 2015-10-08  3:12 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joseph,

On 09/28/2015 06:20 PM, Joseph Qi wrote:
> Hi Ryan,
> I have gone through this patch set and done a simple performance test
> using direct dd, it indeed brings much performance promotion.
>           Before      After
> bs=4K    1.4 MB/s    5.0 MB/s
> bs=256k  40.5 MB/s   56.3 MB/s
>
> My questions are:
> 1) You solution is still using orphan dir to keep inode and allocation
> consistency, am I right? From our test, it is the most complicated part
> and has many race cases to be taken consideration. So I wonder if this
> can be restructured.
I have not got a better idea to do this. I think the only reason why 
direct io using orphan is to prevent space lost when system crash during 
append direct write. But maybe a 'fsck -f' will do that job. Is it 
necessary to use orphan?
> 2) Rather than using normal block direct io, you introduce a way to use
> write begin/end in buffer io. IMO, if it wants to perform like direct
> io, it should be committed to disk by forcing committing journal. But
> journal committing will consume much time. Why does it bring performance
> promotion instead?
I use buffer io to write only the zero pages. Actual data payload is 
written as direct io. I think there is no need to do a force commit. 
Because direct means "Try to minimize cache effects of the I/O to and 
from this file.", it does not means "write all data & meta data to disk 
before write return".
> 3) Do you have a test in case of lack of memory?
I tested it in a system with 2GB memory. Is that enough?

Thanks,
Ryan
>
> On 2015/9/11 16:19, Ryan Ding wrote:
>> The idea is to use buffer io(more precisely use the interface
>> ocfs2_write_begin_nolock & ocfs2_write_end_nolock) to do the zero work beyond
>> block size. And clear UNWRITTEN flag until direct io data has been written to
>> disk, which can prevent data corruption when system crashed during direct write.
>>
>> And we will also archive a better performance:
>> eg. dd direct write new file with block size 4KB:
>> before this patch:
>> 2.5 MB/s
>> after this patch:
>> 66.4 MB/s
>>
>> ----------------------------------------------------------------
>> Ryan Ding (8):
>>        ocfs2: add ocfs2_write_type_t type to identify the caller of write
>>        ocfs2: use c_new to indicate newly allocated extents
>>        ocfs2: test target page before change it
>>        ocfs2: do not change i_size in write_end for direct io
>>        ocfs2: return the physical address in ocfs2_write_cluster
>>        ocfs2: record UNWRITTEN extents when populate write desc
>>        ocfs2: fix sparse file & data ordering issue in direct io.
>>        ocfs2: code clean up for direct io
>>
>>   fs/ocfs2/aops.c        | 1118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------
>>   fs/ocfs2/aops.h        |   11 +-
>>   fs/ocfs2/file.c        |  138 +---------------------
>>   fs/ocfs2/inode.c       |    3 +
>>   fs/ocfs2/inode.h       |    3 +
>>   fs/ocfs2/mmap.c        |    4 +-
>>   fs/ocfs2/ocfs2_trace.h |   16 +--
>>   fs/ocfs2/super.c       |    1 +
>>   8 files changed, 568 insertions(+), 726 deletions(-)
>>
>> .
>>
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-10-08  3:12   ` Ryan Ding
@ 2015-10-08  6:13     ` Joseph Qi
  2015-10-08  7:13       ` Ryan Ding
  2015-10-12  6:34       ` Ryan Ding
  0 siblings, 2 replies; 22+ messages in thread
From: Joseph Qi @ 2015-10-08  6:13 UTC (permalink / raw)
  To: ocfs2-devel

Hi Ryan,

On 2015/10/8 11:12, Ryan Ding wrote:
> Hi Joseph,
> 
> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>> Hi Ryan,
>> I have gone through this patch set and done a simple performance test
>> using direct dd, it indeed brings much performance promotion.
>>           Before      After
>> bs=4K    1.4 MB/s    5.0 MB/s
>> bs=256k  40.5 MB/s   56.3 MB/s
>>
>> My questions are:
>> 1) You solution is still using orphan dir to keep inode and allocation
>> consistency, am I right? From our test, it is the most complicated part
>> and has many race cases to be taken consideration. So I wonder if this
>> can be restructured.
> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
it is much more complicated than ext4.
And fsck can only be used offline, but using orphan is to perform
recovering online. So I don't think fsck can replace it in all cases.

>> 2) Rather than using normal block direct io, you introduce a way to use
>> write begin/end in buffer io. IMO, if it wants to perform like direct
>> io, it should be committed to disk by forcing committing journal. But
>> journal committing will consume much time. Why does it bring performance
>> promotion instead?
> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
So this is protected by "UNWRITTEN" flag, right?

>> 3) Do you have a test in case of lack of memory?
> I tested it in a system with 2GB memory. Is that enough?
What I mean is doing many direct io jobs in case system free memory is
low.

Thanks,
Joesph

> 
> Thanks,
> Ryan
>>
>> On 2015/9/11 16:19, Ryan Ding wrote:
>>> The idea is to use buffer io(more precisely use the interface
>>> ocfs2_write_begin_nolock & ocfs2_write_end_nolock) to do the zero work beyond
>>> block size. And clear UNWRITTEN flag until direct io data has been written to
>>> disk, which can prevent data corruption when system crashed during direct write.
>>>
>>> And we will also archive a better performance:
>>> eg. dd direct write new file with block size 4KB:
>>> before this patch:
>>> 2.5 MB/s
>>> after this patch:
>>> 66.4 MB/s
>>>
>>> ----------------------------------------------------------------
>>> Ryan Ding (8):
>>>        ocfs2: add ocfs2_write_type_t type to identify the caller of write
>>>        ocfs2: use c_new to indicate newly allocated extents
>>>        ocfs2: test target page before change it
>>>        ocfs2: do not change i_size in write_end for direct io
>>>        ocfs2: return the physical address in ocfs2_write_cluster
>>>        ocfs2: record UNWRITTEN extents when populate write desc
>>>        ocfs2: fix sparse file & data ordering issue in direct io.
>>>        ocfs2: code clean up for direct io
>>>
>>>   fs/ocfs2/aops.c        | 1118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------
>>>   fs/ocfs2/aops.h        |   11 +-
>>>   fs/ocfs2/file.c        |  138 +---------------------
>>>   fs/ocfs2/inode.c       |    3 +
>>>   fs/ocfs2/inode.h       |    3 +
>>>   fs/ocfs2/mmap.c        |    4 +-
>>>   fs/ocfs2/ocfs2_trace.h |   16 +--
>>>   fs/ocfs2/super.c       |    1 +
>>>   8 files changed, 568 insertions(+), 726 deletions(-)
>>>
>>> .
>>>
>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-10-08  6:13     ` Joseph Qi
@ 2015-10-08  7:13       ` Ryan Ding
  2015-10-12  6:34       ` Ryan Ding
  1 sibling, 0 replies; 22+ messages in thread
From: Ryan Ding @ 2015-10-08  7:13 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joseph,

On 10/08/2015 02:13 PM, Joseph Qi wrote:
> Hi Ryan,
>
> On 2015/10/8 11:12, Ryan Ding wrote:
>> Hi Joseph,
>>
>> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>>> Hi Ryan,
>>> I have gone through this patch set and done a simple performance test
>>> using direct dd, it indeed brings much performance promotion.
>>>            Before      After
>>> bs=4K    1.4 MB/s    5.0 MB/s
>>> bs=256k  40.5 MB/s   56.3 MB/s
>>>
>>> My questions are:
>>> 1) You solution is still using orphan dir to keep inode and allocation
>>> consistency, am I right? From our test, it is the most complicated part
>>> and has many race cases to be taken consideration. So I wonder if this
>>> can be restructured.
>> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
> The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
> it is much more complicated than ext4.
> And fsck can only be used offline, but using orphan is to perform
> recovering online. So I don't think fsck can replace it in all cases.
OK, I agree.
>
>>> 2) Rather than using normal block direct io, you introduce a way to use
>>> write begin/end in buffer io. IMO, if it wants to perform like direct
>>> io, it should be committed to disk by forcing committing journal. But
>>> journal committing will consume much time. Why does it bring performance
>>> promotion instead?
>> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
> So this is protected by "UNWRITTEN" flag, right?
Yes.
>
>>> 3) Do you have a test in case of lack of memory?
>> I tested it in a system with 2GB memory. Is that enough?
> What I mean is doing many direct io jobs in case system free memory is
> low.
You use dio or aio+dio?

Thanks,
Ryan
>
> Thanks,
> Joesph
>
>> Thanks,
>> Ryan
>>> On 2015/9/11 16:19, Ryan Ding wrote:
>>>> The idea is to use buffer io(more precisely use the interface
>>>> ocfs2_write_begin_nolock & ocfs2_write_end_nolock) to do the zero work beyond
>>>> block size. And clear UNWRITTEN flag until direct io data has been written to
>>>> disk, which can prevent data corruption when system crashed during direct write.
>>>>
>>>> And we will also archive a better performance:
>>>> eg. dd direct write new file with block size 4KB:
>>>> before this patch:
>>>> 2.5 MB/s
>>>> after this patch:
>>>> 66.4 MB/s
>>>>
>>>> ----------------------------------------------------------------
>>>> Ryan Ding (8):
>>>>         ocfs2: add ocfs2_write_type_t type to identify the caller of write
>>>>         ocfs2: use c_new to indicate newly allocated extents
>>>>         ocfs2: test target page before change it
>>>>         ocfs2: do not change i_size in write_end for direct io
>>>>         ocfs2: return the physical address in ocfs2_write_cluster
>>>>         ocfs2: record UNWRITTEN extents when populate write desc
>>>>         ocfs2: fix sparse file & data ordering issue in direct io.
>>>>         ocfs2: code clean up for direct io
>>>>
>>>>    fs/ocfs2/aops.c        | 1118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------
>>>>    fs/ocfs2/aops.h        |   11 +-
>>>>    fs/ocfs2/file.c        |  138 +---------------------
>>>>    fs/ocfs2/inode.c       |    3 +
>>>>    fs/ocfs2/inode.h       |    3 +
>>>>    fs/ocfs2/mmap.c        |    4 +-
>>>>    fs/ocfs2/ocfs2_trace.h |   16 +--
>>>>    fs/ocfs2/super.c       |    1 +
>>>>    8 files changed, 568 insertions(+), 726 deletions(-)
>>>>
>>>> .
>>>>
>>
>> .
>>
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-10-08  6:13     ` Joseph Qi
  2015-10-08  7:13       ` Ryan Ding
@ 2015-10-12  6:34       ` Ryan Ding
  2015-12-10  7:54         ` Joseph Qi
  1 sibling, 1 reply; 22+ messages in thread
From: Ryan Ding @ 2015-10-12  6:34 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joseph,

On 10/08/2015 02:13 PM, Joseph Qi wrote:
> Hi Ryan,
>
> On 2015/10/8 11:12, Ryan Ding wrote:
>> Hi Joseph,
>>
>> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>>> Hi Ryan,
>>> I have gone through this patch set and done a simple performance test
>>> using direct dd, it indeed brings much performance promotion.
>>>            Before      After
>>> bs=4K    1.4 MB/s    5.0 MB/s
>>> bs=256k  40.5 MB/s   56.3 MB/s
>>>
>>> My questions are:
>>> 1) You solution is still using orphan dir to keep inode and allocation
>>> consistency, am I right? From our test, it is the most complicated part
>>> and has many race cases to be taken consideration. So I wonder if this
>>> can be restructured.
>> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
> The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
> it is much more complicated than ext4.
> And fsck can only be used offline, but using orphan is to perform
> recovering online. So I don't think fsck can replace it in all cases.
>
>>> 2) Rather than using normal block direct io, you introduce a way to use
>>> write begin/end in buffer io. IMO, if it wants to perform like direct
>>> io, it should be committed to disk by forcing committing journal. But
>>> journal committing will consume much time. Why does it bring performance
>>> promotion instead?
>> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
> So this is protected by "UNWRITTEN" flag, right?
>
>>> 3) Do you have a test in case of lack of memory?
>> I tested it in a system with 2GB memory. Is that enough?
> What I mean is doing many direct io jobs in case system free memory is
> low.
I understand what you mean, but did not find a better way to test it. 
Since if free memory is too low, even the process can not be started. If 
free memory is fairlyenough, the test has no meaning.
So I try to collect the memory usage during io, and do a comparison test 
with buffer io. The result is:
1. start 100 dd to do 4KB direct write:
[root at hnode3 ~]# cat /proc/meminfo | grep -E 
"^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
MemTotal:        2809788 kB
MemFree:           21824 kB
Buffers:           55176 kB
Cached:          2513968 kB
Dirty:               412 kB
Writeback:            36 kB

2. start 100 dd to do 4KB buffer write:
[root at hnode3 ~]# cat /proc/meminfo | grep -E 
"^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
MemTotal:        2809788 kB
MemFree:           22476 kB
Buffers:           15696 kB
Cached:          2544892 kB
Dirty:            320136 kB
Writeback:        146404 kB

You can see from the 'Dirty' and 'Writeback' field that there is not so 
much memory used as buffer io. So I think what you concern is no longer 
exist. :-)

Thanks,
Ryan
>
> Thanks,
> Joesph
>
>> Thanks,
>> Ryan
>>> On 2015/9/11 16:19, Ryan Ding wrote:
>>>> The idea is to use buffer io(more precisely use the interface
>>>> ocfs2_write_begin_nolock & ocfs2_write_end_nolock) to do the zero work beyond
>>>> block size. And clear UNWRITTEN flag until direct io data has been written to
>>>> disk, which can prevent data corruption when system crashed during direct write.
>>>>
>>>> And we will also archive a better performance:
>>>> eg. dd direct write new file with block size 4KB:
>>>> before this patch:
>>>> 2.5 MB/s
>>>> after this patch:
>>>> 66.4 MB/s
>>>>
>>>> ----------------------------------------------------------------
>>>> Ryan Ding (8):
>>>>         ocfs2: add ocfs2_write_type_t type to identify the caller of write
>>>>         ocfs2: use c_new to indicate newly allocated extents
>>>>         ocfs2: test target page before change it
>>>>         ocfs2: do not change i_size in write_end for direct io
>>>>         ocfs2: return the physical address in ocfs2_write_cluster
>>>>         ocfs2: record UNWRITTEN extents when populate write desc
>>>>         ocfs2: fix sparse file & data ordering issue in direct io.
>>>>         ocfs2: code clean up for direct io
>>>>
>>>>    fs/ocfs2/aops.c        | 1118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------
>>>>    fs/ocfs2/aops.h        |   11 +-
>>>>    fs/ocfs2/file.c        |  138 +---------------------
>>>>    fs/ocfs2/inode.c       |    3 +
>>>>    fs/ocfs2/inode.h       |    3 +
>>>>    fs/ocfs2/mmap.c        |    4 +-
>>>>    fs/ocfs2/ocfs2_trace.h |   16 +--
>>>>    fs/ocfs2/super.c       |    1 +
>>>>    8 files changed, 568 insertions(+), 726 deletions(-)
>>>>
>>>> .
>>>>
>>
>> .
>>
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-10-12  6:34       ` Ryan Ding
@ 2015-12-10  7:54         ` Joseph Qi
  2015-12-10  8:48           ` Ryan Ding
  0 siblings, 1 reply; 22+ messages in thread
From: Joseph Qi @ 2015-12-10  7:54 UTC (permalink / raw)
  To: ocfs2-devel

Hi Ryan,

On 2015/10/12 14:34, Ryan Ding wrote:
> Hi Joseph,
> 
> On 10/08/2015 02:13 PM, Joseph Qi wrote:
>> Hi Ryan,
>>
>> On 2015/10/8 11:12, Ryan Ding wrote:
>>> Hi Joseph,
>>>
>>> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>>>> Hi Ryan,
>>>> I have gone through this patch set and done a simple performance test
>>>> using direct dd, it indeed brings much performance promotion.
>>>>            Before      After
>>>> bs=4K    1.4 MB/s    5.0 MB/s
>>>> bs=256k  40.5 MB/s   56.3 MB/s
>>>>
>>>> My questions are:
>>>> 1) You solution is still using orphan dir to keep inode and allocation
>>>> consistency, am I right? From our test, it is the most complicated part
>>>> and has many race cases to be taken consideration. So I wonder if this
>>>> can be restructured.
>>> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
>> The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
>> it is much more complicated than ext4.
>> And fsck can only be used offline, but using orphan is to perform
>> recovering online. So I don't think fsck can replace it in all cases.
>>
>>>> 2) Rather than using normal block direct io, you introduce a way to use
>>>> write begin/end in buffer io. IMO, if it wants to perform like direct
>>>> io, it should be committed to disk by forcing committing journal. But
>>>> journal committing will consume much time. Why does it bring performance
>>>> promotion instead?
>>> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
I think we cannot mix zero pages with direct io here, which will lead
to direct io data to be overwritten by zero pages.
For example, a ocfs2 volume with block size 4K and cluster size 4K.
Firstly I create a file with size of 5K and it will be allocated 2
clusters (8K) and the last 3K without zeroed (no need at this time).
Then I seek to offset 9K and do direct write 1K, then back to 4K and do
direct write 5K. Here we have to zero allocated space to avoid dirty
data. But since direct write data goes to disk directly and zero pages
depends on journal commit, so direct write data will be overwritten and
file corrupts.

>> So this is protected by "UNWRITTEN" flag, right?
>>
>>>> 3) Do you have a test in case of lack of memory?
>>> I tested it in a system with 2GB memory. Is that enough?
>> What I mean is doing many direct io jobs in case system free memory is
>> low.
> I understand what you mean, but did not find a better way to test it. Since if free memory is too low, even the process can not be started. If free memory is fairlyenough, the test has no meaning.
> So I try to collect the memory usage during io, and do a comparison test with buffer io. The result is:
> 1. start 100 dd to do 4KB direct write:
> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
> MemTotal:        2809788 kB
> MemFree:           21824 kB
> Buffers:           55176 kB
> Cached:          2513968 kB
> Dirty:               412 kB
> Writeback:            36 kB
> 
> 2. start 100 dd to do 4KB buffer write:
> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
> MemTotal:        2809788 kB
> MemFree:           22476 kB
> Buffers:           15696 kB
> Cached:          2544892 kB
> Dirty:            320136 kB
> Writeback:        146404 kB
> 
> You can see from the 'Dirty' and 'Writeback' field that there is not so much memory used as buffer io. So I think what you concern is no longer exist. :-)
> 
> Thanks,
> Ryan
>>
>> Thanks,
>> Joesph
>>
>>> Thanks,
>>> Ryan
>>>> On 2015/9/11 16:19, Ryan Ding wrote:
>>>>> The idea is to use buffer io(more precisely use the interface
>>>>> ocfs2_write_begin_nolock & ocfs2_write_end_nolock) to do the zero work beyond
>>>>> block size. And clear UNWRITTEN flag until direct io data has been written to
>>>>> disk, which can prevent data corruption when system crashed during direct write.
>>>>>
>>>>> And we will also archive a better performance:
>>>>> eg. dd direct write new file with block size 4KB:
>>>>> before this patch:
>>>>> 2.5 MB/s
>>>>> after this patch:
>>>>> 66.4 MB/s
>>>>>
>>>>> ----------------------------------------------------------------
>>>>> Ryan Ding (8):
>>>>>         ocfs2: add ocfs2_write_type_t type to identify the caller of write
>>>>>         ocfs2: use c_new to indicate newly allocated extents
>>>>>         ocfs2: test target page before change it
>>>>>         ocfs2: do not change i_size in write_end for direct io
>>>>>         ocfs2: return the physical address in ocfs2_write_cluster
>>>>>         ocfs2: record UNWRITTEN extents when populate write desc
>>>>>         ocfs2: fix sparse file & data ordering issue in direct io.
>>>>>         ocfs2: code clean up for direct io
>>>>>
>>>>>    fs/ocfs2/aops.c        | 1118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------
>>>>>    fs/ocfs2/aops.h        |   11 +-
>>>>>    fs/ocfs2/file.c        |  138 +---------------------
>>>>>    fs/ocfs2/inode.c       |    3 +
>>>>>    fs/ocfs2/inode.h       |    3 +
>>>>>    fs/ocfs2/mmap.c        |    4 +-
>>>>>    fs/ocfs2/ocfs2_trace.h |   16 +--
>>>>>    fs/ocfs2/super.c       |    1 +
>>>>>    8 files changed, 568 insertions(+), 726 deletions(-)
>>>>>
>>>>> .
>>>>>
>>>
>>> .
>>>
>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-12-10  7:54         ` Joseph Qi
@ 2015-12-10  8:48           ` Ryan Ding
  2015-12-10 10:36             ` Joseph Qi
  0 siblings, 1 reply; 22+ messages in thread
From: Ryan Ding @ 2015-12-10  8:48 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joseph,

Thanks for your comments, please see my reply:

On 12/10/2015 03:54 PM, Joseph Qi wrote:
> Hi Ryan,
>
> On 2015/10/12 14:34, Ryan Ding wrote:
>> Hi Joseph,
>>
>> On 10/08/2015 02:13 PM, Joseph Qi wrote:
>>> Hi Ryan,
>>>
>>> On 2015/10/8 11:12, Ryan Ding wrote:
>>>> Hi Joseph,
>>>>
>>>> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>>>>> Hi Ryan,
>>>>> I have gone through this patch set and done a simple performance test
>>>>> using direct dd, it indeed brings much performance promotion.
>>>>>             Before      After
>>>>> bs=4K    1.4 MB/s    5.0 MB/s
>>>>> bs=256k  40.5 MB/s   56.3 MB/s
>>>>>
>>>>> My questions are:
>>>>> 1) You solution is still using orphan dir to keep inode and allocation
>>>>> consistency, am I right? From our test, it is the most complicated part
>>>>> and has many race cases to be taken consideration. So I wonder if this
>>>>> can be restructured.
>>>> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
>>> The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
>>> it is much more complicated than ext4.
>>> And fsck can only be used offline, but using orphan is to perform
>>> recovering online. So I don't think fsck can replace it in all cases.
>>>
>>>>> 2) Rather than using normal block direct io, you introduce a way to use
>>>>> write begin/end in buffer io. IMO, if it wants to perform like direct
>>>>> io, it should be committed to disk by forcing committing journal. But
>>>>> journal committing will consume much time. Why does it bring performance
>>>>> promotion instead?
>>>> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
> I think we cannot mix zero pages with direct io here, which will lead
> to direct io data to be overwritten by zero pages.
> For example, a ocfs2 volume with block size 4K and cluster size 4K.
> Firstly I create a file with size of 5K and it will be allocated 2
> clusters (8K) and the last 3K without zeroed (no need at this time).
I think the last 3K will be zeroed no matter you use direct io or buffer 
io to create the a file with 5K.
> Then I seek to offset 9K and do direct write 1K, then back to 4K and do
> direct write 5K. Here we have to zero allocated space to avoid dirty
> data. But since direct write data goes to disk directly and zero pages
> depends on journal commit, so direct write data will be overwritten and
> file corrupts.
do_blockdev_direct_IO() will zero unwritten area within block size(in 
this case, 6K~8K), when get_block callback return a map with buffer_new 
flag. This zero operation is also using direct io.
So the buffer io zero operation in my design will not work at all in 
this case.It only works to zero the area beyond block size, but within 
cluster size. For example, when block size 4KB cluster size 1MB, a 4KB 
direct write will trigger a zero buffer page of size 1MB-4KB=1020KB.
I think your question is this zero buffer page will conflict with the 
later direct io writing to the same area. The truth is conflict will not 
exist, because before direct write, all conflict buffer page will be 
flushed to disk first (in __generic_file_write_iter()).
BTW, there is a lot testcases to test the operations like buffer write, 
direct write, lseek.. (it's a mix of these operations) in ltp (Linux 
Test Project). This patch set has passed all of them. :)
>
>>> So this is protected by "UNWRITTEN" flag, right?
>>>
>>>>> 3) Do you have a test in case of lack of memory?
>>>> I tested it in a system with 2GB memory. Is that enough?
>>> What I mean is doing many direct io jobs in case system free memory is
>>> low.
>> I understand what you mean, but did not find a better way to test it. Since if free memory is too low, even the process can not be started. If free memory is fairlyenough, the test has no meaning.
>> So I try to collect the memory usage during io, and do a comparison test with buffer io. The result is:
>> 1. start 100 dd to do 4KB direct write:
>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>> MemTotal:        2809788 kB
>> MemFree:           21824 kB
>> Buffers:           55176 kB
>> Cached:          2513968 kB
>> Dirty:               412 kB
>> Writeback:            36 kB
>>
>> 2. start 100 dd to do 4KB buffer write:
>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>> MemTotal:        2809788 kB
>> MemFree:           22476 kB
>> Buffers:           15696 kB
>> Cached:          2544892 kB
>> Dirty:            320136 kB
>> Writeback:        146404 kB
>>
>> You can see from the 'Dirty' and 'Writeback' field that there is not so much memory used as buffer io. So I think what you concern is no longer exist. :-)
>>
>> Thanks,
>> Ryan
>>> Thanks,
>>> Joesph
>>>
>>>> Thanks,
>>>> Ryan
>>>>> On 2015/9/11 16:19, Ryan Ding wrote:
>>>>>> The idea is to use buffer io(more precisely use the interface
>>>>>> ocfs2_write_begin_nolock & ocfs2_write_end_nolock) to do the zero work beyond
>>>>>> block size. And clear UNWRITTEN flag until direct io data has been written to
>>>>>> disk, which can prevent data corruption when system crashed during direct write.
>>>>>>
>>>>>> And we will also archive a better performance:
>>>>>> eg. dd direct write new file with block size 4KB:
>>>>>> before this patch:
>>>>>> 2.5 MB/s
>>>>>> after this patch:
>>>>>> 66.4 MB/s
>>>>>>
>>>>>> ----------------------------------------------------------------
>>>>>> Ryan Ding (8):
>>>>>>          ocfs2: add ocfs2_write_type_t type to identify the caller of write
>>>>>>          ocfs2: use c_new to indicate newly allocated extents
>>>>>>          ocfs2: test target page before change it
>>>>>>          ocfs2: do not change i_size in write_end for direct io
>>>>>>          ocfs2: return the physical address in ocfs2_write_cluster
>>>>>>          ocfs2: record UNWRITTEN extents when populate write desc
>>>>>>          ocfs2: fix sparse file & data ordering issue in direct io.
>>>>>>          ocfs2: code clean up for direct io
>>>>>>
>>>>>>     fs/ocfs2/aops.c        | 1118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------------------------
>>>>>>     fs/ocfs2/aops.h        |   11 +-
>>>>>>     fs/ocfs2/file.c        |  138 +---------------------
>>>>>>     fs/ocfs2/inode.c       |    3 +
>>>>>>     fs/ocfs2/inode.h       |    3 +
>>>>>>     fs/ocfs2/mmap.c        |    4 +-
>>>>>>     fs/ocfs2/ocfs2_trace.h |   16 +--
>>>>>>     fs/ocfs2/super.c       |    1 +
>>>>>>     8 files changed, 568 insertions(+), 726 deletions(-)
>>>>>>
>>>>>> .
>>>>>>
>>>> .
>>>>
>>
>> .
>>
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-12-10  8:48           ` Ryan Ding
@ 2015-12-10 10:36             ` Joseph Qi
  2015-12-14  5:31               ` Ryan Ding
  0 siblings, 1 reply; 22+ messages in thread
From: Joseph Qi @ 2015-12-10 10:36 UTC (permalink / raw)
  To: ocfs2-devel

Hi Ryan,

On 2015/12/10 16:48, Ryan Ding wrote:
> Hi Joseph,
> 
> Thanks for your comments, please see my reply:
> 
> On 12/10/2015 03:54 PM, Joseph Qi wrote:
>> Hi Ryan,
>>
>> On 2015/10/12 14:34, Ryan Ding wrote:
>>> Hi Joseph,
>>>
>>> On 10/08/2015 02:13 PM, Joseph Qi wrote:
>>>> Hi Ryan,
>>>>
>>>> On 2015/10/8 11:12, Ryan Ding wrote:
>>>>> Hi Joseph,
>>>>>
>>>>> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>>>>>> Hi Ryan,
>>>>>> I have gone through this patch set and done a simple performance test
>>>>>> using direct dd, it indeed brings much performance promotion.
>>>>>>             Before      After
>>>>>> bs=4K    1.4 MB/s    5.0 MB/s
>>>>>> bs=256k  40.5 MB/s   56.3 MB/s
>>>>>>
>>>>>> My questions are:
>>>>>> 1) You solution is still using orphan dir to keep inode and allocation
>>>>>> consistency, am I right? From our test, it is the most complicated part
>>>>>> and has many race cases to be taken consideration. So I wonder if this
>>>>>> can be restructured.
>>>>> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
>>>> The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
>>>> it is much more complicated than ext4.
>>>> And fsck can only be used offline, but using orphan is to perform
>>>> recovering online. So I don't think fsck can replace it in all cases.
>>>>
>>>>>> 2) Rather than using normal block direct io, you introduce a way to use
>>>>>> write begin/end in buffer io. IMO, if it wants to perform like direct
>>>>>> io, it should be committed to disk by forcing committing journal. But
>>>>>> journal committing will consume much time. Why does it bring performance
>>>>>> promotion instead?
>>>>> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
>> I think we cannot mix zero pages with direct io here, which will lead
>> to direct io data to be overwritten by zero pages.
>> For example, a ocfs2 volume with block size 4K and cluster size 4K.
>> Firstly I create a file with size of 5K and it will be allocated 2
>> clusters (8K) and the last 3K without zeroed (no need at this time).
> I think the last 3K will be zeroed no matter you use direct io or buffer io to create the a file with 5K.
>> Then I seek to offset 9K and do direct write 1K, then back to 4K and do
>> direct write 5K. Here we have to zero allocated space to avoid dirty
>> data. But since direct write data goes to disk directly and zero pages
>> depends on journal commit, so direct write data will be overwritten and
>> file corrupts.
> do_blockdev_direct_IO() will zero unwritten area within block size(in this case, 6K~8K), when get_block callback return a map with buffer_new flag. This zero operation is also using direct io.
> So the buffer io zero operation in my design will not work at all in this case.It only works to zero the area beyond block size, but within cluster size. For example, when block size 4KB cluster size 1MB, a 4KB direct write will trigger a zero buffer page of size 1MB-4KB=1020KB.
> I think your question is this zero buffer page will conflict with the later direct io writing to the same area. The truth is conflict will not exist, because before direct write, all conflict buffer page will be flushed to disk first (in __generic_file_write_iter()).
How can it make sure the zero pages to be flushed to disk first? In
ocfs2_direct_IO, it calls ocfs2_dio_get_block which uses write_begin
and write_end, and then __blockdev_direct_IO.
I've backported your patch set to kernel 3.0 and tested with vhd-util,
and the result fails. The test case is below.
1) create a 1G dynamic vhd file, the actual size is 5K.
# vhd-util create -n test.vhd -s 1024
2) resize it to 4G, the actual size becomes to 11K
# vhd-util resize -n test.vhd -s 4096 -j test.log
3) hexdump the data, say hexdump1
4) umount to commit journal and mount again, and hexdump the data again,
say hexdump2, which is not equal to hexdump1.
I am not sure if there is any relations with kernel version, which
indeed has many differences due to refactoring.

Thanks,
Joseph

> BTW, there is a lot testcases to test the operations like buffer write, direct write, lseek.. (it's a mix of these operations) in ltp (Linux Test Project). This patch set has passed all of them. :)
>>
>>>> So this is protected by "UNWRITTEN" flag, right?
>>>>
>>>>>> 3) Do you have a test in case of lack of memory?
>>>>> I tested it in a system with 2GB memory. Is that enough?
>>>> What I mean is doing many direct io jobs in case system free memory is
>>>> low.
>>> I understand what you mean, but did not find a better way to test it. Since if free memory is too low, even the process can not be started. If free memory is fairlyenough, the test has no meaning.
>>> So I try to collect the memory usage during io, and do a comparison test with buffer io. The result is:
>>> 1. start 100 dd to do 4KB direct write:
>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>> MemTotal:        2809788 kB
>>> MemFree:           21824 kB
>>> Buffers:           55176 kB
>>> Cached:          2513968 kB
>>> Dirty:               412 kB
>>> Writeback:            36 kB
>>>
>>> 2. start 100 dd to do 4KB buffer write:
>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>> MemTotal:        2809788 kB
>>> MemFree:           22476 kB
>>> Buffers:           15696 kB
>>> Cached:          2544892 kB
>>> Dirty:            320136 kB
>>> Writeback:        146404 kB
>>>
>>> You can see from the 'Dirty' and 'Writeback' field that there is not so much memory used as buffer io. So I think what you concern is no longer exist. :-)
>>>
>>> Thanks,
>>> Ryan
>>>> Thanks,
>>>> Joesph
>>>>
>>>>> Thanks,
>>>>> Ryan

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-12-10 10:36             ` Joseph Qi
@ 2015-12-14  5:31               ` Ryan Ding
  2015-12-14 10:36                 ` Joseph Qi
  0 siblings, 1 reply; 22+ messages in thread
From: Ryan Ding @ 2015-12-14  5:31 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joseph,

On 12/10/2015 06:36 PM, Joseph Qi wrote:
> Hi Ryan,
>
> On 2015/12/10 16:48, Ryan Ding wrote:
>> Hi Joseph,
>>
>> Thanks for your comments, please see my reply:
>>
>> On 12/10/2015 03:54 PM, Joseph Qi wrote:
>>> Hi Ryan,
>>>
>>> On 2015/10/12 14:34, Ryan Ding wrote:
>>>> Hi Joseph,
>>>>
>>>> On 10/08/2015 02:13 PM, Joseph Qi wrote:
>>>>> Hi Ryan,
>>>>>
>>>>> On 2015/10/8 11:12, Ryan Ding wrote:
>>>>>> Hi Joseph,
>>>>>>
>>>>>> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>>>>>>> Hi Ryan,
>>>>>>> I have gone through this patch set and done a simple performance test
>>>>>>> using direct dd, it indeed brings much performance promotion.
>>>>>>>              Before      After
>>>>>>> bs=4K    1.4 MB/s    5.0 MB/s
>>>>>>> bs=256k  40.5 MB/s   56.3 MB/s
>>>>>>>
>>>>>>> My questions are:
>>>>>>> 1) You solution is still using orphan dir to keep inode and allocation
>>>>>>> consistency, am I right? From our test, it is the most complicated part
>>>>>>> and has many race cases to be taken consideration. So I wonder if this
>>>>>>> can be restructured.
>>>>>> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
>>>>> The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
>>>>> it is much more complicated than ext4.
>>>>> And fsck can only be used offline, but using orphan is to perform
>>>>> recovering online. So I don't think fsck can replace it in all cases.
>>>>>
>>>>>>> 2) Rather than using normal block direct io, you introduce a way to use
>>>>>>> write begin/end in buffer io. IMO, if it wants to perform like direct
>>>>>>> io, it should be committed to disk by forcing committing journal. But
>>>>>>> journal committing will consume much time. Why does it bring performance
>>>>>>> promotion instead?
>>>>>> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
>>> I think we cannot mix zero pages with direct io here, which will lead
>>> to direct io data to be overwritten by zero pages.
>>> For example, a ocfs2 volume with block size 4K and cluster size 4K.
>>> Firstly I create a file with size of 5K and it will be allocated 2
>>> clusters (8K) and the last 3K without zeroed (no need at this time).
>> I think the last 3K will be zeroed no matter you use direct io or buffer io to create the a file with 5K.
>>> Then I seek to offset 9K and do direct write 1K, then back to 4K and do
>>> direct write 5K. Here we have to zero allocated space to avoid dirty
>>> data. But since direct write data goes to disk directly and zero pages
>>> depends on journal commit, so direct write data will be overwritten and
>>> file corrupts.
>> do_blockdev_direct_IO() will zero unwritten area within block size(in this case, 6K~8K), when get_block callback return a map with buffer_new flag. This zero operation is also using direct io.
>> So the buffer io zero operation in my design will not work at all in this case.It only works to zero the area beyond block size, but within cluster size. For example, when block size 4KB cluster size 1MB, a 4KB direct write will trigger a zero buffer page of size 1MB-4KB=1020KB.
>> I think your question is this zero buffer page will conflict with the later direct io writing to the same area. The truth is conflict will not exist, because before direct write, all conflict buffer page will be flushed to disk first (in __generic_file_write_iter()).
> How can it make sure the zero pages to be flushed to disk first? In
> ocfs2_direct_IO, it calls ocfs2_dio_get_block which uses write_begin
> and write_end, and then __blockdev_direct_IO.
> I've backported your patch set to kernel 3.0 and tested with vhd-util,
> and the result fails. The test case is below.
> 1) create a 1G dynamic vhd file, the actual size is 5K.
> # vhd-util create -n test.vhd -s 1024
> 2) resize it to 4G, the actual size becomes to 11K
> # vhd-util resize -n test.vhd -s 4096 -j test.log
> 3) hexdump the data, say hexdump1
> 4) umount to commit journal and mount again, and hexdump the data again,
> say hexdump2, which is not equal to hexdump1.
> I am not sure if there is any relations with kernel version, which
> indeed has many differences due to refactoring.
I have backported it to kernel 3.8, and run the scripts below (I think 
it's the same as your test):

     mount /dev/dm-1 /mnt
     pushd /mnt/
     rm test.vhd -f
     vhd-util create -n test.vhd -s 1024
     vhd-util resize -n test.vhd -s 4096 -j test.log
     hexdump test.vhd > ~/test.hex.1
     popd
     umount /mnt/
     mount /dev/dm-1 /mnt/
     hexdump /mnt/test.vhd > ~/test.hex.2
     umount /mnt

block size & cluster size are all 4K.
It shows there is no difference between test.hex.1 and test.hex.2. I 
think this issue is related to specified kernel version, so which 
version is your kernel? Please provide the backport patches if you wish :)

Thanks,
Ryan
>
> Thanks,
> Joseph
>
>> BTW, there is a lot testcases to test the operations like buffer write, direct write, lseek.. (it's a mix of these operations) in ltp (Linux Test Project). This patch set has passed all of them. :)
>>>>> So this is protected by "UNWRITTEN" flag, right?
>>>>>
>>>>>>> 3) Do you have a test in case of lack of memory?
>>>>>> I tested it in a system with 2GB memory. Is that enough?
>>>>> What I mean is doing many direct io jobs in case system free memory is
>>>>> low.
>>>> I understand what you mean, but did not find a better way to test it. Since if free memory is too low, even the process can not be started. If free memory is fairlyenough, the test has no meaning.
>>>> So I try to collect the memory usage during io, and do a comparison test with buffer io. The result is:
>>>> 1. start 100 dd to do 4KB direct write:
>>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>>> MemTotal:        2809788 kB
>>>> MemFree:           21824 kB
>>>> Buffers:           55176 kB
>>>> Cached:          2513968 kB
>>>> Dirty:               412 kB
>>>> Writeback:            36 kB
>>>>
>>>> 2. start 100 dd to do 4KB buffer write:
>>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>>> MemTotal:        2809788 kB
>>>> MemFree:           22476 kB
>>>> Buffers:           15696 kB
>>>> Cached:          2544892 kB
>>>> Dirty:            320136 kB
>>>> Writeback:        146404 kB
>>>>
>>>> You can see from the 'Dirty' and 'Writeback' field that there is not so much memory used as buffer io. So I think what you concern is no longer exist. :-)
>>>>
>>>> Thanks,
>>>> Ryan
>>>>> Thanks,
>>>>> Joesph
>>>>>
>>>>>> Thanks,
>>>>>> Ryan
>
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-12-14  5:31               ` Ryan Ding
@ 2015-12-14 10:36                 ` Joseph Qi
  2015-12-16  1:39                   ` Ryan Ding
  0 siblings, 1 reply; 22+ messages in thread
From: Joseph Qi @ 2015-12-14 10:36 UTC (permalink / raw)
  To: ocfs2-devel

Hi Ryan,

On 2015/12/14 13:31, Ryan Ding wrote:
> Hi Joseph,
> 
> On 12/10/2015 06:36 PM, Joseph Qi wrote:
>> Hi Ryan,
>>
>> On 2015/12/10 16:48, Ryan Ding wrote:
>>> Hi Joseph,
>>>
>>> Thanks for your comments, please see my reply:
>>>
>>> On 12/10/2015 03:54 PM, Joseph Qi wrote:
>>>> Hi Ryan,
>>>>
>>>> On 2015/10/12 14:34, Ryan Ding wrote:
>>>>> Hi Joseph,
>>>>>
>>>>> On 10/08/2015 02:13 PM, Joseph Qi wrote:
>>>>>> Hi Ryan,
>>>>>>
>>>>>> On 2015/10/8 11:12, Ryan Ding wrote:
>>>>>>> Hi Joseph,
>>>>>>>
>>>>>>> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>>>>>>>> Hi Ryan,
>>>>>>>> I have gone through this patch set and done a simple performance test
>>>>>>>> using direct dd, it indeed brings much performance promotion.
>>>>>>>>              Before      After
>>>>>>>> bs=4K    1.4 MB/s    5.0 MB/s
>>>>>>>> bs=256k  40.5 MB/s   56.3 MB/s
>>>>>>>>
>>>>>>>> My questions are:
>>>>>>>> 1) You solution is still using orphan dir to keep inode and allocation
>>>>>>>> consistency, am I right? From our test, it is the most complicated part
>>>>>>>> and has many race cases to be taken consideration. So I wonder if this
>>>>>>>> can be restructured.
>>>>>>> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
>>>>>> The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
>>>>>> it is much more complicated than ext4.
>>>>>> And fsck can only be used offline, but using orphan is to perform
>>>>>> recovering online. So I don't think fsck can replace it in all cases.
>>>>>>
>>>>>>>> 2) Rather than using normal block direct io, you introduce a way to use
>>>>>>>> write begin/end in buffer io. IMO, if it wants to perform like direct
>>>>>>>> io, it should be committed to disk by forcing committing journal. But
>>>>>>>> journal committing will consume much time. Why does it bring performance
>>>>>>>> promotion instead?
>>>>>>> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
>>>> I think we cannot mix zero pages with direct io here, which will lead
>>>> to direct io data to be overwritten by zero pages.
>>>> For example, a ocfs2 volume with block size 4K and cluster size 4K.
>>>> Firstly I create a file with size of 5K and it will be allocated 2
>>>> clusters (8K) and the last 3K without zeroed (no need at this time).
>>> I think the last 3K will be zeroed no matter you use direct io or buffer io to create the a file with 5K.
>>>> Then I seek to offset 9K and do direct write 1K, then back to 4K and do
>>>> direct write 5K. Here we have to zero allocated space to avoid dirty
>>>> data. But since direct write data goes to disk directly and zero pages
>>>> depends on journal commit, so direct write data will be overwritten and
>>>> file corrupts.
>>> do_blockdev_direct_IO() will zero unwritten area within block size(in this case, 6K~8K), when get_block callback return a map with buffer_new flag. This zero operation is also using direct io.
>>> So the buffer io zero operation in my design will not work at all in this case.It only works to zero the area beyond block size, but within cluster size. For example, when block size 4KB cluster size 1MB, a 4KB direct write will trigger a zero buffer page of size 1MB-4KB=1020KB.
>>> I think your question is this zero buffer page will conflict with the later direct io writing to the same area. The truth is conflict will not exist, because before direct write, all conflict buffer page will be flushed to disk first (in __generic_file_write_iter()).
>> How can it make sure the zero pages to be flushed to disk first? In
>> ocfs2_direct_IO, it calls ocfs2_dio_get_block which uses write_begin
>> and write_end, and then __blockdev_direct_IO.
>> I've backported your patch set to kernel 3.0 and tested with vhd-util,
>> and the result fails. The test case is below.
>> 1) create a 1G dynamic vhd file, the actual size is 5K.
>> # vhd-util create -n test.vhd -s 1024
>> 2) resize it to 4G, the actual size becomes to 11K
>> # vhd-util resize -n test.vhd -s 4096 -j test.log
>> 3) hexdump the data, say hexdump1
>> 4) umount to commit journal and mount again, and hexdump the data again,
>> say hexdump2, which is not equal to hexdump1.
>> I am not sure if there is any relations with kernel version, which
>> indeed has many differences due to refactoring.
> I have backported it to kernel 3.8, and run the scripts below (I think it's the same as your test):
> 
>     mount /dev/dm-1 /mnt
>     pushd /mnt/
>     rm test.vhd -f
>     vhd-util create -n test.vhd -s 1024
>     vhd-util resize -n test.vhd -s 4096 -j test.log
>     hexdump test.vhd > ~/test.hex.1
>     popd
>     umount /mnt/
>     mount /dev/dm-1 /mnt/
>     hexdump /mnt/test.vhd > ~/test.hex.2
>     umount /mnt
> 
> block size & cluster size are all 4K.
> It shows there is no difference between test.hex.1 and test.hex.2. I think this issue is related to specified kernel version, so which version is your kernel? Please provide the backport patches if you wish :)
I am using kernel 3.0.93. But I think it have no relations with kernel.
In one direct io, use buffer to zero first and then do direct write, you
cannot make sure the order. In other words, direct io may goes to disk
first and then zero buffers. That's why I am using blkdev_issue_zeroout
to do this in my patches.
And I am using jbd2_journal_force_commit to get metadata go to disk at
the same time, which will make performance poorer than yours. It can be
removed if direct io's semantics does not require.

> 
> Thanks,
> Ryan
>>
>> Thanks,
>> Joseph
>>
>>> BTW, there is a lot testcases to test the operations like buffer write, direct write, lseek.. (it's a mix of these operations) in ltp (Linux Test Project). This patch set has passed all of them. :)
>>>>>> So this is protected by "UNWRITTEN" flag, right?
>>>>>>
>>>>>>>> 3) Do you have a test in case of lack of memory?
>>>>>>> I tested it in a system with 2GB memory. Is that enough?
>>>>>> What I mean is doing many direct io jobs in case system free memory is
>>>>>> low.
>>>>> I understand what you mean, but did not find a better way to test it. Since if free memory is too low, even the process can not be started. If free memory is fairlyenough, the test has no meaning.
>>>>> So I try to collect the memory usage during io, and do a comparison test with buffer io. The result is:
>>>>> 1. start 100 dd to do 4KB direct write:
>>>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>>>> MemTotal:        2809788 kB
>>>>> MemFree:           21824 kB
>>>>> Buffers:           55176 kB
>>>>> Cached:          2513968 kB
>>>>> Dirty:               412 kB
>>>>> Writeback:            36 kB
>>>>>
>>>>> 2. start 100 dd to do 4KB buffer write:
>>>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>>>> MemTotal:        2809788 kB
>>>>> MemFree:           22476 kB
>>>>> Buffers:           15696 kB
>>>>> Cached:          2544892 kB
>>>>> Dirty:            320136 kB
>>>>> Writeback:        146404 kB
>>>>>
>>>>> You can see from the 'Dirty' and 'Writeback' field that there is not so much memory used as buffer io. So I think what you concern is no longer exist. :-)
>>>>>
>>>>> Thanks,
>>>>> Ryan
>>>>>> Thanks,
>>>>>> Joesph
>>>>>>
>>>>>>> Thanks,
>>>>>>> Ryan
>>
>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-12-14 10:36                 ` Joseph Qi
@ 2015-12-16  1:39                   ` Ryan Ding
  2015-12-16  2:26                     ` Joseph Qi
  0 siblings, 1 reply; 22+ messages in thread
From: Ryan Ding @ 2015-12-16  1:39 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joseph,

On 12/14/2015 06:36 PM, Joseph Qi wrote:
> Hi Ryan,
>
> On 2015/12/14 13:31, Ryan Ding wrote:
>> Hi Joseph,
>>
>> On 12/10/2015 06:36 PM, Joseph Qi wrote:
>>> Hi Ryan,
>>>
>>> On 2015/12/10 16:48, Ryan Ding wrote:
>>>> Hi Joseph,
>>>>
>>>> Thanks for your comments, please see my reply:
>>>>
>>>> On 12/10/2015 03:54 PM, Joseph Qi wrote:
>>>>> Hi Ryan,
>>>>>
>>>>> On 2015/10/12 14:34, Ryan Ding wrote:
>>>>>> Hi Joseph,
>>>>>>
>>>>>> On 10/08/2015 02:13 PM, Joseph Qi wrote:
>>>>>>> Hi Ryan,
>>>>>>>
>>>>>>> On 2015/10/8 11:12, Ryan Ding wrote:
>>>>>>>> Hi Joseph,
>>>>>>>>
>>>>>>>> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>>>>>>>>> Hi Ryan,
>>>>>>>>> I have gone through this patch set and done a simple performance test
>>>>>>>>> using direct dd, it indeed brings much performance promotion.
>>>>>>>>>               Before      After
>>>>>>>>> bs=4K    1.4 MB/s    5.0 MB/s
>>>>>>>>> bs=256k  40.5 MB/s   56.3 MB/s
>>>>>>>>>
>>>>>>>>> My questions are:
>>>>>>>>> 1) You solution is still using orphan dir to keep inode and allocation
>>>>>>>>> consistency, am I right? From our test, it is the most complicated part
>>>>>>>>> and has many race cases to be taken consideration. So I wonder if this
>>>>>>>>> can be restructured.
>>>>>>>> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
>>>>>>> The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
>>>>>>> it is much more complicated than ext4.
>>>>>>> And fsck can only be used offline, but using orphan is to perform
>>>>>>> recovering online. So I don't think fsck can replace it in all cases.
>>>>>>>
>>>>>>>>> 2) Rather than using normal block direct io, you introduce a way to use
>>>>>>>>> write begin/end in buffer io. IMO, if it wants to perform like direct
>>>>>>>>> io, it should be committed to disk by forcing committing journal. But
>>>>>>>>> journal committing will consume much time. Why does it bring performance
>>>>>>>>> promotion instead?
>>>>>>>> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
>>>>> I think we cannot mix zero pages with direct io here, which will lead
>>>>> to direct io data to be overwritten by zero pages.
>>>>> For example, a ocfs2 volume with block size 4K and cluster size 4K.
>>>>> Firstly I create a file with size of 5K and it will be allocated 2
>>>>> clusters (8K) and the last 3K without zeroed (no need at this time).
>>>> I think the last 3K will be zeroed no matter you use direct io or buffer io to create the a file with 5K.
>>>>> Then I seek to offset 9K and do direct write 1K, then back to 4K and do
>>>>> direct write 5K. Here we have to zero allocated space to avoid dirty
>>>>> data. But since direct write data goes to disk directly and zero pages
>>>>> depends on journal commit, so direct write data will be overwritten and
>>>>> file corrupts.
>>>> do_blockdev_direct_IO() will zero unwritten area within block size(in this case, 6K~8K), when get_block callback return a map with buffer_new flag. This zero operation is also using direct io.
>>>> So the buffer io zero operation in my design will not work at all in this case.It only works to zero the area beyond block size, but within cluster size. For example, when block size 4KB cluster size 1MB, a 4KB direct write will trigger a zero buffer page of size 1MB-4KB=1020KB.
>>>> I think your question is this zero buffer page will conflict with the later direct io writing to the same area. The truth is conflict will not exist, because before direct write, all conflict buffer page will be flushed to disk first (in __generic_file_write_iter()).
>>> How can it make sure the zero pages to be flushed to disk first? In
>>> ocfs2_direct_IO, it calls ocfs2_dio_get_block which uses write_begin
>>> and write_end, and then __blockdev_direct_IO.
>>> I've backported your patch set to kernel 3.0 and tested with vhd-util,
>>> and the result fails. The test case is below.
>>> 1) create a 1G dynamic vhd file, the actual size is 5K.
>>> # vhd-util create -n test.vhd -s 1024
>>> 2) resize it to 4G, the actual size becomes to 11K
>>> # vhd-util resize -n test.vhd -s 4096 -j test.log
>>> 3) hexdump the data, say hexdump1
>>> 4) umount to commit journal and mount again, and hexdump the data again,
>>> say hexdump2, which is not equal to hexdump1.
>>> I am not sure if there is any relations with kernel version, which
>>> indeed has many differences due to refactoring.
>> I have backported it to kernel 3.8, and run the scripts below (I think it's the same as your test):
>>
>>      mount /dev/dm-1 /mnt
>>      pushd /mnt/
>>      rm test.vhd -f
>>      vhd-util create -n test.vhd -s 1024
>>      vhd-util resize -n test.vhd -s 4096 -j test.log
>>      hexdump test.vhd > ~/test.hex.1
>>      popd
>>      umount /mnt/
>>      mount /dev/dm-1 /mnt/
>>      hexdump /mnt/test.vhd > ~/test.hex.2
>>      umount /mnt
>>
>> block size & cluster size are all 4K.
>> It shows there is no difference between test.hex.1 and test.hex.2. I think this issue is related to specified kernel version, so which version is your kernel? Please provide the backport patches if you wish :)
> I am using kernel 3.0.93. But I think it have no relations with kernel.
> In one direct io, use buffer to zero first and then do direct write, you
> cannot make sure the order. In other words, direct io may goes to disk
> first and then zero buffers. That's why I am using blkdev_issue_zeroout
> to do this in my patches.
> And I am using jbd2_journal_force_commit to get metadata go to disk at
> the same time, which will make performance poorer than yours. It can be
> removed if direct io's semantics does not require.
As you can see, generic_file_direct_write() (it's in ocfs2's direct io 
code path) will make sure buffer page goes first, there is no chance 
that direct io and buffer io write to the same place at parallel.
And I have test it with ltp's diotest & aiodio test on both kernel 3.8 & 
4.2, there is no problem found. I think you have something wrong with 
the backport, will you try your test with the newest -mm tree?

Thanks,
Ryan
>
>> Thanks,
>> Ryan
>>> Thanks,
>>> Joseph
>>>
>>>> BTW, there is a lot testcases to test the operations like buffer write, direct write, lseek.. (it's a mix of these operations) in ltp (Linux Test Project). This patch set has passed all of them. :)
>>>>>>> So this is protected by "UNWRITTEN" flag, right?
>>>>>>>
>>>>>>>>> 3) Do you have a test in case of lack of memory?
>>>>>>>> I tested it in a system with 2GB memory. Is that enough?
>>>>>>> What I mean is doing many direct io jobs in case system free memory is
>>>>>>> low.
>>>>>> I understand what you mean, but did not find a better way to test it. Since if free memory is too low, even the process can not be started. If free memory is fairlyenough, the test has no meaning.
>>>>>> So I try to collect the memory usage during io, and do a comparison test with buffer io. The result is:
>>>>>> 1. start 100 dd to do 4KB direct write:
>>>>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>>>>> MemTotal:        2809788 kB
>>>>>> MemFree:           21824 kB
>>>>>> Buffers:           55176 kB
>>>>>> Cached:          2513968 kB
>>>>>> Dirty:               412 kB
>>>>>> Writeback:            36 kB
>>>>>>
>>>>>> 2. start 100 dd to do 4KB buffer write:
>>>>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>>>>> MemTotal:        2809788 kB
>>>>>> MemFree:           22476 kB
>>>>>> Buffers:           15696 kB
>>>>>> Cached:          2544892 kB
>>>>>> Dirty:            320136 kB
>>>>>> Writeback:        146404 kB
>>>>>>
>>>>>> You can see from the 'Dirty' and 'Writeback' field that there is not so much memory used as buffer io. So I think what you concern is no longer exist. :-)
>>>>>>
>>>>>> Thanks,
>>>>>> Ryan
>>>>>>> Thanks,
>>>>>>> Joesph
>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> Ryan
>>>
>>
>> .
>>
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-12-16  1:39                   ` Ryan Ding
@ 2015-12-16  2:26                     ` Joseph Qi
  2015-12-16  3:12                       ` Ryan Ding
  0 siblings, 1 reply; 22+ messages in thread
From: Joseph Qi @ 2015-12-16  2:26 UTC (permalink / raw)
  To: ocfs2-devel

On 2015/12/16 9:39, Ryan Ding wrote:
> Hi Joseph,
> 
> On 12/14/2015 06:36 PM, Joseph Qi wrote:
>> Hi Ryan,
>>
>> On 2015/12/14 13:31, Ryan Ding wrote:
>>> Hi Joseph,
>>>
>>> On 12/10/2015 06:36 PM, Joseph Qi wrote:
>>>> Hi Ryan,
>>>>
>>>> On 2015/12/10 16:48, Ryan Ding wrote:
>>>>> Hi Joseph,
>>>>>
>>>>> Thanks for your comments, please see my reply:
>>>>>
>>>>> On 12/10/2015 03:54 PM, Joseph Qi wrote:
>>>>>> Hi Ryan,
>>>>>>
>>>>>> On 2015/10/12 14:34, Ryan Ding wrote:
>>>>>>> Hi Joseph,
>>>>>>>
>>>>>>> On 10/08/2015 02:13 PM, Joseph Qi wrote:
>>>>>>>> Hi Ryan,
>>>>>>>>
>>>>>>>> On 2015/10/8 11:12, Ryan Ding wrote:
>>>>>>>>> Hi Joseph,
>>>>>>>>>
>>>>>>>>> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>>>>>>>>>> Hi Ryan,
>>>>>>>>>> I have gone through this patch set and done a simple performance test
>>>>>>>>>> using direct dd, it indeed brings much performance promotion.
>>>>>>>>>>               Before      After
>>>>>>>>>> bs=4K    1.4 MB/s    5.0 MB/s
>>>>>>>>>> bs=256k  40.5 MB/s   56.3 MB/s
>>>>>>>>>>
>>>>>>>>>> My questions are:
>>>>>>>>>> 1) You solution is still using orphan dir to keep inode and allocation
>>>>>>>>>> consistency, am I right? From our test, it is the most complicated part
>>>>>>>>>> and has many race cases to be taken consideration. So I wonder if this
>>>>>>>>>> can be restructured.
>>>>>>>>> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
>>>>>>>> The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
>>>>>>>> it is much more complicated than ext4.
>>>>>>>> And fsck can only be used offline, but using orphan is to perform
>>>>>>>> recovering online. So I don't think fsck can replace it in all cases.
>>>>>>>>
>>>>>>>>>> 2) Rather than using normal block direct io, you introduce a way to use
>>>>>>>>>> write begin/end in buffer io. IMO, if it wants to perform like direct
>>>>>>>>>> io, it should be committed to disk by forcing committing journal. But
>>>>>>>>>> journal committing will consume much time. Why does it bring performance
>>>>>>>>>> promotion instead?
>>>>>>>>> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
>>>>>> I think we cannot mix zero pages with direct io here, which will lead
>>>>>> to direct io data to be overwritten by zero pages.
>>>>>> For example, a ocfs2 volume with block size 4K and cluster size 4K.
>>>>>> Firstly I create a file with size of 5K and it will be allocated 2
>>>>>> clusters (8K) and the last 3K without zeroed (no need at this time).
>>>>> I think the last 3K will be zeroed no matter you use direct io or buffer io to create the a file with 5K.
>>>>>> Then I seek to offset 9K and do direct write 1K, then back to 4K and do
>>>>>> direct write 5K. Here we have to zero allocated space to avoid dirty
>>>>>> data. But since direct write data goes to disk directly and zero pages
>>>>>> depends on journal commit, so direct write data will be overwritten and
>>>>>> file corrupts.
>>>>> do_blockdev_direct_IO() will zero unwritten area within block size(in this case, 6K~8K), when get_block callback return a map with buffer_new flag. This zero operation is also using direct io.
>>>>> So the buffer io zero operation in my design will not work at all in this case.It only works to zero the area beyond block size, but within cluster size. For example, when block size 4KB cluster size 1MB, a 4KB direct write will trigger a zero buffer page of size 1MB-4KB=1020KB.
>>>>> I think your question is this zero buffer page will conflict with the later direct io writing to the same area. The truth is conflict will not exist, because before direct write, all conflict buffer page will be flushed to disk first (in __generic_file_write_iter()).
>>>> How can it make sure the zero pages to be flushed to disk first? In
>>>> ocfs2_direct_IO, it calls ocfs2_dio_get_block which uses write_begin
>>>> and write_end, and then __blockdev_direct_IO.
>>>> I've backported your patch set to kernel 3.0 and tested with vhd-util,
>>>> and the result fails. The test case is below.
>>>> 1) create a 1G dynamic vhd file, the actual size is 5K.
>>>> # vhd-util create -n test.vhd -s 1024
>>>> 2) resize it to 4G, the actual size becomes to 11K
>>>> # vhd-util resize -n test.vhd -s 4096 -j test.log
>>>> 3) hexdump the data, say hexdump1
>>>> 4) umount to commit journal and mount again, and hexdump the data again,
>>>> say hexdump2, which is not equal to hexdump1.
>>>> I am not sure if there is any relations with kernel version, which
>>>> indeed has many differences due to refactoring.
>>> I have backported it to kernel 3.8, and run the scripts below (I think it's the same as your test):
>>>
>>>      mount /dev/dm-1 /mnt
>>>      pushd /mnt/
>>>      rm test.vhd -f
>>>      vhd-util create -n test.vhd -s 1024
>>>      vhd-util resize -n test.vhd -s 4096 -j test.log
>>>      hexdump test.vhd > ~/test.hex.1
>>>      popd
>>>      umount /mnt/
>>>      mount /dev/dm-1 /mnt/
>>>      hexdump /mnt/test.vhd > ~/test.hex.2
>>>      umount /mnt
>>>
>>> block size & cluster size are all 4K.
>>> It shows there is no difference between test.hex.1 and test.hex.2. I think this issue is related to specified kernel version, so which version is your kernel? Please provide the backport patches if you wish :)
>> I am using kernel 3.0.93. But I think it have no relations with kernel.
>> In one direct io, use buffer to zero first and then do direct write, you
>> cannot make sure the order. In other words, direct io may goes to disk
>> first and then zero buffers. That's why I am using blkdev_issue_zeroout
>> to do this in my patches.
>> And I am using jbd2_journal_force_commit to get metadata go to disk at
>> the same time, which will make performance poorer than yours. It can be
>> removed if direct io's semantics does not require.
> As you can see, generic_file_direct_write() (it's in ocfs2's direct io code path) will make sure buffer page goes first, there is no chance that direct io and buffer io write to the same place at parallel.
> And I have test it with ltp's diotest & aiodio test on both kernel 3.8 & 4.2, there is no problem found. I think you have something wrong with the backport, will you try your test with the newest -mm tree?
IMO, generic_file_direct_write can make sure the former buffer data go
first. But what I am talking here is in the same direct io. In other words,
it is in the same mapping->a_ops->direct_IO call. And in ocfs2_direct_IO,
you call ocfs2_dio_get_block first which will use zero buffer and then
__blockdev_direct_IO. Here the order cannot be guaranteed.

Thanks,
Joseph

> 
> Thanks,
> Ryan
>>
>>> Thanks,
>>> Ryan
>>>> Thanks,
>>>> Joseph
>>>>
>>>>> BTW, there is a lot testcases to test the operations like buffer write, direct write, lseek.. (it's a mix of these operations) in ltp (Linux Test Project). This patch set has passed all of them. :)
>>>>>>>> So this is protected by "UNWRITTEN" flag, right?
>>>>>>>>
>>>>>>>>>> 3) Do you have a test in case of lack of memory?
>>>>>>>>> I tested it in a system with 2GB memory. Is that enough?
>>>>>>>> What I mean is doing many direct io jobs in case system free memory is
>>>>>>>> low.
>>>>>>> I understand what you mean, but did not find a better way to test it. Since if free memory is too low, even the process can not be started. If free memory is fairlyenough, the test has no meaning.
>>>>>>> So I try to collect the memory usage during io, and do a comparison test with buffer io. The result is:
>>>>>>> 1. start 100 dd to do 4KB direct write:
>>>>>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>>>>>> MemTotal:        2809788 kB
>>>>>>> MemFree:           21824 kB
>>>>>>> Buffers:           55176 kB
>>>>>>> Cached:          2513968 kB
>>>>>>> Dirty:               412 kB
>>>>>>> Writeback:            36 kB
>>>>>>>
>>>>>>> 2. start 100 dd to do 4KB buffer write:
>>>>>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>>>>>> MemTotal:        2809788 kB
>>>>>>> MemFree:           22476 kB
>>>>>>> Buffers:           15696 kB
>>>>>>> Cached:          2544892 kB
>>>>>>> Dirty:            320136 kB
>>>>>>> Writeback:        146404 kB
>>>>>>>
>>>>>>> You can see from the 'Dirty' and 'Writeback' field that there is not so much memory used as buffer io. So I think what you concern is no longer exist. :-)
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Ryan
>>>>>>>> Thanks,
>>>>>>>> Joesph
>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> Ryan
>>>>
>>>
>>> .
>>>
>>
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics
  2015-12-16  2:26                     ` Joseph Qi
@ 2015-12-16  3:12                       ` Ryan Ding
  0 siblings, 0 replies; 22+ messages in thread
From: Ryan Ding @ 2015-12-16  3:12 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joseph,

On 12/16/2015 10:26 AM, Joseph Qi wrote:
> On 2015/12/16 9:39, Ryan Ding wrote:
>> Hi Joseph,
>>
>> On 12/14/2015 06:36 PM, Joseph Qi wrote:
>>> Hi Ryan,
>>>
>>> On 2015/12/14 13:31, Ryan Ding wrote:
>>>> Hi Joseph,
>>>>
>>>> On 12/10/2015 06:36 PM, Joseph Qi wrote:
>>>>> Hi Ryan,
>>>>>
>>>>> On 2015/12/10 16:48, Ryan Ding wrote:
>>>>>> Hi Joseph,
>>>>>>
>>>>>> Thanks for your comments, please see my reply:
>>>>>>
>>>>>> On 12/10/2015 03:54 PM, Joseph Qi wrote:
>>>>>>> Hi Ryan,
>>>>>>>
>>>>>>> On 2015/10/12 14:34, Ryan Ding wrote:
>>>>>>>> Hi Joseph,
>>>>>>>>
>>>>>>>> On 10/08/2015 02:13 PM, Joseph Qi wrote:
>>>>>>>>> Hi Ryan,
>>>>>>>>>
>>>>>>>>> On 2015/10/8 11:12, Ryan Ding wrote:
>>>>>>>>>> Hi Joseph,
>>>>>>>>>>
>>>>>>>>>> On 09/28/2015 06:20 PM, Joseph Qi wrote:
>>>>>>>>>>> Hi Ryan,
>>>>>>>>>>> I have gone through this patch set and done a simple performance test
>>>>>>>>>>> using direct dd, it indeed brings much performance promotion.
>>>>>>>>>>>                Before      After
>>>>>>>>>>> bs=4K    1.4 MB/s    5.0 MB/s
>>>>>>>>>>> bs=256k  40.5 MB/s   56.3 MB/s
>>>>>>>>>>>
>>>>>>>>>>> My questions are:
>>>>>>>>>>> 1) You solution is still using orphan dir to keep inode and allocation
>>>>>>>>>>> consistency, am I right? From our test, it is the most complicated part
>>>>>>>>>>> and has many race cases to be taken consideration. So I wonder if this
>>>>>>>>>>> can be restructured.
>>>>>>>>>> I have not got a better idea to do this. I think the only reason why direct io using orphan is to prevent space lost when system crash during append direct write. But maybe a 'fsck -f' will do that job. Is it necessary to use orphan?
>>>>>>>>> The idea is taken from ext4, but since ocfs2 is cluster filesystem, so
>>>>>>>>> it is much more complicated than ext4.
>>>>>>>>> And fsck can only be used offline, but using orphan is to perform
>>>>>>>>> recovering online. So I don't think fsck can replace it in all cases.
>>>>>>>>>
>>>>>>>>>>> 2) Rather than using normal block direct io, you introduce a way to use
>>>>>>>>>>> write begin/end in buffer io. IMO, if it wants to perform like direct
>>>>>>>>>>> io, it should be committed to disk by forcing committing journal. But
>>>>>>>>>>> journal committing will consume much time. Why does it bring performance
>>>>>>>>>>> promotion instead?
>>>>>>>>>> I use buffer io to write only the zero pages. Actual data payload is written as direct io. I think there is no need to do a force commit. Because direct means "Try to minimize cache effects of the I/O to and from this file.", it does not means "write all data & meta data to disk before write return".
>>>>>>> I think we cannot mix zero pages with direct io here, which will lead
>>>>>>> to direct io data to be overwritten by zero pages.
>>>>>>> For example, a ocfs2 volume with block size 4K and cluster size 4K.
>>>>>>> Firstly I create a file with size of 5K and it will be allocated 2
>>>>>>> clusters (8K) and the last 3K without zeroed (no need at this time).
>>>>>> I think the last 3K will be zeroed no matter you use direct io or buffer io to create the a file with 5K.
>>>>>>> Then I seek to offset 9K and do direct write 1K, then back to 4K and do
>>>>>>> direct write 5K. Here we have to zero allocated space to avoid dirty
>>>>>>> data. But since direct write data goes to disk directly and zero pages
>>>>>>> depends on journal commit, so direct write data will be overwritten and
>>>>>>> file corrupts.
>>>>>> do_blockdev_direct_IO() will zero unwritten area within block size(in this case, 6K~8K), when get_block callback return a map with buffer_new flag. This zero operation is also using direct io.
>>>>>> So the buffer io zero operation in my design will not work at all in this case.It only works to zero the area beyond block size, but within cluster size. For example, when block size 4KB cluster size 1MB, a 4KB direct write will trigger a zero buffer page of size 1MB-4KB=1020KB.
>>>>>> I think your question is this zero buffer page will conflict with the later direct io writing to the same area. The truth is conflict will not exist, because before direct write, all conflict buffer page will be flushed to disk first (in __generic_file_write_iter()).
>>>>> How can it make sure the zero pages to be flushed to disk first? In
>>>>> ocfs2_direct_IO, it calls ocfs2_dio_get_block which uses write_begin
>>>>> and write_end, and then __blockdev_direct_IO.
>>>>> I've backported your patch set to kernel 3.0 and tested with vhd-util,
>>>>> and the result fails. The test case is below.
>>>>> 1) create a 1G dynamic vhd file, the actual size is 5K.
>>>>> # vhd-util create -n test.vhd -s 1024
>>>>> 2) resize it to 4G, the actual size becomes to 11K
>>>>> # vhd-util resize -n test.vhd -s 4096 -j test.log
>>>>> 3) hexdump the data, say hexdump1
>>>>> 4) umount to commit journal and mount again, and hexdump the data again,
>>>>> say hexdump2, which is not equal to hexdump1.
>>>>> I am not sure if there is any relations with kernel version, which
>>>>> indeed has many differences due to refactoring.
>>>> I have backported it to kernel 3.8, and run the scripts below (I think it's the same as your test):
>>>>
>>>>       mount /dev/dm-1 /mnt
>>>>       pushd /mnt/
>>>>       rm test.vhd -f
>>>>       vhd-util create -n test.vhd -s 1024
>>>>       vhd-util resize -n test.vhd -s 4096 -j test.log
>>>>       hexdump test.vhd > ~/test.hex.1
>>>>       popd
>>>>       umount /mnt/
>>>>       mount /dev/dm-1 /mnt/
>>>>       hexdump /mnt/test.vhd > ~/test.hex.2
>>>>       umount /mnt
>>>>
>>>> block size & cluster size are all 4K.
>>>> It shows there is no difference between test.hex.1 and test.hex.2. I think this issue is related to specified kernel version, so which version is your kernel? Please provide the backport patches if you wish :)
>>> I am using kernel 3.0.93. But I think it have no relations with kernel.
>>> In one direct io, use buffer to zero first and then do direct write, you
>>> cannot make sure the order. In other words, direct io may goes to disk
>>> first and then zero buffers. That's why I am using blkdev_issue_zeroout
>>> to do this in my patches.
>>> And I am using jbd2_journal_force_commit to get metadata go to disk at
>>> the same time, which will make performance poorer than yours. It can be
>>> removed if direct io's semantics does not require.
>> As you can see, generic_file_direct_write() (it's in ocfs2's direct io code path) will make sure buffer page goes first, there is no chance that direct io and buffer io write to the same place at parallel.
>> And I have test it with ltp's diotest & aiodio test on both kernel 3.8 & 4.2, there is no problem found. I think you have something wrong with the backport, will you try your test with the newest -mm tree?
> IMO, generic_file_direct_write can make sure the former buffer data go
> first. But what I am talking here is in the same direct io. In other words,
> it is in the same mapping->a_ops->direct_IO call. And in ocfs2_direct_IO,
> you call ocfs2_dio_get_block first which will use zero buffer and then
> __blockdev_direct_IO. Here the order cannot be guaranteed.
ocfs2_dio_get_block will only fill zero pages outside of the direct io 
request. Because the bh_result argument (which is passed from 
__blockdev_direct_IO()) has the total length of the direct io request, 
so we can avoid to fill those area with zero buffer page.
So, there should be no overlap between zero area and direct io area in 
the same direct_IO call.

Thanks,
Ryan
>
> Thanks,
> Joseph
>
>> Thanks,
>> Ryan
>>>> Thanks,
>>>> Ryan
>>>>> Thanks,
>>>>> Joseph
>>>>>
>>>>>> BTW, there is a lot testcases to test the operations like buffer write, direct write, lseek.. (it's a mix of these operations) in ltp (Linux Test Project). This patch set has passed all of them. :)
>>>>>>>>> So this is protected by "UNWRITTEN" flag, right?
>>>>>>>>>
>>>>>>>>>>> 3) Do you have a test in case of lack of memory?
>>>>>>>>>> I tested it in a system with 2GB memory. Is that enough?
>>>>>>>>> What I mean is doing many direct io jobs in case system free memory is
>>>>>>>>> low.
>>>>>>>> I understand what you mean, but did not find a better way to test it. Since if free memory is too low, even the process can not be started. If free memory is fairlyenough, the test has no meaning.
>>>>>>>> So I try to collect the memory usage during io, and do a comparison test with buffer io. The result is:
>>>>>>>> 1. start 100 dd to do 4KB direct write:
>>>>>>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>>>>>>> MemTotal:        2809788 kB
>>>>>>>> MemFree:           21824 kB
>>>>>>>> Buffers:           55176 kB
>>>>>>>> Cached:          2513968 kB
>>>>>>>> Dirty:               412 kB
>>>>>>>> Writeback:            36 kB
>>>>>>>>
>>>>>>>> 2. start 100 dd to do 4KB buffer write:
>>>>>>>> [root at hnode3 ~]# cat /proc/meminfo | grep -E "^Cached|^Dirty|^MemFree|^MemTotal|^Buffers|^Writeback:"
>>>>>>>> MemTotal:        2809788 kB
>>>>>>>> MemFree:           22476 kB
>>>>>>>> Buffers:           15696 kB
>>>>>>>> Cached:          2544892 kB
>>>>>>>> Dirty:            320136 kB
>>>>>>>> Writeback:        146404 kB
>>>>>>>>
>>>>>>>> You can see from the 'Dirty' and 'Writeback' field that there is not so much memory used as buffer io. So I think what you concern is no longer exist. :-)
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> Ryan
>>>>>>>>> Thanks,
>>>>>>>>> Joesph
>>>>>>>>>
>>>>>>>>>> Thanks,
>>>>>>>>>> Ryan
>>>> .
>>>>
>>
>> .
>>
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2015-12-16  3:12 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-09-11  8:19 [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Ryan Ding
2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 1/8] ocfs2: add ocfs2_write_type_t type to identify the caller of write Ryan Ding
2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 2/8] ocfs2: use c_new to indicate newly allocated extents Ryan Ding
2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 3/8] ocfs2: test target page before change it Ryan Ding
2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 4/8] ocfs2: do not change i_size in write_end for direct io Ryan Ding
2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 5/8] ocfs2: return the physical address in ocfs2_write_cluster Ryan Ding
2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 6/8] ocfs2: record UNWRITTEN extents when populate write desc Ryan Ding
2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 7/8] ocfs2: fix sparse file & data ordering issue in direct io Ryan Ding
2015-09-11  8:19 ` [Ocfs2-devel] [PATCH 8/8] ocfs2: code clean up for " Ryan Ding
2015-09-28 10:20 ` [Ocfs2-devel] [PATCH 0/8] ocfs2: fix ocfs2 direct io code patch to support sparse file and data ordering semantics Joseph Qi
2015-10-08  3:12   ` Ryan Ding
2015-10-08  6:13     ` Joseph Qi
2015-10-08  7:13       ` Ryan Ding
2015-10-12  6:34       ` Ryan Ding
2015-12-10  7:54         ` Joseph Qi
2015-12-10  8:48           ` Ryan Ding
2015-12-10 10:36             ` Joseph Qi
2015-12-14  5:31               ` Ryan Ding
2015-12-14 10:36                 ` Joseph Qi
2015-12-16  1:39                   ` Ryan Ding
2015-12-16  2:26                     ` Joseph Qi
2015-12-16  3:12                       ` Ryan Ding

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.