All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] f2fs: support file defragment
@ 2015-10-22 11:59 Chao Yu
  2015-10-22 18:11 ` Jaegeuk Kim
  0 siblings, 1 reply; 5+ messages in thread
From: Chao Yu @ 2015-10-22 11:59 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-f2fs-devel, linux-kernel

This patch introduces a new ioctl F2FS_IOC_DEFRAGMENT to support file
defragment in a specified range of regular file.

This ioctl can be used in very limited workload: if user expects high
sequential read performance in randomly written file, this interface
can be used for defragmentation, after that file can be written as
continuous as possible in the device.

Meanwhile, it has side-effect, it will make holes in segments where
blocks located originally, so it's better to trigger GC to eliminate
fragment in segments.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
---
 fs/f2fs/data.c |   6 +-
 fs/f2fs/f2fs.h |   8 +++
 fs/f2fs/file.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 213 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 972eab7..5bb375a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -566,7 +566,7 @@ out:
  *     b. do not use extent cache for better performance
  *     c. give the block addresses to blockdev
  */
-static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 						int create, int flag)
 {
 	unsigned int maxblocks = map->m_len;
@@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 			available_free_memory(sbi, DIRTY_DENTS))
 		goto skip_write;
 
+	/* skip writing during file defragment */
+	if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
+		goto skip_write;
+
 	/* during POR, we don't need to trigger writepage at all. */
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto skip_write;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9db5500..068813c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -234,6 +234,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
 #define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
 #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
+#define F2FS_IOC_DEFRAGMENT		_IO(F2FS_IOCTL_MAGIC, 8)
 
 #define F2FS_IOC_SET_ENCRYPTION_POLICY					\
 		_IOR('f', 19, struct f2fs_encryption_policy)
@@ -260,6 +261,11 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 #define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
 #endif
 
+struct f2fs_defragment {
+	u64 start;
+	u64 len;
+};
+
 /*
  * For INODE and NODE manager
  */
@@ -1416,6 +1422,7 @@ enum {
 	FI_DROP_CACHE,		/* drop dirty page cache */
 	FI_DATA_EXIST,		/* indicate data exists */
 	FI_INLINE_DOTS,		/* indicate inline dot dentries */
+	FI_DO_DEFRAG,		/* indicate defragment is running */
 };
 
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1847,6 +1854,7 @@ struct page *find_data_page(struct inode *, pgoff_t);
 struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int do_write_data_page(struct f2fs_io_info *);
+int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
 void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
 int f2fs_release_page(struct page *, gfp_t);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index a197215..ad59694 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1646,6 +1646,204 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
 	return 0;
 }
 
+static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
+					struct file *filp,
+					struct f2fs_defragment *range)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_map_blocks map;
+	struct extent_info ei;
+	pgoff_t pg_start, pg_end;
+	unsigned int blk_per_seg = 1 << sbi->log_blocks_per_seg;
+	unsigned int total = 0, sec_num;
+	unsigned int pages_per_sec = sbi->segs_per_sec *
+					(1 << sbi->log_blocks_per_seg);
+	block_t blk_end = 0;
+	bool fragmented = false;
+	int err = 0;
+
+	pg_start = range->start >> PAGE_CACHE_SHIFT;
+	pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
+
+	f2fs_balance_fs(sbi);
+
+	mutex_lock(&inode->i_mutex);
+
+	/* writeback all dirty pages in the range */
+	err = filemap_write_and_wait_range(inode->i_mapping, range->start,
+						range->start + range->len);
+	if (err)
+		goto out;
+
+	/*
+	 * lookup mapping info in extent cache, skip defragmenting if physical
+	 * block addresses are continuous.
+	 */
+	if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+		if (ei.fofs + ei.len >= pg_end)
+			goto out;
+	}
+
+	map.m_lblk = pg_start;
+	map.m_len = pg_end - pg_start;
+
+	/*
+	 * lookup mapping info in dnode page cache, skip defragmenting if all
+	 * physical block addresses are continuous even if there are hole(s)
+	 * in logical blocks.
+	 */
+	while (map.m_lblk < pg_end) {
+		map.m_flags = 0;
+		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+		if (err)
+			goto out;
+
+		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+			map.m_lblk++;
+			map.m_len--;
+			continue;
+		}
+
+		if (blk_end && blk_end != map.m_pblk) {
+			fragmented = true;
+			break;
+		}
+		blk_end = map.m_pblk + map.m_len;
+
+		map.m_lblk += map.m_len;
+		map.m_len = pg_end - map.m_lblk;
+	}
+
+	if (!fragmented)
+		goto out;
+
+	map.m_lblk = pg_start;
+	map.m_len = pg_end - pg_start;
+
+	sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+
+	if (has_not_enough_free_secs(sbi, sec_num))
+		goto out;
+
+	while (map.m_lblk < pg_end) {
+		pgoff_t idx;
+		int cnt = 0;
+
+do_map:
+		map.m_flags = 0;
+		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+		if (err)
+			goto out;
+
+		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+			map.m_lblk++;
+			continue;
+		}
+
+		set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+
+		idx = map.m_lblk;
+		while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
+			struct address_space *mapping = inode->i_mapping;
+			struct page *page;
+
+			page = find_or_create_page(mapping, idx, GFP_NOFS);
+			if (!page) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			f2fs_wait_on_page_writeback(page, DATA);
+
+			if (!PageUptodate(page)) {
+				err = mapping->a_ops->readpage(filp, page);
+				if (unlikely(err)) {
+					f2fs_put_page(page, 0);
+					goto out;
+				}
+
+				lock_page_killable(page);
+
+				if (!PageUptodate(page)) {
+					f2fs_put_page(page, 1);
+					err = -EIO;
+					goto out;
+				}
+			}
+			set_page_dirty(page);
+			f2fs_put_page(page, 1);
+
+			idx++;
+			cnt++;
+			total++;
+		}
+
+		map.m_lblk = idx;
+		map.m_len = pg_end - idx;
+
+		if (idx < pg_end && cnt < blk_per_seg)
+			goto do_map;
+
+		clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+
+		err = filemap_fdatawrite(inode->i_mapping);
+		if (err)
+			goto out;
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+	if (!err)
+		range->len = (u64)total << PAGE_CACHE_SHIFT;
+	return err;
+}
+
+static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_defragment range;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	err = mnt_want_write_file(filp);
+	if (err)
+		return err;
+
+	if (f2fs_readonly(sbi->sb)) {
+		err = -EROFS;
+		goto out;
+	}
+
+	if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
+							sizeof(range))) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	/* verify alignment of offset & size */
+	if (range.start & (F2FS_BLKSIZE - 1) ||
+		range.len & (F2FS_BLKSIZE - 1)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = f2fs_defragment_range(sbi, filp, &range);
+	if (err < 0)
+		goto out;
+
+	if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
+							sizeof(range)))
+		err = -EFAULT;
+out:
+	mnt_drop_write_file(filp);
+	return err;
+}
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -1679,6 +1877,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_gc(filp, arg);
 	case F2FS_IOC_WRITE_CHECKPOINT:
 		return f2fs_ioc_write_checkpoint(filp, arg);
+	case F2FS_IOC_DEFRAGMENT:
+		return f2fs_ioc_defragment(filp, arg);
 	default:
 		return -ENOTTY;
 	}
-- 
2.6.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] f2fs: support file defragment
  2015-10-22 11:59 [PATCH] f2fs: support file defragment Chao Yu
@ 2015-10-22 18:11 ` Jaegeuk Kim
  2015-10-23  6:57     ` Chao Yu
  0 siblings, 1 reply; 5+ messages in thread
From: Jaegeuk Kim @ 2015-10-22 18:11 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-f2fs-devel, linux-kernel

Hi,

On Thu, Oct 22, 2015 at 07:59:14PM +0800, Chao Yu wrote:
> This patch introduces a new ioctl F2FS_IOC_DEFRAGMENT to support file
> defragment in a specified range of regular file.
> 
> This ioctl can be used in very limited workload: if user expects high
> sequential read performance in randomly written file, this interface
> can be used for defragmentation, after that file can be written as
> continuous as possible in the device.
> 
> Meanwhile, it has side-effect, it will make holes in segments where
> blocks located originally, so it's better to trigger GC to eliminate
> fragment in segments.
> 
> Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> ---
>  fs/f2fs/data.c |   6 +-
>  fs/f2fs/f2fs.h |   8 +++
>  fs/f2fs/file.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 213 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 972eab7..5bb375a 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -566,7 +566,7 @@ out:
>   *     b. do not use extent cache for better performance
>   *     c. give the block addresses to blockdev
>   */
> -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
> +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
>  						int create, int flag)
>  {
>  	unsigned int maxblocks = map->m_len;
> @@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
>  			available_free_memory(sbi, DIRTY_DENTS))
>  		goto skip_write;
>  
> +	/* skip writing during file defragment */
> +	if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
> +		goto skip_write;
> +
>  	/* during POR, we don't need to trigger writepage at all. */
>  	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
>  		goto skip_write;
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 9db5500..068813c 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -234,6 +234,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
>  #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
>  #define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
>  #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
> +#define F2FS_IOC_DEFRAGMENT		_IO(F2FS_IOCTL_MAGIC, 8)
>  
>  #define F2FS_IOC_SET_ENCRYPTION_POLICY					\
>  		_IOR('f', 19, struct f2fs_encryption_policy)
> @@ -260,6 +261,11 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
>  #define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
>  #endif
>  
> +struct f2fs_defragment {
> +	u64 start;
> +	u64 len;
> +};
> +
>  /*
>   * For INODE and NODE manager
>   */
> @@ -1416,6 +1422,7 @@ enum {
>  	FI_DROP_CACHE,		/* drop dirty page cache */
>  	FI_DATA_EXIST,		/* indicate data exists */
>  	FI_INLINE_DOTS,		/* indicate inline dot dentries */
> +	FI_DO_DEFRAG,		/* indicate defragment is running */
>  };
>  
>  static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
> @@ -1847,6 +1854,7 @@ struct page *find_data_page(struct inode *, pgoff_t);
>  struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
>  struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
>  int do_write_data_page(struct f2fs_io_info *);
> +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
>  int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
>  void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
>  int f2fs_release_page(struct page *, gfp_t);
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index a197215..ad59694 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -1646,6 +1646,204 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
>  	return 0;
>  }
>  
> +static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
> +					struct file *filp,
> +					struct f2fs_defragment *range)
> +{
> +	struct inode *inode = file_inode(filp);
> +	struct f2fs_map_blocks map;
> +	struct extent_info ei;
> +	pgoff_t pg_start, pg_end;
> +	unsigned int blk_per_seg = 1 << sbi->log_blocks_per_seg;
> +	unsigned int total = 0, sec_num;
> +	unsigned int pages_per_sec = sbi->segs_per_sec *
> +					(1 << sbi->log_blocks_per_seg);
> +	block_t blk_end = 0;
> +	bool fragmented = false;
> +	int err = 0;
> +
> +	pg_start = range->start >> PAGE_CACHE_SHIFT;
> +	pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
> +
> +	f2fs_balance_fs(sbi);
> +
> +	mutex_lock(&inode->i_mutex);
> +
> +	/* writeback all dirty pages in the range */
> +	err = filemap_write_and_wait_range(inode->i_mapping, range->start,
> +						range->start + range->len);
> +	if (err)
> +		goto out;
> +
> +	/*
> +	 * lookup mapping info in extent cache, skip defragmenting if physical
> +	 * block addresses are continuous.
> +	 */
> +	if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
> +		if (ei.fofs + ei.len >= pg_end)
> +			goto out;
> +	}
> +
> +	map.m_lblk = pg_start;
> +	map.m_len = pg_end - pg_start;
> +
> +	/*
> +	 * lookup mapping info in dnode page cache, skip defragmenting if all
> +	 * physical block addresses are continuous even if there are hole(s)
> +	 * in logical blocks.
> +	 */
> +	while (map.m_lblk < pg_end) {
> +		map.m_flags = 0;
> +		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);

How about using f2fs_fiemap to get the extent information?

> +		if (err)
> +			goto out;
> +
> +		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> +			map.m_lblk++;
> +			map.m_len--;
> +			continue;
> +		}
> +
> +		if (blk_end && blk_end != map.m_pblk) {
> +			fragmented = true;
> +			break;
> +		}
> +		blk_end = map.m_pblk + map.m_len;
> +
> +		map.m_lblk += map.m_len;
> +		map.m_len = pg_end - map.m_lblk;
> +	}
> +
> +	if (!fragmented)
> +		goto out;
> +
> +	map.m_lblk = pg_start;
> +	map.m_len = pg_end - pg_start;
> +
> +	sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
> +
> +	if (has_not_enough_free_secs(sbi, sec_num))

Later, ->writepage will handle this?

> +		goto out;
> +
> +	while (map.m_lblk < pg_end) {
> +		pgoff_t idx;
> +		int cnt = 0;

What about this?

	for_each_extents(extent_info) {
		page = get_lock_data_page(inode, idx, true);

		set_page_dirty(page);
	}
	filemap_fdatawrite();

Thanks,

> +
> +do_map:
> +		map.m_flags = 0;
> +		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
> +		if (err)
> +			goto out;
> +
> +		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> +			map.m_lblk++;
> +			continue;
> +		}
> +
> +		set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> +
> +		idx = map.m_lblk;
> +		while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
> +			struct address_space *mapping = inode->i_mapping;
> +			struct page *page;
> +
> +			page = find_or_create_page(mapping, idx, GFP_NOFS);
> +			if (!page) {
> +				err = -ENOMEM;
> +				goto out;
> +			}
> +
> +			f2fs_wait_on_page_writeback(page, DATA);
> +
> +			if (!PageUptodate(page)) {
> +				err = mapping->a_ops->readpage(filp, page);
> +				if (unlikely(err)) {
> +					f2fs_put_page(page, 0);
> +					goto out;
> +				}
> +
> +				lock_page_killable(page);
> +
> +				if (!PageUptodate(page)) {
> +					f2fs_put_page(page, 1);
> +					err = -EIO;
> +					goto out;
> +				}
> +			}
> +			set_page_dirty(page);
> +			f2fs_put_page(page, 1);
> +
> +			idx++;
> +			cnt++;
> +			total++;
> +		}
> +
> +		map.m_lblk = idx;
> +		map.m_len = pg_end - idx;
> +
> +		if (idx < pg_end && cnt < blk_per_seg)
> +			goto do_map;
> +
> +		clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> +
> +		err = filemap_fdatawrite(inode->i_mapping);
> +		if (err)
> +			goto out;
> +	}
> +out:
> +	mutex_unlock(&inode->i_mutex);
> +	if (!err)
> +		range->len = (u64)total << PAGE_CACHE_SHIFT;
> +	return err;
> +}
> +
> +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
> +{
> +	struct inode *inode = file_inode(filp);
> +	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> +	struct f2fs_defragment range;
> +	int err;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	if (!S_ISREG(inode->i_mode))
> +		return -EINVAL;
> +
> +	err = mnt_want_write_file(filp);
> +	if (err)
> +		return err;
> +
> +	if (f2fs_readonly(sbi->sb)) {
> +		err = -EROFS;
> +		goto out;
> +	}
> +
> +	if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
> +							sizeof(range))) {
> +		err = -EFAULT;
> +		goto out;
> +	}
> +
> +	/* verify alignment of offset & size */
> +	if (range.start & (F2FS_BLKSIZE - 1) ||
> +		range.len & (F2FS_BLKSIZE - 1)) {
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	err = f2fs_defragment_range(sbi, filp, &range);
> +	if (err < 0)
> +		goto out;
> +
> +	if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
> +							sizeof(range)))
> +		err = -EFAULT;
> +out:
> +	mnt_drop_write_file(filp);
> +	return err;
> +}
> +
>  long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>  {
>  	switch (cmd) {
> @@ -1679,6 +1877,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>  		return f2fs_ioc_gc(filp, arg);
>  	case F2FS_IOC_WRITE_CHECKPOINT:
>  		return f2fs_ioc_write_checkpoint(filp, arg);
> +	case F2FS_IOC_DEFRAGMENT:
> +		return f2fs_ioc_defragment(filp, arg);
>  	default:
>  		return -ENOTTY;
>  	}
> -- 
> 2.6.1

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] f2fs: support file defragment
  2015-10-22 18:11 ` Jaegeuk Kim
@ 2015-10-23  6:57     ` Chao Yu
  0 siblings, 0 replies; 5+ messages in thread
From: Chao Yu @ 2015-10-23  6:57 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Friday, October 23, 2015 2:12 AM
> To: Chao Yu
> Cc: linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] f2fs: support file defragment
> 
> Hi,
> 
> On Thu, Oct 22, 2015 at 07:59:14PM +0800, Chao Yu wrote:
> > This patch introduces a new ioctl F2FS_IOC_DEFRAGMENT to support file
> > defragment in a specified range of regular file.
> >
> > This ioctl can be used in very limited workload: if user expects high
> > sequential read performance in randomly written file, this interface
> > can be used for defragmentation, after that file can be written as
> > continuous as possible in the device.
> >
> > Meanwhile, it has side-effect, it will make holes in segments where
> > blocks located originally, so it's better to trigger GC to eliminate
> > fragment in segments.
> >
> > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > ---
> >  fs/f2fs/data.c |   6 +-
> >  fs/f2fs/f2fs.h |   8 +++
> >  fs/f2fs/file.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 213 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> > index 972eab7..5bb375a 100644
> > --- a/fs/f2fs/data.c
> > +++ b/fs/f2fs/data.c
> > @@ -566,7 +566,7 @@ out:
> >   *     b. do not use extent cache for better performance
> >   *     c. give the block addresses to blockdev
> >   */
> > -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
> > +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
> >  						int create, int flag)
> >  {
> >  	unsigned int maxblocks = map->m_len;
> > @@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
> >  			available_free_memory(sbi, DIRTY_DENTS))
> >  		goto skip_write;
> >
> > +	/* skip writing during file defragment */
> > +	if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
> > +		goto skip_write;
> > +
> >  	/* during POR, we don't need to trigger writepage at all. */
> >  	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
> >  		goto skip_write;
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 9db5500..068813c 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -234,6 +234,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum,
> int size,
> >  #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
> >  #define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
> >  #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
> > +#define F2FS_IOC_DEFRAGMENT		_IO(F2FS_IOCTL_MAGIC, 8)
> >
> >  #define F2FS_IOC_SET_ENCRYPTION_POLICY					\
> >  		_IOR('f', 19, struct f2fs_encryption_policy)
> > @@ -260,6 +261,11 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum,
> int size,
> >  #define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
> >  #endif
> >
> > +struct f2fs_defragment {
> > +	u64 start;
> > +	u64 len;
> > +};
> > +
> >  /*
> >   * For INODE and NODE manager
> >   */
> > @@ -1416,6 +1422,7 @@ enum {
> >  	FI_DROP_CACHE,		/* drop dirty page cache */
> >  	FI_DATA_EXIST,		/* indicate data exists */
> >  	FI_INLINE_DOTS,		/* indicate inline dot dentries */
> > +	FI_DO_DEFRAG,		/* indicate defragment is running */
> >  };
> >
> >  static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
> > @@ -1847,6 +1854,7 @@ struct page *find_data_page(struct inode *, pgoff_t);
> >  struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
> >  struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
> >  int do_write_data_page(struct f2fs_io_info *);
> > +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
> >  int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
> >  void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
> >  int f2fs_release_page(struct page *, gfp_t);
> > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > index a197215..ad59694 100644
> > --- a/fs/f2fs/file.c
> > +++ b/fs/f2fs/file.c
> > @@ -1646,6 +1646,204 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long
> arg)
> >  	return 0;
> >  }
> >
> > +static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
> > +					struct file *filp,
> > +					struct f2fs_defragment *range)
> > +{
> > +	struct inode *inode = file_inode(filp);
> > +	struct f2fs_map_blocks map;
> > +	struct extent_info ei;
> > +	pgoff_t pg_start, pg_end;
> > +	unsigned int blk_per_seg = 1 << sbi->log_blocks_per_seg;
> > +	unsigned int total = 0, sec_num;
> > +	unsigned int pages_per_sec = sbi->segs_per_sec *
> > +					(1 << sbi->log_blocks_per_seg);
> > +	block_t blk_end = 0;
> > +	bool fragmented = false;
> > +	int err = 0;
> > +
> > +	pg_start = range->start >> PAGE_CACHE_SHIFT;
> > +	pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
> > +
> > +	f2fs_balance_fs(sbi);
> > +
> > +	mutex_lock(&inode->i_mutex);
> > +
> > +	/* writeback all dirty pages in the range */
> > +	err = filemap_write_and_wait_range(inode->i_mapping, range->start,
> > +						range->start + range->len);
> > +	if (err)
> > +		goto out;
> > +
> > +	/*
> > +	 * lookup mapping info in extent cache, skip defragmenting if physical
> > +	 * block addresses are continuous.
> > +	 */
> > +	if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
> > +		if (ei.fofs + ei.len >= pg_end)
> > +			goto out;
> > +	}
> > +
> > +	map.m_lblk = pg_start;
> > +	map.m_len = pg_end - pg_start;
> > +
> > +	/*
> > +	 * lookup mapping info in dnode page cache, skip defragmenting if all
> > +	 * physical block addresses are continuous even if there are hole(s)
> > +	 * in logical blocks.
> > +	 */
> > +	while (map.m_lblk < pg_end) {
> > +		map.m_flags = 0;
> > +		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
> 
> How about using f2fs_fiemap to get the extent information?

Hmm, if we use f2fs_fiemap, we will encounter unneeded memset & copy_to_user
in fiemap_fill_next_extent, and struct fiemap_extent is designed for using in
usersapce, fi_extents_start in struct fiemap_extent has __user * in this
type, we'd better to avoid allocating such type in kernel, right? otherwise
it looks very weird. So how about keeping using f2fs_map_blocks? as its call
path is shortest and has no copying overhead.

> 
> > +		if (err)
> > +			goto out;
> > +
> > +		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> > +			map.m_lblk++;
> > +			map.m_len--;
> > +			continue;
> > +		}
> > +
> > +		if (blk_end && blk_end != map.m_pblk) {
> > +			fragmented = true;
> > +			break;
> > +		}
> > +		blk_end = map.m_pblk + map.m_len;
> > +
> > +		map.m_lblk += map.m_len;
> > +		map.m_len = pg_end - map.m_lblk;
> > +	}
> > +
> > +	if (!fragmented)
> > +		goto out;
> > +
> > +	map.m_lblk = pg_start;
> > +	map.m_len = pg_end - pg_start;
> > +
> > +	sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
> > +
> > +	if (has_not_enough_free_secs(sbi, sec_num))
> 
> Later, ->writepage will handle this?

Right, my intention here is that I hope defragmenting will be executed in
a low fragmented partition, it will be good that finally blocks are locating
in continuous segments, and also this can decrease the chance to writeback
pages in SSR mode.

Moreover we should check IPU policy to insure defragment will actually
works.

> 
> > +		goto out;
> > +
> > +	while (map.m_lblk < pg_end) {
> > +		pgoff_t idx;
> > +		int cnt = 0;
> 
> What about this?
> 
> 	for_each_extents(extent_info) {
> 		page = get_lock_data_page(inode, idx, true);
> 
> 		set_page_dirty(page);
> 	}
> 	filemap_fdatawrite();

Yeah, more neat! I will change this. :)

Thanks,

> 
> Thanks,
> 
> > +
> > +do_map:
> > +		map.m_flags = 0;
> > +		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
> > +		if (err)
> > +			goto out;
> > +
> > +		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> > +			map.m_lblk++;
> > +			continue;
> > +		}
> > +
> > +		set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> > +
> > +		idx = map.m_lblk;
> > +		while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
> > +			struct address_space *mapping = inode->i_mapping;
> > +			struct page *page;
> > +
> > +			page = find_or_create_page(mapping, idx, GFP_NOFS);
> > +			if (!page) {
> > +				err = -ENOMEM;
> > +				goto out;
> > +			}
> > +
> > +			f2fs_wait_on_page_writeback(page, DATA);
> > +
> > +			if (!PageUptodate(page)) {
> > +				err = mapping->a_ops->readpage(filp, page);
> > +				if (unlikely(err)) {
> > +					f2fs_put_page(page, 0);
> > +					goto out;
> > +				}
> > +
> > +				lock_page_killable(page);
> > +
> > +				if (!PageUptodate(page)) {
> > +					f2fs_put_page(page, 1);
> > +					err = -EIO;
> > +					goto out;
> > +				}
> > +			}
> > +			set_page_dirty(page);
> > +			f2fs_put_page(page, 1);
> > +
> > +			idx++;
> > +			cnt++;
> > +			total++;
> > +		}
> > +
> > +		map.m_lblk = idx;
> > +		map.m_len = pg_end - idx;
> > +
> > +		if (idx < pg_end && cnt < blk_per_seg)
> > +			goto do_map;
> > +
> > +		clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> > +
> > +		err = filemap_fdatawrite(inode->i_mapping);
> > +		if (err)
> > +			goto out;
> > +	}
> > +out:
> > +	mutex_unlock(&inode->i_mutex);
> > +	if (!err)
> > +		range->len = (u64)total << PAGE_CACHE_SHIFT;
> > +	return err;
> > +}
> > +
> > +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
> > +{
> > +	struct inode *inode = file_inode(filp);
> > +	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> > +	struct f2fs_defragment range;
> > +	int err;
> > +
> > +	if (!capable(CAP_SYS_ADMIN))
> > +		return -EPERM;
> > +
> > +	if (!S_ISREG(inode->i_mode))
> > +		return -EINVAL;
> > +
> > +	err = mnt_want_write_file(filp);
> > +	if (err)
> > +		return err;
> > +
> > +	if (f2fs_readonly(sbi->sb)) {
> > +		err = -EROFS;
> > +		goto out;
> > +	}
> > +
> > +	if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
> > +							sizeof(range))) {
> > +		err = -EFAULT;
> > +		goto out;
> > +	}
> > +
> > +	/* verify alignment of offset & size */
> > +	if (range.start & (F2FS_BLKSIZE - 1) ||
> > +		range.len & (F2FS_BLKSIZE - 1)) {
> > +		err = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	err = f2fs_defragment_range(sbi, filp, &range);
> > +	if (err < 0)
> > +		goto out;
> > +
> > +	if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
> > +							sizeof(range)))
> > +		err = -EFAULT;
> > +out:
> > +	mnt_drop_write_file(filp);
> > +	return err;
> > +}
> > +
> >  long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> >  {
> >  	switch (cmd) {
> > @@ -1679,6 +1877,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> >  		return f2fs_ioc_gc(filp, arg);
> >  	case F2FS_IOC_WRITE_CHECKPOINT:
> >  		return f2fs_ioc_write_checkpoint(filp, arg);
> > +	case F2FS_IOC_DEFRAGMENT:
> > +		return f2fs_ioc_defragment(filp, arg);
> >  	default:
> >  		return -ENOTTY;
> >  	}
> > --
> > 2.6.1


------------------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] f2fs: support file defragment
@ 2015-10-23  6:57     ` Chao Yu
  0 siblings, 0 replies; 5+ messages in thread
From: Chao Yu @ 2015-10-23  6:57 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-f2fs-devel, linux-kernel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Friday, October 23, 2015 2:12 AM
> To: Chao Yu
> Cc: linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] f2fs: support file defragment
> 
> Hi,
> 
> On Thu, Oct 22, 2015 at 07:59:14PM +0800, Chao Yu wrote:
> > This patch introduces a new ioctl F2FS_IOC_DEFRAGMENT to support file
> > defragment in a specified range of regular file.
> >
> > This ioctl can be used in very limited workload: if user expects high
> > sequential read performance in randomly written file, this interface
> > can be used for defragmentation, after that file can be written as
> > continuous as possible in the device.
> >
> > Meanwhile, it has side-effect, it will make holes in segments where
> > blocks located originally, so it's better to trigger GC to eliminate
> > fragment in segments.
> >
> > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > ---
> >  fs/f2fs/data.c |   6 +-
> >  fs/f2fs/f2fs.h |   8 +++
> >  fs/f2fs/file.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 213 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> > index 972eab7..5bb375a 100644
> > --- a/fs/f2fs/data.c
> > +++ b/fs/f2fs/data.c
> > @@ -566,7 +566,7 @@ out:
> >   *     b. do not use extent cache for better performance
> >   *     c. give the block addresses to blockdev
> >   */
> > -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
> > +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
> >  						int create, int flag)
> >  {
> >  	unsigned int maxblocks = map->m_len;
> > @@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
> >  			available_free_memory(sbi, DIRTY_DENTS))
> >  		goto skip_write;
> >
> > +	/* skip writing during file defragment */
> > +	if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
> > +		goto skip_write;
> > +
> >  	/* during POR, we don't need to trigger writepage at all. */
> >  	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
> >  		goto skip_write;
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 9db5500..068813c 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -234,6 +234,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum,
> int size,
> >  #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
> >  #define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
> >  #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
> > +#define F2FS_IOC_DEFRAGMENT		_IO(F2FS_IOCTL_MAGIC, 8)
> >
> >  #define F2FS_IOC_SET_ENCRYPTION_POLICY					\
> >  		_IOR('f', 19, struct f2fs_encryption_policy)
> > @@ -260,6 +261,11 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum,
> int size,
> >  #define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
> >  #endif
> >
> > +struct f2fs_defragment {
> > +	u64 start;
> > +	u64 len;
> > +};
> > +
> >  /*
> >   * For INODE and NODE manager
> >   */
> > @@ -1416,6 +1422,7 @@ enum {
> >  	FI_DROP_CACHE,		/* drop dirty page cache */
> >  	FI_DATA_EXIST,		/* indicate data exists */
> >  	FI_INLINE_DOTS,		/* indicate inline dot dentries */
> > +	FI_DO_DEFRAG,		/* indicate defragment is running */
> >  };
> >
> >  static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
> > @@ -1847,6 +1854,7 @@ struct page *find_data_page(struct inode *, pgoff_t);
> >  struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
> >  struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
> >  int do_write_data_page(struct f2fs_io_info *);
> > +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
> >  int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
> >  void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
> >  int f2fs_release_page(struct page *, gfp_t);
> > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > index a197215..ad59694 100644
> > --- a/fs/f2fs/file.c
> > +++ b/fs/f2fs/file.c
> > @@ -1646,6 +1646,204 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long
> arg)
> >  	return 0;
> >  }
> >
> > +static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
> > +					struct file *filp,
> > +					struct f2fs_defragment *range)
> > +{
> > +	struct inode *inode = file_inode(filp);
> > +	struct f2fs_map_blocks map;
> > +	struct extent_info ei;
> > +	pgoff_t pg_start, pg_end;
> > +	unsigned int blk_per_seg = 1 << sbi->log_blocks_per_seg;
> > +	unsigned int total = 0, sec_num;
> > +	unsigned int pages_per_sec = sbi->segs_per_sec *
> > +					(1 << sbi->log_blocks_per_seg);
> > +	block_t blk_end = 0;
> > +	bool fragmented = false;
> > +	int err = 0;
> > +
> > +	pg_start = range->start >> PAGE_CACHE_SHIFT;
> > +	pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
> > +
> > +	f2fs_balance_fs(sbi);
> > +
> > +	mutex_lock(&inode->i_mutex);
> > +
> > +	/* writeback all dirty pages in the range */
> > +	err = filemap_write_and_wait_range(inode->i_mapping, range->start,
> > +						range->start + range->len);
> > +	if (err)
> > +		goto out;
> > +
> > +	/*
> > +	 * lookup mapping info in extent cache, skip defragmenting if physical
> > +	 * block addresses are continuous.
> > +	 */
> > +	if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
> > +		if (ei.fofs + ei.len >= pg_end)
> > +			goto out;
> > +	}
> > +
> > +	map.m_lblk = pg_start;
> > +	map.m_len = pg_end - pg_start;
> > +
> > +	/*
> > +	 * lookup mapping info in dnode page cache, skip defragmenting if all
> > +	 * physical block addresses are continuous even if there are hole(s)
> > +	 * in logical blocks.
> > +	 */
> > +	while (map.m_lblk < pg_end) {
> > +		map.m_flags = 0;
> > +		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
> 
> How about using f2fs_fiemap to get the extent information?

Hmm, if we use f2fs_fiemap, we will encounter unneeded memset & copy_to_user
in fiemap_fill_next_extent, and struct fiemap_extent is designed for using in
usersapce, fi_extents_start in struct fiemap_extent has __user * in this
type, we'd better to avoid allocating such type in kernel, right? otherwise
it looks very weird. So how about keeping using f2fs_map_blocks? as its call
path is shortest and has no copying overhead.

> 
> > +		if (err)
> > +			goto out;
> > +
> > +		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> > +			map.m_lblk++;
> > +			map.m_len--;
> > +			continue;
> > +		}
> > +
> > +		if (blk_end && blk_end != map.m_pblk) {
> > +			fragmented = true;
> > +			break;
> > +		}
> > +		blk_end = map.m_pblk + map.m_len;
> > +
> > +		map.m_lblk += map.m_len;
> > +		map.m_len = pg_end - map.m_lblk;
> > +	}
> > +
> > +	if (!fragmented)
> > +		goto out;
> > +
> > +	map.m_lblk = pg_start;
> > +	map.m_len = pg_end - pg_start;
> > +
> > +	sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
> > +
> > +	if (has_not_enough_free_secs(sbi, sec_num))
> 
> Later, ->writepage will handle this?

Right, my intention here is that I hope defragmenting will be executed in
a low fragmented partition, it will be good that finally blocks are locating
in continuous segments, and also this can decrease the chance to writeback
pages in SSR mode.

Moreover we should check IPU policy to insure defragment will actually
works.

> 
> > +		goto out;
> > +
> > +	while (map.m_lblk < pg_end) {
> > +		pgoff_t idx;
> > +		int cnt = 0;
> 
> What about this?
> 
> 	for_each_extents(extent_info) {
> 		page = get_lock_data_page(inode, idx, true);
> 
> 		set_page_dirty(page);
> 	}
> 	filemap_fdatawrite();

Yeah, more neat! I will change this. :)

Thanks,

> 
> Thanks,
> 
> > +
> > +do_map:
> > +		map.m_flags = 0;
> > +		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
> > +		if (err)
> > +			goto out;
> > +
> > +		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> > +			map.m_lblk++;
> > +			continue;
> > +		}
> > +
> > +		set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> > +
> > +		idx = map.m_lblk;
> > +		while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
> > +			struct address_space *mapping = inode->i_mapping;
> > +			struct page *page;
> > +
> > +			page = find_or_create_page(mapping, idx, GFP_NOFS);
> > +			if (!page) {
> > +				err = -ENOMEM;
> > +				goto out;
> > +			}
> > +
> > +			f2fs_wait_on_page_writeback(page, DATA);
> > +
> > +			if (!PageUptodate(page)) {
> > +				err = mapping->a_ops->readpage(filp, page);
> > +				if (unlikely(err)) {
> > +					f2fs_put_page(page, 0);
> > +					goto out;
> > +				}
> > +
> > +				lock_page_killable(page);
> > +
> > +				if (!PageUptodate(page)) {
> > +					f2fs_put_page(page, 1);
> > +					err = -EIO;
> > +					goto out;
> > +				}
> > +			}
> > +			set_page_dirty(page);
> > +			f2fs_put_page(page, 1);
> > +
> > +			idx++;
> > +			cnt++;
> > +			total++;
> > +		}
> > +
> > +		map.m_lblk = idx;
> > +		map.m_len = pg_end - idx;
> > +
> > +		if (idx < pg_end && cnt < blk_per_seg)
> > +			goto do_map;
> > +
> > +		clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> > +
> > +		err = filemap_fdatawrite(inode->i_mapping);
> > +		if (err)
> > +			goto out;
> > +	}
> > +out:
> > +	mutex_unlock(&inode->i_mutex);
> > +	if (!err)
> > +		range->len = (u64)total << PAGE_CACHE_SHIFT;
> > +	return err;
> > +}
> > +
> > +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
> > +{
> > +	struct inode *inode = file_inode(filp);
> > +	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> > +	struct f2fs_defragment range;
> > +	int err;
> > +
> > +	if (!capable(CAP_SYS_ADMIN))
> > +		return -EPERM;
> > +
> > +	if (!S_ISREG(inode->i_mode))
> > +		return -EINVAL;
> > +
> > +	err = mnt_want_write_file(filp);
> > +	if (err)
> > +		return err;
> > +
> > +	if (f2fs_readonly(sbi->sb)) {
> > +		err = -EROFS;
> > +		goto out;
> > +	}
> > +
> > +	if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
> > +							sizeof(range))) {
> > +		err = -EFAULT;
> > +		goto out;
> > +	}
> > +
> > +	/* verify alignment of offset & size */
> > +	if (range.start & (F2FS_BLKSIZE - 1) ||
> > +		range.len & (F2FS_BLKSIZE - 1)) {
> > +		err = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	err = f2fs_defragment_range(sbi, filp, &range);
> > +	if (err < 0)
> > +		goto out;
> > +
> > +	if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
> > +							sizeof(range)))
> > +		err = -EFAULT;
> > +out:
> > +	mnt_drop_write_file(filp);
> > +	return err;
> > +}
> > +
> >  long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> >  {
> >  	switch (cmd) {
> > @@ -1679,6 +1877,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> >  		return f2fs_ioc_gc(filp, arg);
> >  	case F2FS_IOC_WRITE_CHECKPOINT:
> >  		return f2fs_ioc_write_checkpoint(filp, arg);
> > +	case F2FS_IOC_DEFRAGMENT:
> > +		return f2fs_ioc_defragment(filp, arg);
> >  	default:
> >  		return -ENOTTY;
> >  	}
> > --
> > 2.6.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] f2fs: support file defragment
  2015-10-23  6:57     ` Chao Yu
  (?)
@ 2015-10-26 23:15     ` Jaegeuk Kim
  -1 siblings, 0 replies; 5+ messages in thread
From: Jaegeuk Kim @ 2015-10-26 23:15 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-f2fs-devel, linux-kernel

Hi,

On Fri, Oct 23, 2015 at 02:57:03PM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Friday, October 23, 2015 2:12 AM
> > To: Chao Yu
> > Cc: linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH] f2fs: support file defragment
> > 
> > Hi,
> > 
> > On Thu, Oct 22, 2015 at 07:59:14PM +0800, Chao Yu wrote:
> > > This patch introduces a new ioctl F2FS_IOC_DEFRAGMENT to support file
> > > defragment in a specified range of regular file.
> > >
> > > This ioctl can be used in very limited workload: if user expects high
> > > sequential read performance in randomly written file, this interface
> > > can be used for defragmentation, after that file can be written as
> > > continuous as possible in the device.
> > >
> > > Meanwhile, it has side-effect, it will make holes in segments where
> > > blocks located originally, so it's better to trigger GC to eliminate
> > > fragment in segments.
> > >
> > > Signed-off-by: Chao Yu <chao2.yu@samsung.com>
> > > ---
> > >  fs/f2fs/data.c |   6 +-
> > >  fs/f2fs/f2fs.h |   8 +++
> > >  fs/f2fs/file.c | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  3 files changed, 213 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> > > index 972eab7..5bb375a 100644
> > > --- a/fs/f2fs/data.c
> > > +++ b/fs/f2fs/data.c
> > > @@ -566,7 +566,7 @@ out:
> > >   *     b. do not use extent cache for better performance
> > >   *     c. give the block addresses to blockdev
> > >   */
> > > -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
> > > +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
> > >  						int create, int flag)
> > >  {
> > >  	unsigned int maxblocks = map->m_len;
> > > @@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
> > >  			available_free_memory(sbi, DIRTY_DENTS))
> > >  		goto skip_write;
> > >
> > > +	/* skip writing during file defragment */
> > > +	if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
> > > +		goto skip_write;
> > > +
> > >  	/* during POR, we don't need to trigger writepage at all. */
> > >  	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
> > >  		goto skip_write;
> > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > > index 9db5500..068813c 100644
> > > --- a/fs/f2fs/f2fs.h
> > > +++ b/fs/f2fs/f2fs.h
> > > @@ -234,6 +234,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum,
> > int size,
> > >  #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
> > >  #define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
> > >  #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
> > > +#define F2FS_IOC_DEFRAGMENT		_IO(F2FS_IOCTL_MAGIC, 8)
> > >
> > >  #define F2FS_IOC_SET_ENCRYPTION_POLICY					\
> > >  		_IOR('f', 19, struct f2fs_encryption_policy)
> > > @@ -260,6 +261,11 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum,
> > int size,
> > >  #define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
> > >  #endif
> > >
> > > +struct f2fs_defragment {
> > > +	u64 start;
> > > +	u64 len;
> > > +};
> > > +
> > >  /*
> > >   * For INODE and NODE manager
> > >   */
> > > @@ -1416,6 +1422,7 @@ enum {
> > >  	FI_DROP_CACHE,		/* drop dirty page cache */
> > >  	FI_DATA_EXIST,		/* indicate data exists */
> > >  	FI_INLINE_DOTS,		/* indicate inline dot dentries */
> > > +	FI_DO_DEFRAG,		/* indicate defragment is running */
> > >  };
> > >
> > >  static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
> > > @@ -1847,6 +1854,7 @@ struct page *find_data_page(struct inode *, pgoff_t);
> > >  struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
> > >  struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
> > >  int do_write_data_page(struct f2fs_io_info *);
> > > +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
> > >  int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
> > >  void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
> > >  int f2fs_release_page(struct page *, gfp_t);
> > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > index a197215..ad59694 100644
> > > --- a/fs/f2fs/file.c
> > > +++ b/fs/f2fs/file.c
> > > @@ -1646,6 +1646,204 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long
> > arg)
> > >  	return 0;
> > >  }
> > >
> > > +static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
> > > +					struct file *filp,
> > > +					struct f2fs_defragment *range)
> > > +{
> > > +	struct inode *inode = file_inode(filp);
> > > +	struct f2fs_map_blocks map;
> > > +	struct extent_info ei;
> > > +	pgoff_t pg_start, pg_end;
> > > +	unsigned int blk_per_seg = 1 << sbi->log_blocks_per_seg;
> > > +	unsigned int total = 0, sec_num;
> > > +	unsigned int pages_per_sec = sbi->segs_per_sec *
> > > +					(1 << sbi->log_blocks_per_seg);
> > > +	block_t blk_end = 0;
> > > +	bool fragmented = false;
> > > +	int err = 0;
> > > +
> > > +	pg_start = range->start >> PAGE_CACHE_SHIFT;
> > > +	pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
> > > +
> > > +	f2fs_balance_fs(sbi);
> > > +
> > > +	mutex_lock(&inode->i_mutex);
> > > +
> > > +	/* writeback all dirty pages in the range */
> > > +	err = filemap_write_and_wait_range(inode->i_mapping, range->start,
> > > +						range->start + range->len);
> > > +	if (err)
> > > +		goto out;
> > > +
> > > +	/*
> > > +	 * lookup mapping info in extent cache, skip defragmenting if physical
> > > +	 * block addresses are continuous.
> > > +	 */
> > > +	if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
> > > +		if (ei.fofs + ei.len >= pg_end)
> > > +			goto out;
> > > +	}
> > > +
> > > +	map.m_lblk = pg_start;
> > > +	map.m_len = pg_end - pg_start;
> > > +
> > > +	/*
> > > +	 * lookup mapping info in dnode page cache, skip defragmenting if all
> > > +	 * physical block addresses are continuous even if there are hole(s)
> > > +	 * in logical blocks.
> > > +	 */
> > > +	while (map.m_lblk < pg_end) {
> > > +		map.m_flags = 0;
> > > +		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
> > 
> > How about using f2fs_fiemap to get the extent information?
> 
> Hmm, if we use f2fs_fiemap, we will encounter unneeded memset & copy_to_user
> in fiemap_fill_next_extent, and struct fiemap_extent is designed for using in
> usersapce, fi_extents_start in struct fiemap_extent has __user * in this
> type, we'd better to avoid allocating such type in kernel, right? otherwise
> it looks very weird. So how about keeping using f2fs_map_blocks? as its call
> path is shortest and has no copying overhead.

Agreed. :)

Thanks,

> 
> > 
> > > +		if (err)
> > > +			goto out;
> > > +
> > > +		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> > > +			map.m_lblk++;
> > > +			map.m_len--;
> > > +			continue;
> > > +		}
> > > +
> > > +		if (blk_end && blk_end != map.m_pblk) {
> > > +			fragmented = true;
> > > +			break;
> > > +		}
> > > +		blk_end = map.m_pblk + map.m_len;
> > > +
> > > +		map.m_lblk += map.m_len;
> > > +		map.m_len = pg_end - map.m_lblk;
> > > +	}
> > > +
> > > +	if (!fragmented)
> > > +		goto out;
> > > +
> > > +	map.m_lblk = pg_start;
> > > +	map.m_len = pg_end - pg_start;
> > > +
> > > +	sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
> > > +
> > > +	if (has_not_enough_free_secs(sbi, sec_num))
> > 
> > Later, ->writepage will handle this?
> 
> Right, my intention here is that I hope defragmenting will be executed in
> a low fragmented partition, it will be good that finally blocks are locating
> in continuous segments, and also this can decrease the chance to writeback
> pages in SSR mode.
> 
> Moreover we should check IPU policy to insure defragment will actually
> works.
> 
> > 
> > > +		goto out;
> > > +
> > > +	while (map.m_lblk < pg_end) {
> > > +		pgoff_t idx;
> > > +		int cnt = 0;
> > 
> > What about this?
> > 
> > 	for_each_extents(extent_info) {
> > 		page = get_lock_data_page(inode, idx, true);
> > 
> > 		set_page_dirty(page);
> > 	}
> > 	filemap_fdatawrite();
> 
> Yeah, more neat! I will change this. :)
> 
> Thanks,
> 
> > 
> > Thanks,
> > 
> > > +
> > > +do_map:
> > > +		map.m_flags = 0;
> > > +		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
> > > +		if (err)
> > > +			goto out;
> > > +
> > > +		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
> > > +			map.m_lblk++;
> > > +			continue;
> > > +		}
> > > +
> > > +		set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> > > +
> > > +		idx = map.m_lblk;
> > > +		while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
> > > +			struct address_space *mapping = inode->i_mapping;
> > > +			struct page *page;
> > > +
> > > +			page = find_or_create_page(mapping, idx, GFP_NOFS);
> > > +			if (!page) {
> > > +				err = -ENOMEM;
> > > +				goto out;
> > > +			}
> > > +
> > > +			f2fs_wait_on_page_writeback(page, DATA);
> > > +
> > > +			if (!PageUptodate(page)) {
> > > +				err = mapping->a_ops->readpage(filp, page);
> > > +				if (unlikely(err)) {
> > > +					f2fs_put_page(page, 0);
> > > +					goto out;
> > > +				}
> > > +
> > > +				lock_page_killable(page);
> > > +
> > > +				if (!PageUptodate(page)) {
> > > +					f2fs_put_page(page, 1);
> > > +					err = -EIO;
> > > +					goto out;
> > > +				}
> > > +			}
> > > +			set_page_dirty(page);
> > > +			f2fs_put_page(page, 1);
> > > +
> > > +			idx++;
> > > +			cnt++;
> > > +			total++;
> > > +		}
> > > +
> > > +		map.m_lblk = idx;
> > > +		map.m_len = pg_end - idx;
> > > +
> > > +		if (idx < pg_end && cnt < blk_per_seg)
> > > +			goto do_map;
> > > +
> > > +		clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
> > > +
> > > +		err = filemap_fdatawrite(inode->i_mapping);
> > > +		if (err)
> > > +			goto out;
> > > +	}
> > > +out:
> > > +	mutex_unlock(&inode->i_mutex);
> > > +	if (!err)
> > > +		range->len = (u64)total << PAGE_CACHE_SHIFT;
> > > +	return err;
> > > +}
> > > +
> > > +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
> > > +{
> > > +	struct inode *inode = file_inode(filp);
> > > +	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> > > +	struct f2fs_defragment range;
> > > +	int err;
> > > +
> > > +	if (!capable(CAP_SYS_ADMIN))
> > > +		return -EPERM;
> > > +
> > > +	if (!S_ISREG(inode->i_mode))
> > > +		return -EINVAL;
> > > +
> > > +	err = mnt_want_write_file(filp);
> > > +	if (err)
> > > +		return err;
> > > +
> > > +	if (f2fs_readonly(sbi->sb)) {
> > > +		err = -EROFS;
> > > +		goto out;
> > > +	}
> > > +
> > > +	if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
> > > +							sizeof(range))) {
> > > +		err = -EFAULT;
> > > +		goto out;
> > > +	}
> > > +
> > > +	/* verify alignment of offset & size */
> > > +	if (range.start & (F2FS_BLKSIZE - 1) ||
> > > +		range.len & (F2FS_BLKSIZE - 1)) {
> > > +		err = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	err = f2fs_defragment_range(sbi, filp, &range);
> > > +	if (err < 0)
> > > +		goto out;
> > > +
> > > +	if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
> > > +							sizeof(range)))
> > > +		err = -EFAULT;
> > > +out:
> > > +	mnt_drop_write_file(filp);
> > > +	return err;
> > > +}
> > > +
> > >  long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> > >  {
> > >  	switch (cmd) {
> > > @@ -1679,6 +1877,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> > >  		return f2fs_ioc_gc(filp, arg);
> > >  	case F2FS_IOC_WRITE_CHECKPOINT:
> > >  		return f2fs_ioc_write_checkpoint(filp, arg);
> > > +	case F2FS_IOC_DEFRAGMENT:
> > > +		return f2fs_ioc_defragment(filp, arg);
> > >  	default:
> > >  		return -ENOTTY;
> > >  	}
> > > --
> > > 2.6.1

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-10-26 23:15 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-10-22 11:59 [PATCH] f2fs: support file defragment Chao Yu
2015-10-22 18:11 ` Jaegeuk Kim
2015-10-23  6:57   ` Chao Yu
2015-10-23  6:57     ` Chao Yu
2015-10-26 23:15     ` Jaegeuk Kim

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.