[Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support.

ocfs2-devel.oss.oracle.com archive mirror
 help / color / mirror / Atom feed

* [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support.
@ 2011-05-06  9:23 Tao Ma
  2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Tao Ma @ 2011-05-06  9:23 UTC (permalink / raw)
  To: ocfs2-devel

Hi all,
	These are the patches for adding batched discard support in ocfs2. I
have tested it with xfstests 251 and it passed.

btw, I have also run some tests against it(bonnie++, postmark, ffsb and
fs_mark) and there are no big difference before and after the discard.

Regards,
Tao

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-05-06  9:23 [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support Tao Ma
@ 2011-05-06  9:27 ` Tao Ma
  2011-05-09 23:02   ` Sunil Mushran
  2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 2/3] ocfs2: Add FITRIM ioctl Tao Ma
  2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 3/3] ocfs2: Add trace event for trim Tao Ma
  2 siblings, 1 reply; 15+ messages in thread
From: Tao Ma @ 2011-05-06  9:27 UTC (permalink / raw)
  To: ocfs2-devel

From: Tao Ma <boyu.mt@taobao.com>

Add ocfs2_trim_fs to support trimming freed clusters in the
volume. A range will be given and all the freed clusters greater
than minlen will be discarded to the block layer.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/alloc.c |  156 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/alloc.h |    1 +
 2 files changed, 157 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 48aa9c7..93a3f92 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 
 #include <cluster/masklog.h>
 
@@ -7184,3 +7185,158 @@ out_commit:
 out:
 	return ret;
 }
+
+static int ocfs2_trim_extent(struct super_block *sb,
+			     struct ocfs2_group_desc *gd,
+			     int start, int count)
+{
+	u64 discard;
+
+	count = ocfs2_clusters_to_blocks(sb, count);
+	discard = le64_to_cpu(gd->bg_blkno) +
+			ocfs2_clusters_to_blocks(sb, start);
+
+	return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+			    struct ocfs2_group_desc *gd,
+			    int start, int max, int minbits)
+{
+	int ret = 0, count = 0, next;
+	void *bitmap = gd->bg_bitmap;
+
+	while (start < max) {
+		start = ocfs2_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = ocfs2_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minbits) {
+			ret = ocfs2_trim_extent(sb, gd,
+						start, next - start);
+			if (ret < 0) {
+				mlog_errno(ret);
+				break;
+			}
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+			break;
+	}
+
+	if (ret < 0)
+		count = ret;
+
+	return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u64 start, len, minlen, trimmed, first_group, last_group, group;
+	int ret, cnt, first_bit, last_bit;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_dinode *main_bm;
+	struct ocfs2_group_desc *gd = NULL;
+
+	start = range->start >> osb->s_clustersize_bits;
+	len = range->len >> osb->s_clustersize_bits;
+	minlen = range->minlen >> osb->s_clustersize_bits;
+	trimmed = 0;
+
+	if (!len) {
+		range->len = 0;
+		return 0;
+	}
+
+	if (minlen >= osb->bitmap_cpg)
+		return -EINVAL;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	if (start >= le32_to_cpu(main_bm->i_clusters)) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	if (start + len > le32_to_cpu(main_bm->i_clusters))
+		len = le32_to_cpu(main_bm->i_clusters) - start;
+
+	/* Determine first and last group to examine based on start and len */
+	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+	if (first_group == osb->first_cluster_group_blkno)
+		first_bit = start;
+	else
+		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+	last_bit = osb->bitmap_cpg;
+
+	for (group = first_group; group <= last_group;) {
+		if (first_bit + len >= osb->bitmap_cpg)
+			last_bit = osb->bitmap_cpg;
+		else
+			last_bit = first_bit + len;
+
+		ret = ocfs2_read_group_descriptor(main_bm_inode,
+						  main_bm, group,
+						  &gd_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+		cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+		brelse(gd_bh);
+		gd_bh = NULL;
+		if (cnt < 0) {
+			ret = cnt;
+			mlog_errno(ret);
+			break;
+		}
+
+		trimmed += cnt;
+		len -= osb->bitmap_cpg - first_bit;
+		first_bit = 0;
+		if (group == osb->first_cluster_group_blkno)
+			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+		else
+			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+	}
+	range->len = trimmed * sb->s_blocksize;
+out_unlock:
+	ocfs2_inode_unlock(main_bm_inode, 0);
+	brelse(main_bm_bh);
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+out:
+	return ret;
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a0..ca381c5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
 		    struct buffer_head **leaf_bh);
 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
 
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
 /*
  * Helper function to look at the # of clusters in an extent record.
  */
-- 
1.6.3.GIT

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
@ 2011-05-09 23:02   ` Sunil Mushran
  2011-05-10  3:14     ` Tao Ma
  0 siblings, 1 reply; 15+ messages in thread
From: Sunil Mushran @ 2011-05-09 23:02 UTC (permalink / raw)
  To: ocfs2-devel

On 05/06/2011 02:27 AM, Tao Ma wrote:
> From: Tao Ma<boyu.mt@taobao.com>
>
> Add ocfs2_trim_fs to support trimming freed clusters in the
> volume. A range will be given and all the freed clusters greater
> than minlen will be discarded to the block layer.
>
> Signed-off-by: Tao Ma<boyu.mt@taobao.com>
> ---
>   fs/ocfs2/alloc.c |  156 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   fs/ocfs2/alloc.h |    1 +
>   2 files changed, 157 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index 48aa9c7..93a3f92 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -29,6 +29,7 @@
>   #include<linux/highmem.h>
>   #include<linux/swap.h>
>   #include<linux/quotaops.h>
> +#include<linux/blkdev.h>
>
>   #include<cluster/masklog.h>
>
> @@ -7184,3 +7185,158 @@ out_commit:
>   out:
>   	return ret;
>   }
> +
> +static int ocfs2_trim_extent(struct super_block *sb,
> +			     struct ocfs2_group_desc *gd,
> +			     int start, int count)

u32 will be better for start and count.

> +{
> +	u64 discard;
> +
> +	count = ocfs2_clusters_to_blocks(sb, count);

ocfs2_clusters_to_blocks() returns u64.

> +	discard = le64_to_cpu(gd->bg_blkno) +
> +			ocfs2_clusters_to_blocks(sb, start);
> +
> +	return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);

> +}
> +
> +static int ocfs2_trim_group(struct super_block *sb,
> +			    struct ocfs2_group_desc *gd,
> +			    int start, int max, int minbits)
> +{
> +	int ret = 0, count = 0, next;
> +	void *bitmap = gd->bg_bitmap;
> +
> +	while (start<  max) {
> +		start = ocfs2_find_next_zero_bit(bitmap, max, start);
> +		if (start>= max)
> +			break;
> +		next = ocfs2_find_next_bit(bitmap, max, start);
> +
> +		if ((next - start)>= minbits) {
> +			ret = ocfs2_trim_extent(sb, gd,
> +						start, next - start);
> +			if (ret<  0) {
> +				mlog_errno(ret);
> +				break;
> +			}
> +			count += next - start;
> +		}
> +		start = next + 1;
> +
> +		if (fatal_signal_pending(current)) {
> +			count = -ERESTARTSYS;
> +			break;
> +		}
> +
> +		if ((le16_to_cpu(gd->bg_free_bits_count) - count)<  minbits)
> +			break;

This check could also be done earlier.

> +	}
> +
> +	if (ret<  0)
> +		count = ret;
> +
> +	return count;
> +}
> +
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +{
> +	struct ocfs2_super *osb = OCFS2_SB(sb);
> +	u64 start, len, minlen, trimmed, first_group, last_group, group;
> +	int ret, cnt, first_bit, last_bit;
> +	struct buffer_head *main_bm_bh = NULL;
> +	struct inode *main_bm_inode = NULL;
> +	struct buffer_head *gd_bh = NULL;
> +	struct ocfs2_dinode *main_bm;
> +	struct ocfs2_group_desc *gd = NULL;
> +
> +	start = range->start>>  osb->s_clustersize_bits;
> +	len = range->len>>  osb->s_clustersize_bits;
> +	minlen = range->minlen>>  osb->s_clustersize_bits;
> +	trimmed = 0;
> +
> +	if (!len) {
> +		range->len = 0;
> +		return 0;
> +	}
> +
> +	if (minlen>= osb->bitmap_cpg)
> +		return -EINVAL;
> +
> +	main_bm_inode = ocfs2_get_system_file_inode(osb,
> +						    GLOBAL_BITMAP_SYSTEM_INODE,
> +						    OCFS2_INVALID_SLOT);
> +	if (!main_bm_inode) {
> +		ret = -EIO;
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	mutex_lock(&main_bm_inode->i_mutex);
> +
> +	ret = ocfs2_inode_lock(main_bm_inode,&main_bm_bh, 0);
> +	if (ret<  0) {
> +		mlog_errno(ret);
> +		goto out_mutex;
> +	}
> +	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
> +
> +	if (start>= le32_to_cpu(main_bm->i_clusters)) {
> +		ret = -EINVAL;
> +		mlog_errno(ret);

User error. No need to log it.

> +		goto out_unlock;
> +	}
> +
> +	if (start + len>  le32_to_cpu(main_bm->i_clusters))
> +		len = le32_to_cpu(main_bm->i_clusters) - start;
> +
> +	/* Determine first and last group to examine based on start and len */
> +	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> +	if (first_group == osb->first_cluster_group_blkno)
> +		first_bit = start;
> +	else
> +		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
> +	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
> +	last_bit = osb->bitmap_cpg;
> +
> +	for (group = first_group; group<= last_group;) {
> +		if (first_bit + len>= osb->bitmap_cpg)
> +			last_bit = osb->bitmap_cpg;
> +		else
> +			last_bit = first_bit + len;
> +
> +		ret = ocfs2_read_group_descriptor(main_bm_inode,
> +						  main_bm, group,
> +						&gd_bh);
> +		if (ret<  0) {
> +			mlog_errno(ret);
> +			break;
> +		}
> +
> +		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
> +		cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
> +		brelse(gd_bh);
> +		gd_bh = NULL;
> +		if (cnt<  0) {
> +			ret = cnt;
> +			mlog_errno(ret);
> +			break;
> +		}
> +
> +		trimmed += cnt;
> +		len -= osb->bitmap_cpg - first_bit;
> +		first_bit = 0;
> +		if (group == osb->first_cluster_group_blkno)
> +			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> +		else
> +			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> +	}
> +	range->len = trimmed * sb->s_blocksize;
> +out_unlock:
> +	ocfs2_inode_unlock(main_bm_inode, 0);
> +	brelse(main_bm_bh);
> +out_mutex:
> +	mutex_unlock(&main_bm_inode->i_mutex);
> +	iput(main_bm_inode);
> +out:
> +	return ret;
> +}
> diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
> index 3bd08a0..ca381c5 100644
> --- a/fs/ocfs2/alloc.h
> +++ b/fs/ocfs2/alloc.h
> @@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
>   		    struct buffer_head **leaf_bh);
>   int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
>
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
>   /*
>    * Helper function to look at the # of clusters in an extent record.
>    */

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-05-09 23:02   ` Sunil Mushran
@ 2011-05-10  3:14     ` Tao Ma
  0 siblings, 0 replies; 15+ messages in thread
From: Tao Ma @ 2011-05-10  3:14 UTC (permalink / raw)
  To: ocfs2-devel

Hi Sunil,
	Thanks for the review.
On 05/10/2011 07:02 AM, Sunil Mushran wrote:
> On 05/06/2011 02:27 AM, Tao Ma wrote:
>> From: Tao Ma<boyu.mt@taobao.com>
>>
>> Add ocfs2_trim_fs to support trimming freed clusters in the
>> volume. A range will be given and all the freed clusters greater
>> than minlen will be discarded to the block layer.
>>
>> Signed-off-by: Tao Ma<boyu.mt@taobao.com>
>> ---
>>   fs/ocfs2/alloc.c |  156
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   fs/ocfs2/alloc.h |    1 +
>>   2 files changed, 157 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index 48aa9c7..93a3f92 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -29,6 +29,7 @@
>>   #include<linux/highmem.h>
>>   #include<linux/swap.h>
>>   #include<linux/quotaops.h>
>> +#include<linux/blkdev.h>
>>
>>   #include<cluster/masklog.h>
>>
>> @@ -7184,3 +7185,158 @@ out_commit:
>>   out:
>>       return ret;
>>   }
>> +
>> +static int ocfs2_trim_extent(struct super_block *sb,
>> +                 struct ocfs2_group_desc *gd,
>> +                 int start, int count)
> 
> u32 will be better for start and count.
> 
>> +{
>> +    u64 discard;
>> +
>> +    count = ocfs2_clusters_to_blocks(sb, count);
> 
> ocfs2_clusters_to_blocks() returns u64.
fine. Actually my original thought was that both 'start' and 'count' are
within the range of a group. So it shouldn't be that much. But if you
prefer, I will change it.
> 
>> +    discard = le64_to_cpu(gd->bg_blkno) +
>> +            ocfs2_clusters_to_blocks(sb, start);
>> +
>> +    return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
> 
>> +}
>> +
>> +static int ocfs2_trim_group(struct super_block *sb,
>> +                struct ocfs2_group_desc *gd,
>> +                int start, int max, int minbits)
>> +{
>> +    int ret = 0, count = 0, next;
>> +    void *bitmap = gd->bg_bitmap;
>> +
>> +    while (start<  max) {
>> +        start = ocfs2_find_next_zero_bit(bitmap, max, start);
>> +        if (start>= max)
>> +            break;
>> +        next = ocfs2_find_next_bit(bitmap, max, start);
>> +
>> +        if ((next - start)>= minbits) {
>> +            ret = ocfs2_trim_extent(sb, gd,
>> +                        start, next - start);
>> +            if (ret<  0) {
>> +                mlog_errno(ret);
>> +                break;
>> +            }
>> +            count += next - start;
>> +        }
>> +        start = next + 1;
>> +
>> +        if (fatal_signal_pending(current)) {
>> +            count = -ERESTARTSYS;
>> +            break;
>> +        }
>> +
>> +        if ((le16_to_cpu(gd->bg_free_bits_count) - count)<  minbits)
>> +            break;
> 
> This check could also be done earlier.
oh, yes, I will add it. Thanks.
> 
>> +    }
>> +
>> +    if (ret<  0)
>> +        count = ret;
>> +
>> +    return count;
>> +}
>> +
>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>> +{
>> +    struct ocfs2_super *osb = OCFS2_SB(sb);
>> +    u64 start, len, minlen, trimmed, first_group, last_group, group;
>> +    int ret, cnt, first_bit, last_bit;
>> +    struct buffer_head *main_bm_bh = NULL;
>> +    struct inode *main_bm_inode = NULL;
>> +    struct buffer_head *gd_bh = NULL;
>> +    struct ocfs2_dinode *main_bm;
>> +    struct ocfs2_group_desc *gd = NULL;
>> +
>> +    start = range->start>>  osb->s_clustersize_bits;
>> +    len = range->len>>  osb->s_clustersize_bits;
>> +    minlen = range->minlen>>  osb->s_clustersize_bits;
>> +    trimmed = 0;
>> +
>> +    if (!len) {
>> +        range->len = 0;
>> +        return 0;
>> +    }
>> +
>> +    if (minlen>= osb->bitmap_cpg)
>> +        return -EINVAL;
>> +
>> +    main_bm_inode = ocfs2_get_system_file_inode(osb,
>> +                            GLOBAL_BITMAP_SYSTEM_INODE,
>> +                            OCFS2_INVALID_SLOT);
>> +    if (!main_bm_inode) {
>> +        ret = -EIO;
>> +        mlog_errno(ret);
>> +        goto out;
>> +    }
>> +
>> +    mutex_lock(&main_bm_inode->i_mutex);
>> +
>> +    ret = ocfs2_inode_lock(main_bm_inode,&main_bm_bh, 0);
>> +    if (ret<  0) {
>> +        mlog_errno(ret);
>> +        goto out_mutex;
>> +    }
>> +    main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>> +
>> +    if (start>= le32_to_cpu(main_bm->i_clusters)) {
>> +        ret = -EINVAL;
>> +        mlog_errno(ret);
> 
> User error. No need to log it.
yeah, thanks for the advice.

Regards,
Tao
> 
>> +        goto out_unlock;
>> +    }
>> +
>> +    if (start + len>  le32_to_cpu(main_bm->i_clusters))
>> +        len = le32_to_cpu(main_bm->i_clusters) - start;
>> +
>> +    /* Determine first and last group to examine based on start and
>> len */
>> +    first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>> +    if (first_group == osb->first_cluster_group_blkno)
>> +        first_bit = start;
>> +    else
>> +        first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>> +    last_group = ocfs2_which_cluster_group(main_bm_inode, start + len
>> - 1);
>> +    last_bit = osb->bitmap_cpg;
>> +
>> +    for (group = first_group; group<= last_group;) {
>> +        if (first_bit + len>= osb->bitmap_cpg)
>> +            last_bit = osb->bitmap_cpg;
>> +        else
>> +            last_bit = first_bit + len;
>> +
>> +        ret = ocfs2_read_group_descriptor(main_bm_inode,
>> +                          main_bm, group,
>> +                        &gd_bh);
>> +        if (ret<  0) {
>> +            mlog_errno(ret);
>> +            break;
>> +        }
>> +
>> +        gd = (struct ocfs2_group_desc *)gd_bh->b_data;
>> +        cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
>> +        brelse(gd_bh);
>> +        gd_bh = NULL;
>> +        if (cnt<  0) {
>> +            ret = cnt;
>> +            mlog_errno(ret);
>> +            break;
>> +        }
>> +
>> +        trimmed += cnt;
>> +        len -= osb->bitmap_cpg - first_bit;
>> +        first_bit = 0;
>> +        if (group == osb->first_cluster_group_blkno)
>> +            group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>> +        else
>> +            group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>> +    }
>> +    range->len = trimmed * sb->s_blocksize;
>> +out_unlock:
>> +    ocfs2_inode_unlock(main_bm_inode, 0);
>> +    brelse(main_bm_bh);
>> +out_mutex:
>> +    mutex_unlock(&main_bm_inode->i_mutex);
>> +    iput(main_bm_inode);
>> +out:
>> +    return ret;
>> +}
>> diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
>> index 3bd08a0..ca381c5 100644
>> --- a/fs/ocfs2/alloc.h
>> +++ b/fs/ocfs2/alloc.h
>> @@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
>>               struct buffer_head **leaf_bh);
>>   int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32
>> v_cluster);
>>
>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
>>   /*
>>    * Helper function to look at the # of clusters in an extent record.
>>    */
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 2/3] ocfs2: Add FITRIM ioctl.
  2011-05-06  9:23 [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support Tao Ma
  2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
@ 2011-05-06  9:27 ` Tao Ma
  2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 3/3] ocfs2: Add trace event for trim Tao Ma
  2 siblings, 0 replies; 15+ messages in thread
From: Tao Ma @ 2011-05-06  9:27 UTC (permalink / raw)
  To: ocfs2-devel

From: Tao Ma <boyu.mt@taobao.com>

Add the corresponding ioctl function for FITRIM.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/ioctl.c |   24 ++++++++++++++++++++++++
 1 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8f13c59..312a28f 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -542,6 +542,29 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			return -EFAULT;
 
 		return ocfs2_info_handle(inode, &info, 0);
+	case FITRIM:
+	{
+		struct super_block *sb = inode->i_sb;
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&range, (struct fstrim_range *)arg,
+		    sizeof(range)))
+			return -EFAULT;
+
+		ret = ocfs2_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range *)arg, &range,
+		    sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
 	default:
 		return -ENOTTY;
 	}
@@ -569,6 +592,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case OCFS2_IOC_GROUP_EXTEND:
 	case OCFS2_IOC_GROUP_ADD:
 	case OCFS2_IOC_GROUP_ADD64:
+	case FITRIM:
 		break;
 	case OCFS2_IOC_REFLINK:
 		if (copy_from_user(&args, (struct reflink_arguments *)arg,
-- 
1.6.3.GIT

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] ocfs2: Add trace event for trim.
  2011-05-06  9:23 [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support Tao Ma
  2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
  2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 2/3] ocfs2: Add FITRIM ioctl Tao Ma
@ 2011-05-06  9:27 ` Tao Ma
  2 siblings, 0 replies; 15+ messages in thread
From: Tao Ma @ 2011-05-06  9:27 UTC (permalink / raw)
  To: ocfs2-devel

From: Tao Ma <boyu.mt@taobao.com>

Add the corresponding trace event for trim.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/alloc.c       |    7 +++++++
 fs/ocfs2/ocfs2_trace.h |   25 +++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 93a3f92..12f1c33 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7196,6 +7196,8 @@ static int ocfs2_trim_extent(struct super_block *sb,
 	discard = le64_to_cpu(gd->bg_blkno) +
 			ocfs2_clusters_to_blocks(sb, start);
 
+	trace_ocfs2_trim_extent(sb, (unsigned long long)discard, count);
+
 	return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
 }
 
@@ -7206,6 +7208,9 @@ static int ocfs2_trim_group(struct super_block *sb,
 	int ret = 0, count = 0, next;
 	void *bitmap = gd->bg_bitmap;
 
+	trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+			       start, max, minbits);
+
 	while (start < max) {
 		start = ocfs2_find_next_zero_bit(bitmap, max, start);
 		if (start >= max)
@@ -7289,6 +7294,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	if (start + len > le32_to_cpu(main_bm->i_clusters))
 		len = le32_to_cpu(main_bm->i_clusters) - start;
 
+	trace_ocfs2_trim_fs(start, len, minlen);
+
 	/* Determine first and last group to examine based on start and len */
 	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
 	if (first_group == osb->first_cluster_group_blkno)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a1dae5b..9ab22a1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
 		  __entry->blkno, __entry->bit)
 );
 
+TRACE_EVENT(ocfs2_trim_extent,
+	TP_PROTO(struct super_block *sb, unsigned long long blk,
+		 unsigned long long count),
+	TP_ARGS(sb, blk, count),
+	TP_STRUCT__entry(
+		__field(int, dev_major)
+		__field(int, dev_minor)
+		__field(unsigned long long, blk)
+		__field(__u64,	count)
+	),
+	TP_fast_assign(
+		__entry->dev_major = MAJOR(sb->s_dev);
+		__entry->dev_minor = MINOR(sb->s_dev);
+		__entry->blk = blk;
+		__entry->count = count;
+	),
+	TP_printk("%d %d %llu %llu",
+		  __entry->dev_major, __entry->dev_minor,
+		  __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
 /* End of trace events for fs/ocfs2/alloc.c. */
 
 /* Trace events for fs/ocfs2/localalloc.c. */
-- 
1.6.3.GIT

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support
@ 2011-03-07 10:02 Tao Ma
  2011-03-07 10:05 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
  0 siblings, 1 reply; 15+ messages in thread
From: Tao Ma @ 2011-03-07 10:02 UTC (permalink / raw)
  To: ocfs2-devel

Hi all,
	This patch set adds batched discard support to ocfs2. Please check. Thanks.

Regards,
Tao

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-03-07 10:02 [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support Tao Ma
@ 2011-03-07 10:05 ` Tao Ma
  2011-03-08  4:55   ` Tristan Ye
  0 siblings, 1 reply; 15+ messages in thread
From: Tao Ma @ 2011-03-07 10:05 UTC (permalink / raw)
  To: ocfs2-devel

From: Tao Ma <boyu.mt@taobao.com>

Add ocfs2_trim_fs to support trimming freed clusters in the
volume. A range will be given and all the freed clusters greater
than minlen will be discarded to the block layer.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/alloc.c |  154 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/alloc.h |    1 +
 2 files changed, 155 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b27a0d8..6e1b3b5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 
 #include <cluster/masklog.h>
 
@@ -7184,3 +7185,156 @@ out_commit:
 out:
 	return ret;
 }
+
+static int ocfs2_trim_extent(struct super_block *sb,
+			     struct ocfs2_group_desc *gd,
+			     int start, int count)
+{
+	u64 discard;
+
+	count = ocfs2_clusters_to_blocks(sb, count);
+	discard = le64_to_cpu(gd->bg_blkno) +
+			ocfs2_clusters_to_blocks(sb, start);
+
+	return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+			    struct ocfs2_group_desc *gd,
+			    int start, int max, int minbits)
+{
+	int ret = 0, count = 0, next;
+	void *bitmap = gd->bg_bitmap;
+
+	while (start < max) {
+		start = ocfs2_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = ocfs2_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minbits) {
+			ret = ocfs2_trim_extent(sb, gd,
+						start, next - start);
+			if (ret < 0) {
+				mlog_errno(ret);
+				break;
+			}
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+			break;
+	}
+
+	if (ret < 0)
+		count = ret;
+
+	return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	u64 start, len, minlen, trimmed, first_group, last_group, group;
+	int ret, cnt, first_bit, last_bit;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct buffer_head *gd_bh = NULL;
+	struct ocfs2_dinode *main_bm;
+	struct ocfs2_group_desc *gd = NULL;
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	start = range->start >> osb->s_clustersize_bits;
+	len = range->len >> osb->s_clustersize_bits;
+	minlen = range->minlen >> osb->s_clustersize_bits;
+	trimmed = 0;
+
+	if (!len || !minlen || minlen >= osb->bitmap_cpg)
+		return -EINVAL;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	if (start >= le32_to_cpu(main_bm->i_clusters)) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	if (start + len > le32_to_cpu(main_bm->i_clusters))
+		len = le32_to_cpu(main_bm->i_clusters) - start;
+
+	/* Determine first and last group to examine based on start and len */
+	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+	if (first_group == osb->first_cluster_group_blkno)
+		first_bit = start;
+	else
+		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+	last_bit = osb->bitmap_cpg;
+
+	for (group = first_group; group <= last_group;) {
+		if (first_bit + len >= osb->bitmap_cpg)
+			last_bit = osb->bitmap_cpg - first_bit;
+		else
+			last_bit = start + len;
+
+		ret = ocfs2_read_group_descriptor(main_bm_inode,
+						  main_bm, group,
+						  &gd_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+		cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+		brelse(gd_bh);
+		gd_bh = NULL;
+		if (cnt < 0) {
+			ret = cnt;
+			mlog_errno(ret);
+			break;
+		}
+
+		trimmed += cnt;
+		len -= osb->bitmap_cpg - first_bit;
+		first_bit = 0;
+		if (group == osb->first_cluster_group_blkno)
+			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+		else
+			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+	}
+	range->len = trimmed * sb->s_blocksize;
+out_unlock:
+	ocfs2_inode_unlock(main_bm_inode, 0);
+	brelse(main_bm_bh);
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+out:
+	return ret;
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a0..ca381c5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
 		    struct buffer_head **leaf_bh);
 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
 
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
 /*
  * Helper function to look at the # of clusters in an extent record.
  */
-- 
1.6.3.GIT

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-03-07 10:05 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
@ 2011-03-08  4:55   ` Tristan Ye
  2011-03-08  5:53     ` Tao Ma
  2011-03-08  7:53     ` Tao Ma
  0 siblings, 2 replies; 15+ messages in thread
From: Tristan Ye @ 2011-03-08  4:55 UTC (permalink / raw)
  To: ocfs2-devel

Hi Tao,

    Most of codes looks pretty neat to me, few comments inlined below:

Tao Ma wrote:
> From: Tao Ma <boyu.mt@taobao.com>
>
> Add ocfs2_trim_fs to support trimming freed clusters in the
> volume. A range will be given and all the freed clusters greater
> than minlen will be discarded to the block layer.
>
> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
> ---
>  fs/ocfs2/alloc.c |  154 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ocfs2/alloc.h |    1 +
>  2 files changed, 155 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index b27a0d8..6e1b3b5 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -29,6 +29,7 @@
>  #include <linux/highmem.h>
>  #include <linux/swap.h>
>  #include <linux/quotaops.h>
> +#include <linux/blkdev.h>
>  
>  #include <cluster/masklog.h>
>  
> @@ -7184,3 +7185,156 @@ out_commit:
>  out:
>  	return ret;
>  }
> +
> +static int ocfs2_trim_extent(struct super_block *sb,
> +			     struct ocfs2_group_desc *gd,
> +			     int start, int count)
> +{
> +	u64 discard;
> +
> +	count = ocfs2_clusters_to_blocks(sb, count);
> +	discard = le64_to_cpu(gd->bg_blkno) +
> +			ocfs2_clusters_to_blocks(sb, start);
> +
> +	return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
> +}
> +
> +static int ocfs2_trim_group(struct super_block *sb,
> +			    struct ocfs2_group_desc *gd,
> +			    int start, int max, int minbits)
> +{
> +	int ret = 0, count = 0, next;
> +	void *bitmap = gd->bg_bitmap;
> +
> +	while (start < max) {
> +		start = ocfs2_find_next_zero_bit(bitmap, max, start);
> +		if (start >= max)
> +			break;

    /* What if the 'start' stands within a hole */

    if (ocfs2_test_bit(...)) {
       start = ocfs2_find_next_zero_bit(...);
       if ((start == -1) || (start >= max))
          break;
    }

> +		next = ocfs2_find_next_bit(bitmap, max, start);
   
    next = ocfs2_find_next_bit(...);
    if (next == -1)
       break;

    if (next > max)
       next = max;
   
> +
> +		if ((next - start) >= minbits) {
> +			ret = ocfs2_trim_extent(sb, gd,
> +						start, next - start);
> +			if (ret < 0) {
> +				mlog_errno(ret);
> +				break;
> +			}
> +			count += next - start;
> +		}
> +		start = next + 1;
> +
> +		if (fatal_signal_pending(current)) {
> +			count = -ERESTARTSYS;
> +			break;
> +		}
> +
> +		if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
> +			break;
> +	}
> +
> +	if (ret < 0)
> +		count = ret;
> +
> +	return count;
> +}
> +
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +{
> +	struct ocfs2_super *osb = OCFS2_SB(sb);
> +	u64 start, len, minlen, trimmed, first_group, last_group, group;
    why not using u32 start, len, minlen, trimmed;
> +	int ret, cnt, first_bit, last_bit;
> +	struct buffer_head *main_bm_bh = NULL;
> +	struct inode *main_bm_inode = NULL;
> +	struct buffer_head *gd_bh = NULL;
> +	struct ocfs2_dinode *main_bm;
> +	struct ocfs2_group_desc *gd = NULL;
> +
> +	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
> +		return -EROFS;
> +
> +	start = range->start >> osb->s_clustersize_bits;
> +	len = range->len >> osb->s_clustersize_bits;
> +	minlen = range->minlen >> osb->s_clustersize_bits;

    I guess you may want to count two corner clusters which cover the 
'start' and 'end' bytes,
so the appropriate way might be:

    start = range->start >> osb->s_clustersize_bits;
    len = ocfs2_clusters_for_bytes(osb->sb, range->start  + range->len);
    len -= start;
   
> +	trimmed = 0;
> +
> +	if (!len || !minlen || minlen >= osb->bitmap_cpg)
    'minlen == 0' looks acceptable, which means we allowing discarding 
for all size of extents.
and what's more, 'len == 0' may not be harmful enough to issue a 
'EINVAL', returning a legal '0'
to userspace immediately is fine.


> +		return -EINVAL;
> +
> +	main_bm_inode = ocfs2_get_system_file_inode(osb,
> +						    GLOBAL_BITMAP_SYSTEM_INODE,
> +						    OCFS2_INVALID_SLOT);
> +	if (!main_bm_inode) {
> +		ret = -EIO;
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	mutex_lock(&main_bm_inode->i_mutex);
> +
> +	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
> +	if (ret < 0) {
> +		mlog_errno(ret);
> +		goto out_mutex;
> +	}
> +	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
> +
> +	if (start >= le32_to_cpu(main_bm->i_clusters)) {
> +		ret = -EINVAL;
> +		mlog_errno(ret);
> +		goto out_unlock;
> +	}
> +
> +	if (start + len > le32_to_cpu(main_bm->i_clusters))
> +		len = le32_to_cpu(main_bm->i_clusters) - start;
> +
> +	/* Determine first and last group to examine based on start and len */
> +	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> +	if (first_group == osb->first_cluster_group_blkno)
> +		first_bit = start;
> +	else
> +		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
> +	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
> +	last_bit = osb->bitmap_cpg;
> +
> +	for (group = first_group; group <= last_group;) {
> +		if (first_bit + len >= osb->bitmap_cpg)
> +			last_bit = osb->bitmap_cpg - first_bit;

    is 'first_bit' and 'last_bit' both represent a local offset within a 
cluster group?
just wondering why last_bit wasn't equal to 'osb->bitmap_cpg' in above 
case(I meant the case
of 'first_bit + len >= osb->bitmap_cpg'

> +		else
> +			last_bit = start + len;

    why above case is not 'last_bit = first_bit + len';

> +
> +		ret = ocfs2_read_group_descriptor(main_bm_inode,
> +						  main_bm, group,
> +						  &gd_bh);
> +		if (ret < 0) {
> +			mlog_errno(ret);
> +			break;
> +		}
> +
> +		gd = (struct ocfs2_group_desc *)gd_bh->b_data;
> +		cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
> +		brelse(gd_bh);
> +		gd_bh = NULL;
> +		if (cnt < 0) {
> +			ret = cnt;
> +			mlog_errno(ret);
> +			break;
> +		}
> +
> +		trimmed += cnt;
> +		len -= osb->bitmap_cpg - first_bit;
> +		first_bit = 0;
> +		if (group == osb->first_cluster_group_blkno)
> +			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> +		else
> +			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> +	}
> +	range->len = trimmed * sb->s_blocksize;
> +out_unlock:
> +	ocfs2_inode_unlock(main_bm_inode, 0);
> +	brelse(main_bm_bh);
> +out_mutex:
> +	mutex_unlock(&main_bm_inode->i_mutex);
> +	iput(main_bm_inode);
> +out:
> +	return ret;
> +}
> diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
> index 3bd08a0..ca381c5 100644
> --- a/fs/ocfs2/alloc.h
> +++ b/fs/ocfs2/alloc.h
> @@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
>  		    struct buffer_head **leaf_bh);
>  int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
>  
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
>  /*
>   * Helper function to look at the # of clusters in an extent record.
>   */

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-03-08  4:55   ` Tristan Ye
@ 2011-03-08  5:53     ` Tao Ma
  2011-03-08  6:23       ` Tristan Ye
  2011-03-08  7:53     ` Tao Ma
  1 sibling, 1 reply; 15+ messages in thread
From: Tao Ma @ 2011-03-08  5:53 UTC (permalink / raw)
  To: ocfs2-devel

On 03/08/2011 12:55 PM, Tristan Ye wrote:
> Hi Tao,
> 
>    Most of codes looks pretty neat to me, few comments inlined below:
Thanks for the review.
> 
> Tao Ma wrote:
>> From: Tao Ma <boyu.mt@taobao.com>
>>
>> Add ocfs2_trim_fs to support trimming freed clusters in the
>> volume. A range will be given and all the freed clusters greater
>> than minlen will be discarded to the block layer.
>>
>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>> ---
>>  fs/ocfs2/alloc.c |  154
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/ocfs2/alloc.h |    1 +
>>  2 files changed, 155 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index b27a0d8..6e1b3b5 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -29,6 +29,7 @@
>>  #include <linux/highmem.h>
>>  #include <linux/swap.h>
>>  #include <linux/quotaops.h>
>> +#include <linux/blkdev.h>
>>  
>>  #include <cluster/masklog.h>
>>  
>> @@ -7184,3 +7185,156 @@ out_commit:
>>  out:
>>      return ret;
>>  }
>> +
>> +static int ocfs2_trim_extent(struct super_block *sb,
>> +                 struct ocfs2_group_desc *gd,
>> +                 int start, int count)
>> +{
>> +    u64 discard;
>> +
>> +    count = ocfs2_clusters_to_blocks(sb, count);
>> +    discard = le64_to_cpu(gd->bg_blkno) +
>> +            ocfs2_clusters_to_blocks(sb, start);
>> +
>> +    return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
>> +}
>> +
>> +static int ocfs2_trim_group(struct super_block *sb,
>> +                struct ocfs2_group_desc *gd,
>> +                int start, int max, int minbits)
>> +{
>> +    int ret = 0, count = 0, next;
>> +    void *bitmap = gd->bg_bitmap;
>> +
>> +    while (start < max) {
>> +        start = ocfs2_find_next_zero_bit(bitmap, max, start);
>> +        if (start >= max)
>> +            break;
> 
>    /* What if the 'start' stands within a hole */
> 
>    if (ocfs2_test_bit(...)) {
>       start = ocfs2_find_next_zero_bit(...);
>       if ((start == -1) || (start >= max))
>          break;
>    }
> 
>> +        next = ocfs2_find_next_bit(bitmap, max, start);
>      next = ocfs2_find_next_bit(...);
>    if (next == -1)
>       break;
next will be set to "-1"? sorry, but where do you get it?
> 
>    if (next > max)
>       next = max;
again, ocfs2_find_next_bit will return a value larger than 'max'? I am
afraid not. Otherwise, it will be nonsense to pass a 'max' to it.
>  
>> +
>> +        if ((next - start) >= minbits) {
>> +            ret = ocfs2_trim_extent(sb, gd,
>> +                        start, next - start);
>> +            if (ret < 0) {
>> +                mlog_errno(ret);
>> +                break;
>> +            }
>> +            count += next - start;
>> +        }
>> +        start = next + 1;
>> +
>> +        if (fatal_signal_pending(current)) {
>> +            count = -ERESTARTSYS;
>> +            break;
>> +        }
>> +
>> +        if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
>> +            break;
>> +    }
>> +
>> +    if (ret < 0)
>> +        count = ret;
>> +
>> +    return count;
>> +}
>> +
>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>> +{
>> +    struct ocfs2_super *osb = OCFS2_SB(sb);
>> +    u64 start, len, minlen, trimmed, first_group, last_group, group;
>    why not using u32 start, len, minlen, trimmed;
we may use 64 bit clusters later I guess. And what's more, they will be
set by the user later. and it may overflow. Say the user pass a u64
range->len, it will overflow with range->len >> osb->s_clustersize_bits.
>> +    int ret, cnt, first_bit, last_bit;
>> +    struct buffer_head *main_bm_bh = NULL;
>> +    struct inode *main_bm_inode = NULL;
>> +    struct buffer_head *gd_bh = NULL;
>> +    struct ocfs2_dinode *main_bm;
>> +    struct ocfs2_group_desc *gd = NULL;
>> +
>> +    if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
>> +        return -EROFS;
>> +
>> +    start = range->start >> osb->s_clustersize_bits;
>> +    len = range->len >> osb->s_clustersize_bits;
>> +    minlen = range->minlen >> osb->s_clustersize_bits;
> 
>    I guess you may want to count two corner clusters which cover the
> 'start' and 'end' bytes,
> so the appropriate way might be:
> 
>    start = range->start >> osb->s_clustersize_bits;
>    len = ocfs2_clusters_for_bytes(osb->sb, range->start  + range->len);
>    len -= start;
No, I don't want that.. Just want to make it the same as what ext4 did.
See ext4_trim_fs for more details.
>  
>> +    trimmed = 0;
>> +
>> +    if (!len || !minlen || minlen >= osb->bitmap_cpg)
>    'minlen == 0' looks acceptable, which means we allowing discarding
> for all size of extents.
> and what's more, 'len == 0' may not be harmful enough to issue a
> 'EINVAL', returning a legal '0'
> to userspace immediately is fine.
Fair enough. I will change it. Thanks.
> 
> 
>> +        return -EINVAL;
>> +
>> +    main_bm_inode = ocfs2_get_system_file_inode(osb,
>> +                            GLOBAL_BITMAP_SYSTEM_INODE,
>> +                            OCFS2_INVALID_SLOT);
>> +    if (!main_bm_inode) {
>> +        ret = -EIO;
>> +        mlog_errno(ret);
>> +        goto out;
>> +    }
>> +
>> +    mutex_lock(&main_bm_inode->i_mutex);
>> +
>> +    ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
>> +    if (ret < 0) {
>> +        mlog_errno(ret);
>> +        goto out_mutex;
>> +    }
>> +    main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>> +
>> +    if (start >= le32_to_cpu(main_bm->i_clusters)) {
>> +        ret = -EINVAL;
>> +        mlog_errno(ret);
>> +        goto out_unlock;
>> +    }
>> +
>> +    if (start + len > le32_to_cpu(main_bm->i_clusters))
>> +        len = le32_to_cpu(main_bm->i_clusters) - start;
>> +
>> +    /* Determine first and last group to examine based on start and
>> len */
>> +    first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>> +    if (first_group == osb->first_cluster_group_blkno)
>> +        first_bit = start;
>> +    else
>> +        first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>> +    last_group = ocfs2_which_cluster_group(main_bm_inode, start + len
>> - 1);
>> +    last_bit = osb->bitmap_cpg;
>> +
>> +    for (group = first_group; group <= last_group;) {
>> +        if (first_bit + len >= osb->bitmap_cpg)
>> +            last_bit = osb->bitmap_cpg - first_bit;
> 
>    is 'first_bit' and 'last_bit' both represent a local offset within a
> cluster group?
> just wondering why last_bit wasn't equal to 'osb->bitmap_cpg' in above
> case(I meant the case
> of 'first_bit + len >= osb->bitmap_cpg'
> 
>> +        else
>> +            last_bit = start + len;
> 
>    why above case is not 'last_bit = first_bit + len';
you are right.  Thanks.

Regards,
Tao

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-03-08  5:53     ` Tao Ma
@ 2011-03-08  6:23       ` Tristan Ye
  2011-03-08  6:42         ` Tao Ma
  0 siblings, 1 reply; 15+ messages in thread
From: Tristan Ye @ 2011-03-08  6:23 UTC (permalink / raw)
  To: ocfs2-devel

Tao Ma wrote:
> On 03/08/2011 12:55 PM, Tristan Ye wrote:
>> Hi Tao,
>>
>>    Most of codes looks pretty neat to me, few comments inlined below:
> Thanks for the review.
>> Tao Ma wrote:
>>> From: Tao Ma <boyu.mt@taobao.com>
>>>
>>> Add ocfs2_trim_fs to support trimming freed clusters in the
>>> volume. A range will be given and all the freed clusters greater
>>> than minlen will be discarded to the block layer.
>>>
>>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>>> ---
>>>  fs/ocfs2/alloc.c |  154
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  fs/ocfs2/alloc.h |    1 +
>>>  2 files changed, 155 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index b27a0d8..6e1b3b5 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -29,6 +29,7 @@
>>>  #include <linux/highmem.h>
>>>  #include <linux/swap.h>
>>>  #include <linux/quotaops.h>
>>> +#include <linux/blkdev.h>
>>>  
>>>  #include <cluster/masklog.h>
>>>  
>>> @@ -7184,3 +7185,156 @@ out_commit:
>>>  out:
>>>      return ret;
>>>  }
>>> +
>>> +static int ocfs2_trim_extent(struct super_block *sb,
>>> +                 struct ocfs2_group_desc *gd,
>>> +                 int start, int count)
>>> +{
>>> +    u64 discard;
>>> +
>>> +    count = ocfs2_clusters_to_blocks(sb, count);
>>> +    discard = le64_to_cpu(gd->bg_blkno) +
>>> +            ocfs2_clusters_to_blocks(sb, start);
>>> +
>>> +    return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
>>> +}
>>> +
>>> +static int ocfs2_trim_group(struct super_block *sb,
>>> +                struct ocfs2_group_desc *gd,
>>> +                int start, int max, int minbits)
>>> +{
>>> +    int ret = 0, count = 0, next;
>>> +    void *bitmap = gd->bg_bitmap;
>>> +
>>> +    while (start < max) {
>>> +        start = ocfs2_find_next_zero_bit(bitmap, max, start);
>>> +        if (start >= max)
>>> +            break;
>>    /* What if the 'start' stands within a hole */
>>
>>    if (ocfs2_test_bit(...)) {
>>       start = ocfs2_find_next_zero_bit(...);
>>       if ((start == -1) || (start >= max))
>>          break;
>>    }
>>
>>> +        next = ocfs2_find_next_bit(bitmap, max, start);
>>      next = ocfs2_find_next_bit(...);
>>    if (next == -1)
>>       break;
> next will be set to "-1"? sorry, but where do you get it?
>>    if (next > max)
>>       next = max;
> again, ocfs2_find_next_bit will return a value larger than 'max'? I am
> afraid not. Otherwise, it will be nonsense to pass a 'max' to it.


Say we're handling the last group, and the 'start + len' was within a 
hole, then the 'max'
is 'first_bit + len', while the next none-zero bit we found may be 
larger than 'max', isn't
that possible?



>>  
>>> +
>>> +        if ((next - start) >= minbits) {
>>> +            ret = ocfs2_trim_extent(sb, gd,
>>> +                        start, next - start);
>>> +            if (ret < 0) {
>>> +                mlog_errno(ret);
>>> +                break;
>>> +            }
>>> +            count += next - start;
>>> +        }
>>> +        start = next + 1;
>>> +
>>> +        if (fatal_signal_pending(current)) {
>>> +            count = -ERESTARTSYS;
>>> +            break;
>>> +        }
>>> +
>>> +        if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
>>> +            break;
>>> +    }
>>> +
>>> +    if (ret < 0)
>>> +        count = ret;
>>> +
>>> +    return count;
>>> +}
>>> +
>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>> +{
>>> +    struct ocfs2_super *osb = OCFS2_SB(sb);
>>> +    u64 start, len, minlen, trimmed, first_group, last_group, group;
>>    why not using u32 start, len, minlen, trimmed;
> we may use 64 bit clusters later I guess. And what's more, they will be
> set by the user later. and it may overflow. Say the user pass a u64
> range->len, it will overflow with range->len >> osb->s_clustersize_bits.

I just found we were using u32 for counting clusters all around ocfs2 
codes, e.g truncate/punching_hole
codes, also passing an u64 byte_offset from userspace, so my original 
intention is to keep an unification;-)

Overflow can theoretically happen anyway, however, it's not very likely 
to pass a 16TB+ byte_offset from userspace.

>>> +    int ret, cnt, first_bit, last_bit;
>>> +    struct buffer_head *main_bm_bh = NULL;
>>> +    struct inode *main_bm_inode = NULL;
>>> +    struct buffer_head *gd_bh = NULL;
>>> +    struct ocfs2_dinode *main_bm;
>>> +    struct ocfs2_group_desc *gd = NULL;
>>> +
>>> +    if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
>>> +        return -EROFS;
>>> +
>>> +    start = range->start >> osb->s_clustersize_bits;
>>> +    len = range->len >> osb->s_clustersize_bits;
>>> +    minlen = range->minlen >> osb->s_clustersize_bits;
>>    I guess you may want to count two corner clusters which cover the
>> 'start' and 'end' bytes,
>> so the appropriate way might be:
>>
>>    start = range->start >> osb->s_clustersize_bits;
>>    len = ocfs2_clusters_for_bytes(osb->sb, range->start  + range->len);
>>    len -= start;
> No, I don't want that.. Just want to make it the same as what ext4 did.
> See ext4_trim_fs for more details.

    All right;-)

>>  
>>> +    trimmed = 0;
>>> +
>>> +    if (!len || !minlen || minlen >= osb->bitmap_cpg)
>>    'minlen == 0' looks acceptable, which means we allowing discarding
>> for all size of extents.
>> and what's more, 'len == 0' may not be harmful enough to issue a
>> 'EINVAL', returning a legal '0'
>> to userspace immediately is fine.
> Fair enough. I will change it. Thanks.
>>
>>> +        return -EINVAL;
>>> +
>>> +    main_bm_inode = ocfs2_get_system_file_inode(osb,
>>> +                            GLOBAL_BITMAP_SYSTEM_INODE,
>>> +                            OCFS2_INVALID_SLOT);
>>> +    if (!main_bm_inode) {
>>> +        ret = -EIO;
>>> +        mlog_errno(ret);
>>> +        goto out;
>>> +    }
>>> +
>>> +    mutex_lock(&main_bm_inode->i_mutex);
>>> +
>>> +    ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
>>> +    if (ret < 0) {
>>> +        mlog_errno(ret);
>>> +        goto out_mutex;
>>> +    }
>>> +    main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>>> +
>>> +    if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>> +        ret = -EINVAL;
>>> +        mlog_errno(ret);
>>> +        goto out_unlock;
>>> +    }
>>> +
>>> +    if (start + len > le32_to_cpu(main_bm->i_clusters))
>>> +        len = le32_to_cpu(main_bm->i_clusters) - start;
>>> +
>>> +    /* Determine first and last group to examine based on start and
>>> len */
>>> +    first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>> +    if (first_group == osb->first_cluster_group_blkno)
>>> +        first_bit = start;
>>> +    else
>>> +        first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>>> +    last_group = ocfs2_which_cluster_group(main_bm_inode, start + len
>>> - 1);
>>> +    last_bit = osb->bitmap_cpg;
>>> +
>>> +    for (group = first_group; group <= last_group;) {
>>> +        if (first_bit + len >= osb->bitmap_cpg)
>>> +            last_bit = osb->bitmap_cpg - first_bit;
>>    is 'first_bit' and 'last_bit' both represent a local offset within a
>> cluster group?
>> just wondering why last_bit wasn't equal to 'osb->bitmap_cpg' in above
>> case(I meant the case
>> of 'first_bit + len >= osb->bitmap_cpg'
>>
>>> +        else
>>> +            last_bit = start + len;
>>    why above case is not 'last_bit = first_bit + len';
> you are right.  Thanks.
>
> Regards,
> Tao

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-03-08  6:23       ` Tristan Ye
@ 2011-03-08  6:42         ` Tao Ma
  2011-03-08  6:53           ` Tristan Ye
  0 siblings, 1 reply; 15+ messages in thread
From: Tao Ma @ 2011-03-08  6:42 UTC (permalink / raw)
  To: ocfs2-devel

On 03/08/2011 02:23 PM, Tristan Ye wrote:
> Tao Ma wrote:
>> On 03/08/2011 12:55 PM, Tristan Ye wrote:
>>> Hi Tao,
>>>
>>>    Most of codes looks pretty neat to me, few comments inlined below:
>> Thanks for the review.
>>> Tao Ma wrote:
>>>> From: Tao Ma <boyu.mt@taobao.com>
>>>>
>>>> Add ocfs2_trim_fs to support trimming freed clusters in the
>>>> volume. A range will be given and all the freed clusters greater
>>>> than minlen will be discarded to the block layer.
>>>>
>>>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>>>> ---
>>>>  fs/ocfs2/alloc.c |  154
>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  fs/ocfs2/alloc.h |    1 +
>>>>  2 files changed, 155 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>>> index b27a0d8..6e1b3b5 100644
>>>> --- a/fs/ocfs2/alloc.c
>>>> +++ b/fs/ocfs2/alloc.c
>>>> @@ -29,6 +29,7 @@
>>>>  #include <linux/highmem.h>
>>>>  #include <linux/swap.h>
>>>>  #include <linux/quotaops.h>
>>>> +#include <linux/blkdev.h>
>>>>  
>>>>  #include <cluster/masklog.h>
>>>>  
>>>> @@ -7184,3 +7185,156 @@ out_commit:
>>>>  out:
>>>>      return ret;
>>>>  }
>>>> +
>>>> +static int ocfs2_trim_extent(struct super_block *sb,
>>>> +                 struct ocfs2_group_desc *gd,
>>>> +                 int start, int count)
>>>> +{
>>>> +    u64 discard;
>>>> +
>>>> +    count = ocfs2_clusters_to_blocks(sb, count);
>>>> +    discard = le64_to_cpu(gd->bg_blkno) +
>>>> +            ocfs2_clusters_to_blocks(sb, start);
>>>> +
>>>> +    return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
>>>> +}
>>>> +
>>>> +static int ocfs2_trim_group(struct super_block *sb,
>>>> +                struct ocfs2_group_desc *gd,
>>>> +                int start, int max, int minbits)
>>>> +{
>>>> +    int ret = 0, count = 0, next;
>>>> +    void *bitmap = gd->bg_bitmap;
>>>> +
>>>> +    while (start < max) {
>>>> +        start = ocfs2_find_next_zero_bit(bitmap, max, start);
>>>> +        if (start >= max)
>>>> +            break;
>>>    /* What if the 'start' stands within a hole */
>>>
>>>    if (ocfs2_test_bit(...)) {
>>>       start = ocfs2_find_next_zero_bit(...);
>>>       if ((start == -1) || (start >= max))
>>>          break;
>>>    }
>>>
>>>> +        next = ocfs2_find_next_bit(bitmap, max, start);
>>>      next = ocfs2_find_next_bit(...);
>>>    if (next == -1)
>>>       break;
>> next will be set to "-1"? sorry, but where do you get it?
>>>    if (next > max)
>>>       next = max;
>> again, ocfs2_find_next_bit will return a value larger than 'max'? I am
>> afraid not. Otherwise, it will be nonsense to pass a 'max' to it.
> 
> 
> Say we're handling the last group, and the 'start + len' was within a
> hole, then the 'max'
> is 'first_bit + len', while the next none-zero bit we found may be
> larger than 'max', isn't
> that possible?
ocfs2_find_next_bit(and ext2_find_next_bit) won't parse, check and
return 'bit' after 'max'. otherwise there should be a problem of memory
overflow(you read and check some memory which isn't owned and handled by
you). So the same goes here. If it can return a value larger than 'max',
every caller will have to check the overflow. That would be too painful.
>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>>> +{
>>>> +    struct ocfs2_super *osb = OCFS2_SB(sb);
>>>> +    u64 start, len, minlen, trimmed, first_group, last_group, group;
>>>    why not using u32 start, len, minlen, trimmed;
>> we may use 64 bit clusters later I guess. And what's more, they will be
>> set by the user later. and it may overflow. Say the user pass a u64
>> range->len, it will overflow with range->len >> osb->s_clustersize_bits.
> 
> I just found we were using u32 for counting clusters all around ocfs2
> codes, e.g truncate/punching_hole
> codes, also passing an u64 byte_offset from userspace, so my original
> intention is to keep an unification;-)
> 
> Overflow can theoretically happen anyway, however, it's not very likely
> to pass a 16TB+ byte_offset from userspace.
I am afraid it is very likely. So say you want to trim all the clusters
within the volume, how could you set 'range->len'? Will you first fdisk
to get the volume size and then set it accordingly?
Most guys will set it to ULLONG_MAX and let the file system handles it.
This is not my personal view, please check this article:
http://lwn.net/Articles/417809/
Jonathan also suggests to set len to ULLONG_MAX so that you can trim the
whole volume.

Regards,
Tao

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-03-08  6:42         ` Tao Ma
@ 2011-03-08  6:53           ` Tristan Ye
  2011-03-08  7:47             ` Tao Ma
  0 siblings, 1 reply; 15+ messages in thread
From: Tristan Ye @ 2011-03-08  6:53 UTC (permalink / raw)
  To: ocfs2-devel

Tao Ma wrote:
> On 03/08/2011 02:23 PM, Tristan Ye wrote:
>> Tao Ma wrote:
>>> On 03/08/2011 12:55 PM, Tristan Ye wrote:
>>>> Hi Tao,
>>>>
>>>>    Most of codes looks pretty neat to me, few comments inlined below:
>>> Thanks for the review.
>>>> Tao Ma wrote:
>>>>> From: Tao Ma <boyu.mt@taobao.com>
>>>>>
>>>>> Add ocfs2_trim_fs to support trimming freed clusters in the
>>>>> volume. A range will be given and all the freed clusters greater
>>>>> than minlen will be discarded to the block layer.
>>>>>
>>>>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>>>>> ---
>>>>>  fs/ocfs2/alloc.c |  154
>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  fs/ocfs2/alloc.h |    1 +
>>>>>  2 files changed, 155 insertions(+), 0 deletions(-)
>>>>>
>>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>>>> index b27a0d8..6e1b3b5 100644
>>>>> --- a/fs/ocfs2/alloc.c
>>>>> +++ b/fs/ocfs2/alloc.c
>>>>> @@ -29,6 +29,7 @@
>>>>>  #include <linux/highmem.h>
>>>>>  #include <linux/swap.h>
>>>>>  #include <linux/quotaops.h>
>>>>> +#include <linux/blkdev.h>
>>>>>  
>>>>>  #include <cluster/masklog.h>
>>>>>  
>>>>> @@ -7184,3 +7185,156 @@ out_commit:
>>>>>  out:
>>>>>      return ret;
>>>>>  }
>>>>> +
>>>>> +static int ocfs2_trim_extent(struct super_block *sb,
>>>>> +                 struct ocfs2_group_desc *gd,
>>>>> +                 int start, int count)
>>>>> +{
>>>>> +    u64 discard;
>>>>> +
>>>>> +    count = ocfs2_clusters_to_blocks(sb, count);
>>>>> +    discard = le64_to_cpu(gd->bg_blkno) +
>>>>> +            ocfs2_clusters_to_blocks(sb, start);
>>>>> +
>>>>> +    return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
>>>>> +}
>>>>> +
>>>>> +static int ocfs2_trim_group(struct super_block *sb,
>>>>> +                struct ocfs2_group_desc *gd,
>>>>> +                int start, int max, int minbits)
>>>>> +{
>>>>> +    int ret = 0, count = 0, next;
>>>>> +    void *bitmap = gd->bg_bitmap;
>>>>> +
>>>>> +    while (start < max) {
>>>>> +        start = ocfs2_find_next_zero_bit(bitmap, max, start);
>>>>> +        if (start >= max)
>>>>> +            break;
>>>>    /* What if the 'start' stands within a hole */
>>>>
>>>>    if (ocfs2_test_bit(...)) {
>>>>       start = ocfs2_find_next_zero_bit(...);
>>>>       if ((start == -1) || (start >= max))
>>>>          break;
>>>>    }
>>>>
>>>>> +        next = ocfs2_find_next_bit(bitmap, max, start);
>>>>      next = ocfs2_find_next_bit(...);
>>>>    if (next == -1)
>>>>       break;
>>> next will be set to "-1"? sorry, but where do you get it?
>>>>    if (next > max)
>>>>       next = max;
>>> again, ocfs2_find_next_bit will return a value larger than 'max'? I am
>>> afraid not. Otherwise, it will be nonsense to pass a 'max' to it.
>>
>> Say we're handling the last group, and the 'start + len' was within a
>> hole, then the 'max'
>> is 'first_bit + len', while the next none-zero bit we found may be
>> larger than 'max', isn't
>> that possible?
> ocfs2_find_next_bit(and ext2_find_next_bit) won't parse, check and
> return 'bit' after 'max'. otherwise there should be a problem of memory
> overflow(you read and check some memory which isn't owned and handled by
> you). So the same goes here. If it can return a value larger than 'max',
> every caller will have to check the overflow. That would be too painful.

  Oh, you may misunderstood my words, the 'max' you passed to 
ocfs2_find_next_bit()
may not be the ending-edge of the cluster group(bitmap), it may be the 
end of what user specified
for TRIMing, therefore the 'next'(ending-edge for a wanted hole) bit you 
found from ocfs2_find_next_bit()
might be larger than 'max', is that possible?

>>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>>>> +{
>>>>> +    struct ocfs2_super *osb = OCFS2_SB(sb);
>>>>> +    u64 start, len, minlen, trimmed, first_group, last_group, group;
>>>>    why not using u32 start, len, minlen, trimmed;
>>> we may use 64 bit clusters later I guess. And what's more, they will be
>>> set by the user later. and it may overflow. Say the user pass a u64
>>> range->len, it will overflow with range->len >> osb->s_clustersize_bits.
>> I just found we were using u32 for counting clusters all around ocfs2
>> codes, e.g truncate/punching_hole
>> codes, also passing an u64 byte_offset from userspace, so my original
>> intention is to keep an unification;-)
>>
>> Overflow can theoretically happen anyway, however, it's not very likely
>> to pass a 16TB+ byte_offset from userspace.
> I am afraid it is very likely. So say you want to trim all the clusters
> within the volume, how could you set 'range->len'? Will you first fdisk
> to get the volume size and then set it accordingly?
> Most guys will set it to ULLONG_MAX and let the file system handles it.
> This is not my personal view, please check this article:
> http://lwn.net/Articles/417809/
> Jonathan also suggests to set len to ULLONG_MAX so that you can trim the
> whole volume.

    Nice self-defense;-), how about the overflow risk in 
truncate/punching-hole
codes, where u32 were being used for cluster counting.



>
> Regards,
> Tao

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-03-08  6:53           ` Tristan Ye
@ 2011-03-08  7:47             ` Tao Ma
  0 siblings, 0 replies; 15+ messages in thread
From: Tao Ma @ 2011-03-08  7:47 UTC (permalink / raw)
  To: ocfs2-devel

On 03/08/2011 02:53 PM, Tristan Ye wrote:
> Tao Ma wrote:
>> On 03/08/2011 02:23 PM, Tristan Ye wrote:
>>> Tao Ma wrote:
>>>> On 03/08/2011 12:55 PM, Tristan Ye wrote:
>>>>> Hi Tao,
>>>>>
>>>>>    Most of codes looks pretty neat to me, few comments inlined below:
>>>> Thanks for the review.
>>>>> Tao Ma wrote:
>>>>>> From: Tao Ma <boyu.mt@taobao.com>
>>>>>>
>>>>>> Add ocfs2_trim_fs to support trimming freed clusters in the
>>>>>> volume. A range will be given and all the freed clusters greater
>>>>>> than minlen will be discarded to the block layer.
>>>>>>
>>>>>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>>>>>> ---
>>>>>>  fs/ocfs2/alloc.c |  154
>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  fs/ocfs2/alloc.h |    1 +
>>>>>>  2 files changed, 155 insertions(+), 0 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>>>>> index b27a0d8..6e1b3b5 100644
>>>>>> --- a/fs/ocfs2/alloc.c
>>>>>> +++ b/fs/ocfs2/alloc.c
>>>>>> @@ -29,6 +29,7 @@
>>>>>>  #include <linux/highmem.h>
>>>>>>  #include <linux/swap.h>
>>>>>>  #include <linux/quotaops.h>
>>>>>> +#include <linux/blkdev.h>
>>>>>>  
>>>>>>  #include <cluster/masklog.h>
>>>>>>  
>>>>>> @@ -7184,3 +7185,156 @@ out_commit:
>>>>>>  out:
>>>>>>      return ret;
>>>>>>  }
>>>>>> +
>>>>>> +static int ocfs2_trim_extent(struct super_block *sb,
>>>>>> +                 struct ocfs2_group_desc *gd,
>>>>>> +                 int start, int count)
>>>>>> +{
>>>>>> +    u64 discard;
>>>>>> +
>>>>>> +    count = ocfs2_clusters_to_blocks(sb, count);
>>>>>> +    discard = le64_to_cpu(gd->bg_blkno) +
>>>>>> +            ocfs2_clusters_to_blocks(sb, start);
>>>>>> +
>>>>>> +    return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
>>>>>> +}
>>>>>> +
>>>>>> +static int ocfs2_trim_group(struct super_block *sb,
>>>>>> +                struct ocfs2_group_desc *gd,
>>>>>> +                int start, int max, int minbits)
>>>>>> +{
>>>>>> +    int ret = 0, count = 0, next;
>>>>>> +    void *bitmap = gd->bg_bitmap;
>>>>>> +
>>>>>> +    while (start < max) {
>>>>>> +        start = ocfs2_find_next_zero_bit(bitmap, max, start);
>>>>>> +        if (start >= max)
>>>>>> +            break;
>>>>>    /* What if the 'start' stands within a hole */
>>>>>
>>>>>    if (ocfs2_test_bit(...)) {
>>>>>       start = ocfs2_find_next_zero_bit(...);
>>>>>       if ((start == -1) || (start >= max))
>>>>>          break;
>>>>>    }
>>>>>
>>>>>> +        next = ocfs2_find_next_bit(bitmap, max, start);
>>>>>      next = ocfs2_find_next_bit(...);
>>>>>    if (next == -1)
>>>>>       break;
>>>> next will be set to "-1"? sorry, but where do you get it?
>>>>>    if (next > max)
>>>>>       next = max;
>>>> again, ocfs2_find_next_bit will return a value larger than 'max'? I am
>>>> afraid not. Otherwise, it will be nonsense to pass a 'max' to it.
>>>
>>> Say we're handling the last group, and the 'start + len' was within a
>>> hole, then the 'max'
>>> is 'first_bit + len', while the next none-zero bit we found may be
>>> larger than 'max', isn't
>>> that possible?
>> ocfs2_find_next_bit(and ext2_find_next_bit) won't parse, check and
>> return 'bit' after 'max'. otherwise there should be a problem of memory
>> overflow(you read and check some memory which isn't owned and handled by
>> you). So the same goes here. If it can return a value larger than 'max',
>> every caller will have to check the overflow. That would be too painful.
> 
>  Oh, you may misunderstood my words, the 'max' you passed to
> ocfs2_find_next_bit()
> may not be the ending-edge of the cluster group(bitmap), it may be the
> end of what user specified
> for TRIMing, therefore the 'next'(ending-edge for a wanted hole) bit you
> found from ocfs2_find_next_bit()
> might be larger than 'max', is that possible?
Please note that ocfs2_find_next_bit knows nothing about what 'max'
means. So no matter it will be the end of the cluster group or just the
middle of a bitmap, it would return values after 'max' I think.
> 
>>>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range
>>>>>> *range)
>>>>>> +{
>>>>>> +    struct ocfs2_super *osb = OCFS2_SB(sb);
>>>>>> +    u64 start, len, minlen, trimmed, first_group, last_group, group;
>>>>>    why not using u32 start, len, minlen, trimmed;
>>>> we may use 64 bit clusters later I guess. And what's more, they will be
>>>> set by the user later. and it may overflow. Say the user pass a u64
>>>> range->len, it will overflow with range->len >>
>>>> osb->s_clustersize_bits.
>>> I just found we were using u32 for counting clusters all around ocfs2
>>> codes, e.g truncate/punching_hole
>>> codes, also passing an u64 byte_offset from userspace, so my original
>>> intention is to keep an unification;-)
>>>
>>> Overflow can theoretically happen anyway, however, it's not very likely
>>> to pass a 16TB+ byte_offset from userspace.
>> I am afraid it is very likely. So say you want to trim all the clusters
>> within the volume, how could you set 'range->len'? Will you first fdisk
>> to get the volume size and then set it accordingly?
>> Most guys will set it to ULLONG_MAX and let the file system handles it.
>> This is not my personal view, please check this article:
>> http://lwn.net/Articles/417809/
>> Jonathan also suggests to set len to ULLONG_MAX so that you can trim the
>> whole volume.
> 
>    Nice self-defense;-), how about the overflow risk in
> truncate/punching-hole
> codes, where u32 were being used for cluster counting.
yeah, you can try and fix it.

Regards,
Tao

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-03-08  4:55   ` Tristan Ye
  2011-03-08  5:53     ` Tao Ma
@ 2011-03-08  7:53     ` Tao Ma
  2011-03-08  7:59       ` Tristan Ye
  1 sibling, 1 reply; 15+ messages in thread
From: Tao Ma @ 2011-03-08  7:53 UTC (permalink / raw)
  To: ocfs2-devel

On 03/08/2011 12:55 PM, Tristan Ye wrote:
> Hi Tao,
> 
>    Most of codes looks pretty neat to me, few comments inlined below:
> 
> Tao Ma wrote:
>> From: Tao Ma <boyu.mt@taobao.com>
>>
>> Add ocfs2_trim_fs to support trimming freed clusters in the
>> volume. A range will be given and all the freed clusters greater
>> than minlen will be discarded to the block layer.
>>
>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>> ---
>>  fs/ocfs2/alloc.c |  154
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/ocfs2/alloc.h |    1 +
>>  2 files changed, 155 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index b27a0d8..6e1b3b5 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
<snip>
>> +static int ocfs2_trim_group(struct super_block *sb,
>> +                struct ocfs2_group_desc *gd,
>> +                int start, int max, int minbits)
>> +{
>> +    int ret = 0, count = 0, next;
>> +    void *bitmap = gd->bg_bitmap;
>> +
>> +    while (start < max) {
>> +        start = ocfs2_find_next_zero_bit(bitmap, max, start);
>> +        if (start >= max)
>> +            break;
> 
>    /* What if the 'start' stands within a hole */
> 
>    if (ocfs2_test_bit(...)) {
>       start = ocfs2_find_next_zero_bit(...);
>       if ((start == -1) || (start >= max))
>          break;
>    }
I just noticed that I forget to response to this. what do you define a
hole? It is within the global bitmap, so it is either freed or
allocated. I don't get your meaning of 'hole'. ocfs2_find_next_zero_bit
will do as we expected.

Regards,
Tao

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
  2011-03-08  7:53     ` Tao Ma
@ 2011-03-08  7:59       ` Tristan Ye
  0 siblings, 0 replies; 15+ messages in thread
From: Tristan Ye @ 2011-03-08  7:59 UTC (permalink / raw)
  To: ocfs2-devel

Tao Ma wrote:
> On 03/08/2011 12:55 PM, Tristan Ye wrote:
>> Hi Tao,
>>
>>    Most of codes looks pretty neat to me, few comments inlined below:
>>
>> Tao Ma wrote:
>>> From: Tao Ma <boyu.mt@taobao.com>
>>>
>>> Add ocfs2_trim_fs to support trimming freed clusters in the
>>> volume. A range will be given and all the freed clusters greater
>>> than minlen will be discarded to the block layer.
>>>
>>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>>> ---
>>>  fs/ocfs2/alloc.c |  154
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  fs/ocfs2/alloc.h |    1 +
>>>  2 files changed, 155 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index b27a0d8..6e1b3b5 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
> <snip>
>>> +static int ocfs2_trim_group(struct super_block *sb,
>>> +                struct ocfs2_group_desc *gd,
>>> +                int start, int max, int minbits)
>>> +{
>>> +    int ret = 0, count = 0, next;
>>> +    void *bitmap = gd->bg_bitmap;
>>> +
>>> +    while (start < max) {
>>> +        start = ocfs2_find_next_zero_bit(bitmap, max, start);
>>> +        if (start >= max)
>>> +            break;
>>    /* What if the 'start' stands within a hole */
>>
>>    if (ocfs2_test_bit(...)) {
>>       start = ocfs2_find_next_zero_bit(...);
>>       if ((start == -1) || (start >= max))
>>          break;
>>    }
> I just noticed that I forget to response to this. what do you define a
> hole? It is within the global bitmap, so it is either freed or
> allocated. I don't get your meaning of 'hole'. ocfs2_find_next_zero_bit
> will do as we expected.

	Here the hole means the area/range where contiguous '0' resides in the 
bitmap;-)


> 
> Regards,
> Tao

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2011-05-10  3:14 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-05-06  9:23 [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support Tao Ma
2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
2011-05-09 23:02   ` Sunil Mushran
2011-05-10  3:14     ` Tao Ma
2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 2/3] ocfs2: Add FITRIM ioctl Tao Ma
2011-05-06  9:27 ` [Ocfs2-devel] [PATCH 3/3] ocfs2: Add trace event for trim Tao Ma
  -- strict thread matches above, loose matches on Subject: below --
2011-03-07 10:02 [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support Tao Ma
2011-03-07 10:05 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
2011-03-08  4:55   ` Tristan Ye
2011-03-08  5:53     ` Tao Ma
2011-03-08  6:23       ` Tristan Ye
2011-03-08  6:42         ` Tao Ma
2011-03-08  6:53           ` Tristan Ye
2011-03-08  7:47             ` Tao Ma
2011-03-08  7:53     ` Tao Ma
2011-03-08  7:59       ` Tristan Ye

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).