* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-03-07 10:02 [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support Tao Ma
@ 2011-03-07 10:05 ` Tao Ma
2011-03-08 4:55 ` Tristan Ye
2011-03-07 10:05 ` [Ocfs2-devel] [PATCH 2/3] ocfs2: Add FITRIM ioctl Tao Ma
` (2 subsequent siblings)
3 siblings, 1 reply; 16+ messages in thread
From: Tao Ma @ 2011-03-07 10:05 UTC (permalink / raw)
To: ocfs2-devel
From: Tao Ma <boyu.mt@taobao.com>
Add ocfs2_trim_fs to support trimming freed clusters in the
volume. A range will be given and all the freed clusters greater
than minlen will be discarded to the block layer.
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
fs/ocfs2/alloc.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/alloc.h | 1 +
2 files changed, 155 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b27a0d8..6e1b3b5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#include <cluster/masklog.h>
@@ -7184,3 +7185,156 @@ out_commit:
out:
return ret;
}
+
+static int ocfs2_trim_extent(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ int start, int count)
+{
+ u64 discard;
+
+ count = ocfs2_clusters_to_blocks(sb, count);
+ discard = le64_to_cpu(gd->bg_blkno) +
+ ocfs2_clusters_to_blocks(sb, start);
+
+ return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ int start, int max, int minbits)
+{
+ int ret = 0, count = 0, next;
+ void *bitmap = gd->bg_bitmap;
+
+ while (start < max) {
+ start = ocfs2_find_next_zero_bit(bitmap, max, start);
+ if (start >= max)
+ break;
+ next = ocfs2_find_next_bit(bitmap, max, start);
+
+ if ((next - start) >= minbits) {
+ ret = ocfs2_trim_extent(sb, gd,
+ start, next - start);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ count += next - start;
+ }
+ start = next + 1;
+
+ if (fatal_signal_pending(current)) {
+ count = -ERESTARTSYS;
+ break;
+ }
+
+ if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+ break;
+ }
+
+ if (ret < 0)
+ count = ret;
+
+ return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ u64 start, len, minlen, trimmed, first_group, last_group, group;
+ int ret, cnt, first_bit, last_bit;
+ struct buffer_head *main_bm_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_dinode *main_bm;
+ struct ocfs2_group_desc *gd = NULL;
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ start = range->start >> osb->s_clustersize_bits;
+ len = range->len >> osb->s_clustersize_bits;
+ minlen = range->minlen >> osb->s_clustersize_bits;
+ trimmed = 0;
+
+ if (!len || !minlen || minlen >= osb->bitmap_cpg)
+ return -EINVAL;
+
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+ main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
+
+ /* Determine first and last group to examine based on start and len */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+ last_bit = osb->bitmap_cpg;
+
+ for (group = first_group; group <= last_group;) {
+ if (first_bit + len >= osb->bitmap_cpg)
+ last_bit = osb->bitmap_cpg - first_bit;
+ else
+ last_bit = start + len;
+
+ ret = ocfs2_read_group_descriptor(main_bm_inode,
+ main_bm, group,
+ &gd_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+ brelse(gd_bh);
+ gd_bh = NULL;
+ if (cnt < 0) {
+ ret = cnt;
+ mlog_errno(ret);
+ break;
+ }
+
+ trimmed += cnt;
+ len -= osb->bitmap_cpg - first_bit;
+ first_bit = 0;
+ if (group == osb->first_cluster_group_blkno)
+ group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ else
+ group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ }
+ range->len = trimmed * sb->s_blocksize;
+out_unlock:
+ ocfs2_inode_unlock(main_bm_inode, 0);
+ brelse(main_bm_bh);
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+out:
+ return ret;
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a0..ca381c5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
/*
* Helper function to look at the # of clusters in an extent record.
*/
--
1.6.3.GIT
^ permalink raw reply related [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-03-07 10:05 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
@ 2011-03-08 4:55 ` Tristan Ye
2011-03-08 5:53 ` Tao Ma
2011-03-08 7:53 ` Tao Ma
0 siblings, 2 replies; 16+ messages in thread
From: Tristan Ye @ 2011-03-08 4:55 UTC (permalink / raw)
To: ocfs2-devel
Hi Tao,
Most of codes looks pretty neat to me, few comments inlined below:
Tao Ma wrote:
> From: Tao Ma <boyu.mt@taobao.com>
>
> Add ocfs2_trim_fs to support trimming freed clusters in the
> volume. A range will be given and all the freed clusters greater
> than minlen will be discarded to the block layer.
>
> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
> ---
> fs/ocfs2/alloc.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/ocfs2/alloc.h | 1 +
> 2 files changed, 155 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index b27a0d8..6e1b3b5 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -29,6 +29,7 @@
> #include <linux/highmem.h>
> #include <linux/swap.h>
> #include <linux/quotaops.h>
> +#include <linux/blkdev.h>
>
> #include <cluster/masklog.h>
>
> @@ -7184,3 +7185,156 @@ out_commit:
> out:
> return ret;
> }
> +
> +static int ocfs2_trim_extent(struct super_block *sb,
> + struct ocfs2_group_desc *gd,
> + int start, int count)
> +{
> + u64 discard;
> +
> + count = ocfs2_clusters_to_blocks(sb, count);
> + discard = le64_to_cpu(gd->bg_blkno) +
> + ocfs2_clusters_to_blocks(sb, start);
> +
> + return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
> +}
> +
> +static int ocfs2_trim_group(struct super_block *sb,
> + struct ocfs2_group_desc *gd,
> + int start, int max, int minbits)
> +{
> + int ret = 0, count = 0, next;
> + void *bitmap = gd->bg_bitmap;
> +
> + while (start < max) {
> + start = ocfs2_find_next_zero_bit(bitmap, max, start);
> + if (start >= max)
> + break;
/* What if the 'start' stands within a hole */
if (ocfs2_test_bit(...)) {
start = ocfs2_find_next_zero_bit(...);
if ((start == -1) || (start >= max))
break;
}
> + next = ocfs2_find_next_bit(bitmap, max, start);
next = ocfs2_find_next_bit(...);
if (next == -1)
break;
if (next > max)
next = max;
> +
> + if ((next - start) >= minbits) {
> + ret = ocfs2_trim_extent(sb, gd,
> + start, next - start);
> + if (ret < 0) {
> + mlog_errno(ret);
> + break;
> + }
> + count += next - start;
> + }
> + start = next + 1;
> +
> + if (fatal_signal_pending(current)) {
> + count = -ERESTARTSYS;
> + break;
> + }
> +
> + if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
> + break;
> + }
> +
> + if (ret < 0)
> + count = ret;
> +
> + return count;
> +}
> +
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +{
> + struct ocfs2_super *osb = OCFS2_SB(sb);
> + u64 start, len, minlen, trimmed, first_group, last_group, group;
why not using u32 start, len, minlen, trimmed;
> + int ret, cnt, first_bit, last_bit;
> + struct buffer_head *main_bm_bh = NULL;
> + struct inode *main_bm_inode = NULL;
> + struct buffer_head *gd_bh = NULL;
> + struct ocfs2_dinode *main_bm;
> + struct ocfs2_group_desc *gd = NULL;
> +
> + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
> + return -EROFS;
> +
> + start = range->start >> osb->s_clustersize_bits;
> + len = range->len >> osb->s_clustersize_bits;
> + minlen = range->minlen >> osb->s_clustersize_bits;
I guess you may want to count two corner clusters which cover the
'start' and 'end' bytes,
so the appropriate way might be:
start = range->start >> osb->s_clustersize_bits;
len = ocfs2_clusters_for_bytes(osb->sb, range->start + range->len);
len -= start;
> + trimmed = 0;
> +
> + if (!len || !minlen || minlen >= osb->bitmap_cpg)
'minlen == 0' looks acceptable, which means we allowing discarding
for all size of extents.
and what's more, 'len == 0' may not be harmful enough to issue a
'EINVAL', returning a legal '0'
to userspace immediately is fine.
> + return -EINVAL;
> +
> + main_bm_inode = ocfs2_get_system_file_inode(osb,
> + GLOBAL_BITMAP_SYSTEM_INODE,
> + OCFS2_INVALID_SLOT);
> + if (!main_bm_inode) {
> + ret = -EIO;
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + mutex_lock(&main_bm_inode->i_mutex);
> +
> + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
> + if (ret < 0) {
> + mlog_errno(ret);
> + goto out_mutex;
> + }
> + main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
> +
> + if (start >= le32_to_cpu(main_bm->i_clusters)) {
> + ret = -EINVAL;
> + mlog_errno(ret);
> + goto out_unlock;
> + }
> +
> + if (start + len > le32_to_cpu(main_bm->i_clusters))
> + len = le32_to_cpu(main_bm->i_clusters) - start;
> +
> + /* Determine first and last group to examine based on start and len */
> + first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> + if (first_group == osb->first_cluster_group_blkno)
> + first_bit = start;
> + else
> + first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
> + last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
> + last_bit = osb->bitmap_cpg;
> +
> + for (group = first_group; group <= last_group;) {
> + if (first_bit + len >= osb->bitmap_cpg)
> + last_bit = osb->bitmap_cpg - first_bit;
is 'first_bit' and 'last_bit' both represent a local offset within a
cluster group?
just wondering why last_bit wasn't equal to 'osb->bitmap_cpg' in above
case(I meant the case
of 'first_bit + len >= osb->bitmap_cpg'
> + else
> + last_bit = start + len;
why above case is not 'last_bit = first_bit + len';
> +
> + ret = ocfs2_read_group_descriptor(main_bm_inode,
> + main_bm, group,
> + &gd_bh);
> + if (ret < 0) {
> + mlog_errno(ret);
> + break;
> + }
> +
> + gd = (struct ocfs2_group_desc *)gd_bh->b_data;
> + cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
> + brelse(gd_bh);
> + gd_bh = NULL;
> + if (cnt < 0) {
> + ret = cnt;
> + mlog_errno(ret);
> + break;
> + }
> +
> + trimmed += cnt;
> + len -= osb->bitmap_cpg - first_bit;
> + first_bit = 0;
> + if (group == osb->first_cluster_group_blkno)
> + group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> + else
> + group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> + }
> + range->len = trimmed * sb->s_blocksize;
> +out_unlock:
> + ocfs2_inode_unlock(main_bm_inode, 0);
> + brelse(main_bm_bh);
> +out_mutex:
> + mutex_unlock(&main_bm_inode->i_mutex);
> + iput(main_bm_inode);
> +out:
> + return ret;
> +}
> diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
> index 3bd08a0..ca381c5 100644
> --- a/fs/ocfs2/alloc.h
> +++ b/fs/ocfs2/alloc.h
> @@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
> struct buffer_head **leaf_bh);
> int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
>
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
> /*
> * Helper function to look at the # of clusters in an extent record.
> */
^ permalink raw reply [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-03-08 4:55 ` Tristan Ye
@ 2011-03-08 5:53 ` Tao Ma
2011-03-08 6:23 ` Tristan Ye
2011-03-08 7:53 ` Tao Ma
1 sibling, 1 reply; 16+ messages in thread
From: Tao Ma @ 2011-03-08 5:53 UTC (permalink / raw)
To: ocfs2-devel
On 03/08/2011 12:55 PM, Tristan Ye wrote:
> Hi Tao,
>
> Most of codes looks pretty neat to me, few comments inlined below:
Thanks for the review.
>
> Tao Ma wrote:
>> From: Tao Ma <boyu.mt@taobao.com>
>>
>> Add ocfs2_trim_fs to support trimming freed clusters in the
>> volume. A range will be given and all the freed clusters greater
>> than minlen will be discarded to the block layer.
>>
>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>> ---
>> fs/ocfs2/alloc.c | 154
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> fs/ocfs2/alloc.h | 1 +
>> 2 files changed, 155 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index b27a0d8..6e1b3b5 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -29,6 +29,7 @@
>> #include <linux/highmem.h>
>> #include <linux/swap.h>
>> #include <linux/quotaops.h>
>> +#include <linux/blkdev.h>
>>
>> #include <cluster/masklog.h>
>>
>> @@ -7184,3 +7185,156 @@ out_commit:
>> out:
>> return ret;
>> }
>> +
>> +static int ocfs2_trim_extent(struct super_block *sb,
>> + struct ocfs2_group_desc *gd,
>> + int start, int count)
>> +{
>> + u64 discard;
>> +
>> + count = ocfs2_clusters_to_blocks(sb, count);
>> + discard = le64_to_cpu(gd->bg_blkno) +
>> + ocfs2_clusters_to_blocks(sb, start);
>> +
>> + return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
>> +}
>> +
>> +static int ocfs2_trim_group(struct super_block *sb,
>> + struct ocfs2_group_desc *gd,
>> + int start, int max, int minbits)
>> +{
>> + int ret = 0, count = 0, next;
>> + void *bitmap = gd->bg_bitmap;
>> +
>> + while (start < max) {
>> + start = ocfs2_find_next_zero_bit(bitmap, max, start);
>> + if (start >= max)
>> + break;
>
> /* What if the 'start' stands within a hole */
>
> if (ocfs2_test_bit(...)) {
> start = ocfs2_find_next_zero_bit(...);
> if ((start == -1) || (start >= max))
> break;
> }
>
>> + next = ocfs2_find_next_bit(bitmap, max, start);
> next = ocfs2_find_next_bit(...);
> if (next == -1)
> break;
next will be set to "-1"? sorry, but where do you get it?
>
> if (next > max)
> next = max;
again, ocfs2_find_next_bit will return a value larger than 'max'? I am
afraid not. Otherwise, it will be nonsense to pass a 'max' to it.
>
>> +
>> + if ((next - start) >= minbits) {
>> + ret = ocfs2_trim_extent(sb, gd,
>> + start, next - start);
>> + if (ret < 0) {
>> + mlog_errno(ret);
>> + break;
>> + }
>> + count += next - start;
>> + }
>> + start = next + 1;
>> +
>> + if (fatal_signal_pending(current)) {
>> + count = -ERESTARTSYS;
>> + break;
>> + }
>> +
>> + if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
>> + break;
>> + }
>> +
>> + if (ret < 0)
>> + count = ret;
>> +
>> + return count;
>> +}
>> +
>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>> +{
>> + struct ocfs2_super *osb = OCFS2_SB(sb);
>> + u64 start, len, minlen, trimmed, first_group, last_group, group;
> why not using u32 start, len, minlen, trimmed;
we may use 64 bit clusters later I guess. And what's more, they will be
set by the user later. and it may overflow. Say the user pass a u64
range->len, it will overflow with range->len >> osb->s_clustersize_bits.
>> + int ret, cnt, first_bit, last_bit;
>> + struct buffer_head *main_bm_bh = NULL;
>> + struct inode *main_bm_inode = NULL;
>> + struct buffer_head *gd_bh = NULL;
>> + struct ocfs2_dinode *main_bm;
>> + struct ocfs2_group_desc *gd = NULL;
>> +
>> + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
>> + return -EROFS;
>> +
>> + start = range->start >> osb->s_clustersize_bits;
>> + len = range->len >> osb->s_clustersize_bits;
>> + minlen = range->minlen >> osb->s_clustersize_bits;
>
> I guess you may want to count two corner clusters which cover the
> 'start' and 'end' bytes,
> so the appropriate way might be:
>
> start = range->start >> osb->s_clustersize_bits;
> len = ocfs2_clusters_for_bytes(osb->sb, range->start + range->len);
> len -= start;
No, I don't want that.. Just want to make it the same as what ext4 did.
See ext4_trim_fs for more details.
>
>> + trimmed = 0;
>> +
>> + if (!len || !minlen || minlen >= osb->bitmap_cpg)
> 'minlen == 0' looks acceptable, which means we allowing discarding
> for all size of extents.
> and what's more, 'len == 0' may not be harmful enough to issue a
> 'EINVAL', returning a legal '0'
> to userspace immediately is fine.
Fair enough. I will change it. Thanks.
>
>
>> + return -EINVAL;
>> +
>> + main_bm_inode = ocfs2_get_system_file_inode(osb,
>> + GLOBAL_BITMAP_SYSTEM_INODE,
>> + OCFS2_INVALID_SLOT);
>> + if (!main_bm_inode) {
>> + ret = -EIO;
>> + mlog_errno(ret);
>> + goto out;
>> + }
>> +
>> + mutex_lock(&main_bm_inode->i_mutex);
>> +
>> + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
>> + if (ret < 0) {
>> + mlog_errno(ret);
>> + goto out_mutex;
>> + }
>> + main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>> +
>> + if (start >= le32_to_cpu(main_bm->i_clusters)) {
>> + ret = -EINVAL;
>> + mlog_errno(ret);
>> + goto out_unlock;
>> + }
>> +
>> + if (start + len > le32_to_cpu(main_bm->i_clusters))
>> + len = le32_to_cpu(main_bm->i_clusters) - start;
>> +
>> + /* Determine first and last group to examine based on start and
>> len */
>> + first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>> + if (first_group == osb->first_cluster_group_blkno)
>> + first_bit = start;
>> + else
>> + first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>> + last_group = ocfs2_which_cluster_group(main_bm_inode, start + len
>> - 1);
>> + last_bit = osb->bitmap_cpg;
>> +
>> + for (group = first_group; group <= last_group;) {
>> + if (first_bit + len >= osb->bitmap_cpg)
>> + last_bit = osb->bitmap_cpg - first_bit;
>
> is 'first_bit' and 'last_bit' both represent a local offset within a
> cluster group?
> just wondering why last_bit wasn't equal to 'osb->bitmap_cpg' in above
> case(I meant the case
> of 'first_bit + len >= osb->bitmap_cpg'
>
>> + else
>> + last_bit = start + len;
>
> why above case is not 'last_bit = first_bit + len';
you are right. Thanks.
Regards,
Tao
^ permalink raw reply [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-03-08 5:53 ` Tao Ma
@ 2011-03-08 6:23 ` Tristan Ye
2011-03-08 6:42 ` Tao Ma
0 siblings, 1 reply; 16+ messages in thread
From: Tristan Ye @ 2011-03-08 6:23 UTC (permalink / raw)
To: ocfs2-devel
Tao Ma wrote:
> On 03/08/2011 12:55 PM, Tristan Ye wrote:
>> Hi Tao,
>>
>> Most of codes looks pretty neat to me, few comments inlined below:
> Thanks for the review.
>> Tao Ma wrote:
>>> From: Tao Ma <boyu.mt@taobao.com>
>>>
>>> Add ocfs2_trim_fs to support trimming freed clusters in the
>>> volume. A range will be given and all the freed clusters greater
>>> than minlen will be discarded to the block layer.
>>>
>>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>>> ---
>>> fs/ocfs2/alloc.c | 154
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>> fs/ocfs2/alloc.h | 1 +
>>> 2 files changed, 155 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index b27a0d8..6e1b3b5 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -29,6 +29,7 @@
>>> #include <linux/highmem.h>
>>> #include <linux/swap.h>
>>> #include <linux/quotaops.h>
>>> +#include <linux/blkdev.h>
>>>
>>> #include <cluster/masklog.h>
>>>
>>> @@ -7184,3 +7185,156 @@ out_commit:
>>> out:
>>> return ret;
>>> }
>>> +
>>> +static int ocfs2_trim_extent(struct super_block *sb,
>>> + struct ocfs2_group_desc *gd,
>>> + int start, int count)
>>> +{
>>> + u64 discard;
>>> +
>>> + count = ocfs2_clusters_to_blocks(sb, count);
>>> + discard = le64_to_cpu(gd->bg_blkno) +
>>> + ocfs2_clusters_to_blocks(sb, start);
>>> +
>>> + return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
>>> +}
>>> +
>>> +static int ocfs2_trim_group(struct super_block *sb,
>>> + struct ocfs2_group_desc *gd,
>>> + int start, int max, int minbits)
>>> +{
>>> + int ret = 0, count = 0, next;
>>> + void *bitmap = gd->bg_bitmap;
>>> +
>>> + while (start < max) {
>>> + start = ocfs2_find_next_zero_bit(bitmap, max, start);
>>> + if (start >= max)
>>> + break;
>> /* What if the 'start' stands within a hole */
>>
>> if (ocfs2_test_bit(...)) {
>> start = ocfs2_find_next_zero_bit(...);
>> if ((start == -1) || (start >= max))
>> break;
>> }
>>
>>> + next = ocfs2_find_next_bit(bitmap, max, start);
>> next = ocfs2_find_next_bit(...);
>> if (next == -1)
>> break;
> next will be set to "-1"? sorry, but where do you get it?
>> if (next > max)
>> next = max;
> again, ocfs2_find_next_bit will return a value larger than 'max'? I am
> afraid not. Otherwise, it will be nonsense to pass a 'max' to it.
Say we're handling the last group, and the 'start + len' was within a
hole, then the 'max'
is 'first_bit + len', while the next none-zero bit we found may be
larger than 'max', isn't
that possible?
>>
>>> +
>>> + if ((next - start) >= minbits) {
>>> + ret = ocfs2_trim_extent(sb, gd,
>>> + start, next - start);
>>> + if (ret < 0) {
>>> + mlog_errno(ret);
>>> + break;
>>> + }
>>> + count += next - start;
>>> + }
>>> + start = next + 1;
>>> +
>>> + if (fatal_signal_pending(current)) {
>>> + count = -ERESTARTSYS;
>>> + break;
>>> + }
>>> +
>>> + if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
>>> + break;
>>> + }
>>> +
>>> + if (ret < 0)
>>> + count = ret;
>>> +
>>> + return count;
>>> +}
>>> +
>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>> +{
>>> + struct ocfs2_super *osb = OCFS2_SB(sb);
>>> + u64 start, len, minlen, trimmed, first_group, last_group, group;
>> why not using u32 start, len, minlen, trimmed;
> we may use 64 bit clusters later I guess. And what's more, they will be
> set by the user later. and it may overflow. Say the user pass a u64
> range->len, it will overflow with range->len >> osb->s_clustersize_bits.
I just found we were using u32 for counting clusters all around ocfs2
codes, e.g truncate/punching_hole
codes, also passing an u64 byte_offset from userspace, so my original
intention is to keep an unification;-)
Overflow can theoretically happen anyway, however, it's not very likely
to pass a 16TB+ byte_offset from userspace.
>>> + int ret, cnt, first_bit, last_bit;
>>> + struct buffer_head *main_bm_bh = NULL;
>>> + struct inode *main_bm_inode = NULL;
>>> + struct buffer_head *gd_bh = NULL;
>>> + struct ocfs2_dinode *main_bm;
>>> + struct ocfs2_group_desc *gd = NULL;
>>> +
>>> + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
>>> + return -EROFS;
>>> +
>>> + start = range->start >> osb->s_clustersize_bits;
>>> + len = range->len >> osb->s_clustersize_bits;
>>> + minlen = range->minlen >> osb->s_clustersize_bits;
>> I guess you may want to count two corner clusters which cover the
>> 'start' and 'end' bytes,
>> so the appropriate way might be:
>>
>> start = range->start >> osb->s_clustersize_bits;
>> len = ocfs2_clusters_for_bytes(osb->sb, range->start + range->len);
>> len -= start;
> No, I don't want that.. Just want to make it the same as what ext4 did.
> See ext4_trim_fs for more details.
All right;-)
>>
>>> + trimmed = 0;
>>> +
>>> + if (!len || !minlen || minlen >= osb->bitmap_cpg)
>> 'minlen == 0' looks acceptable, which means we allowing discarding
>> for all size of extents.
>> and what's more, 'len == 0' may not be harmful enough to issue a
>> 'EINVAL', returning a legal '0'
>> to userspace immediately is fine.
> Fair enough. I will change it. Thanks.
>>
>>> + return -EINVAL;
>>> +
>>> + main_bm_inode = ocfs2_get_system_file_inode(osb,
>>> + GLOBAL_BITMAP_SYSTEM_INODE,
>>> + OCFS2_INVALID_SLOT);
>>> + if (!main_bm_inode) {
>>> + ret = -EIO;
>>> + mlog_errno(ret);
>>> + goto out;
>>> + }
>>> +
>>> + mutex_lock(&main_bm_inode->i_mutex);
>>> +
>>> + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
>>> + if (ret < 0) {
>>> + mlog_errno(ret);
>>> + goto out_mutex;
>>> + }
>>> + main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>>> +
>>> + if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>> + ret = -EINVAL;
>>> + mlog_errno(ret);
>>> + goto out_unlock;
>>> + }
>>> +
>>> + if (start + len > le32_to_cpu(main_bm->i_clusters))
>>> + len = le32_to_cpu(main_bm->i_clusters) - start;
>>> +
>>> + /* Determine first and last group to examine based on start and
>>> len */
>>> + first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>> + if (first_group == osb->first_cluster_group_blkno)
>>> + first_bit = start;
>>> + else
>>> + first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>>> + last_group = ocfs2_which_cluster_group(main_bm_inode, start + len
>>> - 1);
>>> + last_bit = osb->bitmap_cpg;
>>> +
>>> + for (group = first_group; group <= last_group;) {
>>> + if (first_bit + len >= osb->bitmap_cpg)
>>> + last_bit = osb->bitmap_cpg - first_bit;
>> is 'first_bit' and 'last_bit' both represent a local offset within a
>> cluster group?
>> just wondering why last_bit wasn't equal to 'osb->bitmap_cpg' in above
>> case(I meant the case
>> of 'first_bit + len >= osb->bitmap_cpg'
>>
>>> + else
>>> + last_bit = start + len;
>> why above case is not 'last_bit = first_bit + len';
> you are right. Thanks.
>
> Regards,
> Tao
^ permalink raw reply [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-03-08 6:23 ` Tristan Ye
@ 2011-03-08 6:42 ` Tao Ma
2011-03-08 6:53 ` Tristan Ye
0 siblings, 1 reply; 16+ messages in thread
From: Tao Ma @ 2011-03-08 6:42 UTC (permalink / raw)
To: ocfs2-devel
On 03/08/2011 02:23 PM, Tristan Ye wrote:
> Tao Ma wrote:
>> On 03/08/2011 12:55 PM, Tristan Ye wrote:
>>> Hi Tao,
>>>
>>> Most of codes looks pretty neat to me, few comments inlined below:
>> Thanks for the review.
>>> Tao Ma wrote:
>>>> From: Tao Ma <boyu.mt@taobao.com>
>>>>
>>>> Add ocfs2_trim_fs to support trimming freed clusters in the
>>>> volume. A range will be given and all the freed clusters greater
>>>> than minlen will be discarded to the block layer.
>>>>
>>>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>>>> ---
>>>> fs/ocfs2/alloc.c | 154
>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>> fs/ocfs2/alloc.h | 1 +
>>>> 2 files changed, 155 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>>> index b27a0d8..6e1b3b5 100644
>>>> --- a/fs/ocfs2/alloc.c
>>>> +++ b/fs/ocfs2/alloc.c
>>>> @@ -29,6 +29,7 @@
>>>> #include <linux/highmem.h>
>>>> #include <linux/swap.h>
>>>> #include <linux/quotaops.h>
>>>> +#include <linux/blkdev.h>
>>>>
>>>> #include <cluster/masklog.h>
>>>>
>>>> @@ -7184,3 +7185,156 @@ out_commit:
>>>> out:
>>>> return ret;
>>>> }
>>>> +
>>>> +static int ocfs2_trim_extent(struct super_block *sb,
>>>> + struct ocfs2_group_desc *gd,
>>>> + int start, int count)
>>>> +{
>>>> + u64 discard;
>>>> +
>>>> + count = ocfs2_clusters_to_blocks(sb, count);
>>>> + discard = le64_to_cpu(gd->bg_blkno) +
>>>> + ocfs2_clusters_to_blocks(sb, start);
>>>> +
>>>> + return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
>>>> +}
>>>> +
>>>> +static int ocfs2_trim_group(struct super_block *sb,
>>>> + struct ocfs2_group_desc *gd,
>>>> + int start, int max, int minbits)
>>>> +{
>>>> + int ret = 0, count = 0, next;
>>>> + void *bitmap = gd->bg_bitmap;
>>>> +
>>>> + while (start < max) {
>>>> + start = ocfs2_find_next_zero_bit(bitmap, max, start);
>>>> + if (start >= max)
>>>> + break;
>>> /* What if the 'start' stands within a hole */
>>>
>>> if (ocfs2_test_bit(...)) {
>>> start = ocfs2_find_next_zero_bit(...);
>>> if ((start == -1) || (start >= max))
>>> break;
>>> }
>>>
>>>> + next = ocfs2_find_next_bit(bitmap, max, start);
>>> next = ocfs2_find_next_bit(...);
>>> if (next == -1)
>>> break;
>> next will be set to "-1"? sorry, but where do you get it?
>>> if (next > max)
>>> next = max;
>> again, ocfs2_find_next_bit will return a value larger than 'max'? I am
>> afraid not. Otherwise, it will be nonsense to pass a 'max' to it.
>
>
> Say we're handling the last group, and the 'start + len' was within a
> hole, then the 'max'
> is 'first_bit + len', while the next none-zero bit we found may be
> larger than 'max', isn't
> that possible?
ocfs2_find_next_bit(and ext2_find_next_bit) won't parse, check and
return 'bit' after 'max'. otherwise there should be a problem of memory
overflow(you read and check some memory which isn't owned and handled by
you). So the same goes here. If it can return a value larger than 'max',
every caller will have to check the overflow. That would be too painful.
>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>>> +{
>>>> + struct ocfs2_super *osb = OCFS2_SB(sb);
>>>> + u64 start, len, minlen, trimmed, first_group, last_group, group;
>>> why not using u32 start, len, minlen, trimmed;
>> we may use 64 bit clusters later I guess. And what's more, they will be
>> set by the user later. and it may overflow. Say the user pass a u64
>> range->len, it will overflow with range->len >> osb->s_clustersize_bits.
>
> I just found we were using u32 for counting clusters all around ocfs2
> codes, e.g truncate/punching_hole
> codes, also passing an u64 byte_offset from userspace, so my original
> intention is to keep an unification;-)
>
> Overflow can theoretically happen anyway, however, it's not very likely
> to pass a 16TB+ byte_offset from userspace.
I am afraid it is very likely. So say you want to trim all the clusters
within the volume, how could you set 'range->len'? Will you first fdisk
to get the volume size and then set it accordingly?
Most guys will set it to ULLONG_MAX and let the file system handles it.
This is not my personal view, please check this article:
http://lwn.net/Articles/417809/
Jonathan also suggests to set len to ULLONG_MAX so that you can trim the
whole volume.
Regards,
Tao
^ permalink raw reply [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-03-08 6:42 ` Tao Ma
@ 2011-03-08 6:53 ` Tristan Ye
2011-03-08 7:47 ` Tao Ma
0 siblings, 1 reply; 16+ messages in thread
From: Tristan Ye @ 2011-03-08 6:53 UTC (permalink / raw)
To: ocfs2-devel
Tao Ma wrote:
> On 03/08/2011 02:23 PM, Tristan Ye wrote:
>> Tao Ma wrote:
>>> On 03/08/2011 12:55 PM, Tristan Ye wrote:
>>>> Hi Tao,
>>>>
>>>> Most of codes looks pretty neat to me, few comments inlined below:
>>> Thanks for the review.
>>>> Tao Ma wrote:
>>>>> From: Tao Ma <boyu.mt@taobao.com>
>>>>>
>>>>> Add ocfs2_trim_fs to support trimming freed clusters in the
>>>>> volume. A range will be given and all the freed clusters greater
>>>>> than minlen will be discarded to the block layer.
>>>>>
>>>>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>>>>> ---
>>>>> fs/ocfs2/alloc.c | 154
>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>> fs/ocfs2/alloc.h | 1 +
>>>>> 2 files changed, 155 insertions(+), 0 deletions(-)
>>>>>
>>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>>>> index b27a0d8..6e1b3b5 100644
>>>>> --- a/fs/ocfs2/alloc.c
>>>>> +++ b/fs/ocfs2/alloc.c
>>>>> @@ -29,6 +29,7 @@
>>>>> #include <linux/highmem.h>
>>>>> #include <linux/swap.h>
>>>>> #include <linux/quotaops.h>
>>>>> +#include <linux/blkdev.h>
>>>>>
>>>>> #include <cluster/masklog.h>
>>>>>
>>>>> @@ -7184,3 +7185,156 @@ out_commit:
>>>>> out:
>>>>> return ret;
>>>>> }
>>>>> +
>>>>> +static int ocfs2_trim_extent(struct super_block *sb,
>>>>> + struct ocfs2_group_desc *gd,
>>>>> + int start, int count)
>>>>> +{
>>>>> + u64 discard;
>>>>> +
>>>>> + count = ocfs2_clusters_to_blocks(sb, count);
>>>>> + discard = le64_to_cpu(gd->bg_blkno) +
>>>>> + ocfs2_clusters_to_blocks(sb, start);
>>>>> +
>>>>> + return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
>>>>> +}
>>>>> +
>>>>> +static int ocfs2_trim_group(struct super_block *sb,
>>>>> + struct ocfs2_group_desc *gd,
>>>>> + int start, int max, int minbits)
>>>>> +{
>>>>> + int ret = 0, count = 0, next;
>>>>> + void *bitmap = gd->bg_bitmap;
>>>>> +
>>>>> + while (start < max) {
>>>>> + start = ocfs2_find_next_zero_bit(bitmap, max, start);
>>>>> + if (start >= max)
>>>>> + break;
>>>> /* What if the 'start' stands within a hole */
>>>>
>>>> if (ocfs2_test_bit(...)) {
>>>> start = ocfs2_find_next_zero_bit(...);
>>>> if ((start == -1) || (start >= max))
>>>> break;
>>>> }
>>>>
>>>>> + next = ocfs2_find_next_bit(bitmap, max, start);
>>>> next = ocfs2_find_next_bit(...);
>>>> if (next == -1)
>>>> break;
>>> next will be set to "-1"? sorry, but where do you get it?
>>>> if (next > max)
>>>> next = max;
>>> again, ocfs2_find_next_bit will return a value larger than 'max'? I am
>>> afraid not. Otherwise, it will be nonsense to pass a 'max' to it.
>>
>> Say we're handling the last group, and the 'start + len' was within a
>> hole, then the 'max'
>> is 'first_bit + len', while the next none-zero bit we found may be
>> larger than 'max', isn't
>> that possible?
> ocfs2_find_next_bit(and ext2_find_next_bit) won't parse, check and
> return 'bit' after 'max'. otherwise there should be a problem of memory
> overflow(you read and check some memory which isn't owned and handled by
> you). So the same goes here. If it can return a value larger than 'max',
> every caller will have to check the overflow. That would be too painful.
Oh, you may misunderstood my words, the 'max' you passed to
ocfs2_find_next_bit()
may not be the ending-edge of the cluster group(bitmap), it may be the
end of what user specified
for TRIMing, therefore the 'next'(ending-edge for a wanted hole) bit you
found from ocfs2_find_next_bit()
might be larger than 'max', is that possible?
>>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>>>> +{
>>>>> + struct ocfs2_super *osb = OCFS2_SB(sb);
>>>>> + u64 start, len, minlen, trimmed, first_group, last_group, group;
>>>> why not using u32 start, len, minlen, trimmed;
>>> we may use 64 bit clusters later I guess. And what's more, they will be
>>> set by the user later. and it may overflow. Say the user pass a u64
>>> range->len, it will overflow with range->len >> osb->s_clustersize_bits.
>> I just found we were using u32 for counting clusters all around ocfs2
>> codes, e.g truncate/punching_hole
>> codes, also passing an u64 byte_offset from userspace, so my original
>> intention is to keep an unification;-)
>>
>> Overflow can theoretically happen anyway, however, it's not very likely
>> to pass a 16TB+ byte_offset from userspace.
> I am afraid it is very likely. So say you want to trim all the clusters
> within the volume, how could you set 'range->len'? Will you first fdisk
> to get the volume size and then set it accordingly?
> Most guys will set it to ULLONG_MAX and let the file system handles it.
> This is not my personal view, please check this article:
> http://lwn.net/Articles/417809/
> Jonathan also suggests to set len to ULLONG_MAX so that you can trim the
> whole volume.
Nice self-defense;-), how about the overflow risk in
truncate/punching-hole
codes, where u32 were being used for cluster counting.
>
> Regards,
> Tao
^ permalink raw reply [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-03-08 6:53 ` Tristan Ye
@ 2011-03-08 7:47 ` Tao Ma
0 siblings, 0 replies; 16+ messages in thread
From: Tao Ma @ 2011-03-08 7:47 UTC (permalink / raw)
To: ocfs2-devel
On 03/08/2011 02:53 PM, Tristan Ye wrote:
> Tao Ma wrote:
>> On 03/08/2011 02:23 PM, Tristan Ye wrote:
>>> Tao Ma wrote:
>>>> On 03/08/2011 12:55 PM, Tristan Ye wrote:
>>>>> Hi Tao,
>>>>>
>>>>> Most of codes looks pretty neat to me, few comments inlined below:
>>>> Thanks for the review.
>>>>> Tao Ma wrote:
>>>>>> From: Tao Ma <boyu.mt@taobao.com>
>>>>>>
>>>>>> Add ocfs2_trim_fs to support trimming freed clusters in the
>>>>>> volume. A range will be given and all the freed clusters greater
>>>>>> than minlen will be discarded to the block layer.
>>>>>>
>>>>>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>>>>>> ---
>>>>>> fs/ocfs2/alloc.c | 154
>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>> fs/ocfs2/alloc.h | 1 +
>>>>>> 2 files changed, 155 insertions(+), 0 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>>>>> index b27a0d8..6e1b3b5 100644
>>>>>> --- a/fs/ocfs2/alloc.c
>>>>>> +++ b/fs/ocfs2/alloc.c
>>>>>> @@ -29,6 +29,7 @@
>>>>>> #include <linux/highmem.h>
>>>>>> #include <linux/swap.h>
>>>>>> #include <linux/quotaops.h>
>>>>>> +#include <linux/blkdev.h>
>>>>>>
>>>>>> #include <cluster/masklog.h>
>>>>>>
>>>>>> @@ -7184,3 +7185,156 @@ out_commit:
>>>>>> out:
>>>>>> return ret;
>>>>>> }
>>>>>> +
>>>>>> +static int ocfs2_trim_extent(struct super_block *sb,
>>>>>> + struct ocfs2_group_desc *gd,
>>>>>> + int start, int count)
>>>>>> +{
>>>>>> + u64 discard;
>>>>>> +
>>>>>> + count = ocfs2_clusters_to_blocks(sb, count);
>>>>>> + discard = le64_to_cpu(gd->bg_blkno) +
>>>>>> + ocfs2_clusters_to_blocks(sb, start);
>>>>>> +
>>>>>> + return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
>>>>>> +}
>>>>>> +
>>>>>> +static int ocfs2_trim_group(struct super_block *sb,
>>>>>> + struct ocfs2_group_desc *gd,
>>>>>> + int start, int max, int minbits)
>>>>>> +{
>>>>>> + int ret = 0, count = 0, next;
>>>>>> + void *bitmap = gd->bg_bitmap;
>>>>>> +
>>>>>> + while (start < max) {
>>>>>> + start = ocfs2_find_next_zero_bit(bitmap, max, start);
>>>>>> + if (start >= max)
>>>>>> + break;
>>>>> /* What if the 'start' stands within a hole */
>>>>>
>>>>> if (ocfs2_test_bit(...)) {
>>>>> start = ocfs2_find_next_zero_bit(...);
>>>>> if ((start == -1) || (start >= max))
>>>>> break;
>>>>> }
>>>>>
>>>>>> + next = ocfs2_find_next_bit(bitmap, max, start);
>>>>> next = ocfs2_find_next_bit(...);
>>>>> if (next == -1)
>>>>> break;
>>>> next will be set to "-1"? sorry, but where do you get it?
>>>>> if (next > max)
>>>>> next = max;
>>>> again, ocfs2_find_next_bit will return a value larger than 'max'? I am
>>>> afraid not. Otherwise, it will be nonsense to pass a 'max' to it.
>>>
>>> Say we're handling the last group, and the 'start + len' was within a
>>> hole, then the 'max'
>>> is 'first_bit + len', while the next none-zero bit we found may be
>>> larger than 'max', isn't
>>> that possible?
>> ocfs2_find_next_bit(and ext2_find_next_bit) won't parse, check and
>> return 'bit' after 'max'. otherwise there should be a problem of memory
>> overflow(you read and check some memory which isn't owned and handled by
>> you). So the same goes here. If it can return a value larger than 'max',
>> every caller will have to check the overflow. That would be too painful.
>
> Oh, you may misunderstood my words, the 'max' you passed to
> ocfs2_find_next_bit()
> may not be the ending-edge of the cluster group(bitmap), it may be the
> end of what user specified
> for TRIMing, therefore the 'next'(ending-edge for a wanted hole) bit you
> found from ocfs2_find_next_bit()
> might be larger than 'max', is that possible?
Please note that ocfs2_find_next_bit knows nothing about what 'max'
means. So no matter it will be the end of the cluster group or just the
middle of a bitmap, it would return values after 'max' I think.
>
>>>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range
>>>>>> *range)
>>>>>> +{
>>>>>> + struct ocfs2_super *osb = OCFS2_SB(sb);
>>>>>> + u64 start, len, minlen, trimmed, first_group, last_group, group;
>>>>> why not using u32 start, len, minlen, trimmed;
>>>> we may use 64 bit clusters later I guess. And what's more, they will be
>>>> set by the user later. and it may overflow. Say the user pass a u64
>>>> range->len, it will overflow with range->len >>
>>>> osb->s_clustersize_bits.
>>> I just found we were using u32 for counting clusters all around ocfs2
>>> codes, e.g truncate/punching_hole
>>> codes, also passing an u64 byte_offset from userspace, so my original
>>> intention is to keep an unification;-)
>>>
>>> Overflow can theoretically happen anyway, however, it's not very likely
>>> to pass a 16TB+ byte_offset from userspace.
>> I am afraid it is very likely. So say you want to trim all the clusters
>> within the volume, how could you set 'range->len'? Will you first fdisk
>> to get the volume size and then set it accordingly?
>> Most guys will set it to ULLONG_MAX and let the file system handles it.
>> This is not my personal view, please check this article:
>> http://lwn.net/Articles/417809/
>> Jonathan also suggests to set len to ULLONG_MAX so that you can trim the
>> whole volume.
>
> Nice self-defense;-), how about the overflow risk in
> truncate/punching-hole
> codes, where u32 were being used for cluster counting.
yeah, you can try and fix it.
Regards,
Tao
^ permalink raw reply [flat|nested] 16+ messages in thread
* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-03-08 4:55 ` Tristan Ye
2011-03-08 5:53 ` Tao Ma
@ 2011-03-08 7:53 ` Tao Ma
2011-03-08 7:59 ` Tristan Ye
1 sibling, 1 reply; 16+ messages in thread
From: Tao Ma @ 2011-03-08 7:53 UTC (permalink / raw)
To: ocfs2-devel
On 03/08/2011 12:55 PM, Tristan Ye wrote:
> Hi Tao,
>
> Most of codes looks pretty neat to me, few comments inlined below:
>
> Tao Ma wrote:
>> From: Tao Ma <boyu.mt@taobao.com>
>>
>> Add ocfs2_trim_fs to support trimming freed clusters in the
>> volume. A range will be given and all the freed clusters greater
>> than minlen will be discarded to the block layer.
>>
>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>> ---
>> fs/ocfs2/alloc.c | 154
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> fs/ocfs2/alloc.h | 1 +
>> 2 files changed, 155 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index b27a0d8..6e1b3b5 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
<snip>
>> +static int ocfs2_trim_group(struct super_block *sb,
>> + struct ocfs2_group_desc *gd,
>> + int start, int max, int minbits)
>> +{
>> + int ret = 0, count = 0, next;
>> + void *bitmap = gd->bg_bitmap;
>> +
>> + while (start < max) {
>> + start = ocfs2_find_next_zero_bit(bitmap, max, start);
>> + if (start >= max)
>> + break;
>
> /* What if the 'start' stands within a hole */
>
> if (ocfs2_test_bit(...)) {
> start = ocfs2_find_next_zero_bit(...);
> if ((start == -1) || (start >= max))
> break;
> }
I just noticed that I forget to response to this. what do you define a
hole? It is within the global bitmap, so it is either freed or
allocated. I don't get your meaning of 'hole'. ocfs2_find_next_zero_bit
will do as we expected.
Regards,
Tao
^ permalink raw reply [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-03-08 7:53 ` Tao Ma
@ 2011-03-08 7:59 ` Tristan Ye
0 siblings, 0 replies; 16+ messages in thread
From: Tristan Ye @ 2011-03-08 7:59 UTC (permalink / raw)
To: ocfs2-devel
Tao Ma wrote:
> On 03/08/2011 12:55 PM, Tristan Ye wrote:
>> Hi Tao,
>>
>> Most of codes looks pretty neat to me, few comments inlined below:
>>
>> Tao Ma wrote:
>>> From: Tao Ma <boyu.mt@taobao.com>
>>>
>>> Add ocfs2_trim_fs to support trimming freed clusters in the
>>> volume. A range will be given and all the freed clusters greater
>>> than minlen will be discarded to the block layer.
>>>
>>> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
>>> ---
>>> fs/ocfs2/alloc.c | 154
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>> fs/ocfs2/alloc.h | 1 +
>>> 2 files changed, 155 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index b27a0d8..6e1b3b5 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
> <snip>
>>> +static int ocfs2_trim_group(struct super_block *sb,
>>> + struct ocfs2_group_desc *gd,
>>> + int start, int max, int minbits)
>>> +{
>>> + int ret = 0, count = 0, next;
>>> + void *bitmap = gd->bg_bitmap;
>>> +
>>> + while (start < max) {
>>> + start = ocfs2_find_next_zero_bit(bitmap, max, start);
>>> + if (start >= max)
>>> + break;
>> /* What if the 'start' stands within a hole */
>>
>> if (ocfs2_test_bit(...)) {
>> start = ocfs2_find_next_zero_bit(...);
>> if ((start == -1) || (start >= max))
>> break;
>> }
> I just noticed that I forget to response to this. what do you define a
> hole? It is within the global bitmap, so it is either freed or
> allocated. I don't get your meaning of 'hole'. ocfs2_find_next_zero_bit
> will do as we expected.
Here the hole means the area/range where contiguous '0' resides in the
bitmap;-)
>
> Regards,
> Tao
^ permalink raw reply [flat|nested] 16+ messages in thread
* [Ocfs2-devel] [PATCH 2/3] ocfs2: Add FITRIM ioctl.
2011-03-07 10:02 [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support Tao Ma
2011-03-07 10:05 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
@ 2011-03-07 10:05 ` Tao Ma
2011-03-07 10:05 ` [Ocfs2-devel] [PATCH 3/3] ocfs2: Add trace event for trim Tao Ma
2011-03-08 15:26 ` [Ocfs2-devel] [PATCH 1/3 v2] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
3 siblings, 0 replies; 16+ messages in thread
From: Tao Ma @ 2011-03-07 10:05 UTC (permalink / raw)
To: ocfs2-devel
From: Tao Ma <boyu.mt@taobao.com>
Add the corresponding ioctl function for FITRIM.
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
fs/ocfs2/ioctl.c | 24 ++++++++++++++++++++++++
1 files changed, 24 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index d9bfa90..ed13656 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -520,6 +520,29 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -EFAULT;
return ocfs2_info_handle(inode, &info, 0);
+ case FITRIM:
+ {
+ struct super_block *sb = inode->i_sb;
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&range, (struct fstrim_range *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ ret = ocfs2_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
default:
return -ENOTTY;
}
@@ -547,6 +570,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC_GROUP_EXTEND:
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
+ case FITRIM:
break;
case OCFS2_IOC_REFLINK:
if (copy_from_user(&args, (struct reflink_arguments *)arg,
--
1.6.3.GIT
^ permalink raw reply related [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 3/3] ocfs2: Add trace event for trim.
2011-03-07 10:02 [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support Tao Ma
2011-03-07 10:05 ` [Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
2011-03-07 10:05 ` [Ocfs2-devel] [PATCH 2/3] ocfs2: Add FITRIM ioctl Tao Ma
@ 2011-03-07 10:05 ` Tao Ma
2011-03-08 15:26 ` [Ocfs2-devel] [PATCH 1/3 v2] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
3 siblings, 0 replies; 16+ messages in thread
From: Tao Ma @ 2011-03-07 10:05 UTC (permalink / raw)
To: ocfs2-devel
From: Tao Ma <boyu.mt@taobao.com>
Add the corresponding trace event for trim.
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
fs/ocfs2/alloc.c | 7 +++++++
fs/ocfs2/ocfs2_trace.h | 25 +++++++++++++++++++++++++
2 files changed, 32 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 6e1b3b5..da05b14 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7196,6 +7196,8 @@ static int ocfs2_trim_extent(struct super_block *sb,
discard = le64_to_cpu(gd->bg_blkno) +
ocfs2_clusters_to_blocks(sb, start);
+ trace_ocfs2_trim_extent(sb, (unsigned long long)discard, count);
+
return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
}
@@ -7206,6 +7208,9 @@ static int ocfs2_trim_group(struct super_block *sb,
int ret = 0, count = 0, next;
void *bitmap = gd->bg_bitmap;
+ trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+ start, max, minbits);
+
while (start < max) {
start = ocfs2_find_next_zero_bit(bitmap, max, start);
if (start >= max)
@@ -7287,6 +7292,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (start + len > le32_to_cpu(main_bm->i_clusters))
len = le32_to_cpu(main_bm->i_clusters) - start;
+ trace_ocfs2_trim_fs(start, len, minlen);
+
/* Determine first and last group to examine based on start and len */
first_group = ocfs2_which_cluster_group(main_bm_inode, start);
if (first_group == osb->first_cluster_group_blkno)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a1dae5b..9ab22a1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
__entry->blkno, __entry->bit)
);
+TRACE_EVENT(ocfs2_trim_extent,
+ TP_PROTO(struct super_block *sb, unsigned long long blk,
+ unsigned long long count),
+ TP_ARGS(sb, blk, count),
+ TP_STRUCT__entry(
+ __field(int, dev_major)
+ __field(int, dev_minor)
+ __field(unsigned long long, blk)
+ __field(__u64, count)
+ ),
+ TP_fast_assign(
+ __entry->dev_major = MAJOR(sb->s_dev);
+ __entry->dev_minor = MINOR(sb->s_dev);
+ __entry->blk = blk;
+ __entry->count = count;
+ ),
+ TP_printk("%d %d %llu %llu",
+ __entry->dev_major, __entry->dev_minor,
+ __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
/* End of trace events for fs/ocfs2/alloc.c. */
/* Trace events for fs/ocfs2/localalloc.c. */
--
1.6.3.GIT
^ permalink raw reply related [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 1/3 v2] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-03-07 10:02 [Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support Tao Ma
` (2 preceding siblings ...)
2011-03-07 10:05 ` [Ocfs2-devel] [PATCH 3/3] ocfs2: Add trace event for trim Tao Ma
@ 2011-03-08 15:26 ` Tao Ma
3 siblings, 0 replies; 16+ messages in thread
From: Tao Ma @ 2011-03-08 15:26 UTC (permalink / raw)
To: ocfs2-devel
Changelog from v1 to v2:
1. remove the check for hard ro and soft ro.
2. fix bug found by tristan.
3. if range->len = 0 return 0 instead of -EINVAL.
4. allow minlen = 0 to go ahead instead of returning -EINVAL.
Regards,
Tao
From 9074413a619f1af32c03c5959d66ff465643496c Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 9 Mar 2011 07:12:10 +0800
Subject: [PATCH 1/3 v2] ocfs2: Add ocfs2_trim_fs for SSD trim support.
Add ocfs2_trim_fs to support trimming freed clusters in the
volume. A range will be given and all the freed clusters greater
than minlen will be discarded to the block layer.
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
fs/ocfs2/alloc.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/alloc.h | 1 +
2 files changed, 157 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b27a0d8..0ff46d9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#include <cluster/masklog.h>
@@ -7184,3 +7185,158 @@ out_commit:
out:
return ret;
}
+
+static int ocfs2_trim_extent(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ int start, int count)
+{
+ u64 discard;
+
+ count = ocfs2_clusters_to_blocks(sb, count);
+ discard = le64_to_cpu(gd->bg_blkno) +
+ ocfs2_clusters_to_blocks(sb, start);
+
+ return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ int start, int max, int minbits)
+{
+ int ret = 0, count = 0, next;
+ void *bitmap = gd->bg_bitmap;
+
+ while (start < max) {
+ start = ocfs2_find_next_zero_bit(bitmap, max, start);
+ if (start >= max)
+ break;
+ next = ocfs2_find_next_bit(bitmap, max, start);
+
+ if ((next - start) >= minbits) {
+ ret = ocfs2_trim_extent(sb, gd,
+ start, next - start);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ count += next - start;
+ }
+ start = next + 1;
+
+ if (fatal_signal_pending(current)) {
+ count = -ERESTARTSYS;
+ break;
+ }
+
+ if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+ break;
+ }
+
+ if (ret < 0)
+ count = ret;
+
+ return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ u64 start, len, minlen, trimmed, first_group, last_group, group;
+ int ret, cnt, first_bit, last_bit;
+ struct buffer_head *main_bm_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_dinode *main_bm;
+ struct ocfs2_group_desc *gd = NULL;
+
+ start = range->start >> osb->s_clustersize_bits;
+ len = range->len >> osb->s_clustersize_bits;
+ minlen = range->minlen >> osb->s_clustersize_bits;
+ trimmed = 0;
+
+ if (!len) {
+ range->len = 0;
+ return 0;
+ }
+
+ if (minlen >= osb->bitmap_cpg)
+ return -EINVAL;
+
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+ main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
+
+ /* Determine first and last group to examine based on start and len */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+ last_bit = osb->bitmap_cpg;
+
+ for (group = first_group; group <= last_group;) {
+ if (first_bit + len >= osb->bitmap_cpg)
+ last_bit = osb->bitmap_cpg;
+ else
+ last_bit = first_bit + len;
+
+ ret = ocfs2_read_group_descriptor(main_bm_inode,
+ main_bm, group,
+ &gd_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+ brelse(gd_bh);
+ gd_bh = NULL;
+ if (cnt < 0) {
+ ret = cnt;
+ mlog_errno(ret);
+ break;
+ }
+
+ trimmed += cnt;
+ len -= osb->bitmap_cpg - first_bit;
+ first_bit = 0;
+ if (group == osb->first_cluster_group_blkno)
+ group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ else
+ group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ }
+ range->len = trimmed * sb->s_blocksize;
+out_unlock:
+ ocfs2_inode_unlock(main_bm_inode, 0);
+ brelse(main_bm_bh);
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+out:
+ return ret;
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a0..ca381c5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
/*
* Helper function to look at the # of clusters in an extent record.
*/
--
1.6.3.GIT
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Ocfs2-devel] [PATCH 1/3 V2] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-05-23 2:08 [Ocfs2-devel] [PATCH 0/3 V2] ocfs2: Add batched discard support Tao Ma
@ 2011-05-23 2:36 ` Tao Ma
2011-05-23 19:33 ` Sunil Mushran
2011-05-24 6:57 ` Joel Becker
0 siblings, 2 replies; 16+ messages in thread
From: Tao Ma @ 2011-05-23 2:36 UTC (permalink / raw)
To: ocfs2-devel
From: Tao Ma <boyu.mt@taobao.com>
Add ocfs2_trim_fs to support trimming freed clusters in the
volume. A range will be given and all the freed clusters greater
than minlen will be discarded to the block layer.
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
fs/ocfs2/alloc.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/alloc.h | 1 +
2 files changed, 160 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 48aa9c7..ae3ea78 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#include <cluster/masklog.h>
@@ -7184,3 +7185,161 @@ out_commit:
out:
return ret;
}
+
+static int ocfs2_trim_extent(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ u32 start, u32 count)
+{
+ u64 discard, bcount;
+
+ bcount = ocfs2_clusters_to_blocks(sb, count);
+ discard = le64_to_cpu(gd->bg_blkno) +
+ ocfs2_clusters_to_blocks(sb, start);
+
+ return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ u32 start, u32 max, u32 minbits)
+{
+ int ret = 0, count = 0, next;
+ void *bitmap = gd->bg_bitmap;
+
+ if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+ return 0;
+
+ while (start < max) {
+ start = ocfs2_find_next_zero_bit(bitmap, max, start);
+ if (start >= max)
+ break;
+ next = ocfs2_find_next_bit(bitmap, max, start);
+
+ if ((next - start) >= minbits) {
+ ret = ocfs2_trim_extent(sb, gd,
+ start, next - start);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ count += next - start;
+ }
+ start = next + 1;
+
+ if (fatal_signal_pending(current)) {
+ count = -ERESTARTSYS;
+ break;
+ }
+
+ if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+ break;
+ }
+
+ if (ret < 0)
+ count = ret;
+
+ return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ u64 start, len, trimmed, first_group, last_group, group;
+ int ret, cnt;
+ u32 first_bit, last_bit, minlen;
+ struct buffer_head *main_bm_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_dinode *main_bm;
+ struct ocfs2_group_desc *gd = NULL;
+
+ start = range->start >> osb->s_clustersize_bits;
+ len = range->len >> osb->s_clustersize_bits;
+ minlen = range->minlen >> osb->s_clustersize_bits;
+ trimmed = 0;
+
+ if (!len) {
+ range->len = 0;
+ return 0;
+ }
+
+ if (minlen >= osb->bitmap_cpg)
+ return -EINVAL;
+
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+ main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
+
+ /* Determine first and last group to examine based on start and len */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+ last_bit = osb->bitmap_cpg;
+
+ for (group = first_group; group <= last_group;) {
+ if (first_bit + len >= osb->bitmap_cpg)
+ last_bit = osb->bitmap_cpg;
+ else
+ last_bit = first_bit + len;
+
+ ret = ocfs2_read_group_descriptor(main_bm_inode,
+ main_bm, group,
+ &gd_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+ brelse(gd_bh);
+ gd_bh = NULL;
+ if (cnt < 0) {
+ ret = cnt;
+ mlog_errno(ret);
+ break;
+ }
+
+ trimmed += cnt;
+ len -= osb->bitmap_cpg - first_bit;
+ first_bit = 0;
+ if (group == osb->first_cluster_group_blkno)
+ group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ else
+ group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ }
+ range->len = trimmed * sb->s_blocksize;
+out_unlock:
+ ocfs2_inode_unlock(main_bm_inode, 0);
+ brelse(main_bm_bh);
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+out:
+ return ret;
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a0..ca381c5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
/*
* Helper function to look at the # of clusters in an extent record.
*/
--
1.6.3.GIT
^ permalink raw reply related [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 1/3 V2] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-05-23 2:36 ` [Ocfs2-devel] [PATCH 1/3 V2] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
@ 2011-05-23 19:33 ` Sunil Mushran
2011-05-24 6:57 ` Joel Becker
1 sibling, 0 replies; 16+ messages in thread
From: Sunil Mushran @ 2011-05-23 19:33 UTC (permalink / raw)
To: ocfs2-devel
Reviewed-by: Sunil Mushran<sunil.mushran@oracle.com>
On 05/22/2011 07:36 PM, Tao Ma wrote:
> From: Tao Ma<boyu.mt@taobao.com>
>
> Add ocfs2_trim_fs to support trimming freed clusters in the
> volume. A range will be given and all the freed clusters greater
> than minlen will be discarded to the block layer.
>
> Signed-off-by: Tao Ma<boyu.mt@taobao.com>
> ---
> fs/ocfs2/alloc.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/ocfs2/alloc.h | 1 +
> 2 files changed, 160 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index 48aa9c7..ae3ea78 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -29,6 +29,7 @@
> #include<linux/highmem.h>
> #include<linux/swap.h>
> #include<linux/quotaops.h>
> +#include<linux/blkdev.h>
>
> #include<cluster/masklog.h>
>
> @@ -7184,3 +7185,161 @@ out_commit:
> out:
> return ret;
> }
> +
> +static int ocfs2_trim_extent(struct super_block *sb,
> + struct ocfs2_group_desc *gd,
> + u32 start, u32 count)
> +{
> + u64 discard, bcount;
> +
> + bcount = ocfs2_clusters_to_blocks(sb, count);
> + discard = le64_to_cpu(gd->bg_blkno) +
> + ocfs2_clusters_to_blocks(sb, start);
> +
> + return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
> +}
> +
> +static int ocfs2_trim_group(struct super_block *sb,
> + struct ocfs2_group_desc *gd,
> + u32 start, u32 max, u32 minbits)
> +{
> + int ret = 0, count = 0, next;
> + void *bitmap = gd->bg_bitmap;
> +
> + if (le16_to_cpu(gd->bg_free_bits_count)< minbits)
> + return 0;
> +
> + while (start< max) {
> + start = ocfs2_find_next_zero_bit(bitmap, max, start);
> + if (start>= max)
> + break;
> + next = ocfs2_find_next_bit(bitmap, max, start);
> +
> + if ((next - start)>= minbits) {
> + ret = ocfs2_trim_extent(sb, gd,
> + start, next - start);
> + if (ret< 0) {
> + mlog_errno(ret);
> + break;
> + }
> + count += next - start;
> + }
> + start = next + 1;
> +
> + if (fatal_signal_pending(current)) {
> + count = -ERESTARTSYS;
> + break;
> + }
> +
> + if ((le16_to_cpu(gd->bg_free_bits_count) - count)< minbits)
> + break;
> + }
> +
> + if (ret< 0)
> + count = ret;
> +
> + return count;
> +}
> +
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +{
> + struct ocfs2_super *osb = OCFS2_SB(sb);
> + u64 start, len, trimmed, first_group, last_group, group;
> + int ret, cnt;
> + u32 first_bit, last_bit, minlen;
> + struct buffer_head *main_bm_bh = NULL;
> + struct inode *main_bm_inode = NULL;
> + struct buffer_head *gd_bh = NULL;
> + struct ocfs2_dinode *main_bm;
> + struct ocfs2_group_desc *gd = NULL;
> +
> + start = range->start>> osb->s_clustersize_bits;
> + len = range->len>> osb->s_clustersize_bits;
> + minlen = range->minlen>> osb->s_clustersize_bits;
> + trimmed = 0;
> +
> + if (!len) {
> + range->len = 0;
> + return 0;
> + }
> +
> + if (minlen>= osb->bitmap_cpg)
> + return -EINVAL;
> +
> + main_bm_inode = ocfs2_get_system_file_inode(osb,
> + GLOBAL_BITMAP_SYSTEM_INODE,
> + OCFS2_INVALID_SLOT);
> + if (!main_bm_inode) {
> + ret = -EIO;
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + mutex_lock(&main_bm_inode->i_mutex);
> +
> + ret = ocfs2_inode_lock(main_bm_inode,&main_bm_bh, 0);
> + if (ret< 0) {
> + mlog_errno(ret);
> + goto out_mutex;
> + }
> + main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
> +
> + if (start>= le32_to_cpu(main_bm->i_clusters)) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + if (start + len> le32_to_cpu(main_bm->i_clusters))
> + len = le32_to_cpu(main_bm->i_clusters) - start;
> +
> + /* Determine first and last group to examine based on start and len */
> + first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> + if (first_group == osb->first_cluster_group_blkno)
> + first_bit = start;
> + else
> + first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
> + last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
> + last_bit = osb->bitmap_cpg;
> +
> + for (group = first_group; group<= last_group;) {
> + if (first_bit + len>= osb->bitmap_cpg)
> + last_bit = osb->bitmap_cpg;
> + else
> + last_bit = first_bit + len;
> +
> + ret = ocfs2_read_group_descriptor(main_bm_inode,
> + main_bm, group,
> + &gd_bh);
> + if (ret< 0) {
> + mlog_errno(ret);
> + break;
> + }
> +
> + gd = (struct ocfs2_group_desc *)gd_bh->b_data;
> + cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
> + brelse(gd_bh);
> + gd_bh = NULL;
> + if (cnt< 0) {
> + ret = cnt;
> + mlog_errno(ret);
> + break;
> + }
> +
> + trimmed += cnt;
> + len -= osb->bitmap_cpg - first_bit;
> + first_bit = 0;
> + if (group == osb->first_cluster_group_blkno)
> + group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> + else
> + group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> + }
> + range->len = trimmed * sb->s_blocksize;
> +out_unlock:
> + ocfs2_inode_unlock(main_bm_inode, 0);
> + brelse(main_bm_bh);
> +out_mutex:
> + mutex_unlock(&main_bm_inode->i_mutex);
> + iput(main_bm_inode);
> +out:
> + return ret;
> +}
> diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
> index 3bd08a0..ca381c5 100644
> --- a/fs/ocfs2/alloc.h
> +++ b/fs/ocfs2/alloc.h
> @@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
> struct buffer_head **leaf_bh);
> int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
>
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
> /*
> * Helper function to look at the # of clusters in an extent record.
> */
^ permalink raw reply [flat|nested] 16+ messages in thread* [Ocfs2-devel] [PATCH 1/3 V2] ocfs2: Add ocfs2_trim_fs for SSD trim support.
2011-05-23 2:36 ` [Ocfs2-devel] [PATCH 1/3 V2] ocfs2: Add ocfs2_trim_fs for SSD trim support Tao Ma
2011-05-23 19:33 ` Sunil Mushran
@ 2011-05-24 6:57 ` Joel Becker
1 sibling, 0 replies; 16+ messages in thread
From: Joel Becker @ 2011-05-24 6:57 UTC (permalink / raw)
To: ocfs2-devel
On Mon, May 23, 2011 at 10:36:43AM +0800, Tao Ma wrote:
> From: Tao Ma <boyu.mt@taobao.com>
>
> Add ocfs2_trim_fs to support trimming freed clusters in the
> volume. A range will be given and all the freed clusters greater
> than minlen will be discarded to the block layer.
>
> Signed-off-by: Tao Ma <boyu.mt@taobao.com>
The TRIM patches are now in the merge-window branch of
ocfs2.git.
Joel
--
"In the room the women come and go
Talking of Michaelangelo."
http://www.jlbec.org/
jlbec at evilplan.org
^ permalink raw reply [flat|nested] 16+ messages in thread