From mboxrd@z Thu Jan 1 00:00:00 1970 From: Kyungmin Park Subject: Re: [PATCH] fat: Batched discard support for fat Date: Fri, 18 Feb 2011 13:21:41 +0900 Message-ID: References: <20110217052220.GA1372@july> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: QUOTED-PRINTABLE Cc: OGAWA Hirofumi , linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org To: Lukas Czerner Return-path: Received: from mail-wy0-f174.google.com ([74.125.82.174]:58906 "EHLO mail-wy0-f174.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754984Ab1BREVp convert rfc822-to-8bit (ORCPT ); Thu, 17 Feb 2011 23:21:45 -0500 In-Reply-To: Sender: linux-fsdevel-owner@vger.kernel.org List-ID: On Thu, Feb 17, 2011 at 8:16 PM, Lukas Czerner wr= ote: > On Thu, 17 Feb 2011, Kyungmin Park wrote: > >> On Thu, Feb 17, 2011 at 7:48 PM, Lukas Czerner = wrote: >> > On Thu, 17 Feb 2011, Kyungmin Park wrote: >> > >> >> From: Kyungmin Park >> >> >> >> FAT supports batched discard as ext4. >> >> >> >> Cited from Lukas words. >> >> "The current solution is not ideal because of its bad performance= impact. >> >> So basic idea to improve things is to avoid discarding every time= some >> >> blocks are freed. and instead batching is together into bigger tr= ims, >> >> which tends to be more effective." >> >> >> >> You can find an information in detail at following URLs. >> >> http://lwn.net/Articles/397538/ >> >> http://lwn.net/Articles/383933/ >> > >> > Hi Kyungmin, >> > >> > this is really great to see more and more filesystemS ADDING this.= I can >> > not really comment on fat specific code, but anyway I have couple = of >> > comments bellow. >> > >> > Thanks! >> > -Lukas >> > >> >> >> >> Signed-off-by: Kyungmin Park >> >> --- >> >> diff --git a/fs/fat/fat.h b/fs/fat/fat.h >> >> index f504089..08b53e1 100644 >> >> --- a/fs/fat/fat.h >> >> +++ b/fs/fat/fat.h >> >> @@ -299,6 +299,7 @@ extern int fat_alloc_clusters(struct inode *i= node, int *cluster, >> >> =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 int nr_cl= uster); >> >> =E1extern int fat_free_clusters(struct inode *inode, int cluster)= ; >> >> =E1extern int fat_count_free_clusters(struct super_block *sb); >> >> +extern int fat_trim_fs(struct super_block *sb, struct fstrim_ran= ge *range); >> >> >> >> =E1/* fat/file.c */ >> >> =E1extern long fat_generic_ioctl(struct file *filp, unsigned int = cmd, >> >> diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c >> >> index b47d2c9..777094b 100644 >> >> --- a/fs/fat/fatent.c >> >> +++ b/fs/fat/fatent.c >> >> @@ -1,6 +1,8 @@ >> >> =E1/* >> >> =E1 * Copyright (C) 2004, OGAWA Hirofumi >> >> =E1 * Released under GPL v2. >> >> + * >> >> + * Batched discard support by Kyungmin Park >> >> =E1 */ >> >> >> >> =E1#include >> >> @@ -541,6 +543,16 @@ out: >> >> =E1 =E1 =E1 return err; >> >> =E1} >> >> >> >> +static int fat_issue_discard(struct super_block *sb, int cluster= , int nr_clus) >> >> +{ >> >> + =E1 =E1 struct msdos_sb_info *sbi =3D MSDOS_SB(sb); >> >> + =E1 =E1 sector_t block, nr_blocks; >> >> + >> >> + =E1 =E1 =E1 =E1block =3D fat_clus_to_blknr(sbi, cluster); >> >> + =E1 =E1 =E1 =E1nr_blocks =3D nr_clus * sbi->sec_per_clus; >> >> + =E1 =E1 =E1 =E1return sb_issue_discard(sb, block, nr_blocks, GF= P_NOFS, 0); >> > >> > Use tabs for code indent. >> > >> >> +} >> >> + >> >> =E1int fat_free_clusters(struct inode *inode, int cluster) >> >> =E1{ >> >> =E1 =E1 =E1 struct super_block *sb =3D inode->i_sb; >> >> @@ -575,11 +587,7 @@ int fat_free_clusters(struct inode *inode, i= nt cluster) >> >> =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 if (cluster !=3D fate= nt.entry + 1) { >> >> =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 int n= r_clus =3D fatent.entry - first_cl + 1; >> >> >> >> - =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 sb_issu= e_discard(sb, >> >> - =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1= =E1 =E1 fat_clus_to_blknr(sbi, first_cl), >> >> - =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1= =E1 =E1 nr_clus * sbi->sec_per_clus, >> >> - =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1= =E1 =E1 GFP_NOFS, 0); >> >> - >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 fat_iss= ue_discard(sb, first_cl, nr_clus); >> >> =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 first= _cl =3D cluster; >> >> =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 } >> >> =E1 =E1 =E1 =E1 =E1 =E1 =E1 } >> >> @@ -683,3 +691,73 @@ out: >> >> =E1 =E1 =E1 unlock_fat(sbi); >> >> =E1 =E1 =E1 return err; >> >> =E1} >> >> + >> >> +int fat_trim_fs(struct super_block *sb, struct fstrim_range *ran= ge) >> >> +{ >> >> + =E1 =E1 struct msdos_sb_info *sbi =3D MSDOS_SB(sb); >> >> + =E1 =E1 struct fatent_operations *ops =3D sbi->fatent_ops; >> >> + =E1 =E1 struct fat_entry fatent; >> >> + =E1 =E1 unsigned long reada_blocks, reada_mask, cur_block; >> >> + =E1 =E1 int err =3D 0, free; >> >> + =E1 =E1 unsigned int start, len, minlen, trimmed; >> >> + =E1 =E1 int entry =3D 0; >> >> + >> >> + =E1 =E1 start =3D range->start >> sb->s_blocksize_bits; >> > >> > Start is not used anywhere either. Let me just explain the reason = for >> > having start, and len. >> > >> > start - address the first Byte of the filesystem (or better, the d= evice, >> > =E1or part of the device underneath the filesystem itself). You ar= e >> > =E1supposed to use this as a starting point for doing discard. >> > >> > len - defines the length in Bytes of the filesystem (or better, th= e >> > =E1device, or part of the device underneath the filesystem itself)= , user >> > =E1want to discard from the "start". >> >> Good explanation. yes I know it's not used at current patch but it's >> added for ext4 does. >> If anyone complaint it, I can remove it for this time. > > Well, I think you misunderstood the reason behind having one ioctl fo= r > all the filesystems. The reason is, that the behavior is the same! fo= r > all the filesystems which implements it, so the user can use one inte= rface > an can rely on the fact that it will do the same thing! So simply > removing start and len is not really a solution! > > It just has to be done the way it accepts "start" and "len" and use i= t > as it is suppose to. Updated. next version handles the "start" and "len" properly. Are there any fat specific comments? Thank you, Kyungmin Park > >> >> > >> > This two values gives us more flexibility in the way we are discar= ding >> > the filesystem's free blocks. For example, if you have huge filesy= stem, >> > or your device has bad discard performance, you probably do not wa= nt to >> > do the FITRIM all-at-once, but rather per-partes, to not disturb o= ther >> > ongoing IO too much. Basically it allows you to spread the load th= rough >> > longer period of time. >> > >> > Also, for bigger filesystems we might want to inform user that som= ething >> > is really happening in the form of progress bar (or whatever), whi= ch you >> > can not do otherwise. >> > >> >> + =E1 =E1 len =3D range->len >> sb->s_blocksize_bits; >> > >> > "len" does not seem to be used anywhere, does it mean that you are >> > discarding free extents on the whole filesystem with one run ? Thi= s >> > operation can take pretty long time on devices with slow discard >> > capability. Can you consider doing it per-partes as we are doing i= n >> > ext4 for example ? I really can't say how much work does it mean i= n >> > fat, but it might be worth it. Especially because of ... -> >> > >> >> + =E1 =E1 minlen =3D range->minlen >> sb->s_blocksize_bits; >> >> + =E1 =E1 trimmed =3D 0; >> >> + >> >> + =E1 =E1 minlen =3D minlen / sbi->sec_per_clus; >> >> + >> >> + =E1 =E1 lock_fat(sbi); >> > =E1 =E1 =E1 =E1^^^^^^^^^^^^^^ >> > =E1 =E1 =E1 =E1-> this >> > You are holding this mutex the whole time you are discarding free >> > extents and I think it would be devastating, because the users wil= l see >> > long stalls. The bigger the filesystem is and the worse discard >> > performance of the device is, the longer stall will be. Please cor= rect >> > me I am mistaken. Have you done any testing ? >> Now I assume it's not perform during fat operation at normal case. >> It's performed at specific time instead of parallel IO as ext4 does. >> e.g., it's performed after disconnected from windows and so on. >> another reason is can't guarantee the fat table entry consistency >> during trim, I just lock the fat at start time as count the free spa= ce >> at fat codes. > > Ok, if there is no way around the mutex I am ok with that. However, y= ou > really need to implement the per-partes approach FITRIM interface for= ces > you to. > > By the way, you can not just rely on users not using the filesystem > while FITRIM is going on, because they just will not care, especially= if > FITRIM will be invoked periodically form cron (for example). > >> > >> >> + =E1 =E1 if (sbi->free_clusters !=3D -1 && sbi->free_clus_valid) >> >> + =E1 =E1 =E1 =E1 =E1 =E1 goto out; >> >> + >> >> + =E1 =E1 reada_blocks =3D FAT_READA_SIZE >> sb->s_blocksize_bits= ; >> >> + =E1 =E1 reada_mask =3D reada_blocks - 1; >> >> + =E1 =E1 cur_block =3D 0; >> >> + >> >> + =E1 =E1 free =3D 0; >> >> + =E1 =E1 fatent_init(&fatent); >> >> + >> >> + =E1 =E1 /* >> >> + =E1 =E1 =E1* REVISIT: scan from the last free block. >> >> + =E1 =E1 =E1*/ >> >> + =E1 =E1 fatent_set_entry(&fatent, FAT_START_ENT); >> >> + =E1 =E1 while (fatent.entry < sbi->max_cluster) { >> >> + =E1 =E1 =E1 =E1 =E1 =E1 /* readahead of fat blocks */ >> >> + =E1 =E1 =E1 =E1 =E1 =E1 if ((cur_block & reada_mask) =3D=3D 0) = { >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 unsigned long rest =3D = sbi->fat_length - cur_block; >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 fat_ent_reada(sb, &fate= nt, min(reada_blocks, rest)); >> >> + =E1 =E1 =E1 =E1 =E1 =E1 } >> >> + =E1 =E1 =E1 =E1 =E1 =E1 cur_block++; >> >> + >> >> + =E1 =E1 =E1 =E1 =E1 =E1 err =3D fat_ent_read_block(sb, &fatent)= ; >> >> + =E1 =E1 =E1 =E1 =E1 =E1 if (err) >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 goto out; >> >> + >> >> + =E1 =E1 =E1 =E1 =E1 =E1 do { >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 if (ops->ent_get(&faten= t) =3D=3D FAT_ENT_FREE) { >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 free++; >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 if (!en= try) >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1= =E1 =E1 entry =3D fatent.entry; >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 } else if (entry) { >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 if (fre= e >=3D minlen) { >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1= =E1 =E1 fat_issue_discard(sb, entry, free); >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1= =E1 =E1 trimmed +=3D free; >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 } >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 free =3D= 0; >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 entry =3D= 0; >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 } >> >> + =E1 =E1 =E1 =E1 =E1 =E1 } while (fat_ent_next(sbi, &fatent)); >> >> + =E1 =E1 } >> >> + =E1 =E1 if (free >=3D minlen) { >> >> + =E1 =E1 =E1 =E1 =E1 =E1 fat_issue_discard(sb, entry, free); >> >> + =E1 =E1 =E1 =E1 =E1 =E1 trimmed +=3D free; >> >> + =E1 =E1 } >> >> + =E1 =E1 range->len =3D (trimmed * sbi->sec_per_clus) * sb->s_bl= ocksize; >> >> + =E1 =E1 fatent_brelse(&fatent); >> >> +out: >> >> + =E1 =E1 unlock_fat(sbi); >> >> + =E1 =E1 return err; >> >> +} >> >> diff --git a/fs/fat/file.c b/fs/fat/file.c >> >> index 7257752..bfdd558 100644 >> >> --- a/fs/fat/file.c >> >> +++ b/fs/fat/file.c >> >> @@ -125,6 +125,30 @@ long fat_generic_ioctl(struct file *filp, un= signed int cmd, unsigned long arg) >> >> =E1 =E1 =E1 =E1 =E1 =E1 =E1 return fat_ioctl_get_attributes(inode= , user_attr); >> >> =E1 =E1 =E1 case FAT_IOCTL_SET_ATTRIBUTES: >> >> =E1 =E1 =E1 =E1 =E1 =E1 =E1 return fat_ioctl_set_attributes(filp,= user_attr); >> >> + =E1 =E1 case FITRIM: >> >> + =E1 =E1 { >> >> + =E1 =E1 =E1 =E1 =E1 =E1 struct super_block *sb =3D inode->i_sb; >> >> + =E1 =E1 =E1 =E1 =E1 =E1 struct fstrim_range range; >> >> + =E1 =E1 =E1 =E1 =E1 =E1 int ret =3D 0; >> >> + >> >> + =E1 =E1 =E1 =E1 =E1 =E1 if (!capable(CAP_SYS_ADMIN)) >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 return -EPERM; >> > >> > You might want to add check whether the device actually support di= scard. >> > See: http://www.spinics.net/lists/linux-ext4/msg23144.html >> >> Good. I didn't see the patch, next version I'll add it. >> >> Thank you, >> Kyungmin Park >> > >> >> + >> >> + =E1 =E1 =E1 =E1 =E1 =E1 if (copy_from_user(&range, (struct fstr= im_range *)arg, >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1= =E1 =E1 sizeof(range))) >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 return -EFAULT; >> >> + >> >> + =E1 =E1 =E1 =E1 =E1 =E1 ret =3D fat_trim_fs(sb, &range); >> >> + =E1 =E1 =E1 =E1 =E1 =E1 if (ret < 0) >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 return ret; >> >> + >> >> + =E1 =E1 =E1 =E1 =E1 =E1 if (copy_to_user((struct fstrim_range *= )arg, &range, >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1= =E1 =E1 sizeof(range))) >> >> + =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 =E1 return -EFAULT; >> >> + >> >> + =E1 =E1 =E1 =E1 =E1 =E1 return 0; >> >> + =E1 =E1 } >> >> + >> >> =E1 =E1 =E1 default: >> >> =E1 =E1 =E1 =E1 =E1 =E1 =E1 return -ENOTTY; /* Inappropriate ioct= l for device */ >> >> =E1 =E1 =E1 } >> >> >> > >> > -- >> > >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-fsde= vel" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html >> > > -- -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel= " in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html