* [PATCH 1/3] f2fs: introduce cp_control structure @ 2014-09-23 4:53 Jaegeuk Kim 2014-09-23 4:53 ` [PATCH 2/3] f2fs: introduce FITRIM in f2fs_ioctl Jaegeuk Kim ` (2 more replies) 0 siblings, 3 replies; 9+ messages in thread From: Jaegeuk Kim @ 2014-09-23 4:53 UTC (permalink / raw) To: linux-kernel, linux-fsdevel, linux-f2fs-devel; +Cc: Jaegeuk Kim This patch add a new data structure to control checkpoint parameters. Currently, it presents the reason of checkpoint such as is_umount and normal sync. Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/checkpoint.c | 16 ++++++++-------- fs/f2fs/f2fs.h | 11 ++++++++++- fs/f2fs/gc.c | 7 +++++-- fs/f2fs/recovery.c | 5 ++++- fs/f2fs/super.c | 13 ++++++++++--- include/trace/events/f2fs.h | 15 ++++++++++----- 6 files changed, 47 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e519aaf..e401ffd 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -826,7 +826,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -894,7 +894,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + orphan_blocks); - if (is_umount) { + if (cpc->reason == CP_UMOUNT) { set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ cp_payload_blks + data_sum_blocks + @@ -948,7 +948,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; - if (is_umount) { + if (cpc->reason == CP_UMOUNT) { write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; } @@ -988,12 +988,12 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); mutex_lock(&sbi->cp_mutex); @@ -1004,7 +1004,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) if (block_operations(sbi)) goto out; - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); f2fs_submit_merged_bio(sbi, DATA, WRITE); f2fs_submit_merged_bio(sbi, NODE, WRITE); @@ -1023,13 +1023,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) flush_sit_entries(sbi); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, is_umount); + do_checkpoint(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); out: mutex_unlock(&sbi->cp_mutex); - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); } void init_ino_entry_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3b70b01..5298924 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -96,6 +96,15 @@ enum { SIT_BITMAP }; +enum { + CP_UMOUNT, + CP_SYNC, +}; + +struct cp_control { + int reason; +}; + /* * For CP/NAT/SIT/SSA readahead */ @@ -1314,7 +1323,7 @@ void update_dirty_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, bool); +void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7bf8392..e88fcf6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -694,6 +694,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi) int gc_type = BG_GC; int nfree = 0; int ret = -1; + struct cp_control cpc = { + .reason = CP_SYNC, + }; INIT_LIST_HEAD(&ilist); gc_more: @@ -704,7 +707,7 @@ gc_more: if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); } if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) @@ -729,7 +732,7 @@ gc_more: goto gc_more; if (gc_type == FG_GC) - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); stop: mutex_unlock(&sbi->gc_mutex); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a3fcea3..4189003 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -542,8 +542,11 @@ out: set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); } else if (need_writecp) { + struct cp_control cpc = { + .reason = CP_SYNC, + }; mutex_unlock(&sbi->cp_mutex); - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); } else { mutex_unlock(&sbi->cp_mutex); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7a91a38..128c420 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -434,8 +434,12 @@ static void f2fs_put_super(struct super_block *sb) stop_gc_thread(sbi); /* We don't need to do checkpoint when it's clean */ - if (sbi->s_dirty) - write_checkpoint(sbi, true); + if (sbi->s_dirty) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + write_checkpoint(sbi, &cpc); + } /* * normally superblock is clean, so we need to release this. @@ -466,8 +470,11 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); if (sync) { + struct cp_control cpc = { + .reason = CP_SYNC, + }; mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } else { f2fs_balance_fs(sbi); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index d06d443..66eaace 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -69,6 +69,11 @@ { GC_GREEDY, "Greedy" }, \ { GC_CB, "Cost-Benefit" }) +#define show_cpreason(type) \ + __print_symbolic(type, \ + { CP_UMOUNT, "Umount" }, \ + { CP_SYNC, "Sync" }) + struct victim_sel_policy; DECLARE_EVENT_CLASS(f2fs__inode, @@ -944,25 +949,25 @@ TRACE_EVENT(f2fs_submit_page_mbio, TRACE_EVENT(f2fs_write_checkpoint, - TP_PROTO(struct super_block *sb, bool is_umount, char *msg), + TP_PROTO(struct super_block *sb, int reason, char *msg), - TP_ARGS(sb, is_umount, msg), + TP_ARGS(sb, reason, msg), TP_STRUCT__entry( __field(dev_t, dev) - __field(bool, is_umount) + __field(int, reason) __field(char *, msg) ), TP_fast_assign( __entry->dev = sb->s_dev; - __entry->is_umount = is_umount; + __entry->reason = reason; __entry->msg = msg; ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", show_dev(__entry), - __entry->is_umount ? "clean umount" : "consistency", + show_cpreason(__entry->reason), __entry->msg) ); -- 1.8.5.2 (Apple Git-48) ------------------------------------------------------------------------------ Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk ^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/3] f2fs: introduce FITRIM in f2fs_ioctl 2014-09-23 4:53 [PATCH 1/3] f2fs: introduce cp_control structure Jaegeuk Kim @ 2014-09-23 4:53 ` Jaegeuk Kim 2014-09-30 6:01 ` Chao Yu 2014-09-23 4:53 ` [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing ops Jaegeuk Kim 2014-09-30 6:01 ` [PATCH 1/3] f2fs: introduce cp_control structure Chao Yu 2 siblings, 1 reply; 9+ messages in thread From: Jaegeuk Kim @ 2014-09-23 4:53 UTC (permalink / raw) To: linux-kernel, linux-fsdevel, linux-f2fs-devel; +Cc: Jaegeuk Kim This patch introduces FITRIM in f2fs_ioctl. In this case, f2fs will issue small discards and prefree discards as many as possible for the given area. Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/checkpoint.c | 4 +- fs/f2fs/f2fs.h | 9 +++- fs/f2fs/file.c | 29 ++++++++++++ fs/f2fs/segment.c | 110 +++++++++++++++++++++++++++++++++++++++----- fs/f2fs/super.c | 1 + include/trace/events/f2fs.h | 3 +- 6 files changed, 141 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e401ffd..5d793ba 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -997,7 +997,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); - if (!sbi->s_dirty) + if (!sbi->s_dirty && cpc->reason != CP_DISCARD) goto out; if (unlikely(f2fs_cp_error(sbi))) goto out; @@ -1020,7 +1020,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write cached NAT/SIT entries to NAT/SIT area */ flush_nat_entries(sbi); - flush_sit_entries(sbi); + flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ do_checkpoint(sbi, cpc); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5298924..7b1e1d2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -99,10 +99,15 @@ enum { enum { CP_UMOUNT, CP_SYNC, + CP_DISCARD, }; struct cp_control { int reason; + __u64 trim_start; + __u64 trim_end; + __u64 trim_minlen; + __u64 trimmed; }; /* @@ -1276,9 +1281,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); +void release_discard_addrs(struct f2fs_sb_info *); void discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); +int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(struct f2fs_sb_info *, struct page *, @@ -1295,7 +1302,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *); +void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); int build_segment_manager(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); int __init create_segment_manager_caches(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ac8c680..1184207 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -860,6 +860,35 @@ out: mnt_drop_write_file(filp); return ret; } + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + range.minlen = max((unsigned int)range.minlen, + q->limits.discard_granularity); + ret = f2fs_trim_fs(F2FS_SB(sb), &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } default: return -ENOTTY; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3125a3d..b423005 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -386,45 +386,92 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) } } -static void add_discard_addrs(struct f2fs_sb_info *sbi, - unsigned int segno, struct seg_entry *se) +static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct list_head *head = &SM_I(sbi)->discard_list; struct discard_entry *new; int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); int max_blocks = sbi->blocks_per_seg; + struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; - unsigned long dmap[entries]; + unsigned long *dmap; unsigned int start = 0, end = -1; + bool force = (cpc->reason == CP_DISCARD); int i; - if (!test_opt(sbi, DISCARD)) + if (!force && !test_opt(sbi, DISCARD)) return; + if (force && !se->valid_blocks) { + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + /* + * if this segment is registered in the prefree list, then + * we should skip adding a discard candidate, and let the + * checkpoint do that later. + */ + mutex_lock(&dirty_i->seglist_lock); + if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) { + mutex_unlock(&dirty_i->seglist_lock); + cpc->trimmed += sbi->blocks_per_seg; + return; + } + mutex_unlock(&dirty_i->seglist_lock); + + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); + INIT_LIST_HEAD(&new->list); + new->blkaddr = START_BLOCK(sbi, cpc->trim_start); + new->len = sbi->blocks_per_seg; + list_add_tail(&new->list, head); + SM_I(sbi)->nr_discards += sbi->blocks_per_seg; + cpc->trimmed += sbi->blocks_per_seg; + return; + } + /* zero block will be discarded through the prefree list */ if (!se->valid_blocks || se->valid_blocks == max_blocks) return; + dmap = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!dmap) + return; + /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ for (i = 0; i < entries; i++) dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; - while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (end - start < cpc->trim_minlen) + continue; + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, segno) + start; + new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; new->len = end - start; + cpc->trimmed += end - start; list_add_tail(&new->list, head); SM_I(sbi)->nr_discards += end - start; } + kfree(dmap); +} + +void release_discard_addrs(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &(SM_I(sbi)->discard_list); + struct discard_entry *entry, *this; + + /* drop caches */ + list_for_each_entry_safe(entry, this, head, list) { + list_del(&entry->list); + kmem_cache_free(discard_entry_slab, entry); + } } /* @@ -897,6 +944,40 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) +{ + block_t start_addr = SM_I(sbi)->main_blkaddr; + __u64 start = range->start >> sbi->log_blocksize; + __u64 end = start + (range->len >> sbi->log_blocksize) - 1; + __u64 segment = 1 << (sbi->log_blocksize + sbi->log_blocks_per_seg); + unsigned int start_segno, end_segno; + struct cp_control cpc; + + if (range->minlen > segment || start >= TOTAL_BLKS(sbi) || + range->len < sbi->blocksize) + return -EINVAL; + + if (end <= start_addr) + goto out; + + /* start/end segment number in main_area */ + start_segno = (start <= start_addr) ? 0 : GET_SEGNO(sbi, start); + end_segno = (end >= TOTAL_BLKS(sbi)) ? TOTAL_SEGS(sbi) - 1 : + GET_SEGNO(sbi, end); + + cpc.reason = CP_DISCARD; + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; + cpc.trim_minlen = range->minlen >> sbi->log_blocksize; + cpc.trimmed = 0; + + /* do checkpoint to issue discard commands safely */ + write_checkpoint(sbi, &cpc); +out: + range->len = cpc.trimmed << sbi->log_blocksize; + return 0; +} + static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1524,7 +1605,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi) * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ -void flush_sit_entries(struct f2fs_sb_info *sbi) +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; @@ -1534,6 +1615,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) struct list_head *head = &SM_I(sbi)->sit_entry_set; unsigned long nsegs = TOTAL_SEGS(sbi); bool to_journal = true; + struct seg_entry *se; mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); @@ -1580,11 +1662,14 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) /* flush dirty sit entries in region of current sit set */ for_each_set_bit_from(segno, bitmap, end) { int offset, sit_offset; - struct seg_entry *se = get_seg_entry(sbi, segno); + + se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) - add_discard_addrs(sbi, segno, se); + if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) { + cpc->trim_start = segno; + add_discard_addrs(sbi, cpc); + } if (to_journal) { offset = lookup_journal_in_cursum(sum, @@ -1614,8 +1699,11 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); - out: + if (cpc->reason == CP_DISCARD) { + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) + add_discard_addrs(sbi, cpc); + } mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 128c420..bb6b568 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -446,6 +446,7 @@ static void f2fs_put_super(struct super_block *sb) * In addition, EIO will skip do checkpoint, we need this as well. */ release_dirty_inode(sbi); + release_discard_addrs(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 66eaace..bbc4de9 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -72,7 +72,8 @@ #define show_cpreason(type) \ __print_symbolic(type, \ { CP_UMOUNT, "Umount" }, \ - { CP_SYNC, "Sync" }) + { CP_SYNC, "Sync" }, \ + { CP_DISCARD, "Discard" }) struct victim_sel_policy; -- 1.8.5.2 (Apple Git-48) ------------------------------------------------------------------------------ Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk ^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 2/3] f2fs: introduce FITRIM in f2fs_ioctl 2014-09-23 4:53 ` [PATCH 2/3] f2fs: introduce FITRIM in f2fs_ioctl Jaegeuk Kim @ 2014-09-30 6:01 ` Chao Yu 2014-10-01 15:07 ` Jaegeuk Kim 0 siblings, 1 reply; 9+ messages in thread From: Chao Yu @ 2014-09-30 6:01 UTC (permalink / raw) To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel > -----Original Message----- > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org] > Sent: Tuesday, September 23, 2014 12:53 PM > To: linux-kernel@vger.kernel.org; linux-fsdevel@vger.kernel.org; > linux-f2fs-devel@lists.sourceforge.net > Cc: Jaegeuk Kim > Subject: [f2fs-dev] [PATCH 2/3] f2fs: introduce FITRIM in f2fs_ioctl > > This patch introduces FITRIM in f2fs_ioctl. > In this case, f2fs will issue small discards and prefree discards as many as > possible for the given area. > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> Looks good! Reviewed-by: Chao Yu <chao2.yu@samsung.com> > + > /* zero block will be discarded through the prefree list */ > if (!se->valid_blocks || se->valid_blocks == max_blocks) > return; > > + dmap = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); We can afford 64 bytes allocation in stack, how about altering dmap to a local array to reduce memory pressure at that moment and avoid delay of allocation? > + if (!dmap) > + return; > + ------------------------------------------------------------------------------ Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/3] f2fs: introduce FITRIM in f2fs_ioctl 2014-09-30 6:01 ` Chao Yu @ 2014-10-01 15:07 ` Jaegeuk Kim 0 siblings, 0 replies; 9+ messages in thread From: Jaegeuk Kim @ 2014-10-01 15:07 UTC (permalink / raw) To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On Tue, Sep 30, 2014 at 02:01:58PM +0800, Chao Yu wrote: > > -----Original Message----- > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org] > > Sent: Tuesday, September 23, 2014 12:53 PM > > To: linux-kernel@vger.kernel.org; linux-fsdevel@vger.kernel.org; > > linux-f2fs-devel@lists.sourceforge.net > > Cc: Jaegeuk Kim > > Subject: [f2fs-dev] [PATCH 2/3] f2fs: introduce FITRIM in f2fs_ioctl > > > > This patch introduces FITRIM in f2fs_ioctl. > > In this case, f2fs will issue small discards and prefree discards as many as > > possible for the given area. > > > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > > Looks good! > > Reviewed-by: Chao Yu <chao2.yu@samsung.com> > > > + > > /* zero block will be discarded through the prefree list */ > > if (!se->valid_blocks || se->valid_blocks == max_blocks) > > return; > > > > + dmap = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); > > We can afford 64 bytes allocation in stack, how about altering dmap to a local > array to reduce memory pressure at that moment and avoid delay of allocation? It's not a big deal. Fixed. Thanks, > > > + if (!dmap) > > + return; > > + ------------------------------------------------------------------------------ Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk ^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing ops 2014-09-23 4:53 [PATCH 1/3] f2fs: introduce cp_control structure Jaegeuk Kim 2014-09-23 4:53 ` [PATCH 2/3] f2fs: introduce FITRIM in f2fs_ioctl Jaegeuk Kim @ 2014-09-23 4:53 ` Jaegeuk Kim 2014-09-30 6:04 ` [f2fs-dev] " Chao Yu 2014-10-11 13:11 ` Sasha Levin 2014-09-30 6:01 ` [PATCH 1/3] f2fs: introduce cp_control structure Chao Yu 2 siblings, 2 replies; 9+ messages in thread From: Jaegeuk Kim @ 2014-09-23 4:53 UTC (permalink / raw) To: linux-kernel, linux-fsdevel, linux-f2fs-devel; +Cc: Jaegeuk Kim Previously, f2fs tries to reorganize the dirty nat entries into multiple sets according to its nid ranges. This can improve the flushing nat pages, however, if there are a lot of cached nat entries, it becomes a bottleneck. This patch introduces a new set management flow by removing dirty nat list and adding a series of set operations when the nat entry becomes dirty. Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> --- fs/f2fs/f2fs.h | 13 +-- fs/f2fs/node.c | 299 +++++++++++++++++++++++++++++---------------------------- fs/f2fs/node.h | 9 +- 3 files changed, 162 insertions(+), 159 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7b1e1d2..94cfdc4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -164,6 +164,9 @@ struct fsync_inode_entry { #define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) #define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) +#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) +#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) + static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) { int before = nats_in_cursum(rs); @@ -182,9 +185,8 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, int type) { if (type == NAT_JOURNAL) - return nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES; - - return sits_in_cursum(sum) + size <= SIT_JOURNAL_ENTRIES; + return size <= MAX_NAT_JENTRIES(sum); + return size <= MAX_SIT_JENTRIES(sum); } /* @@ -292,11 +294,10 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ + struct radix_tree_root nat_set_root;/* root of the nat set cache */ rwlock_t nat_tree_lock; /* protect nat_tree_lock */ - unsigned int nat_cnt; /* the # of cached nat entries */ struct list_head nat_entries; /* cached nat entry list (clean) */ - struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ - struct list_head nat_entry_set; /* nat entry set list */ + unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ /* free node ids management */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 21ed91b..f5a21f4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -123,6 +123,57 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) kmem_cache_free(nat_entry_slab, e); } +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK; + struct nat_entry_set *head; + + if (get_nat_flag(ne, IS_DIRTY)) + return; +retry: + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (!head) { + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); + + INIT_LIST_HEAD(&head->entry_list); + INIT_LIST_HEAD(&head->set_list); + head->set = set; + head->entry_cnt = 0; + + if (radix_tree_insert(&nm_i->nat_set_root, set, head)) { + cond_resched(); + goto retry; + } + } + list_move_tail(&ne->list, &head->entry_list); + nm_i->dirty_nat_cnt++; + head->entry_cnt++; + set_nat_flag(ne, IS_DIRTY, true); +} + +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK; + struct nat_entry_set *head; + + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (head) { + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + head->entry_cnt--; + nm_i->dirty_nat_cnt--; + } +} + +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry_set **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep, + start, nr); +} + bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1739,79 +1790,6 @@ skip: return err; } -static struct nat_entry_set *grab_nat_entry_set(void) -{ - struct nat_entry_set *nes = - f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); - - nes->entry_cnt = 0; - INIT_LIST_HEAD(&nes->set_list); - INIT_LIST_HEAD(&nes->entry_list); - return nes; -} - -static void release_nat_entry_set(struct nat_entry_set *nes, - struct f2fs_nm_info *nm_i) -{ - nm_i->dirty_nat_cnt -= nes->entry_cnt; - list_del(&nes->set_list); - kmem_cache_free(nat_entry_set_slab, nes); -} - -static void adjust_nat_entry_set(struct nat_entry_set *nes, - struct list_head *head) -{ - struct nat_entry_set *next = nes; - - if (list_is_last(&nes->set_list, head)) - return; - - list_for_each_entry_continue(next, head, set_list) - if (nes->entry_cnt <= next->entry_cnt) - break; - - list_move_tail(&nes->set_list, &next->set_list); -} - -static void add_nat_entry(struct nat_entry *ne, struct list_head *head) -{ - struct nat_entry_set *nes; - nid_t start_nid = START_NID(ne->ni.nid); - - list_for_each_entry(nes, head, set_list) { - if (nes->start_nid == start_nid) { - list_move_tail(&ne->list, &nes->entry_list); - nes->entry_cnt++; - adjust_nat_entry_set(nes, head); - return; - } - } - - nes = grab_nat_entry_set(); - - nes->start_nid = start_nid; - list_move_tail(&ne->list, &nes->entry_list); - nes->entry_cnt++; - list_add(&nes->set_list, head); -} - -static void merge_nats_in_set(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct list_head *dirty_list = &nm_i->dirty_nat_entries; - struct list_head *set_list = &nm_i->nat_entry_set; - struct nat_entry *ne, *tmp; - - write_lock(&nm_i->nat_tree_lock); - list_for_each_entry_safe(ne, tmp, dirty_list, list) { - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; - add_nat_entry(ne, set_list); - nm_i->dirty_nat_cnt++; - } - write_unlock(&nm_i->nat_tree_lock); -} - static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1846,101 +1824,129 @@ found: mutex_unlock(&curseg->curseg_mutex); } -/* - * This function is called during the checkpointing process. - */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +static void __adjust_nat_entry_set(struct nat_entry_set *nes, + struct list_head *head, int max) { - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; - struct nat_entry_set *nes, *tmp; - struct list_head *head = &nm_i->nat_entry_set; - bool to_journal = true; + struct nat_entry_set *cur; + nid_t dirty_cnt = 0; - /* merge nat entries of dirty list to nat entry set temporarily */ - merge_nats_in_set(sbi); + if (nes->entry_cnt >= max) + goto add_out; - /* - * if there are no enough space in journal to store dirty nat - * entries, remove all entries from journal and merge them - * into nat entry set. - */ - if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) { - remove_nats_in_journal(sbi); - - /* - * merge nat entries of dirty list to nat entry set temporarily - */ - merge_nats_in_set(sbi); + list_for_each_entry(cur, head, set_list) { + dirty_cnt += cur->entry_cnt; + if (dirty_cnt > max) + break; + if (cur->entry_cnt >= nes->entry_cnt) { + list_add(&nes->set_list, cur->set_list.prev); + return; + } } +add_out: + list_add_tail(&nes->set_list, head); +} - if (!nm_i->dirty_nat_cnt) - return; +static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, + struct nat_entry_set *set) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; + bool to_journal = true; + struct f2fs_nat_block *nat_blk; + struct nat_entry *ne, *cur; + struct page *page = NULL; /* * there are two steps to flush nat entries: * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - list_for_each_entry_safe(nes, tmp, head, set_list) { - struct f2fs_nat_block *nat_blk; - struct nat_entry *ne, *cur; - struct page *page; - nid_t start_nid = nes->start_nid; + if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) + to_journal = false; - if (to_journal && - !__has_cursum_space(sum, nes->entry_cnt, NAT_JOURNAL)) - to_journal = false; + if (to_journal) { + mutex_lock(&curseg->curseg_mutex); + } else { + page = get_next_nat_page(sbi, start_nid); + nat_blk = page_address(page); + f2fs_bug_on(sbi, !nat_blk); + } + + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(ne, cur, &set->entry_list, list) { + struct f2fs_nat_entry *raw_ne; + nid_t nid = nat_get_nid(ne); + int offset; if (to_journal) { - mutex_lock(&curseg->curseg_mutex); + offset = lookup_journal_in_cursum(sum, + NAT_JOURNAL, nid, 1); + f2fs_bug_on(sbi, offset < 0); + raw_ne = &nat_in_journal(sum, offset); + nid_in_journal(sum, offset) = cpu_to_le32(nid); } else { - page = get_next_nat_page(sbi, start_nid); - nat_blk = page_address(page); - f2fs_bug_on(sbi, !nat_blk); + raw_ne = &nat_blk->entries[nid - start_nid]; } + raw_nat_from_node_info(raw_ne, &ne->ni); - /* flush dirty nats in nat entry set */ - list_for_each_entry_safe(ne, cur, &nes->entry_list, list) { - struct f2fs_nat_entry *raw_ne; - nid_t nid = nat_get_nid(ne); - int offset; + write_lock(&NM_I(sbi)->nat_tree_lock); + nat_reset_flag(ne); + __clear_nat_cache_dirty(NM_I(sbi), ne); + write_unlock(&NM_I(sbi)->nat_tree_lock); - if (to_journal) { - offset = lookup_journal_in_cursum(sum, - NAT_JOURNAL, nid, 1); - f2fs_bug_on(sbi, offset < 0); - raw_ne = &nat_in_journal(sum, offset); - nid_in_journal(sum, offset) = cpu_to_le32(nid); - } else { - raw_ne = &nat_blk->entries[nid - start_nid]; - } - raw_nat_from_node_info(raw_ne, &ne->ni); + if (nat_get_blkaddr(ne) == NULL_ADDR) + add_free_nid(sbi, nid, false); + } - if (nat_get_blkaddr(ne) == NULL_ADDR && - add_free_nid(sbi, nid, false) <= 0) { - write_lock(&nm_i->nat_tree_lock); - __del_from_nat_cache(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } else { - write_lock(&nm_i->nat_tree_lock); - nat_reset_flag(ne); - __clear_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } - } + if (to_journal) + mutex_unlock(&curseg->curseg_mutex); + else + f2fs_put_page(page, 1); - if (to_journal) - mutex_unlock(&curseg->curseg_mutex); - else - f2fs_put_page(page, 1); + f2fs_bug_on(sbi, set->entry_cnt); + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); +} - f2fs_bug_on(sbi, !list_empty(&nes->entry_list)); - release_nat_entry_set(nes, nm_i); +/* + * This function is called during the checkpointing process. + */ +void flush_nat_entries(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + struct nat_entry_set *setvec[NATVEC_SIZE]; + struct nat_entry_set *set, *tmp; + unsigned int found; + nid_t set_idx = 0; + LIST_HEAD(sets); + + /* + * if there are no enough space in journal to store dirty nat + * entries, remove all entries from journal and merge them + * into nat entry set. + */ + if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + remove_nats_in_journal(sbi); + + if (!nm_i->dirty_nat_cnt) + return; + + while ((found = __gang_lookup_nat_set(nm_i, + set_idx, NATVEC_SIZE, setvec))) { + unsigned idx; + set_idx = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) + __adjust_nat_entry_set(setvec[idx], &sets, + MAX_NAT_JENTRIES(sum)); } - f2fs_bug_on(sbi, !list_empty(head)); + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(set, tmp, &sets, set_list) + __flush_nat_entry_set(sbi, set); + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -1968,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); + INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); - INIT_LIST_HEAD(&nm_i->dirty_nat_entries); - INIT_LIST_HEAD(&nm_i->nat_entry_set); mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index b8ba63c..bd826d9 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -43,6 +43,7 @@ enum { IS_CHECKPOINTED, /* is it checkpointed before? */ HAS_FSYNCED_INODE, /* is the inode fsynced before? */ HAS_LAST_FSYNC, /* has the latest node fsync mark? */ + IS_DIRTY, /* this nat entry is dirty? */ }; struct nat_entry { @@ -60,10 +61,6 @@ struct nat_entry { #define nat_get_version(nat) (nat->ni.version) #define nat_set_version(nat, v) (nat->ni.version = v) -#define __set_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); -#define __clear_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->nat_entries); #define inc_node_version(version) (++version) static inline void set_nat_flag(struct nat_entry *ne, @@ -113,9 +110,9 @@ enum mem_type { }; struct nat_entry_set { - struct list_head set_list; /* link with all nat sets */ + struct list_head set_list; /* link with other nat sets */ struct list_head entry_list; /* link with dirty nat entries */ - nid_t start_nid; /* start nid of nats in set */ + nid_t set; /* set number*/ unsigned int entry_cnt; /* the # of nat entries in set */ }; -- 1.8.5.2 (Apple Git-48) ------------------------------------------------------------------------------ Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk ^ permalink raw reply related [flat|nested] 9+ messages in thread
* RE: [f2fs-dev] [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing ops 2014-09-23 4:53 ` [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing ops Jaegeuk Kim @ 2014-09-30 6:04 ` Chao Yu 2014-10-01 15:06 ` Jaegeuk Kim 2014-10-11 13:11 ` Sasha Levin 1 sibling, 1 reply; 9+ messages in thread From: Chao Yu @ 2014-09-30 6:04 UTC (permalink / raw) To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel Hi Jaegeuk, > -----Original Message----- > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org] > Sent: Tuesday, September 23, 2014 12:53 PM > To: linux-kernel@vger.kernel.org; linux-fsdevel@vger.kernel.org; > linux-f2fs-devel@lists.sourceforge.net > Cc: Jaegeuk Kim > Subject: [f2fs-dev] [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing > ops > > Previously, f2fs tries to reorganize the dirty nat entries into multiple sets > according to its nid ranges. This can improve the flushing nat pages, however, > if there are a lot of cached nat entries, it becomes a bottleneck. > > This patch introduces a new set management flow by removing dirty nat list and > adding a series of set operations when the nat entry becomes dirty. > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > --- > fs/f2fs/f2fs.h | 13 +-- > fs/f2fs/node.c | 299 +++++++++++++++++++++++++++++---------------------------- > fs/f2fs/node.h | 9 +- > 3 files changed, 162 insertions(+), 159 deletions(-) > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > index 7b1e1d2..94cfdc4 100644 > --- a/fs/f2fs/f2fs.h > +++ b/fs/f2fs/f2fs.h > @@ -164,6 +164,9 @@ struct fsync_inode_entry { > #define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) > #define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) > > +#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) > +#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) > + > static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) > { > int before = nats_in_cursum(rs); > @@ -182,9 +185,8 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int > size, > int type) > { > if (type == NAT_JOURNAL) > - return nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES; > - > - return sits_in_cursum(sum) + size <= SIT_JOURNAL_ENTRIES; > + return size <= MAX_NAT_JENTRIES(sum); > + return size <= MAX_SIT_JENTRIES(sum); > } > > /* > @@ -292,11 +294,10 @@ struct f2fs_nm_info { > > /* NAT cache management */ > struct radix_tree_root nat_root;/* root of the nat entry cache */ > + struct radix_tree_root nat_set_root;/* root of the nat set cache */ > rwlock_t nat_tree_lock; /* protect nat_tree_lock */ > - unsigned int nat_cnt; /* the # of cached nat entries */ > struct list_head nat_entries; /* cached nat entry list (clean) */ > - struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ > - struct list_head nat_entry_set; /* nat entry set list */ > + unsigned int nat_cnt; /* the # of cached nat entries */ > unsigned int dirty_nat_cnt; /* total num of nat entries in set */ > > /* free node ids management */ > diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c > index 21ed91b..f5a21f4 100644 > --- a/fs/f2fs/node.c > +++ b/fs/f2fs/node.c > @@ -123,6 +123,57 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct > nat_entry *e) > kmem_cache_free(nat_entry_slab, e); > } > > +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, > + struct nat_entry *ne) > +{ > + nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK; nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); > + struct nat_entry_set *head; > + > + if (get_nat_flag(ne, IS_DIRTY)) > + return; > +retry: > + head = radix_tree_lookup(&nm_i->nat_set_root, set); > + if (!head) { > + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); > + > + INIT_LIST_HEAD(&head->entry_list); > + INIT_LIST_HEAD(&head->set_list); > + head->set = set; > + head->entry_cnt = 0; > + > + if (radix_tree_insert(&nm_i->nat_set_root, set, head)) { > + cond_resched(); > + goto retry; > + } > + } > + list_move_tail(&ne->list, &head->entry_list); > + nm_i->dirty_nat_cnt++; > + head->entry_cnt++; > + set_nat_flag(ne, IS_DIRTY, true); > +} > + > +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, > + struct nat_entry *ne) > +{ > + nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK; > + struct nat_entry_set *head; > + > + head = radix_tree_lookup(&nm_i->nat_set_root, set); > + if (head) { > + list_move_tail(&ne->list, &nm_i->nat_entries); > + set_nat_flag(ne, IS_DIRTY, false); > + head->entry_cnt--; > + nm_i->dirty_nat_cnt--; > + } > +} > + > +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, > + nid_t start, unsigned int nr, struct nat_entry_set **ep) > +{ > + return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep, > + start, nr); > +} > + > bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) > { > struct f2fs_nm_info *nm_i = NM_I(sbi); > @@ -1739,79 +1790,6 @@ skip: > return err; > } > > -static struct nat_entry_set *grab_nat_entry_set(void) > -{ > - struct nat_entry_set *nes = > - f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); > - > - nes->entry_cnt = 0; > - INIT_LIST_HEAD(&nes->set_list); > - INIT_LIST_HEAD(&nes->entry_list); > - return nes; > -} > - > -static void release_nat_entry_set(struct nat_entry_set *nes, > - struct f2fs_nm_info *nm_i) > -{ > - nm_i->dirty_nat_cnt -= nes->entry_cnt; > - list_del(&nes->set_list); > - kmem_cache_free(nat_entry_set_slab, nes); > -} > - > -static void adjust_nat_entry_set(struct nat_entry_set *nes, > - struct list_head *head) > -{ > - struct nat_entry_set *next = nes; > - > - if (list_is_last(&nes->set_list, head)) > - return; > - > - list_for_each_entry_continue(next, head, set_list) > - if (nes->entry_cnt <= next->entry_cnt) > - break; > - > - list_move_tail(&nes->set_list, &next->set_list); > -} > - > -static void add_nat_entry(struct nat_entry *ne, struct list_head *head) > -{ > - struct nat_entry_set *nes; > - nid_t start_nid = START_NID(ne->ni.nid); > - > - list_for_each_entry(nes, head, set_list) { > - if (nes->start_nid == start_nid) { > - list_move_tail(&ne->list, &nes->entry_list); > - nes->entry_cnt++; > - adjust_nat_entry_set(nes, head); > - return; > - } > - } > - > - nes = grab_nat_entry_set(); > - > - nes->start_nid = start_nid; > - list_move_tail(&ne->list, &nes->entry_list); > - nes->entry_cnt++; > - list_add(&nes->set_list, head); > -} > - > -static void merge_nats_in_set(struct f2fs_sb_info *sbi) > -{ > - struct f2fs_nm_info *nm_i = NM_I(sbi); > - struct list_head *dirty_list = &nm_i->dirty_nat_entries; > - struct list_head *set_list = &nm_i->nat_entry_set; > - struct nat_entry *ne, *tmp; > - > - write_lock(&nm_i->nat_tree_lock); > - list_for_each_entry_safe(ne, tmp, dirty_list, list) { > - if (nat_get_blkaddr(ne) == NEW_ADDR) > - continue; Shouldn't we move this condition judgment into __flush_nat_entry_set? > - add_nat_entry(ne, set_list); > - nm_i->dirty_nat_cnt++; > - } > - write_unlock(&nm_i->nat_tree_lock); > -} > - > static void remove_nats_in_journal(struct f2fs_sb_info *sbi) > { > struct f2fs_nm_info *nm_i = NM_I(sbi); > @@ -1846,101 +1824,129 @@ found: > mutex_unlock(&curseg->curseg_mutex); > } > > -/* > - * This function is called during the checkpointing process. > - */ > -void flush_nat_entries(struct f2fs_sb_info *sbi) > +static void __adjust_nat_entry_set(struct nat_entry_set *nes, > + struct list_head *head, int max) > { > - struct f2fs_nm_info *nm_i = NM_I(sbi); > - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); > - struct f2fs_summary_block *sum = curseg->sum_blk; > - struct nat_entry_set *nes, *tmp; > - struct list_head *head = &nm_i->nat_entry_set; > - bool to_journal = true; > + struct nat_entry_set *cur; > + nid_t dirty_cnt = 0; > > - /* merge nat entries of dirty list to nat entry set temporarily */ > - merge_nats_in_set(sbi); > + if (nes->entry_cnt >= max) > + goto add_out; > > - /* > - * if there are no enough space in journal to store dirty nat > - * entries, remove all entries from journal and merge them > - * into nat entry set. > - */ > - if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) { > - remove_nats_in_journal(sbi); > - > - /* > - * merge nat entries of dirty list to nat entry set temporarily > - */ > - merge_nats_in_set(sbi); > + list_for_each_entry(cur, head, set_list) { > + dirty_cnt += cur->entry_cnt; > + if (dirty_cnt > max) > + break; Seems a problem here, For example: set no: 1 2 3 4 5 6 7 nat number: 4 3 2 1 1 1 1 max = 6 set list will be: 3--4--2--1--1--1--1 Then we will prior flush nat set with more entries in the set list, result in a degradation. We'd better not break the adjustment until we finish to traverse the whole sequential sorted set list to avoid above condition, so how about removing the codes relate to dirty_cnt? Thanks, Yu > + if (cur->entry_cnt >= nes->entry_cnt) { > + list_add(&nes->set_list, cur->set_list.prev); > + return; > + } > } > +add_out: > + list_add_tail(&nes->set_list, head); > +} > > - if (!nm_i->dirty_nat_cnt) > - return; > +static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, > + struct nat_entry_set *set) > +{ > + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); > + struct f2fs_summary_block *sum = curseg->sum_blk; > + nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; > + bool to_journal = true; > + struct f2fs_nat_block *nat_blk; > + struct nat_entry *ne, *cur; > + struct page *page = NULL; > > /* > * there are two steps to flush nat entries: > * #1, flush nat entries to journal in current hot data summary block. > * #2, flush nat entries to nat page. > */ > - list_for_each_entry_safe(nes, tmp, head, set_list) { > - struct f2fs_nat_block *nat_blk; > - struct nat_entry *ne, *cur; > - struct page *page; > - nid_t start_nid = nes->start_nid; > + if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) > + to_journal = false; > > - if (to_journal && > - !__has_cursum_space(sum, nes->entry_cnt, NAT_JOURNAL)) > - to_journal = false; > + if (to_journal) { > + mutex_lock(&curseg->curseg_mutex); > + } else { > + page = get_next_nat_page(sbi, start_nid); > + nat_blk = page_address(page); > + f2fs_bug_on(sbi, !nat_blk); > + } > + > + /* flush dirty nats in nat entry set */ > + list_for_each_entry_safe(ne, cur, &set->entry_list, list) { > + struct f2fs_nat_entry *raw_ne; > + nid_t nid = nat_get_nid(ne); > + int offset; > > if (to_journal) { > - mutex_lock(&curseg->curseg_mutex); > + offset = lookup_journal_in_cursum(sum, > + NAT_JOURNAL, nid, 1); > + f2fs_bug_on(sbi, offset < 0); > + raw_ne = &nat_in_journal(sum, offset); > + nid_in_journal(sum, offset) = cpu_to_le32(nid); > } else { > - page = get_next_nat_page(sbi, start_nid); > - nat_blk = page_address(page); > - f2fs_bug_on(sbi, !nat_blk); > + raw_ne = &nat_blk->entries[nid - start_nid]; > } > + raw_nat_from_node_info(raw_ne, &ne->ni); > > - /* flush dirty nats in nat entry set */ > - list_for_each_entry_safe(ne, cur, &nes->entry_list, list) { > - struct f2fs_nat_entry *raw_ne; > - nid_t nid = nat_get_nid(ne); > - int offset; > + write_lock(&NM_I(sbi)->nat_tree_lock); > + nat_reset_flag(ne); > + __clear_nat_cache_dirty(NM_I(sbi), ne); > + write_unlock(&NM_I(sbi)->nat_tree_lock); > > - if (to_journal) { > - offset = lookup_journal_in_cursum(sum, > - NAT_JOURNAL, nid, 1); > - f2fs_bug_on(sbi, offset < 0); > - raw_ne = &nat_in_journal(sum, offset); > - nid_in_journal(sum, offset) = cpu_to_le32(nid); > - } else { > - raw_ne = &nat_blk->entries[nid - start_nid]; > - } > - raw_nat_from_node_info(raw_ne, &ne->ni); > + if (nat_get_blkaddr(ne) == NULL_ADDR) > + add_free_nid(sbi, nid, false); > + } > > - if (nat_get_blkaddr(ne) == NULL_ADDR && > - add_free_nid(sbi, nid, false) <= 0) { > - write_lock(&nm_i->nat_tree_lock); > - __del_from_nat_cache(nm_i, ne); > - write_unlock(&nm_i->nat_tree_lock); > - } else { > - write_lock(&nm_i->nat_tree_lock); > - nat_reset_flag(ne); > - __clear_nat_cache_dirty(nm_i, ne); > - write_unlock(&nm_i->nat_tree_lock); > - } > - } > + if (to_journal) > + mutex_unlock(&curseg->curseg_mutex); > + else > + f2fs_put_page(page, 1); > > - if (to_journal) > - mutex_unlock(&curseg->curseg_mutex); > - else > - f2fs_put_page(page, 1); > + f2fs_bug_on(sbi, set->entry_cnt); > + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); > + kmem_cache_free(nat_entry_set_slab, set); > +} > > - f2fs_bug_on(sbi, !list_empty(&nes->entry_list)); > - release_nat_entry_set(nes, nm_i); > +/* > + * This function is called during the checkpointing process. > + */ > +void flush_nat_entries(struct f2fs_sb_info *sbi) > +{ > + struct f2fs_nm_info *nm_i = NM_I(sbi); > + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); > + struct f2fs_summary_block *sum = curseg->sum_blk; > + struct nat_entry_set *setvec[NATVEC_SIZE]; > + struct nat_entry_set *set, *tmp; > + unsigned int found; > + nid_t set_idx = 0; > + LIST_HEAD(sets); > + > + /* > + * if there are no enough space in journal to store dirty nat > + * entries, remove all entries from journal and merge them > + * into nat entry set. > + */ > + if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) > + remove_nats_in_journal(sbi); > + > + if (!nm_i->dirty_nat_cnt) > + return; > + > + while ((found = __gang_lookup_nat_set(nm_i, > + set_idx, NATVEC_SIZE, setvec))) { > + unsigned idx; > + set_idx = setvec[found - 1]->set + 1; > + for (idx = 0; idx < found; idx++) > + __adjust_nat_entry_set(setvec[idx], &sets, > + MAX_NAT_JENTRIES(sum)); > } > > - f2fs_bug_on(sbi, !list_empty(head)); > + /* flush dirty nats in nat entry set */ > + list_for_each_entry_safe(set, tmp, &sets, set_list) > + __flush_nat_entry_set(sbi, set); > + > f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); > } > > @@ -1968,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) > INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); > INIT_LIST_HEAD(&nm_i->free_nid_list); > INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); > + INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC); > INIT_LIST_HEAD(&nm_i->nat_entries); > - INIT_LIST_HEAD(&nm_i->dirty_nat_entries); > - INIT_LIST_HEAD(&nm_i->nat_entry_set); > > mutex_init(&nm_i->build_lock); > spin_lock_init(&nm_i->free_nid_list_lock); > diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h > index b8ba63c..bd826d9 100644 > --- a/fs/f2fs/node.h > +++ b/fs/f2fs/node.h > @@ -43,6 +43,7 @@ enum { > IS_CHECKPOINTED, /* is it checkpointed before? */ > HAS_FSYNCED_INODE, /* is the inode fsynced before? */ > HAS_LAST_FSYNC, /* has the latest node fsync mark? */ > + IS_DIRTY, /* this nat entry is dirty? */ > }; > > struct nat_entry { > @@ -60,10 +61,6 @@ struct nat_entry { > #define nat_get_version(nat) (nat->ni.version) > #define nat_set_version(nat, v) (nat->ni.version = v) > > -#define __set_nat_cache_dirty(nm_i, ne) \ > - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); > -#define __clear_nat_cache_dirty(nm_i, ne) \ > - list_move_tail(&ne->list, &nm_i->nat_entries); > #define inc_node_version(version) (++version) > > static inline void set_nat_flag(struct nat_entry *ne, > @@ -113,9 +110,9 @@ enum mem_type { > }; > > struct nat_entry_set { > - struct list_head set_list; /* link with all nat sets */ > + struct list_head set_list; /* link with other nat sets */ > struct list_head entry_list; /* link with dirty nat entries */ > - nid_t start_nid; /* start nid of nats in set */ > + nid_t set; /* set number*/ > unsigned int entry_cnt; /* the # of nat entries in set */ > }; > > -- > 1.8.5.2 (Apple Git-48) > > > ------------------------------------------------------------------------------ > Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer > Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports > Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper > Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer > http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk > _______________________________________________ > Linux-f2fs-devel mailing list > Linux-f2fs-devel@lists.sourceforge.net > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [f2fs-dev] [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing ops 2014-09-30 6:04 ` [f2fs-dev] " Chao Yu @ 2014-10-01 15:06 ` Jaegeuk Kim 0 siblings, 0 replies; 9+ messages in thread From: Jaegeuk Kim @ 2014-10-01 15:06 UTC (permalink / raw) To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel On Tue, Sep 30, 2014 at 02:04:20PM +0800, Chao Yu wrote: > Hi Jaegeuk, > > > -----Original Message----- > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org] > > Sent: Tuesday, September 23, 2014 12:53 PM > > To: linux-kernel@vger.kernel.org; linux-fsdevel@vger.kernel.org; > > linux-f2fs-devel@lists.sourceforge.net > > Cc: Jaegeuk Kim > > Subject: [f2fs-dev] [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing > > ops > > > > Previously, f2fs tries to reorganize the dirty nat entries into multiple sets > > according to its nid ranges. This can improve the flushing nat pages, however, > > if there are a lot of cached nat entries, it becomes a bottleneck. > > > > This patch introduces a new set management flow by removing dirty nat list and > > adding a series of set operations when the nat entry becomes dirty. > > > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> > > --- > > fs/f2fs/f2fs.h | 13 +-- > > fs/f2fs/node.c | 299 +++++++++++++++++++++++++++++---------------------------- > > fs/f2fs/node.h | 9 +- > > 3 files changed, 162 insertions(+), 159 deletions(-) > > > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > > index 7b1e1d2..94cfdc4 100644 > > --- a/fs/f2fs/f2fs.h > > +++ b/fs/f2fs/f2fs.h > > @@ -164,6 +164,9 @@ struct fsync_inode_entry { > > #define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) > > #define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) > > > > +#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) > > +#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) > > + > > static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) > > { > > int before = nats_in_cursum(rs); > > @@ -182,9 +185,8 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int > > size, > > int type) > > { > > if (type == NAT_JOURNAL) > > - return nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES; > > - > > - return sits_in_cursum(sum) + size <= SIT_JOURNAL_ENTRIES; > > + return size <= MAX_NAT_JENTRIES(sum); > > + return size <= MAX_SIT_JENTRIES(sum); > > } > > > > /* > > @@ -292,11 +294,10 @@ struct f2fs_nm_info { > > > > /* NAT cache management */ > > struct radix_tree_root nat_root;/* root of the nat entry cache */ > > + struct radix_tree_root nat_set_root;/* root of the nat set cache */ > > rwlock_t nat_tree_lock; /* protect nat_tree_lock */ > > - unsigned int nat_cnt; /* the # of cached nat entries */ > > struct list_head nat_entries; /* cached nat entry list (clean) */ > > - struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ > > - struct list_head nat_entry_set; /* nat entry set list */ > > + unsigned int nat_cnt; /* the # of cached nat entries */ > > unsigned int dirty_nat_cnt; /* total num of nat entries in set */ > > > > /* free node ids management */ > > diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c > > index 21ed91b..f5a21f4 100644 > > --- a/fs/f2fs/node.c > > +++ b/fs/f2fs/node.c > > @@ -123,6 +123,57 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct > > nat_entry *e) > > kmem_cache_free(nat_entry_slab, e); > > } > > > > +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, > > + struct nat_entry *ne) > > +{ > > + nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK; > > nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); > > > + struct nat_entry_set *head; > > + > > + if (get_nat_flag(ne, IS_DIRTY)) > > + return; > > +retry: > > + head = radix_tree_lookup(&nm_i->nat_set_root, set); > > + if (!head) { > > + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); > > + > > + INIT_LIST_HEAD(&head->entry_list); > > + INIT_LIST_HEAD(&head->set_list); > > + head->set = set; > > + head->entry_cnt = 0; > > + > > + if (radix_tree_insert(&nm_i->nat_set_root, set, head)) { > > + cond_resched(); > > + goto retry; > > + } > > + } > > + list_move_tail(&ne->list, &head->entry_list); > > + nm_i->dirty_nat_cnt++; > > + head->entry_cnt++; > > + set_nat_flag(ne, IS_DIRTY, true); > > +} > > + > > +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, > > + struct nat_entry *ne) > > +{ > > + nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK; > > + struct nat_entry_set *head; > > + > > + head = radix_tree_lookup(&nm_i->nat_set_root, set); > > + if (head) { > > + list_move_tail(&ne->list, &nm_i->nat_entries); > > + set_nat_flag(ne, IS_DIRTY, false); > > + head->entry_cnt--; > > + nm_i->dirty_nat_cnt--; > > + } > > +} > > + > > +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, > > + nid_t start, unsigned int nr, struct nat_entry_set **ep) > > +{ > > + return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep, > > + start, nr); > > +} > > + > > bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) > > { > > struct f2fs_nm_info *nm_i = NM_I(sbi); > > @@ -1739,79 +1790,6 @@ skip: > > return err; > > } > > > > -static struct nat_entry_set *grab_nat_entry_set(void) > > -{ > > - struct nat_entry_set *nes = > > - f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); > > - > > - nes->entry_cnt = 0; > > - INIT_LIST_HEAD(&nes->set_list); > > - INIT_LIST_HEAD(&nes->entry_list); > > - return nes; > > -} > > - > > -static void release_nat_entry_set(struct nat_entry_set *nes, > > - struct f2fs_nm_info *nm_i) > > -{ > > - nm_i->dirty_nat_cnt -= nes->entry_cnt; > > - list_del(&nes->set_list); > > - kmem_cache_free(nat_entry_set_slab, nes); > > -} > > - > > -static void adjust_nat_entry_set(struct nat_entry_set *nes, > > - struct list_head *head) > > -{ > > - struct nat_entry_set *next = nes; > > - > > - if (list_is_last(&nes->set_list, head)) > > - return; > > - > > - list_for_each_entry_continue(next, head, set_list) > > - if (nes->entry_cnt <= next->entry_cnt) > > - break; > > - > > - list_move_tail(&nes->set_list, &next->set_list); > > -} > > - > > -static void add_nat_entry(struct nat_entry *ne, struct list_head *head) > > -{ > > - struct nat_entry_set *nes; > > - nid_t start_nid = START_NID(ne->ni.nid); > > - > > - list_for_each_entry(nes, head, set_list) { > > - if (nes->start_nid == start_nid) { > > - list_move_tail(&ne->list, &nes->entry_list); > > - nes->entry_cnt++; > > - adjust_nat_entry_set(nes, head); > > - return; > > - } > > - } > > - > > - nes = grab_nat_entry_set(); > > - > > - nes->start_nid = start_nid; > > - list_move_tail(&ne->list, &nes->entry_list); > > - nes->entry_cnt++; > > - list_add(&nes->set_list, head); > > -} > > - > > -static void merge_nats_in_set(struct f2fs_sb_info *sbi) > > -{ > > - struct f2fs_nm_info *nm_i = NM_I(sbi); > > - struct list_head *dirty_list = &nm_i->dirty_nat_entries; > > - struct list_head *set_list = &nm_i->nat_entry_set; > > - struct nat_entry *ne, *tmp; > > - > > - write_lock(&nm_i->nat_tree_lock); > > - list_for_each_entry_safe(ne, tmp, dirty_list, list) { > > - if (nat_get_blkaddr(ne) == NEW_ADDR) > > - continue; > > Shouldn't we move this condition judgment into __flush_nat_entry_set? > > > - add_nat_entry(ne, set_list); > > - nm_i->dirty_nat_cnt++; > > - } > > - write_unlock(&nm_i->nat_tree_lock); > > -} > > - > > static void remove_nats_in_journal(struct f2fs_sb_info *sbi) > > { > > struct f2fs_nm_info *nm_i = NM_I(sbi); > > @@ -1846,101 +1824,129 @@ found: > > mutex_unlock(&curseg->curseg_mutex); > > } > > > > -/* > > - * This function is called during the checkpointing process. > > - */ > > -void flush_nat_entries(struct f2fs_sb_info *sbi) > > +static void __adjust_nat_entry_set(struct nat_entry_set *nes, > > + struct list_head *head, int max) > > { > > - struct f2fs_nm_info *nm_i = NM_I(sbi); > > - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); > > - struct f2fs_summary_block *sum = curseg->sum_blk; > > - struct nat_entry_set *nes, *tmp; > > - struct list_head *head = &nm_i->nat_entry_set; > > - bool to_journal = true; > > + struct nat_entry_set *cur; > > + nid_t dirty_cnt = 0; > > > > - /* merge nat entries of dirty list to nat entry set temporarily */ > > - merge_nats_in_set(sbi); > > + if (nes->entry_cnt >= max) > > + goto add_out; > > > > - /* > > - * if there are no enough space in journal to store dirty nat > > - * entries, remove all entries from journal and merge them > > - * into nat entry set. > > - */ > > - if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) { > > - remove_nats_in_journal(sbi); > > - > > - /* > > - * merge nat entries of dirty list to nat entry set temporarily > > - */ > > - merge_nats_in_set(sbi); > > + list_for_each_entry(cur, head, set_list) { > > + dirty_cnt += cur->entry_cnt; > > + if (dirty_cnt > max) > > + break; > > Seems a problem here, For example: > set no: 1 2 3 4 5 6 7 > nat number: 4 3 2 1 1 1 1 > max = 6 > set list will be: 3--4--2--1--1--1--1 > Then we will prior flush nat set with more entries in the set list, result in > a degradation. > We'd better not break the adjustment until we finish to traverse the whole > sequential sorted set list to avoid above condition, so how about removing the > codes relate to dirty_cnt? Hi Chao, Agreed. I fixed that. Thanks, > > Thanks, > Yu > > > + if (cur->entry_cnt >= nes->entry_cnt) { > > + list_add(&nes->set_list, cur->set_list.prev); > > + return; > > + } > > } > > +add_out: > > + list_add_tail(&nes->set_list, head); > > +} > > > > - if (!nm_i->dirty_nat_cnt) > > - return; > > +static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, > > + struct nat_entry_set *set) > > +{ > > + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); > > + struct f2fs_summary_block *sum = curseg->sum_blk; > > + nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; > > + bool to_journal = true; > > + struct f2fs_nat_block *nat_blk; > > + struct nat_entry *ne, *cur; > > + struct page *page = NULL; > > > > /* > > * there are two steps to flush nat entries: > > * #1, flush nat entries to journal in current hot data summary block. > > * #2, flush nat entries to nat page. > > */ > > - list_for_each_entry_safe(nes, tmp, head, set_list) { > > - struct f2fs_nat_block *nat_blk; > > - struct nat_entry *ne, *cur; > > - struct page *page; > > - nid_t start_nid = nes->start_nid; > > + if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) > > + to_journal = false; > > > > - if (to_journal && > > - !__has_cursum_space(sum, nes->entry_cnt, NAT_JOURNAL)) > > - to_journal = false; > > + if (to_journal) { > > + mutex_lock(&curseg->curseg_mutex); > > + } else { > > + page = get_next_nat_page(sbi, start_nid); > > + nat_blk = page_address(page); > > + f2fs_bug_on(sbi, !nat_blk); > > + } > > + > > + /* flush dirty nats in nat entry set */ > > + list_for_each_entry_safe(ne, cur, &set->entry_list, list) { > > + struct f2fs_nat_entry *raw_ne; > > + nid_t nid = nat_get_nid(ne); > > + int offset; > > > > if (to_journal) { > > - mutex_lock(&curseg->curseg_mutex); > > + offset = lookup_journal_in_cursum(sum, > > + NAT_JOURNAL, nid, 1); > > + f2fs_bug_on(sbi, offset < 0); > > + raw_ne = &nat_in_journal(sum, offset); > > + nid_in_journal(sum, offset) = cpu_to_le32(nid); > > } else { > > - page = get_next_nat_page(sbi, start_nid); > > - nat_blk = page_address(page); > > - f2fs_bug_on(sbi, !nat_blk); > > + raw_ne = &nat_blk->entries[nid - start_nid]; > > } > > + raw_nat_from_node_info(raw_ne, &ne->ni); > > > > - /* flush dirty nats in nat entry set */ > > - list_for_each_entry_safe(ne, cur, &nes->entry_list, list) { > > - struct f2fs_nat_entry *raw_ne; > > - nid_t nid = nat_get_nid(ne); > > - int offset; > > + write_lock(&NM_I(sbi)->nat_tree_lock); > > + nat_reset_flag(ne); > > + __clear_nat_cache_dirty(NM_I(sbi), ne); > > + write_unlock(&NM_I(sbi)->nat_tree_lock); > > > > - if (to_journal) { > > - offset = lookup_journal_in_cursum(sum, > > - NAT_JOURNAL, nid, 1); > > - f2fs_bug_on(sbi, offset < 0); > > - raw_ne = &nat_in_journal(sum, offset); > > - nid_in_journal(sum, offset) = cpu_to_le32(nid); > > - } else { > > - raw_ne = &nat_blk->entries[nid - start_nid]; > > - } > > - raw_nat_from_node_info(raw_ne, &ne->ni); > > + if (nat_get_blkaddr(ne) == NULL_ADDR) > > + add_free_nid(sbi, nid, false); > > + } > > > > - if (nat_get_blkaddr(ne) == NULL_ADDR && > > - add_free_nid(sbi, nid, false) <= 0) { > > - write_lock(&nm_i->nat_tree_lock); > > - __del_from_nat_cache(nm_i, ne); > > - write_unlock(&nm_i->nat_tree_lock); > > - } else { > > - write_lock(&nm_i->nat_tree_lock); > > - nat_reset_flag(ne); > > - __clear_nat_cache_dirty(nm_i, ne); > > - write_unlock(&nm_i->nat_tree_lock); > > - } > > - } > > + if (to_journal) > > + mutex_unlock(&curseg->curseg_mutex); > > + else > > + f2fs_put_page(page, 1); > > > > - if (to_journal) > > - mutex_unlock(&curseg->curseg_mutex); > > - else > > - f2fs_put_page(page, 1); > > + f2fs_bug_on(sbi, set->entry_cnt); > > + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); > > + kmem_cache_free(nat_entry_set_slab, set); > > +} > > > > - f2fs_bug_on(sbi, !list_empty(&nes->entry_list)); > > - release_nat_entry_set(nes, nm_i); > > +/* > > + * This function is called during the checkpointing process. > > + */ > > +void flush_nat_entries(struct f2fs_sb_info *sbi) > > +{ > > + struct f2fs_nm_info *nm_i = NM_I(sbi); > > + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); > > + struct f2fs_summary_block *sum = curseg->sum_blk; > > + struct nat_entry_set *setvec[NATVEC_SIZE]; > > + struct nat_entry_set *set, *tmp; > > + unsigned int found; > > + nid_t set_idx = 0; > > + LIST_HEAD(sets); > > + > > + /* > > + * if there are no enough space in journal to store dirty nat > > + * entries, remove all entries from journal and merge them > > + * into nat entry set. > > + */ > > + if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) > > + remove_nats_in_journal(sbi); > > + > > + if (!nm_i->dirty_nat_cnt) > > + return; > > + > > + while ((found = __gang_lookup_nat_set(nm_i, > > + set_idx, NATVEC_SIZE, setvec))) { > > + unsigned idx; > > + set_idx = setvec[found - 1]->set + 1; > > + for (idx = 0; idx < found; idx++) > > + __adjust_nat_entry_set(setvec[idx], &sets, > > + MAX_NAT_JENTRIES(sum)); > > } > > > > - f2fs_bug_on(sbi, !list_empty(head)); > > + /* flush dirty nats in nat entry set */ > > + list_for_each_entry_safe(set, tmp, &sets, set_list) > > + __flush_nat_entry_set(sbi, set); > > + > > f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); > > } > > > > @@ -1968,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) > > INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); > > INIT_LIST_HEAD(&nm_i->free_nid_list); > > INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); > > + INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC); > > INIT_LIST_HEAD(&nm_i->nat_entries); > > - INIT_LIST_HEAD(&nm_i->dirty_nat_entries); > > - INIT_LIST_HEAD(&nm_i->nat_entry_set); > > > > mutex_init(&nm_i->build_lock); > > spin_lock_init(&nm_i->free_nid_list_lock); > > diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h > > index b8ba63c..bd826d9 100644 > > --- a/fs/f2fs/node.h > > +++ b/fs/f2fs/node.h > > @@ -43,6 +43,7 @@ enum { > > IS_CHECKPOINTED, /* is it checkpointed before? */ > > HAS_FSYNCED_INODE, /* is the inode fsynced before? */ > > HAS_LAST_FSYNC, /* has the latest node fsync mark? */ > > + IS_DIRTY, /* this nat entry is dirty? */ > > }; > > > > struct nat_entry { > > @@ -60,10 +61,6 @@ struct nat_entry { > > #define nat_get_version(nat) (nat->ni.version) > > #define nat_set_version(nat, v) (nat->ni.version = v) > > > > -#define __set_nat_cache_dirty(nm_i, ne) \ > > - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); > > -#define __clear_nat_cache_dirty(nm_i, ne) \ > > - list_move_tail(&ne->list, &nm_i->nat_entries); > > #define inc_node_version(version) (++version) > > > > static inline void set_nat_flag(struct nat_entry *ne, > > @@ -113,9 +110,9 @@ enum mem_type { > > }; > > > > struct nat_entry_set { > > - struct list_head set_list; /* link with all nat sets */ > > + struct list_head set_list; /* link with other nat sets */ > > struct list_head entry_list; /* link with dirty nat entries */ > > - nid_t start_nid; /* start nid of nats in set */ > > + nid_t set; /* set number*/ > > unsigned int entry_cnt; /* the # of nat entries in set */ > > }; > > > > -- > > 1.8.5.2 (Apple Git-48) > > > > > > ------------------------------------------------------------------------------ > > Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer > > Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports > > Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper > > Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer > > http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk > > _______________________________________________ > > Linux-f2fs-devel mailing list > > Linux-f2fs-devel@lists.sourceforge.net > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing ops 2014-09-23 4:53 ` [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing ops Jaegeuk Kim 2014-09-30 6:04 ` [f2fs-dev] " Chao Yu @ 2014-10-11 13:11 ` Sasha Levin 1 sibling, 0 replies; 9+ messages in thread From: Sasha Levin @ 2014-10-11 13:11 UTC (permalink / raw) To: Jaegeuk Kim, linux-kernel, linux-fsdevel, linux-f2fs-devel On 09/23/2014 12:53 AM, Jaegeuk Kim wrote: > +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, > + struct nat_entry *ne) > +{ > + nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK; > + struct nat_entry_set *head; > + > + if (get_nat_flag(ne, IS_DIRTY)) > + return; > +retry: > + head = radix_tree_lookup(&nm_i->nat_set_root, set); > + if (!head) { > + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); This is funny, you call f2fs_kmem_cache_alloc() here with GFP_ATOMIC because of disabled preemption, but f2fs_kmem_cache_alloc() will attempt to cond_resched() in case of failed allocations: retry: entry = kmem_cache_alloc(cachep, flags); if (!entry) { cond_resched(); goto retry; } So in reality, f2fs_kmem_cache_alloc can't really work with GFP_ATOMIC, and right now there are two different locations that call it with that flag. Thanks, Sasha ------------------------------------------------------------------------------ Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer http://p.sf.net/sfu/Zoho ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/3] f2fs: introduce cp_control structure 2014-09-23 4:53 [PATCH 1/3] f2fs: introduce cp_control structure Jaegeuk Kim 2014-09-23 4:53 ` [PATCH 2/3] f2fs: introduce FITRIM in f2fs_ioctl Jaegeuk Kim 2014-09-23 4:53 ` [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing ops Jaegeuk Kim @ 2014-09-30 6:01 ` Chao Yu 2 siblings, 0 replies; 9+ messages in thread From: Chao Yu @ 2014-09-30 6:01 UTC (permalink / raw) To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel > -----Original Message----- > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org] > Sent: Tuesday, September 23, 2014 12:53 PM > To: linux-kernel@vger.kernel.org; linux-fsdevel@vger.kernel.org; > linux-f2fs-devel@lists.sourceforge.net > Cc: Jaegeuk Kim > Subject: [f2fs-dev] [PATCH 1/3] f2fs: introduce cp_control structure > > This patch add a new data structure to control checkpoint parameters. > Currently, it presents the reason of checkpoint such as is_umount and normal > sync. > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org> Looks good! Reviewed-by: Chao Yu <chao2.yu@samsung.com> ------------------------------------------------------------------------------ Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk ^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2014-10-11 13:11 UTC | newest] Thread overview: 9+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2014-09-23 4:53 [PATCH 1/3] f2fs: introduce cp_control structure Jaegeuk Kim 2014-09-23 4:53 ` [PATCH 2/3] f2fs: introduce FITRIM in f2fs_ioctl Jaegeuk Kim 2014-09-30 6:01 ` Chao Yu 2014-10-01 15:07 ` Jaegeuk Kim 2014-09-23 4:53 ` [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing ops Jaegeuk Kim 2014-09-30 6:04 ` [f2fs-dev] " Chao Yu 2014-10-01 15:06 ` Jaegeuk Kim 2014-10-11 13:11 ` Sasha Levin 2014-09-30 6:01 ` [PATCH 1/3] f2fs: introduce cp_control structure Chao Yu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).