From mboxrd@z Thu Jan 1 00:00:00 1970 From: Nick Piggin Subject: [patch] fs: use fast counters for vfs caches Date: Mon, 29 Nov 2010 21:57:33 +1100 Message-ID: <20101129105733.GA3241@amd> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="ZGiS0Q5IWpPtfppv" To: linux-fsdevel@vger.kernel.org, Linus Torvalds , Al Viro , Christoph Hellwig Return-path: Received: from ipmail07.adl2.internode.on.net ([150.101.137.131]:5617 "EHLO ipmail07.adl2.internode.on.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752154Ab0K2K5k (ORCPT ); Mon, 29 Nov 2010 05:57:40 -0500 Content-Disposition: inline Sender: linux-fsdevel-owner@vger.kernel.org List-ID: --ZGiS0Q5IWpPtfppv Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Sorry, forgot to cc linux-fsdevel --ZGiS0Q5IWpPtfppv Content-Type: message/rfc822 Content-Disposition: inline Date: Mon, 29 Nov 2010 21:49:52 +1100 From: Nick Piggin To: Linus Torvalds , Al Viro , Christoph Hellwig Subject: [patch] fs: use fast counters for vfs caches Message-ID: <20101129104952.GA3185@amd> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14) Hey, What was the reason behind not using my approach to use fast per-cpu counters for inode and dentry counters, and instead using the percpu_counter lib (which is not useful unless very fast approximate access to the global counter is required, or performance is not critical, which is somewhat of an oxymoron if you're using per-counters in the first place). It is a difference between this: incl %gs:nr_dentry # nr_dentry versus this horrible thing: movl percpu_counter_batch(%rip), %edx # percpu_counter_batch, movl $1, %esi #, movq $nr_dentry, %rdi #, call __percpu_counter_add # (plus I clobber registers) __percpu_counter_add: pushq %rbp # movq %rsp, %rbp #, subq $32, %rsp #, movq %rbx, -24(%rbp) #, movq %r12, -16(%rbp) #, movq %r13, -8(%rbp) #, movq %rdi, %rbx # fbc, fbc #APP # 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1 movq %gs:kernel_stack,%rax #, pfo_ret__ # 0 "" 2 #NO_APP incl -8124(%rax) # .preempt_count movq 32(%rdi), %r12 # .counters, tcp_ptr__ #APP # 78 "lib/percpu_counter.c" 1 add %gs:this_cpu_off, %r12 # this_cpu_off, tcp_ptr__ # 0 "" 2 #NO_APP movslq (%r12),%r13 #* tcp_ptr__, tmp73 movslq %edx,%rax # batch, batch addq %rsi, %r13 # amount, count cmpq %rax, %r13 # batch, count jge .L27 #, negl %edx # tmp76 movslq %edx,%rdx # tmp76, tmp77 cmpq %rdx, %r13 # tmp77, count jg .L28 #, .L27: movq %rbx, %rdi # fbc, call _raw_spin_lock # addq %r13, 8(%rbx) # count, .count movq %rbx, %rdi # fbc, movl $0, (%r12) #,* tcp_ptr__ call _raw_spin_unlock # .L29: #APP # 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1 movq %gs:kernel_stack,%rax #, pfo_ret__ # 0 "" 2 #NO_APP decl -8124(%rax) # .preempt_count movq -8136(%rax), %rax #, D.14625 testb $8, %al #, D.14625 jne .L32 #, .L31: movq -24(%rbp), %rbx #, movq -16(%rbp), %r12 #, movq -8(%rbp), %r13 #, leave ret .p2align 4,,10 .p2align 3 .L28: movl %r13d, (%r12) # count,* jmp .L29 # .L32: call preempt_schedule # .p2align 4,,6 jmp .L31 # .size __percpu_counter_add, .-__percpu_counter_add .p2align 4,,15 So, fix it. Signed-off-by: Nick Piggin --- I was really trying to be very careful with single thread performance and count cycles of every change made :( No matter what happens, fine grained locking will bloat things up and slow them down, so every effort has to be made to ensure it is done with as little impact as possible. Also, making LRU counters per-cpu is not really that useful because the LRU never becomes a percpu structure. In my patches, they become per-zone counters, which is reasonable since all the other LRU manipulations are per-zone as well. I'll have to change that back at some point, too. --- fs/dcache.c | 43 +++++++++++++++++++++++++++++-------------- fs/inode.c | 46 ++++++++++++++++++++++++++-------------------- 2 files changed, 55 insertions(+), 34 deletions(-) Index: linux-2.6/fs/dcache.c =================================================================== --- linux-2.6.orig/fs/dcache.c 2010-11-29 21:10:35.000000000 +1100 +++ linux-2.6/fs/dcache.c 2010-11-29 21:15:56.000000000 +1100 @@ -67,15 +67,33 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static struct percpu_counter nr_dentry __cacheline_aligned_in_smp; -static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp; +static DEFINE_PER_CPU(unsigned int, nr_dentry); +static DEFINE_PER_CPU(unsigned int, nr_dentry_unused); + +static int get_nr_dentry(void) +{ + int i; + int sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry, i); + return sum < 0 ? 0 : sum; +} + +static int get_nr_dentry_unused(void) +{ + int i; + int sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_unused, i); + return sum < 0 ? 0 : sum; +} #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry); - dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused); + dentry_stat.nr_dentry = get_nr_dentry(); + dentry_stat.nr_unused = get_nr_dentry_unused(); return proc_dointvec(table, write, buffer, lenp, ppos); } #endif @@ -95,7 +113,7 @@ static void __d_free(struct rcu_head *he */ static void d_free(struct dentry *dentry) { - percpu_counter_dec(&nr_dentry); + this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -140,7 +158,7 @@ static void dentry_lru_add(struct dentry if (list_empty(&dentry->d_lru)) { list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; - percpu_counter_inc(&nr_dentry_unused); + this_cpu_inc(nr_dentry_unused); } } @@ -149,7 +167,7 @@ static void dentry_lru_del(struct dentry if (!list_empty(&dentry->d_lru)) { list_del_init(&dentry->d_lru); dentry->d_sb->s_nr_dentry_unused--; - percpu_counter_dec(&nr_dentry_unused); + this_cpu_dec(nr_dentry_unused); } } @@ -158,7 +176,7 @@ static void dentry_lru_move_tail(struct if (list_empty(&dentry->d_lru)) { list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; - percpu_counter_inc(&nr_dentry_unused); + this_cpu_inc(nr_dentry_unused); } else { list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); } @@ -546,7 +564,7 @@ static void prune_dcache(int count) { struct super_block *sb, *p = NULL; int w_count; - int unused = percpu_counter_sum_positive(&nr_dentry_unused); + int unused = get_nr_dentry_unused(); int prune_ratio; int pruned; @@ -916,7 +934,7 @@ static int shrink_dcache_memory(struct s prune_dcache(nr); } - nr_unused = percpu_counter_sum_positive(&nr_dentry_unused); + nr_unused = get_nr_dentry_unused(); return (nr_unused / 100) * sysctl_vfs_cache_pressure; } @@ -986,7 +1004,7 @@ struct dentry *d_alloc(struct dentry * p list_add(&dentry->d_u.d_child, &parent->d_subdirs); spin_unlock(&dcache_lock); - percpu_counter_inc(&nr_dentry); + this_cpu_inc(nr_dentry); return dentry; } @@ -2427,9 +2445,6 @@ static void __init dcache_init(void) { int loop; - percpu_counter_init(&nr_dentry, 0); - percpu_counter_init(&nr_dentry_unused, 0); - /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature Index: linux-2.6/fs/inode.c =================================================================== --- linux-2.6.orig/fs/inode.c 2010-11-29 21:20:46.000000000 +1100 +++ linux-2.6/fs/inode.c 2010-11-29 21:31:30.000000000 +1100 @@ -102,26 +102,34 @@ static DECLARE_RWSEM(iprune_sem); */ struct inodes_stat_t inodes_stat; -static struct percpu_counter nr_inodes __cacheline_aligned_in_smp; -static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp; +static DEFINE_PER_CPU(unsigned int, nr_inodes); +static DEFINE_PER_CPU(unsigned int, nr_inodes_unused); static struct kmem_cache *inode_cachep __read_mostly; -static inline int get_nr_inodes(void) +static int get_nr_inodes(void) { - return percpu_counter_sum_positive(&nr_inodes); -} - -static inline int get_nr_inodes_unused(void) -{ - return percpu_counter_sum_positive(&nr_inodes_unused); + int i; + int sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_inodes, i); + return sum < 0 ? 0 : sum; +} + +static int get_nr_inodes_unused(void) +{ + int i; + int sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_inodes_unused, i); + return sum < 0 ? 0 : sum; } int get_nr_dirty_inodes(void) { + /* not actually dirty inodes, but a wild approximation */ int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; - } /* @@ -224,7 +232,7 @@ int inode_init_always(struct super_block inode->i_fsnotify_mask = 0; #endif - percpu_counter_inc(&nr_inodes); + this_cpu_inc(nr_inodes); return 0; out: @@ -266,7 +274,7 @@ void __destroy_inode(struct inode *inode if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_default_acl); #endif - percpu_counter_dec(&nr_inodes); + this_cpu_dec(nr_inodes); } EXPORT_SYMBOL(__destroy_inode); @@ -335,7 +343,7 @@ static void inode_lru_list_add(struct in { if (list_empty(&inode->i_lru)) { list_add(&inode->i_lru, &inode_lru); - percpu_counter_inc(&nr_inodes_unused); + this_cpu_inc(nr_inodes_unused); } } @@ -343,7 +351,7 @@ static void inode_lru_list_del(struct in { if (!list_empty(&inode->i_lru)) { list_del_init(&inode->i_lru); - percpu_counter_dec(&nr_inodes_unused); + this_cpu_dec(nr_inodes_unused); } } @@ -513,7 +521,7 @@ void evict_inodes(struct super_block *sb list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); if (!(inode->i_state & (I_DIRTY | I_SYNC))) - percpu_counter_dec(&nr_inodes_unused); + this_cpu_dec(nr_inodes_unused); } spin_unlock(&inode_lock); @@ -554,7 +562,7 @@ int invalidate_inodes(struct super_block list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); if (!(inode->i_state & (I_DIRTY | I_SYNC))) - percpu_counter_dec(&nr_inodes_unused); + this_cpu_dec(nr_inodes_unused); } spin_unlock(&inode_lock); @@ -616,7 +624,7 @@ static void prune_icache(int nr_to_scan) if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { list_del_init(&inode->i_lru); - percpu_counter_dec(&nr_inodes_unused); + this_cpu_dec(nr_inodes_unused); continue; } @@ -650,7 +658,7 @@ static void prune_icache(int nr_to_scan) */ list_move(&inode->i_lru, &freeable); list_del_init(&inode->i_wb_list); - percpu_counter_dec(&nr_inodes_unused); + this_cpu_dec(nr_inodes_unused); } if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); @@ -1648,8 +1656,6 @@ void __init inode_init(void) SLAB_MEM_SPREAD), init_once); register_shrinker(&icache_shrinker); - percpu_counter_init(&nr_inodes, 0); - percpu_counter_init(&nr_inodes_unused, 0); /* Hash may have been set up in inode_init_early */ if (!hashdist) --ZGiS0Q5IWpPtfppv--