linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] fs: use fast counters for vfs caches
@ 2010-11-29 10:57 Nick Piggin
  2010-12-09  5:43 ` Dave Chinner
  0 siblings, 1 reply; 10+ messages in thread
From: Nick Piggin @ 2010-11-29 10:57 UTC (permalink / raw)
  To: linux-fsdevel, Linus Torvalds, Al Viro, Christoph Hellwig

[-- Attachment #1: Type: text/plain, Size: 34 bytes --]

Sorry, forgot to cc linux-fsdevel

[-- Attachment #2: Type: message/rfc822, Size: 11235 bytes --]

From: Nick Piggin <npiggin@kernel.dk>
To: Linus Torvalds <torvalds@linux-foundation.org>, Al Viro <viro@ZenIV.linux.org.uk>, Christoph Hellwig <hch@lst.de>
Subject: [patch] fs: use fast counters for vfs caches
Date: Mon, 29 Nov 2010 21:49:52 +1100
Message-ID: <20101129104952.GA3185@amd>

Hey,

What was the reason behind not using my approach to use fast per-cpu
counters for inode and dentry counters, and instead using the
percpu_counter lib (which is not useful unless very fast approximate
access to the global counter is required, or performance is not
critical, which is somewhat of an oxymoron if you're using per-counters
in the first place). It is a difference between this:

        incl %gs:nr_dentry      # nr_dentry

versus this horrible thing:

        movl    percpu_counter_batch(%rip), %edx        # percpu_counter_batch,
        movl    $1, %esi        #,
        movq    $nr_dentry, %rdi        #,
        call    __percpu_counter_add    # (plus I clobber registers)

__percpu_counter_add:
        pushq   %rbp    #
        movq    %rsp, %rbp      #,
        subq    $32, %rsp       #,
        movq    %rbx, -24(%rbp) #,
        movq    %r12, -16(%rbp) #,
        movq    %r13, -8(%rbp)  #,
        movq    %rdi, %rbx      # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
        movq %gs:kernel_stack,%rax      #, pfo_ret__
# 0 "" 2
#NO_APP
        incl    -8124(%rax)     # <variable>.preempt_count
        movq    32(%rdi), %r12  # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
        add %gs:this_cpu_off, %r12      # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
        movslq  (%r12),%r13     #* tcp_ptr__, tmp73
        movslq  %edx,%rax       # batch, batch
        addq    %rsi, %r13      # amount, count
        cmpq    %rax, %r13      # batch, count
        jge     .L27    #,
        negl    %edx    # tmp76
        movslq  %edx,%rdx       # tmp76, tmp77
        cmpq    %rdx, %r13      # tmp77, count
        jg      .L28    #,
.L27:
        movq    %rbx, %rdi      # fbc,
        call    _raw_spin_lock  #
        addq    %r13, 8(%rbx)   # count, <variable>.count
        movq    %rbx, %rdi      # fbc,
        movl    $0, (%r12)      #,* tcp_ptr__
        call    _raw_spin_unlock        #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
        movq %gs:kernel_stack,%rax      #, pfo_ret__
# 0 "" 2
#NO_APP
        decl    -8124(%rax)     # <variable>.preempt_count
        movq    -8136(%rax), %rax       #, D.14625
        testb   $8, %al #, D.14625
        jne     .L32    #,
.L31:
        movq    -24(%rbp), %rbx #,
        movq    -16(%rbp), %r12 #,
        movq    -8(%rbp), %r13  #,
        leave
        ret
        .p2align 4,,10
        .p2align 3
.L28:
        movl    %r13d, (%r12)   # count,*
        jmp     .L29    #
.L32:
        call    preempt_schedule        #
        .p2align 4,,6
        jmp     .L31    #
        .size   __percpu_counter_add, .-__percpu_counter_add
        .p2align 4,,15

So, fix it.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>

---
I was really trying to be very careful with single thread performance and
count cycles of every change made :( No matter what happens, fine
grained locking will bloat things up and slow them down, so every effort
has to be made to ensure it is done with as little impact as possible.

Also, making LRU counters per-cpu is not really that useful because the
LRU never becomes a percpu structure. In my patches, they become per-zone
counters, which is reasonable since all the other LRU manipulations are
per-zone as well. I'll have to change that back at some point, too.

---
 fs/dcache.c |   43 +++++++++++++++++++++++++++++--------------
 fs/inode.c  |   46 ++++++++++++++++++++++++++--------------------
 2 files changed, 55 insertions(+), 34 deletions(-)

Index: linux-2.6/fs/dcache.c
===================================================================
--- linux-2.6.orig/fs/dcache.c	2010-11-29 21:10:35.000000000 +1100
+++ linux-2.6/fs/dcache.c	2010-11-29 21:15:56.000000000 +1100
@@ -67,15 +67,33 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
-static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_dentry);
+static DEFINE_PER_CPU(unsigned int, nr_dentry_unused);
+
+static int get_nr_dentry(void)
+{
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry, i);
+	return sum < 0 ? 0 : sum;
+}
+
+static int get_nr_dentry_unused(void)
+{
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry_unused, i);
+	return sum < 0 ? 0 : sum;
+}
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
-	dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
-	dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	dentry_stat.nr_dentry = get_nr_dentry();
+	dentry_stat.nr_unused = get_nr_dentry_unused();
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -95,7 +113,7 @@ static void __d_free(struct rcu_head *he
  */
 static void d_free(struct dentry *dentry)
 {
-	percpu_counter_dec(&nr_dentry);
+	this_cpu_dec(nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
 
@@ -140,7 +158,7 @@ static void dentry_lru_add(struct dentry
 	if (list_empty(&dentry->d_lru)) {
 		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
-		percpu_counter_inc(&nr_dentry_unused);
+		this_cpu_inc(nr_dentry_unused);
 	}
 }
 
@@ -149,7 +167,7 @@ static void dentry_lru_del(struct dentry
 	if (!list_empty(&dentry->d_lru)) {
 		list_del_init(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
-		percpu_counter_dec(&nr_dentry_unused);
+		this_cpu_dec(nr_dentry_unused);
 	}
 }
 
@@ -158,7 +176,7 @@ static void dentry_lru_move_tail(struct
 	if (list_empty(&dentry->d_lru)) {
 		list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
-		percpu_counter_inc(&nr_dentry_unused);
+		this_cpu_inc(nr_dentry_unused);
 	} else {
 		list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	}
@@ -546,7 +564,7 @@ static void prune_dcache(int count)
 {
 	struct super_block *sb, *p = NULL;
 	int w_count;
-	int unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	int unused = get_nr_dentry_unused();
 	int prune_ratio;
 	int pruned;
 
@@ -916,7 +934,7 @@ static int shrink_dcache_memory(struct s
 		prune_dcache(nr);
 	}
 
-	nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	nr_unused = get_nr_dentry_unused();
 	return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
@@ -986,7 +1004,7 @@ struct dentry *d_alloc(struct dentry * p
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	spin_unlock(&dcache_lock);
 
-	percpu_counter_inc(&nr_dentry);
+	this_cpu_inc(nr_dentry);
 
 	return dentry;
 }
@@ -2427,9 +2445,6 @@ static void __init dcache_init(void)
 {
 	int loop;
 
-	percpu_counter_init(&nr_dentry, 0);
-	percpu_counter_init(&nr_dentry_unused, 0);
-
 	/* 
 	 * A constructor could be added for stable state like the lists,
 	 * but it is probably not worth it because of the cache nature
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2010-11-29 21:20:46.000000000 +1100
+++ linux-2.6/fs/inode.c	2010-11-29 21:31:30.000000000 +1100
@@ -102,26 +102,34 @@ static DECLARE_RWSEM(iprune_sem);
  */
 struct inodes_stat_t inodes_stat;
 
-static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
-static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_inodes);
+static DEFINE_PER_CPU(unsigned int, nr_inodes_unused);
 
 static struct kmem_cache *inode_cachep __read_mostly;
 
-static inline int get_nr_inodes(void)
+static int get_nr_inodes(void)
 {
-	return percpu_counter_sum_positive(&nr_inodes);
-}
-
-static inline int get_nr_inodes_unused(void)
-{
-	return percpu_counter_sum_positive(&nr_inodes_unused);
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_inodes, i);
+	return sum < 0 ? 0 : sum;
+}
+
+static int get_nr_inodes_unused(void)
+{
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_inodes_unused, i);
+	return sum < 0 ? 0 : sum;
 }
 
 int get_nr_dirty_inodes(void)
 {
+	/* not actually dirty inodes, but a wild approximation */
 	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
 	return nr_dirty > 0 ? nr_dirty : 0;
-
 }
 
 /*
@@ -224,7 +232,7 @@ int inode_init_always(struct super_block
 	inode->i_fsnotify_mask = 0;
 #endif
 
-	percpu_counter_inc(&nr_inodes);
+	this_cpu_inc(nr_inodes);
 
 	return 0;
 out:
@@ -266,7 +274,7 @@ void __destroy_inode(struct inode *inode
 	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_default_acl);
 #endif
-	percpu_counter_dec(&nr_inodes);
+	this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
 
@@ -335,7 +343,7 @@ static void inode_lru_list_add(struct in
 {
 	if (list_empty(&inode->i_lru)) {
 		list_add(&inode->i_lru, &inode_lru);
-		percpu_counter_inc(&nr_inodes_unused);
+		this_cpu_inc(nr_inodes_unused);
 	}
 }
 
@@ -343,7 +351,7 @@ static void inode_lru_list_del(struct in
 {
 	if (!list_empty(&inode->i_lru)) {
 		list_del_init(&inode->i_lru);
-		percpu_counter_dec(&nr_inodes_unused);
+		this_cpu_dec(nr_inodes_unused);
 	}
 }
 
@@ -513,7 +521,7 @@ void evict_inodes(struct super_block *sb
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+			this_cpu_dec(nr_inodes_unused);
 	}
 	spin_unlock(&inode_lock);
 
@@ -554,7 +562,7 @@ int invalidate_inodes(struct super_block
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+			this_cpu_dec(nr_inodes_unused);
 	}
 	spin_unlock(&inode_lock);
 
@@ -616,7 +624,7 @@ static void prune_icache(int nr_to_scan)
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
 			list_del_init(&inode->i_lru);
-			percpu_counter_dec(&nr_inodes_unused);
+			this_cpu_dec(nr_inodes_unused);
 			continue;
 		}
 
@@ -650,7 +658,7 @@ static void prune_icache(int nr_to_scan)
 		 */
 		list_move(&inode->i_lru, &freeable);
 		list_del_init(&inode->i_wb_list);
-		percpu_counter_dec(&nr_inodes_unused);
+		this_cpu_dec(nr_inodes_unused);
 	}
 	if (current_is_kswapd())
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
@@ -1648,8 +1656,6 @@ void __init inode_init(void)
 					 SLAB_MEM_SPREAD),
 					 init_once);
 	register_shrinker(&icache_shrinker);
-	percpu_counter_init(&nr_inodes, 0);
-	percpu_counter_init(&nr_inodes_unused, 0);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] fs: use fast counters for vfs caches
  2010-11-29 10:57 [patch] fs: use fast counters for vfs caches Nick Piggin
@ 2010-12-09  5:43 ` Dave Chinner
  2010-12-09  6:16   ` Nick Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: Dave Chinner @ 2010-12-09  5:43 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-fsdevel, Linus Torvalds, Al Viro, Christoph Hellwig

On Mon, Nov 29, 2010 at 09:57:33PM +1100, Nick Piggin wrote:
> Hey,
> 
> What was the reason behind not using my approach to use fast per-cpu
> counters for inode and dentry counters, and instead using the
> percpu_counter lib (which is not useful unless very fast approximate
> access to the global counter is required, or performance is not
> critical, which is somewhat of an oxymoron if you're using per-counters
> in the first place). It is a difference between this:

Hi Nick - sorry for being slow to answer this - I only just found
this email.

The reason for using the generic counters is because the shrinkers
read the current value of the global counter on every call and hence
they can be read thousands of times a second. The only way to do that
efficiently is to use the approximately value the generic counters
provide.

IIRC, it's also used in a couple of places in the writeback code as
well, but that is significantly fewer reads of the summed value.
It's probably best to revisit this once the shrinkers no longer use
a global counter value.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] fs: use fast counters for vfs caches
  2010-12-09  5:43 ` Dave Chinner
@ 2010-12-09  6:16   ` Nick Piggin
  2010-12-09  6:40     ` Nick Piggin
  2010-12-09  7:45     ` [PATCH] fs: use approximate counter values for inodes and dentries. (was Re: [patch] fs: use fast counters for vfs caches) Dave Chinner
  0 siblings, 2 replies; 10+ messages in thread
From: Nick Piggin @ 2010-12-09  6:16 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Nick Piggin, linux-fsdevel, Linus Torvalds, Al Viro,
	Christoph Hellwig

On Thu, Dec 09, 2010 at 04:43:43PM +1100, Dave Chinner wrote:
> On Mon, Nov 29, 2010 at 09:57:33PM +1100, Nick Piggin wrote:
> > Hey,
> > 
> > What was the reason behind not using my approach to use fast per-cpu
> > counters for inode and dentry counters, and instead using the
> > percpu_counter lib (which is not useful unless very fast approximate
> > access to the global counter is required, or performance is not
> > critical, which is somewhat of an oxymoron if you're using per-counters
> > in the first place). It is a difference between this:
> 
> Hi Nick - sorry for being slow to answer this - I only just found
> this email.
> 
> The reason for using the generic counters is because the shrinkers
> read the current value of the global counter on every call and hence
> they can be read thousands of times a second. The only way to do that
> efficiently is to use the approximately value the generic counters
> provide.

That is not what is happening, though, so I assume that no measurements
were done.

In fact what happens now is that *both* type of counters use the crappy
percpu counter library, and the shrinkers actually do a per-cpu loop
over the counters to get the sum.

But anyway even if that operation was fast, it is silly to use a per
cpu counter for nr_unused, because it is tied fundamentally to the LRU,
so you can't get any more scalability than the LRU operations anyway!

I'm all for breaking out patches and pulling things ahead where they
make sense, but it seems like things have just been done without much
thought or measurements or any critical discussion of why changes were
made.

There wasn't even any point making the total counter per-cpu yet either,
seeing as there is still a lot of global locking in there it would not
have made any difference to scalability, and only slowed things down.

What it _should_ look like is exactly what I had in my tree.  Proper,
fast total object counters with a per-cpu loop for the sum when the
global locks in the create/destroy path are lifted; with per-LRU counter
for nr_unused counter which is protected together with lru lock.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch] fs: use fast counters for vfs caches
  2010-12-09  6:16   ` Nick Piggin
@ 2010-12-09  6:40     ` Nick Piggin
  2010-12-10  4:51       ` [patch 1/2] fs: revert percpu nr_unused counters for dentry and inodes Nick Piggin
  2010-12-09  7:45     ` [PATCH] fs: use approximate counter values for inodes and dentries. (was Re: [patch] fs: use fast counters for vfs caches) Dave Chinner
  1 sibling, 1 reply; 10+ messages in thread
From: Nick Piggin @ 2010-12-09  6:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Dave Chinner, linux-fsdevel, Linus Torvalds, Al Viro,
	Christoph Hellwig

On Thu, Dec 09, 2010 at 05:16:44PM +1100, Nick Piggin wrote:
> On Thu, Dec 09, 2010 at 04:43:43PM +1100, Dave Chinner wrote:
> > On Mon, Nov 29, 2010 at 09:57:33PM +1100, Nick Piggin wrote:
> > > Hey,
> > > 
> > > What was the reason behind not using my approach to use fast per-cpu
> > > counters for inode and dentry counters, and instead using the
> > > percpu_counter lib (which is not useful unless very fast approximate
> > > access to the global counter is required, or performance is not
> > > critical, which is somewhat of an oxymoron if you're using per-counters
> > > in the first place). It is a difference between this:
> > 
> > Hi Nick - sorry for being slow to answer this - I only just found
> > this email.
> > 
> > The reason for using the generic counters is because the shrinkers
> > read the current value of the global counter on every call and hence
> > they can be read thousands of times a second. The only way to do that
> > efficiently is to use the approximately value the generic counters
> > provide.
> 
> That is not what is happening, though, so I assume that no measurements
> were done.
> 
> In fact what happens now is that *both* type of counters use the crappy
> percpu counter library, and the shrinkers actually do a per-cpu loop
> over the counters to get the sum.
> 
> But anyway even if that operation was fast, it is silly to use a per
> cpu counter for nr_unused, because it is tied fundamentally to the LRU,
> so you can't get any more scalability than the LRU operations anyway!
> 
> I'm all for breaking out patches and pulling things ahead where they
> make sense, but it seems like things have just been done without much
> thought or measurements or any critical discussion of why changes were
> made.
> 
> There wasn't even any point making the total counter per-cpu yet either,
> seeing as there is still a lot of global locking in there it would not
> have made any difference to scalability, and only slowed things down.
> 
> What it _should_ look like is exactly what I had in my tree.  Proper,
> fast total object counters with a per-cpu loop for the sum when the
> global locks in the create/destroy path are lifted; with per-LRU counter
> for nr_unused counter which is protected together with lru lock.

In fact, I should revise my regression fix to go back to global LRU
counters until per-zone LRUs are implemented. Sigh, yay for more tree
breakage.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] fs: use approximate counter values for inodes and dentries. (was Re: [patch] fs: use fast counters for vfs caches)
  2010-12-09  6:16   ` Nick Piggin
  2010-12-09  6:40     ` Nick Piggin
@ 2010-12-09  7:45     ` Dave Chinner
  2010-12-09 12:24       ` Nick Piggin
  1 sibling, 1 reply; 10+ messages in thread
From: Dave Chinner @ 2010-12-09  7:45 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-fsdevel, Linus Torvalds, Al Viro, Christoph Hellwig

On Thu, Dec 09, 2010 at 05:16:44PM +1100, Nick Piggin wrote:
> On Thu, Dec 09, 2010 at 04:43:43PM +1100, Dave Chinner wrote:
> > On Mon, Nov 29, 2010 at 09:57:33PM +1100, Nick Piggin wrote:
> > > Hey,
> > > 
> > > What was the reason behind not using my approach to use fast per-cpu
> > > counters for inode and dentry counters, and instead using the
> > > percpu_counter lib (which is not useful unless very fast approximate
> > > access to the global counter is required, or performance is not
> > > critical, which is somewhat of an oxymoron if you're using per-counters
> > > in the first place). It is a difference between this:
> > 
> > Hi Nick - sorry for being slow to answer this - I only just found
> > this email.
> > 
> > The reason for using the generic counters is because the shrinkers
> > read the current value of the global counter on every call and hence
> > they can be read thousands of times a second. The only way to do that
> > efficiently is to use the approximately value the generic counters
> > provide.
> 
> That is not what is happening, though, so I assume that no measurements
> were done.
>
> In fact what happens now is that *both* type of counters use the crappy
> percpu counter library, and the shrinkers actually do a per-cpu loop
> over the counters to get the sum.

More likely that the overhead was hidden in the noise on the size of
machines most people test on. It certainly wasn't measurable on my
16p machine, and nobody who reviewed it at the time (ѕeveral people)
picked it up. So thanks for reviewing it - the simple fix is below.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

fs: Use approximate values for number of inodes and dentries

From: Dave Chinner <dchinner@redhat.com>

The number of inodes and dentries are percpu counters that are
summed by the shrinkers. This can result in summing across all CPUs
thousands of times per second per counter which is very inefficient.
The approximate counter value should be used instead to keep the
overhead to a minimum.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/dcache.c |    4 ++--
 fs/inode.c  |    4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 23702a9..a533184 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -546,7 +546,7 @@ static void prune_dcache(int count)
 {
 	struct super_block *sb, *p = NULL;
 	int w_count;
-	int unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	int unused = percpu_counter_read_positive(&nr_dentry_unused);
 	int prune_ratio;
 	int pruned;
 
@@ -916,7 +916,7 @@ static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 		prune_dcache(nr);
 	}
 
-	nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	nr_unused = percpu_counter_read_positive(&nr_dentry_unused);
 	return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index ae2727a..975a651 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -109,12 +109,12 @@ static struct kmem_cache *inode_cachep __read_mostly;
 
 static inline int get_nr_inodes(void)
 {
-	return percpu_counter_sum_positive(&nr_inodes);
+	return percpu_counter_read_positive(&nr_inodes);
 }
 
 static inline int get_nr_inodes_unused(void)
 {
-	return percpu_counter_sum_positive(&nr_inodes_unused);
+	return percpu_counter_read_positive(&nr_inodes_unused);
 }
 
 int get_nr_dirty_inodes(void)
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] fs: use approximate counter values for inodes and dentries. (was Re: [patch] fs: use fast counters for vfs caches)
  2010-12-09  7:45     ` [PATCH] fs: use approximate counter values for inodes and dentries. (was Re: [patch] fs: use fast counters for vfs caches) Dave Chinner
@ 2010-12-09 12:24       ` Nick Piggin
  2010-12-09 23:30         ` Dave Chinner
  0 siblings, 1 reply; 10+ messages in thread
From: Nick Piggin @ 2010-12-09 12:24 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Nick Piggin, linux-fsdevel, Linus Torvalds, Al Viro,
	Christoph Hellwig

On Thu, Dec 9, 2010 at 6:45 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Thu, Dec 09, 2010 at 05:16:44PM +1100, Nick Piggin wrote:
>> On Thu, Dec 09, 2010 at 04:43:43PM +1100, Dave Chinner wrote:
>> > On Mon, Nov 29, 2010 at 09:57:33PM +1100, Nick Piggin wrote:
>> > > Hey,
>> > >
>> > > What was the reason behind not using my approach to use fast per-cpu
>> > > counters for inode and dentry counters, and instead using the
>> > > percpu_counter lib (which is not useful unless very fast approximate
>> > > access to the global counter is required, or performance is not
>> > > critical, which is somewhat of an oxymoron if you're using per-counters
>> > > in the first place). It is a difference between this:
>> >
>> > Hi Nick - sorry for being slow to answer this - I only just found
>> > this email.
>> >
>> > The reason for using the generic counters is because the shrinkers
>> > read the current value of the global counter on every call and hence
>> > they can be read thousands of times a second. The only way to do that
>> > efficiently is to use the approximately value the generic counters
>> > provide.
>>
>> That is not what is happening, though, so I assume that no measurements
>> were done.
>>
>> In fact what happens now is that *both* type of counters use the crappy
>> percpu counter library, and the shrinkers actually do a per-cpu loop
>> over the counters to get the sum.
>
> More likely that the overhead was hidden in the noise on the size of
> machines most people test on.

No. I was referring to the decision to use the heavyweight percpu_counter
code over the superior per cpu data that I was using.

Also, the unrelated change to make nr_unused into per-cpu was not
right, and I will revert that back to a global variable. (again, unless you
have numbers)

> It certainly wasn't measurable on my
> 16p machine, and nobody who reviewed it at the time (ѕeveral people)
> picked it up. So thanks for reviewing it - the simple fix is below.
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
>
> fs: Use approximate values for number of inodes and dentries
>
> From: Dave Chinner <dchinner@redhat.com>

Nack. Can you please address my points and actually explain why this
is better than my proposed approach please?
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] fs: use approximate counter values for inodes and dentries. (was Re: [patch] fs: use fast counters for vfs caches)
  2010-12-09 12:24       ` Nick Piggin
@ 2010-12-09 23:30         ` Dave Chinner
  2010-12-10  2:23           ` Nick Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: Dave Chinner @ 2010-12-09 23:30 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Nick Piggin, linux-fsdevel, Linus Torvalds, Al Viro,
	Christoph Hellwig

On Thu, Dec 09, 2010 at 11:24:38PM +1100, Nick Piggin wrote:
> On Thu, Dec 9, 2010 at 6:45 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Thu, Dec 09, 2010 at 05:16:44PM +1100, Nick Piggin wrote:
> >> On Thu, Dec 09, 2010 at 04:43:43PM +1100, Dave Chinner wrote:
> >> > On Mon, Nov 29, 2010 at 09:57:33PM +1100, Nick Piggin wrote:
> >> > > Hey,
> >> > >
> >> > > What was the reason behind not using my approach to use fast per-cpu
> >> > > counters for inode and dentry counters, and instead using the
> >> > > percpu_counter lib (which is not useful unless very fast approximate
> >> > > access to the global counter is required, or performance is not
> >> > > critical, which is somewhat of an oxymoron if you're using per-counters
> >> > > in the first place). It is a difference between this:
> >> >
> >> > Hi Nick - sorry for being slow to answer this - I only just found
> >> > this email.
> >> >
> >> > The reason for using the generic counters is because the shrinkers
> >> > read the current value of the global counter on every call and hence
> >> > they can be read thousands of times a second. The only way to do that
> >> > efficiently is to use the approximately value the generic counters
> >> > provide.
> >>
> >> That is not what is happening, though, so I assume that no measurements
> >> were done.
> >>
> >> In fact what happens now is that *both* type of counters use the crappy
> >> percpu counter library, and the shrinkers actually do a per-cpu loop
> >> over the counters to get the sum.
> >
> > More likely that the overhead was hidden in the noise on the size of
> > machines most people test on.
> 
> No. I was referring to the decision to use the heavyweight percpu_counter
> code over the superior per cpu data that I was using.

Your "superior" solution is only superior when you don't have to sum
the counters regularly.

I'll repeat what Andrew Morton said early one when your per-cpu
counter approach was first discussed: If you think the generic
percpu counters are too heavyweight, then _fix the generic counters_
rather than hack around them. That way everyone who uses the generic
infrastructure benefits and it reduces the desire for every subsystem
to roll their own specialised percpu counters...

> Also, the unrelated change to make nr_unused into per-cpu was not
> right, and I will revert that back to a global variable. (again, unless you
> have numbers)

What "nr_unused" variable? nr_dentrys_unused, nr_inodes_unused or
some other variable? And, apart from the overhead, why is it wrong -
does it give incorrect values?

> > It certainly wasn't measurable on my
> > 16p machine, and nobody who reviewed it at the time (ѕeveral people)
> > picked it up. So thanks for reviewing it - the simple fix is below.
> >
> > Cheers,
> >
> > Dave.
> > --
> > Dave Chinner
> > david@fromorbit.com
> >
> > fs: Use approximate values for number of inodes and dentries
> >
> > From: Dave Chinner <dchinner@redhat.com>
> 
> Nack. Can you please address my points and actually explain why this
> is better than my proposed approach please?

FFS. What bit of "need to sum the counters thousands of times a
second" don't you understand?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] fs: use approximate counter values for inodes and dentries. (was Re: [patch] fs: use fast counters for vfs caches)
  2010-12-09 23:30         ` Dave Chinner
@ 2010-12-10  2:23           ` Nick Piggin
  0 siblings, 0 replies; 10+ messages in thread
From: Nick Piggin @ 2010-12-10  2:23 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Nick Piggin, Nick Piggin, linux-fsdevel, Linus Torvalds, Al Viro,
	Christoph Hellwig

On Fri, Dec 10, 2010 at 10:30:28AM +1100, Dave Chinner wrote:
> On Thu, Dec 09, 2010 at 11:24:38PM +1100, Nick Piggin wrote:
> > No. I was referring to the decision to use the heavyweight percpu_counter
> > code over the superior per cpu data that I was using.
> 
> Your "superior" solution is only superior when you don't have to sum
> the counters regularly.

I was talking about using per cpu variable only for the total counts.
The unused counts would be per-lru (ie. a global variable in this
case).

 
> I'll repeat what Andrew Morton said early one when your per-cpu
> counter approach was first discussed: If you think the generic
> percpu counters are too heavyweight, then _fix the generic counters_
> rather than hack around them. That way everyone who uses the generic
> infrastructure benefits and it reduces the desire for every subsystem
> to roll their own specialised percpu counters...

So why was the percpu_counter patch merged without addressing *my*
concern that it is too heavyweight? Hmm?


> > Also, the unrelated change to make nr_unused into per-cpu was not
> > right, and I will revert that back to a global variable. (again, unless you
> > have numbers)
> 
> What "nr_unused" variable? nr_dentrys_unused, nr_inodes_unused or
> some other variable? And, apart from the overhead, why is it wrong -
> does it give incorrect values?

It's wrong because it is tied completely to lru operation and can't
be at all scalable anyway. I said that in this thread already, there
is no point adding overhead of per cpu counter for operations that
are done under a lock anyway.

 
> > > It certainly wasn't measurable on my
> > > 16p machine, and nobody who reviewed it at the time (ѕeveral people)
> > > picked it up. So thanks for reviewing it - the simple fix is below.
> > >
> > > Cheers,
> > >
> > > Dave.
> > > --
> > > Dave Chinner
> > > david@fromorbit.com
> > >
> > > fs: Use approximate values for number of inodes and dentries
> > >
> > > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Nack. Can you please address my points and actually explain why this
> > is better than my proposed approach please?
> 
> FFS. What bit of "need to sum the counters thousands of times a
> second" don't you understand?

The part where reclaim only sums the nr_unused counter, which I
said should not be per cpu.

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [patch 1/2] fs: revert percpu nr_unused counters for dentry and inodes
  2010-12-09  6:40     ` Nick Piggin
@ 2010-12-10  4:51       ` Nick Piggin
  2010-12-10  4:55         ` [patch 2/2] fs: use fast counters for vfs caches Nick Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: Nick Piggin @ 2010-12-10  4:51 UTC (permalink / raw)
  To: Al Viro; +Cc: Dave Chinner, linux-fsdevel, Linus Torvalds, Christoph Hellwig

On Thu, Dec 09, 2010 at 05:40:28PM +1100, Nick Piggin wrote:
> > What it _should_ look like is exactly what I had in my tree.  Proper,
> > fast total object counters with a per-cpu loop for the sum when the
> > global locks in the create/destroy path are lifted; with per-LRU counter
> > for nr_unused counter which is protected together with lru lock.
> 
> In fact, I should revise my regression fix to go back to global LRU
> counters until per-zone LRUs are implemented. Sigh, yay for more tree
> breakage.


vfs: revert per-cpu nr_unused counters for dentry and inodes

The nr_unused counters count the number of objects on an LRU, and as such they
are synchronized with LRU object insertion and removal and scanning, and
protected under the LRU lock.

Making it per-cpu does not actually get any concurrency improvements because of
this lock, and summing the counter is much slower, and
incrementing/decrementing it costs more code size and is slower too.

These counters should stay per-LRU, which currently means global.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>

---
 fs/dcache.c |   16 +++++-----------
 fs/inode.c  |   17 +++++++----------
 2 files changed, 12 insertions(+), 21 deletions(-)

Index: linux-2.6/fs/dcache.c
===================================================================
--- linux-2.6.orig/fs/dcache.c	2010-12-10 15:39:54.000000000 +1100
+++ linux-2.6/fs/dcache.c	2010-12-10 15:47:38.000000000 +1100
@@ -68,14 +68,12 @@ struct dentry_stat_t dentry_stat = {
 };
 
 static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
-static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
 	dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
-	dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -140,7 +138,7 @@ static void dentry_lru_add(struct dentry
 	if (list_empty(&dentry->d_lru)) {
 		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
-		percpu_counter_inc(&nr_dentry_unused);
+		dentry_stat.nr_unused++;
 	}
 }
 
@@ -149,7 +147,7 @@ static void dentry_lru_del(struct dentry
 	if (!list_empty(&dentry->d_lru)) {
 		list_del_init(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
-		percpu_counter_dec(&nr_dentry_unused);
+		dentry_stat.nr_unused--;
 	}
 }
 
@@ -158,7 +156,7 @@ static void dentry_lru_move_tail(struct
 	if (list_empty(&dentry->d_lru)) {
 		list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
-		percpu_counter_inc(&nr_dentry_unused);
+		dentry_stat.nr_unused++;
 	} else {
 		list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	}
@@ -546,7 +544,7 @@ static void prune_dcache(int count)
 {
 	struct super_block *sb, *p = NULL;
 	int w_count;
-	int unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	int unused = dentry_stat.nr_unused;
 	int prune_ratio;
 	int pruned;
 
@@ -908,16 +906,13 @@ EXPORT_SYMBOL(shrink_dcache_parent);
  */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
-	int nr_unused;
-
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
 		prune_dcache(nr);
 	}
 
-	nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
-	return (nr_unused / 100) * sysctl_vfs_cache_pressure;
+	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
 static struct shrinker dcache_shrinker = {
@@ -2428,7 +2423,6 @@ static void __init dcache_init(void)
 	int loop;
 
 	percpu_counter_init(&nr_dentry, 0);
-	percpu_counter_init(&nr_dentry_unused, 0);
 
 	/* 
 	 * A constructor could be added for stable state like the lists,
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2010-12-10 15:39:54.000000000 +1100
+++ linux-2.6/fs/inode.c	2010-12-10 15:47:38.000000000 +1100
@@ -103,7 +103,6 @@ static DECLARE_RWSEM(iprune_sem);
 struct inodes_stat_t inodes_stat;
 
 static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
-static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
 
 static struct kmem_cache *inode_cachep __read_mostly;
 
@@ -114,7 +113,7 @@ static inline int get_nr_inodes(void)
 
 static inline int get_nr_inodes_unused(void)
 {
-	return percpu_counter_sum_positive(&nr_inodes_unused);
+	return inodes_stat.nr_unused;
 }
 
 int get_nr_dirty_inodes(void)
@@ -132,7 +131,6 @@ int proc_nr_inodes(ctl_table *table, int
 		   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	inodes_stat.nr_inodes = get_nr_inodes();
-	inodes_stat.nr_unused = get_nr_inodes_unused();
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -335,7 +333,7 @@ static void inode_lru_list_add(struct in
 {
 	if (list_empty(&inode->i_lru)) {
 		list_add(&inode->i_lru, &inode_lru);
-		percpu_counter_inc(&nr_inodes_unused);
+		inodes_stat.nr_unused++;
 	}
 }
 
@@ -343,7 +341,7 @@ static void inode_lru_list_del(struct in
 {
 	if (!list_empty(&inode->i_lru)) {
 		list_del_init(&inode->i_lru);
-		percpu_counter_dec(&nr_inodes_unused);
+		inodes_stat.nr_unused--;
 	}
 }
 
@@ -513,7 +511,7 @@ void evict_inodes(struct super_block *sb
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+			inodes_stat.nr_unused--;
 	}
 	spin_unlock(&inode_lock);
 
@@ -554,7 +552,7 @@ int invalidate_inodes(struct super_block
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+			inodes_stat.nr_unused--;
 	}
 	spin_unlock(&inode_lock);
 
@@ -616,7 +614,7 @@ static void prune_icache(int nr_to_scan)
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
 			list_del_init(&inode->i_lru);
-			percpu_counter_dec(&nr_inodes_unused);
+			inodes_stat.nr_unused--;
 			continue;
 		}
 
@@ -650,7 +648,7 @@ static void prune_icache(int nr_to_scan)
 		 */
 		list_move(&inode->i_lru, &freeable);
 		list_del_init(&inode->i_wb_list);
-		percpu_counter_dec(&nr_inodes_unused);
+		inodes_stat.nr_unused--;
 	}
 	if (current_is_kswapd())
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
@@ -1649,7 +1647,6 @@ void __init inode_init(void)
 					 init_once);
 	register_shrinker(&icache_shrinker);
 	percpu_counter_init(&nr_inodes, 0);
-	percpu_counter_init(&nr_inodes_unused, 0);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [patch 2/2] fs: use fast counters for vfs caches
  2010-12-10  4:51       ` [patch 1/2] fs: revert percpu nr_unused counters for dentry and inodes Nick Piggin
@ 2010-12-10  4:55         ` Nick Piggin
  0 siblings, 0 replies; 10+ messages in thread
From: Nick Piggin @ 2010-12-10  4:55 UTC (permalink / raw)
  To: Al Viro
  Cc: Al Viro, Dave Chinner, linux-fsdevel, Linus Torvalds,
	Christoph Hellwig

percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.

The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.

In the fastpath, it is the difference between this:

        incl %gs:nr_dentry      # nr_dentry

and this:

        movl    percpu_counter_batch(%rip), %edx        # percpu_counter_batch,
        movl    $1, %esi        #,
        movq    $nr_dentry, %rdi        #,
        call    __percpu_counter_add    # (plus I clobber registers)

__percpu_counter_add:
        pushq   %rbp    #
        movq    %rsp, %rbp      #,
        subq    $32, %rsp       #,
        movq    %rbx, -24(%rbp) #,
        movq    %r12, -16(%rbp) #,
        movq    %r13, -8(%rbp)  #,
        movq    %rdi, %rbx      # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
        movq %gs:kernel_stack,%rax      #, pfo_ret__
# 0 "" 2
#NO_APP
        incl    -8124(%rax)     # <variable>.preempt_count
        movq    32(%rdi), %r12  # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
        add %gs:this_cpu_off, %r12      # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
        movslq  (%r12),%r13     #* tcp_ptr__, tmp73
        movslq  %edx,%rax       # batch, batch
        addq    %rsi, %r13      # amount, count
        cmpq    %rax, %r13      # batch, count
        jge     .L27    #,
        negl    %edx    # tmp76
        movslq  %edx,%rdx       # tmp76, tmp77
        cmpq    %rdx, %r13      # tmp77, count
        jg      .L28    #,
.L27:
        movq    %rbx, %rdi      # fbc,
        call    _raw_spin_lock  #
        addq    %r13, 8(%rbx)   # count, <variable>.count
        movq    %rbx, %rdi      # fbc,
        movl    $0, (%r12)      #,* tcp_ptr__
        call    _raw_spin_unlock        #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
        movq %gs:kernel_stack,%rax      #, pfo_ret__
# 0 "" 2
#NO_APP
        decl    -8124(%rax)     # <variable>.preempt_count
        movq    -8136(%rax), %rax       #, D.14625
        testb   $8, %al #, D.14625
        jne     .L32    #,
.L31:
        movq    -24(%rbp), %rbx #,
        movq    -16(%rbp), %r12 #,
        movq    -8(%rbp), %r13  #,
        leave
        ret
        .p2align 4,,10
        .p2align 3
.L28:
        movl    %r13d, (%r12)   # count,*
        jmp     .L29    #
.L32:
        call    preempt_schedule        #
        .p2align 4,,6
        jmp     .L31    #
        .size   __percpu_counter_add, .-__percpu_counter_add
        .p2align 4,,15

Signed-off-by: Nick Piggin <npiggin@kernel.dk>

---
 fs/dcache.c |   43 +++++++++++++++++++++++++++++--------------
 fs/inode.c  |   46 ++++++++++++++++++++++++++--------------------
 2 files changed, 55 insertions(+), 34 deletions(-)

Index: linux-2.6/fs/dcache.c
===================================================================
--- linux-2.6.orig/fs/dcache.c	2010-12-10 15:39:56.000000000 +1100
+++ linux-2.6/fs/dcache.c	2010-12-10 15:42:38.000000000 +1100
@@ -67,13 +67,22 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_dentry);
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+static int get_nr_dentry(void)
+{
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry, i);
+	return sum < 0 ? 0 : sum;
+}
+
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
-	dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+	dentry_stat.nr_dentry = get_nr_dentry();
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -93,7 +102,7 @@ static void __d_free(struct rcu_head *he
  */
 static void d_free(struct dentry *dentry)
 {
-	percpu_counter_dec(&nr_dentry);
+	this_cpu_dec(nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
 
@@ -981,7 +990,7 @@ struct dentry *d_alloc(struct dentry * p
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	spin_unlock(&dcache_lock);
 
-	percpu_counter_inc(&nr_dentry);
+	this_cpu_inc(nr_dentry);
 
 	return dentry;
 }
@@ -2422,8 +2431,6 @@ static void __init dcache_init(void)
 {
 	int loop;
 
-	percpu_counter_init(&nr_dentry, 0);
-
 	/* 
 	 * A constructor could be added for stable state like the lists,
 	 * but it is probably not worth it because of the cache nature
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2010-12-10 15:39:56.000000000 +1100
+++ linux-2.6/fs/inode.c	2010-12-10 15:43:59.000000000 +1100
@@ -102,13 +102,17 @@ static DECLARE_RWSEM(iprune_sem);
  */
 struct inodes_stat_t inodes_stat;
 
-static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_inodes);
 
 static struct kmem_cache *inode_cachep __read_mostly;
 
-static inline int get_nr_inodes(void)
+static int get_nr_inodes(void)
 {
-	return percpu_counter_sum_positive(&nr_inodes);
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_inodes, i);
+	return sum < 0 ? 0 : sum;
 }
 
 static inline int get_nr_inodes_unused(void)
@@ -118,9 +122,9 @@ static inline int get_nr_inodes_unused(v
 
 int get_nr_dirty_inodes(void)
 {
+	/* not actually dirty inodes, but a wild approximation */
 	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
 	return nr_dirty > 0 ? nr_dirty : 0;
-
 }
 
 /*
@@ -222,7 +226,7 @@ int inode_init_always(struct super_block
 	inode->i_fsnotify_mask = 0;
 #endif
 
-	percpu_counter_inc(&nr_inodes);
+	this_cpu_inc(nr_inodes);
 
 	return 0;
 out:
@@ -264,7 +268,7 @@ void __destroy_inode(struct inode *inode
 	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_default_acl);
 #endif
-	percpu_counter_dec(&nr_inodes);
+	this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
 
@@ -1646,7 +1650,6 @@ void __init inode_init(void)
 					 SLAB_MEM_SPREAD),
 					 init_once);
 	register_shrinker(&icache_shrinker);
-	percpu_counter_init(&nr_inodes, 0);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-12-10  4:55 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-11-29 10:57 [patch] fs: use fast counters for vfs caches Nick Piggin
2010-12-09  5:43 ` Dave Chinner
2010-12-09  6:16   ` Nick Piggin
2010-12-09  6:40     ` Nick Piggin
2010-12-10  4:51       ` [patch 1/2] fs: revert percpu nr_unused counters for dentry and inodes Nick Piggin
2010-12-10  4:55         ` [patch 2/2] fs: use fast counters for vfs caches Nick Piggin
2010-12-09  7:45     ` [PATCH] fs: use approximate counter values for inodes and dentries. (was Re: [patch] fs: use fast counters for vfs caches) Dave Chinner
2010-12-09 12:24       ` Nick Piggin
2010-12-09 23:30         ` Dave Chinner
2010-12-10  2:23           ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).