public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Alex Tomas <bzzz@tmi.comex.ru>
To: Andreas Dilger <adilger@clusterfs.com>
Cc: linux-kernel <linux-kernel@vger.kernel.org>,
	ext2-devel@lists.sourceforge.net, Andrew Morton <akpm@digeo.com>
Subject: Re: [Ext2-devel] [PATCH] distributed counters for ext2 to avoid group scaning
Date: 17 Mar 2003 00:55:17 +0300	[thread overview]
Message-ID: <m3bs0bugca.fsf@lexa.home.net> (raw)
In-Reply-To: <20030316104447.D12806@schatzie.adilger.int>


hi!

>>>>> Andreas Dilger (AD) writes:
> > +	dcounter_init(&EXT2_SB(sb)->free_blocks_dc, total_free, 1);
> > +	dcounter_init(&EXT2_SB(sb)->free_inodes_dc,
> > +			le32_to_cpu (es->s_free_inodes_count), 1);
>
> So, why is it that the minimum free blocks/inodes is 1?

damn! it's my mistake. thank you.

> > +struct dcounter {
> > +	int base;
> > +	int min;
> > +	int diff[NR_CPUS];
> > +	seqlock_t lock;
> > +};
>
> For a generic struct, it probably makes more sense to make these fields
> "long" instead of "int".

fixed

> Also, while your goal is to reduce cache ping-pong between CPUs, we will
> now have cache ping-pong for the "diff" array.  We need to do per-cpu
> values or make each value cacheline aligned to avoid ping-pong.

yes, you're right. fixed

> Just for sanity's sake, it would be good to call these fields something
> other than "min" and "lock", since that makes life just hell with tags
> (it always bugs me when structs have fields called list_head, list_entry,
> page, inode, etc).
>
> Maybe something like "dc_min", "dc_lock", etc. would be much nicer?
> The same goes for the fields in the block group info - it would be nice
> if they had a "bgi_" prefix.

it makes sense. I've corrected this for dcounters and hope to correct for
block group info in future cleanups. 

again, corrected patch is against 2.5.64-mm8.

with best regards, Alex

diff -uNr linux/fs/ext2/balloc.c edited/fs/ext2/balloc.c
--- linux/fs/ext2/balloc.c	Sun Mar 16 17:21:33 2003
+++ edited/fs/ext2/balloc.c	Sun Mar 16 17:40:28 2003
@@ -138,6 +138,7 @@
 	desc->bg_free_blocks_count = cpu_to_le16(free_blocks - count);
 
 	spin_unlock(&bgi->balloc_lock);
+	dcounter_add(&EXT2_SB(sb)->free_blocks_dc, -count);
 	sb->s_dirt = 1;
 	mark_buffer_dirty(bh);
 	return count;
@@ -156,6 +157,7 @@
 		desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
 
 		spin_unlock(&bgi->balloc_lock);
+		dcounter_add(&EXT2_SB(sb)->free_blocks_dc, count);
 		sb->s_dirt = 1;
 		mark_buffer_dirty(bh);
 	}
@@ -491,7 +493,12 @@
 	goto out_release;
 }
 
-unsigned long ext2_count_free_blocks (struct super_block * sb)
+unsigned long ext2_count_free_blocks(struct super_block *sb)
+{
+	return dcounter_value(&EXT2_SB(sb)->free_blocks_dc);
+}
+
+unsigned long ext2_count_free_blocks_old(struct super_block *sb)
 {
 	struct ext2_group_desc * desc;
 	unsigned long desc_count = 0;
diff -uNr linux/fs/ext2/ialloc.c edited/fs/ext2/ialloc.c
--- linux/fs/ext2/ialloc.c	Sun Mar 16 17:21:33 2003
+++ edited/fs/ext2/ialloc.c	Mon Mar 17 00:29:25 2003
@@ -88,10 +88,13 @@
 	spin_lock(&EXT2_SB(sb)->s_bgi[group].ialloc_lock);
 	desc->bg_free_inodes_count =
 		cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) - 1);
-	if (dir)
+	if (dir) {
 		desc->bg_used_dirs_count =
 			cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) + 1);
+		dcounter_add(&EXT2_SB(sb)->dirs_dc, 1);
+	}
 	spin_unlock(&EXT2_SB(sb)->s_bgi[group].ialloc_lock);
+	dcounter_add(&EXT2_SB(sb)->free_inodes_dc, -1);
 	sb->s_dirt = 1;
 	mark_buffer_dirty(bh);
 }
@@ -111,10 +114,13 @@
 	spin_lock(&EXT2_SB(sb)->s_bgi[group].ialloc_lock);
 	desc->bg_free_inodes_count =
 		cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) + 1);
-	if (dir)
+	if (dir) {
 		desc->bg_used_dirs_count =
 			cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) - 1);
+		dcounter_add(&EXT2_SB(sb)->dirs_dc, -1);
+	}
 	spin_unlock(&EXT2_SB(sb)->s_bgi[group].ialloc_lock);
+	dcounter_add(&EXT2_SB(sb)->free_inodes_dc, 1);
 	sb->s_dirt = 1;
 	mark_buffer_dirty(bh);
 }
@@ -317,7 +323,7 @@
 	int free_blocks = ext2_count_free_blocks(sb);
 	int avefreeb = free_blocks / ngroups;
 	int blocks_per_dir;
-	int ndirs = ext2_count_dirs(sb);
+	int ndirs = dcounter_value(&sbi->dirs_dc);
 	int max_debt, max_dirs, min_blocks, min_inodes;
 	int group = -1, i;
 	struct ext2_group_desc *desc;
@@ -624,7 +630,12 @@
 	goto repeat;
 }
 
-unsigned long ext2_count_free_inodes (struct super_block * sb)
+unsigned long ext2_count_free_inodes(struct super_block *sb)
+{
+	return dcounter_value(&EXT2_SB(sb)->free_inodes_dc);
+}
+
+unsigned long ext2_count_free_inodes_old(struct super_block *sb)
 {
 	struct ext2_group_desc *desc;
 	unsigned long desc_count = 0;
diff -uNr linux/fs/ext2/super.c edited/fs/ext2/super.c
--- linux/fs/ext2/super.c	Sun Mar 16 17:21:33 2003
+++ edited/fs/ext2/super.c	Mon Mar 17 00:30:35 2003
@@ -35,6 +35,8 @@
 			    struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct super_block * sb, struct statfs * buf);
+unsigned long ext2_count_free_inodes_old(struct super_block *sb);
+unsigned long ext2_count_free_blocks_old (struct super_block * sb);
 
 static char error_buf[1024];
 
@@ -508,9 +510,11 @@
 		gdp++;
 	}
 	
-	/* restore free blocks counter in SB -bzzz */
-	es->s_free_blocks_count = total_free = ext2_count_free_blocks(sb);
-	es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
+	total_free = le32_to_cpu (es->s_free_blocks_count);
+	dcounter_init(&EXT2_SB(sb)->free_blocks_dc, total_free, 0);
+	dcounter_init(&EXT2_SB(sb)->free_inodes_dc,
+			le32_to_cpu (es->s_free_inodes_count), 0);
+	dcounter_init(&EXT2_SB(sb)->dirs_dc, ext2_count_dirs(sb), 1);
 
 	/* distribute reserved blocks over groups -bzzz */
 	for(i = sbi->s_groups_count-1; reserved && total_free && i >= 0; i--) {
@@ -870,8 +874,22 @@
 
 static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 {
+	if (dcounter_value(&EXT2_SB(sb)->dirs_dc) != ext2_count_dirs(sb))
+		printk("EXT2-fs: invalid dirs_dc %d (real %d)\n",
+				(int) dcounter_value(&EXT2_SB(sb)->dirs_dc),
+				(int) ext2_count_dirs(sb));
+	if (ext2_count_free_blocks(sb) != ext2_count_free_blocks_old(sb))
+		printk("EXT2-fs: invalid free blocks dcounter %d (real %d)\n",
+				(int) ext2_count_free_blocks(sb),
+				(int) ext2_count_free_blocks_old(sb));
 	es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
+
+	if (ext2_count_free_inodes(sb) != ext2_count_free_inodes_old(sb))
+		printk("EXT2-fs: invalid free inodes dcounter %d (real %d)\n",
+			(int) ext2_count_free_inodes(sb),
+			(int) ext2_count_free_inodes_old(sb));
 	es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
+
 	es->s_wtime = cpu_to_le32(get_seconds());
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
 	sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
@@ -900,8 +918,25 @@
 			ext2_debug ("setting valid to 0\n");
 			es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) &
 						  ~EXT2_VALID_FS);
-			es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
-			es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
+			if (dcounter_value(&EXT2_SB(sb)->dirs_dc) != ext2_count_dirs(sb))
+				printk("EXT2-fs: invalid dirs_dc %d (real %d)\n",
+					(int) dcounter_value(&EXT2_SB(sb)->dirs_dc),
+					(int) ext2_count_dirs(sb));
+
+			es->s_free_blocks_count =
+				cpu_to_le32(ext2_count_free_blocks(sb));
+			if (ext2_count_free_blocks(sb) != ext2_count_free_blocks_old(sb)) 
+				printk("EXT2-fs: invalid free blocks dcounter %d (real %d)\n",
+					(int)ext2_count_free_blocks(sb),
+					(int)ext2_count_free_blocks_old(sb));
+
+			es->s_free_inodes_count =
+				cpu_to_le32(ext2_count_free_inodes(sb));
+			if (ext2_count_free_inodes(sb) != ext2_count_free_inodes_old(sb))
+				 printk("EXT2-fs: invalid free inodes dcounter %d (real %d)\n",
+					(int)ext2_count_free_inodes(sb),
+					(int)ext2_count_free_inodes_old(sb));
+
 			es->s_mtime = cpu_to_le32(get_seconds());
 			ext2_sync_super(sb, es);
 		} else
diff -uNr linux/include/linux/dcounter.h edited/include/linux/dcounter.h
--- linux/include/linux/dcounter.h	Thu Jan  1 03:00:00 1970
+++ edited/include/linux/dcounter.h	Mon Mar 17 00:28:25 2003
@@ -0,0 +1,85 @@
+#ifndef _DCOUNTER_H_
+#define _DCOUNTER_H_
+/*
+ * Distrubuted counters:
+ * 
+ * Problem:
+ *   1) we have to support global counter for some subsystems
+ *      for example, ext2
+ *   2) we do not want to use spinlocks/atomic_t because of cache ping-pong
+ *   3) counter may have some fluctuation
+ *      for example, number of free blocks in ext2
+ *
+ * Solution:
+ *   1) there is 'base' counter
+ *   2) each CPU supports own 'diff'
+ *   3) global value calculated as sum of base and all diff'es
+ *   4) sometimes diff goes to base in order to prevent int overflow.
+ *      this 'syncronization' uses seqlock
+ *   
+ *
+ *   written by Alex Tomas <bzzz@tmi.comex.ru>
+ */
+
+#include <linux/smp.h>
+#include <linux/seqlock.h>
+#include <linux/string.h>
+
+#define DCOUNTER_MAX_DIFF	((1 << 31) / NR_CPUS - 1000)
+
+struct dcounter_diff {
+	long dd_value; 
+} ____cacheline_aligned_in_smp;
+
+struct dcounter {
+	long dc_base;
+	long dc_min;
+	struct dcounter_diff dc_diff[NR_CPUS];
+	seqlock_t dc_lock;
+};
+
+static inline void dcounter_init(struct dcounter *dc, int value, int min)
+{
+	seqlock_init(&dc->dc_lock);
+	dc->dc_base = value;
+	dc->dc_min = min;
+	memset(dc->dc_diff, 0, sizeof(struct dcounter_diff) * NR_CPUS);
+}
+
+static inline int dcounter_value(struct dcounter *dc)
+{
+	int i;
+	int counter;
+	int seq;
+
+	do {
+		seq = read_seqbegin(&dc->dc_lock);
+		counter = dc->dc_base;
+		for (i = 0; i < NR_CPUS; i++)
+			counter += dc->dc_diff[i].dd_value;
+	} while (read_seqretry(&dc->dc_lock, seq));
+
+	if (counter < dc->dc_min)
+		counter = dc->dc_min;	
+	return counter;
+}
+
+static inline void dcounter_add(struct dcounter *dc, int value)
+{
+	int cpu;
+	
+	preempt_disable();
+	cpu = smp_processor_id();
+	dc->dc_diff[cpu].dd_value += value;
+	if (dc->dc_diff[cpu].dd_value > DCOUNTER_MAX_DIFF ||
+		dc->dc_diff[cpu].dd_value < -DCOUNTER_MAX_DIFF) {
+		write_seqlock(&dc->dc_lock);
+		dc->dc_base += dc->dc_diff[cpu].dd_value;
+		dc->dc_diff[cpu].dd_value = 0;
+		write_sequnlock(&dc->dc_lock);
+	}
+	preempt_enable();
+}
+
+#endif /* _DCOUNTER_H_ */
+
diff -uNr linux/include/linux/ext2_fs_sb.h edited/include/linux/ext2_fs_sb.h
--- linux/include/linux/ext2_fs_sb.h	Sun Mar 16 17:21:34 2003
+++ edited/include/linux/ext2_fs_sb.h	Mon Mar 17 00:12:00 2003
@@ -16,6 +16,8 @@
 #ifndef _LINUX_EXT2_FS_SB
 #define _LINUX_EXT2_FS_SB
 
+#include <linux/dcounter.h>
+
 struct ext2_bg_info {
 	u8 debts;
 	spinlock_t balloc_lock;
@@ -52,6 +54,9 @@
 	u32 s_next_generation;
 	unsigned long s_dir_count;
 	struct ext2_bg_info *s_bgi;
+	struct dcounter free_blocks_dc;
+	struct dcounter free_inodes_dc;
+	struct dcounter dirs_dc;
 };
 
 #endif	/* _LINUX_EXT2_FS_SB */



  reply	other threads:[~2003-03-16 21:52 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-03-16 15:01 [PATCH] distributed counters for ext2 to avoid group scaning Alex Tomas
2003-03-16 17:44 ` [Ext2-devel] " Andreas Dilger
2003-03-16 21:55   ` Alex Tomas [this message]
2003-03-17 15:11     ` Matthew Wilcox
2003-03-17 15:09       ` Alex Tomas
2003-03-17 15:27         ` Matthew Wilcox
2003-03-17 15:25           ` Alex Tomas
2003-03-17 20:23           ` Andrew Morton
2003-03-17 20:27             ` Matthew Wilcox
2003-03-17 20:40             ` Alex Tomas
2003-03-17  9:37 ` William Lee Irwin III
2003-03-17  9:48   ` William Lee Irwin III
2003-03-17 10:41   ` Alex Tomas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=m3bs0bugca.fsf@lexa.home.net \
    --to=bzzz@tmi.comex.ru \
    --cc=adilger@clusterfs.com \
    --cc=akpm@digeo.com \
    --cc=ext2-devel@lists.sourceforge.net \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox