Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
@ 2006-03-25 13:33 sho
  2006-03-26 22:37 ` Badari Pulavarty
  2006-03-27 18:45 ` Mingming Cao
  0 siblings, 2 replies; 49+ messages in thread
From: sho @ 2006-03-25 13:33 UTC (permalink / raw)
  To: pbadari; +Cc: linux-kernel, Ext2-devel

Hi,

>More information. I ran the test with "-onoreservation" thinking that
>the patch didn't address "reservation code" issues and I still ran
>into block allocation problems. Hope this helps.

As you said, the previous patches were broken because of my mailer,
and part of them would be rejected.
I'm re-sending them;  I have not changed them other than the mailer.
Could you try new patches and check what happened?
I have run fsx with these patches several times and the problems
weren't reproduced.

Signed-off-by: Takashi Sato sho@tnes.nec.co.jp
---
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext2/balloc.c linux-2.6.16-rc6-4g/fs/e
xt2/balloc.c
--- linux-2.6.16-rc6.org/fs/ext2/balloc.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext2/balloc.c	2006-03-14 09:29:01.000000000 +0900
@@ -99,14 +99,14 @@ error_out:
  * Set sb->s_dirt here because the superblock was "logically" altered.  We
  * need to recalculate its free blocks count and flush it out.
  */
-static int reserve_blocks(struct super_block *sb, int count)
+static unsigned int reserve_blocks(struct super_block *sb, unsigned int count)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = sbi->s_es;
-	unsigned free_blocks;
+	unsigned int free_blocks;
 	unsigned root_blocks;
 
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	free_blocks = percpu_llcounter_read_positive(&sbi->s_freeblocks_counter);
 	root_blocks = le32_to_cpu(es->s_r_blocks_count);
 
 	if (free_blocks < count)
@@ -125,23 +125,23 @@ static int reserve_blocks(struct super_b
 			return 0;
 	}
 
-	percpu_counter_mod(&sbi->s_freeblocks_counter, -count);
+	percpu_llcounter_mod(&sbi->s_freeblocks_counter, -count);
 	sb->s_dirt = 1;
 	return count;
 }
 
-static void release_blocks(struct super_block *sb, int count)
+static void release_blocks(struct super_block *sb, unsigned int count)
 {
 	if (count) {
 		struct ext2_sb_info *sbi = EXT2_SB(sb);
 
-		percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+		percpu_llcounter_mod(&sbi->s_freeblocks_counter, count);
 		sb->s_dirt = 1;
 	}
 }
 
-static int group_reserve_blocks(struct ext2_sb_info *sbi, int group_no,
-	struct ext2_group_desc *desc, struct buffer_head *bh, int count)
+static unsigned int group_reserve_blocks(struct ext2_sb_info *sbi, int group_no,
+	struct ext2_group_desc *desc, struct buffer_head *bh, unsigned int count)
 {
 	unsigned free_blocks;
 
@@ -159,7 +159,7 @@ static int group_reserve_blocks(struct e
 }
 
 static void group_release_blocks(struct super_block *sb, int group_no,
-	struct ext2_group_desc *desc, struct buffer_head *bh, int count)
+	struct ext2_group_desc *desc, struct buffer_head *bh, unsigned int count)
 {
 	if (count) {
 		struct ext2_sb_info *sbi = EXT2_SB(sb);
@@ -324,7 +324,7 @@ got_it:
  * bitmap, and then for any free bit if that fails.
  * This function also updates quota and i_blocks field.
  */
-int ext2_new_block(struct inode *inode, unsigned long goal,
+unsigned int ext2_new_block(struct inode *inode, unsigned long goal,
 			u32 *prealloc_count, u32 *prealloc_block, int *err)
 {
 	struct buffer_head *bitmap_bh = NULL;
@@ -333,8 +333,8 @@ int ext2_new_block(struct inode *inode, 
 	int group_no;			/* i */
 	int ret_block;			/* j */
 	int group_idx;			/* k */
-	int target_block;		/* tmp */
-	int block = 0;
+	unsigned int target_block;	/* tmp */
+	unsigned int block = 0;
 	struct super_block *sb = inode->i_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = sbi->s_es;
@@ -447,7 +447,6 @@ retry:
 		group_alloc = 0;
 		goto retry;
 	}
-
 got_block:
 	ext2_debug("using block group %d(%d)\n",
 		group_no, desc->bg_free_blocks_count);
@@ -465,7 +464,7 @@ got_block:
 
 	if (target_block >= le32_to_cpu(es->s_blocks_count)) {
 		ext2_error (sb, "ext2_new_block",
-			    "block(%d) >= blocks count(%d) - "
+			    "block(%d) >= blocks count(%u) - "
 			    "block_group = %d, es == %p ", ret_block,
 			le32_to_cpu(es->s_blocks_count), group_no, es);
 		goto io_error;
@@ -504,7 +503,7 @@ got_block:
 	if (sb->s_flags & MS_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
-	ext2_debug ("allocating block %d. ", block);
+	ext2_debug ("allocating block %u. ", block);
 
 	*err = 0;
 out_release:
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext2/ext2.h linux-2.6.16-rc6-4g/fs/ext
2/ext2.h
--- linux-2.6.16-rc6.org/fs/ext2/ext2.h	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext2/ext2.h	2006-03-14 09:29:01.000000000 +0900
@@ -91,7 +91,7 @@ static inline struct ext2_inode_info *EX
 /* balloc.c */
 extern int ext2_bg_has_super(struct super_block *sb, int group);
 extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group);
-extern int ext2_new_block (struct inode *, unsigned long,
+extern unsigned int ext2_new_block (struct inode *, unsigned long,
 			   __u32 *, __u32 *, int *);
 extern void ext2_free_blocks (struct inode *, unsigned long,
 			      unsigned long);
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext2/ialloc.c linux-2.6.16-rc6-4g/fs/e
xt2/ialloc.c
--- linux-2.6.16-rc6.org/fs/ext2/ialloc.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext2/ialloc.c	2006-03-14 09:29:01.000000000 +0900
@@ -83,7 +83,7 @@ static void ext2_release_inode(struct su
 			cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) - 1);
 	spin_unlock(sb_bgl_lock(EXT2_SB(sb), group));
 	if (dir)
-		percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter);
+		percpu_llcounter_dec(&EXT2_SB(sb)->s_dirs_counter);
 	sb->s_dirt = 1;
 	mark_buffer_dirty(bh);
 }
@@ -276,22 +276,20 @@ static int find_group_orlov(struct super
 	struct ext2_super_block *es = sbi->s_es;
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT2_INODES_PER_GROUP(sb);
-	int freei;
+	unsigned long freei, free_blocks, ndirs;
 	int avefreei;
-	int free_blocks;
 	int avefreeb;
 	int blocks_per_dir;
-	int ndirs;
 	int max_debt, max_dirs, min_blocks, min_inodes;
 	int group = -1, i;
 	struct ext2_group_desc *desc;
 	struct buffer_head *bh;
 
-	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	freei = percpu_llcounter_read_positive(&sbi->s_freeinodes_counter);
 	avefreei = freei / ngroups;
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	free_blocks = percpu_llcounter_read_positive(&sbi->s_freeblocks_counter);
 	avefreeb = free_blocks / ngroups;
-	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+	ndirs = percpu_llcounter_read_positive(&sbi->s_dirs_counter);
 
 	if ((parent == sb->s_root->d_inode) ||
 	    (EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) {
@@ -328,7 +326,7 @@ static int find_group_orlov(struct super
 	}
 
 	if (ndirs == 0)
-		ndirs = 1;	/* percpu_counters are approximate... */
+		ndirs = 1;	/* percpu_llcounters are approximate... */
 
 	blocks_per_dir = (le32_to_cpu(es->s_blocks_count)-free_blocks) / ndirs;
 
@@ -543,9 +541,9 @@ got:
 		goto fail;
 	}
 
-	percpu_counter_mod(&sbi->s_freeinodes_counter, -1);
+	percpu_llcounter_mod(&sbi->s_freeinodes_counter, -1);
 	if (S_ISDIR(mode))
-		percpu_counter_inc(&sbi->s_dirs_counter);
+		percpu_llcounter_inc(&sbi->s_dirs_counter);
 
 	spin_lock(sb_bgl_lock(sbi, group));
 	gdp->bg_free_inodes_count =
@@ -670,7 +668,7 @@ unsigned long ext2_count_free_inodes (st
 	}
 	brelse(bitmap_bh);
 	printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
-		percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter),
+		percpu_llcounter_read(&EXT2_SB(sb)->s_freeinodes_counter),
 		desc_count, bitmap_count);
 	unlock_super(sb);
 	return desc_count;
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext2/inode.c linux-2.6.16-rc6-4g/fs/ex
t2/inode.c
--- linux-2.6.16-rc6.org/fs/ext2/inode.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext2/inode.c	2006-03-15 21:16:51.000000000 +0900
@@ -107,7 +107,7 @@ void ext2_discard_prealloc (struct inode
 #endif
 }
 
-static int ext2_alloc_block (struct inode * inode, unsigned long goal, int *err)
+static unsigned int ext2_alloc_block (struct inode * inode, unsigned int goal, int *err)
 {
 #ifdef EXT2FS_DEBUG
 	static unsigned long alloc_hits, alloc_attempts;
@@ -193,8 +193,8 @@ static inline int verify_chain(Indirect 
  * get there at all.
  */
 
-static int ext2_block_to_path(struct inode *inode,
-			long i_block, int offsets[4], int *boundary)
+static int ext2_block_to_path(struct inode *inode, unsigned long i_block, 
+				unsigned int offsets[4], int *boundary)
 {
 	int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -263,7 +263,7 @@ static int ext2_block_to_path(struct ino
  */
 static Indirect *ext2_get_branch(struct inode *inode,
 				 int depth,
-				 int *offsets,
+				 unsigned int *offsets,
 				 Indirect chain[4],
 				 int *err)
 {
@@ -363,7 +363,7 @@ static unsigned long ext2_find_near(stru
  */
 
 static inline int ext2_find_goal(struct inode *inode,
-				 long block,
+				 unsigned long block,
 				 Indirect chain[4],
 				 Indirect *partial,
 				 unsigned long *goal)
@@ -418,20 +418,20 @@ static inline int ext2_find_goal(struct 
 static int ext2_alloc_branch(struct inode *inode,
 			     int num,
 			     unsigned long goal,
-			     int *offsets,
+			     unsigned int *offsets,
 			     Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int n = 0;
 	int err;
 	int i;
-	int parent = ext2_alloc_block(inode, goal, &err);
+	unsigned int parent = ext2_alloc_block(inode, goal, &err);
 
 	branch[0].key = cpu_to_le32(parent);
 	if (parent) for (n = 1; n < num; n++) {
 		struct buffer_head *bh;
 		/* Allocate the next block */
-		int nr = ext2_alloc_block(inode, parent, &err);
+		unsigned int nr = ext2_alloc_block(inode, parent, &err);
 		if (!nr)
 			break;
 		branch[n].key = cpu_to_le32(nr);
@@ -489,7 +489,7 @@ static int ext2_alloc_branch(struct inod
  */
 
 static inline int ext2_splice_branch(struct inode *inode,
-				     long block,
+				     unsigned long block,
 				     Indirect chain[4],
 				     Indirect *where,
 				     int num)
@@ -547,7 +547,7 @@ changed:
 int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
 {
 	int err = -EIO;
-	int offsets[4];
+	unsigned int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	unsigned long goal;
@@ -776,7 +776,7 @@ static inline int all_zeroes(__le32 *p, 
 
 static Indirect *ext2_find_shared(struct inode *inode,
 				int depth,
-				int offsets[4],
+				unsigned int offsets[4],
 				Indirect chain[4],
 				__le32 *top)
 {
@@ -892,7 +892,7 @@ static void ext2_free_branches(struct in
 			 */ 
 			if (!bh) {
 				ext2_error(inode->i_sb, "ext2_free_branches",
-					"Read failure, inode=%ld, block=%ld",
+					"Read failure, inode=%lu, block=%lu",
 					inode->i_ino, nr);
 				continue;
 			}
@@ -912,12 +912,12 @@ void ext2_truncate (struct inode * inode
 {
 	__le32 *i_data = EXT2_I(inode)->i_data;
 	int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
-	int offsets[4];
+	unsigned int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	__le32 nr = 0;
 	int n;
-	long iblock;
+	unsigned long iblock;
 	unsigned blocksize;
 
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext2/super.c linux-2.6.16-rc6-4g/fs/ex
t2/super.c
--- linux-2.6.16-rc6.org/fs/ext2/super.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext2/super.c	2006-03-14 09:29:01.000000000 +0900
@@ -126,9 +126,9 @@ static void ext2_put_super (struct super
 			brelse (sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
 	kfree(sbi->s_debts);
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_llcounter_destroy(&sbi->s_freeblocks_counter);
+	percpu_llcounter_destroy(&sbi->s_freeinodes_counter);
+	percpu_llcounter_destroy(&sbi->s_dirs_counter);
 	brelse (sbi->s_sbh);
 	sb->s_fs_info = NULL;
 	kfree(sbi);
@@ -836,9 +836,9 @@ static int ext2_fill_super(struct super_
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
+	percpu_llcounter_init(&sbi->s_freeblocks_counter);
+	percpu_llcounter_init(&sbi->s_freeinodes_counter);
+	percpu_llcounter_init(&sbi->s_dirs_counter);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 	sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
 			       GFP_KERNEL);
@@ -888,11 +888,11 @@ static int ext2_fill_super(struct super_
 		ext2_warning(sb, __FUNCTION__,
 			"mounting ext3 filesystem as ext2");
 	ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
+	percpu_llcounter_mod(&sbi->s_freeblocks_counter,
 				ext2_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
+	percpu_llcounter_mod(&sbi->s_freeinodes_counter,
 				ext2_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
+	percpu_llcounter_mod(&sbi->s_dirs_counter,
 				ext2_count_dirs(sb));
 	return 0;
 
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext2/xattr.c linux-2.6.16-rc6-4g/fs/ex
t2/xattr.c
--- linux-2.6.16-rc6.org/fs/ext2/xattr.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext2/xattr.c	2006-03-14 09:29:01.000000000 +0900
@@ -71,7 +71,7 @@
 
 #ifdef EXT2_XATTR_DEBUG
 # define ea_idebug(inode, f...) do { \
-		printk(KERN_DEBUG "inode %s:%ld: ", \
+		printk(KERN_DEBUG "inode %s:%lu: ", \
 			inode->i_sb->s_id, inode->i_ino); \
 		printk(f); \
 		printk("\n"); \
@@ -164,7 +164,7 @@ ext2_xattr_get(struct inode *inode, int 
 	error = -ENODATA;
 	if (!EXT2_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT2_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
 	error = -EIO;
 	if (!bh)
@@ -175,7 +175,7 @@ ext2_xattr_get(struct inode *inode, int 
 	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
 	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
 bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
-			"inode %ld: bad block %d", inode->i_ino,
+			"inode %lu: bad block %u", inode->i_ino,
 			EXT2_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -264,7 +264,7 @@ ext2_xattr_list(struct inode *inode, cha
 	error = 0;
 	if (!EXT2_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT2_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
 	error = -EIO;
 	if (!bh)
@@ -275,7 +275,7 @@ ext2_xattr_list(struct inode *inode, cha
 	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
 	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
 bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
-			"inode %ld: bad block %d", inode->i_ino,
+			"inode %lu: bad block %u", inode->i_ino,
 			EXT2_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -411,7 +411,7 @@ ext2_xattr_set(struct inode *inode, int 
 		if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
 		    header->h_blocks != cpu_to_le32(1)) {
 bad_block:		ext2_error(sb, "ext2_xattr_set",
-				"inode %ld: bad block %d", inode->i_ino, 
+				"inode %lu: bad block %u", inode->i_ino,
 				   EXT2_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
@@ -664,15 +664,15 @@ ext2_xattr_set2(struct inode *inode, str
 			ext2_xattr_cache_insert(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			int goal = le32_to_cpu(EXT2_SB(sb)->s_es->
+			unsigned int goal = le32_to_cpu(EXT2_SB(sb)->s_es->
 						           s_first_data_block) +
 				   EXT2_I(inode)->i_block_group *
 				   EXT2_BLOCKS_PER_GROUP(sb);
-			int block = ext2_new_block(inode, goal,
+			unsigned int block = ext2_new_block(inode, goal,
 						   NULL, NULL, &error);
 			if (error)
 				goto cleanup;
-			ea_idebug(inode, "creating block %d", block);
+			ea_idebug(inode, "creating block %u", block);
 
 			new_bh = sb_getblk(sb, block);
 			if (!new_bh) {
@@ -772,7 +772,7 @@ ext2_xattr_delete_inode(struct inode *in
 	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
 	if (!bh) {
 		ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
-			"inode %ld: block %d read error", inode->i_ino,
+			"inode %lu: block %u read error", inode->i_ino,
 			EXT2_I(inode)->i_file_acl);
 		goto cleanup;
 	}
@@ -780,7 +780,7 @@ ext2_xattr_delete_inode(struct inode *in
 	if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
 	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
-			"inode %ld: bad block %d", inode->i_ino,
+			"inode %lu: bad block %u", inode->i_ino,
 			EXT2_I(inode)->i_file_acl);
 		goto cleanup;
 	}
@@ -931,13 +931,13 @@ again:
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
 			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
-				"inode %ld: block %ld read error",
+				"inode %lu: block %lu read error",
 				inode->i_ino, (unsigned long) ce->e_block);
 		} else {
 			lock_buffer(bh);
 			if (le32_to_cpu(HDR(bh)->h_refcount) >
 				   EXT2_XATTR_REFCOUNT_MAX) {
-				ea_idebug(inode, "block %ld refcount %d>%d",
+				ea_idebug(inode, "block %lu refcount %d>%d",
 					  (unsigned long) ce->e_block,
 					  le32_to_cpu(HDR(bh)->h_refcount),
 					  EXT2_XATTR_REFCOUNT_MAX);
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext2/xip.c linux-2.6.16-rc6-4g/fs/ext2
/xip.c
--- linux-2.6.16-rc6.org/fs/ext2/xip.c	2006-01-03 12:21:10.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext2/xip.c	2006-03-14 09:29:01.000000000 +0900
@@ -44,8 +44,8 @@ __ext2_get_sector(struct inode *inode, s
 	return rc;
 }
 
-int
-ext2_clear_xip_target(struct inode *inode, int block)
+unsigned int
+ext2_clear_xip_target(struct inode *inode, unsigned int block)
 {
 	sector_t sector = block * (PAGE_SIZE/512);
 	unsigned long data;
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext3/balloc.c linux-2.6.16-rc6-4g/fs/e
xt3/balloc.c
--- linux-2.6.16-rc6.org/fs/ext3/balloc.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext3/balloc.c	2006-03-14 09:29:01.000000000 +0900
@@ -36,7 +36,6 @@
  * when a file system is mounted (see ext3_read_super).
  */
 
-
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
 struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
@@ -467,7 +466,7 @@ do_more:
 		cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
 			group_freed);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
-	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+	percpu_llcounter_mod(&sbi->s_freeblocks_counter, count);
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -1118,9 +1117,10 @@ out:
 
 static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 {
-	int free_blocks, root_blocks;
+	unsigned long free_blocks;
+	int  root_blocks;
 
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	free_blocks = percpu_llcounter_read_positive(&sbi->s_freeblocks_counter);
 	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
 	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
 		sbi->s_resuid != current->fsuid &&
@@ -1154,19 +1154,20 @@ int ext3_should_retry_alloc(struct super
  * bitmap, and then for any free bit if that fails.
  * This function also updates quota and i_blocks field.
  */
-int ext3_new_block(handle_t *handle, struct inode *inode,
+unsigned int ext3_new_block(handle_t *handle, struct inode *inode,
 			unsigned long goal, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gdp_bh;
 	int group_no;
 	int goal_group;
-	int ret_block;
+	unsigned int ret_block;
 	int bgi;			/* blockgroup iteration index */
-	int target_block;
+	unsigned int target_block;
 	int fatal = 0, err;
 	int performed_allocation = 0;
 	int free_blocks;
+	int group_block;
 	struct super_block *sb;
 	struct ext3_group_desc *gdp;
 	struct ext3_super_block *es;
@@ -1238,17 +1239,19 @@ retry:
 		my_rsv = NULL;
 
 	if (free_blocks > 0) {
-		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
+		group_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
 				EXT3_BLOCKS_PER_GROUP(sb));
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
-		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
-					bitmap_bh, ret_block, my_rsv, &fatal);
+		group_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
+					bitmap_bh, group_block, my_rsv, &fatal);
 		if (fatal)
 			goto out;
-		if (ret_block >= 0)
+		if (group_block >= 0) {
+			ret_block = group_block;
 			goto allocated;
+		}
 	}
 
 	ngroups = EXT3_SB(sb)->s_groups_count;
@@ -1280,12 +1283,14 @@ retry:
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
-		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
+		group_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
 					bitmap_bh, -1, my_rsv, &fatal);
 		if (fatal)
 			goto out;
-		if (ret_block >= 0) 
+		if (group_block >= 0) {
+			ret_block = group_block;
 			goto allocated;
+		}
 	}
 	/*
 	 * We may end up a bogus ealier ENOSPC error due to
@@ -1347,7 +1352,7 @@ allocated:
 				"b_committed_data\n", __FUNCTION__);
 		}
 	}
-	ext3_debug("found bit %d\n", ret_block);
+	ext3_debug("found bit %u\n", ret_block);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
 	jbd_unlock_bh_state(bitmap_bh);
 #endif
@@ -1357,8 +1362,8 @@ allocated:
 
 	if (ret_block >= le32_to_cpu(es->s_blocks_count)) {
 		ext3_error(sb, "ext3_new_block",
-			    "block(%d) >= blocks count(%d) - "
-			    "block_group = %d, es == %p ", ret_block,
+			    "block(%u) >= blocks count(%u) - "
+			    "block_group = %u, es == %p ", ret_block,
 			le32_to_cpu(es->s_blocks_count), group_no, es);
 		goto out;
 	}
@@ -1368,14 +1373,14 @@ allocated:
 	 * list of some description.  We don't know in advance whether
 	 * the caller wants to use it as metadata or data.
 	 */
-	ext3_debug("allocating block %d. Goal hits %d of %d.\n",
+	ext3_debug("allocating block %u. Goal hits %d of %d.\n",
 			ret_block, goal_hits, goal_attempts);
 
 	spin_lock(sb_bgl_lock(sbi, group_no));
 	gdp->bg_free_blocks_count =
 			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_mod(&sbi->s_freeblocks_counter, -1);
+	percpu_llcounter_mod(&sbi->s_freeblocks_counter, -1);
 
 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 	err = ext3_journal_dirty_metadata(handle, gdp_bh);
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext3/ialloc.c linux-2.6.16-rc6-4g/fs/e
xt3/ialloc.c
--- linux-2.6.16-rc6.org/fs/ext3/ialloc.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext3/ialloc.c	2006-03-14 09:29:01.000000000 +0900
@@ -170,9 +170,9 @@ void ext3_free_inode (handle_t *handle, 
 				gdp->bg_used_dirs_count = cpu_to_le16(
 				  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
 			spin_unlock(sb_bgl_lock(sbi, block_group));
-			percpu_counter_inc(&sbi->s_freeinodes_counter);
+			percpu_llcounter_inc(&sbi->s_freeinodes_counter);
 			if (is_directory)
-				percpu_counter_dec(&sbi->s_dirs_counter);
+				percpu_llcounter_dec(&sbi->s_dirs_counter);
 
 		}
 		BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
@@ -202,12 +202,13 @@ error_return:
 static int find_group_dir(struct super_block *sb, struct inode *parent)
 {
 	int ngroups = EXT3_SB(sb)->s_groups_count;
-	int freei, avefreei;
+	unsigned long freei;
+	int avefreei;
 	struct ext3_group_desc *desc, *best_desc = NULL;
 	struct buffer_head *bh;
 	int group, best_group = -1;
 
-	freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
+	freei = percpu_llcounter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
 	avefreei = freei / ngroups;
 
 	for (group = 0; group < ngroups; group++) {
@@ -261,19 +262,20 @@ static int find_group_orlov(struct super
 	struct ext3_super_block *es = sbi->s_es;
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
-	int freei, avefreei;
-	int freeb, avefreeb;
-	int blocks_per_dir, ndirs;
+	unsigned long freei, freeb, ndirs;
+	int avefreei;
+	int avefreeb;
+	int blocks_per_dir;
 	int max_debt, max_dirs, min_blocks, min_inodes;
 	int group = -1, i;
 	struct ext3_group_desc *desc;
 	struct buffer_head *bh;
 
-	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	freei = percpu_llcounter_read_positive(&sbi->s_freeinodes_counter);
 	avefreei = freei / ngroups;
-	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	freeb = percpu_llcounter_read_positive(&sbi->s_freeblocks_counter);
 	avefreeb = freeb / ngroups;
-	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+	ndirs = percpu_llcounter_read_positive(&sbi->s_dirs_counter);
 
 	if ((parent == sb->s_root->d_inode) ||
 	    (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
@@ -539,9 +541,9 @@ got:
 	err = ext3_journal_dirty_metadata(handle, bh2);
 	if (err) goto fail;
 
-	percpu_counter_dec(&sbi->s_freeinodes_counter);
+	percpu_llcounter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
-		percpu_counter_inc(&sbi->s_dirs_counter);
+		percpu_llcounter_inc(&sbi->s_dirs_counter);
 	sb->s_dirt = 1;
 
 	inode->i_uid = current->fsuid;
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext3/inode.c linux-2.6.16-rc6-4g/fs/ex
t3/inode.c
--- linux-2.6.16-rc6.org/fs/ext3/inode.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext3/inode.c	2006-03-14 09:29:01.000000000 +0900
@@ -64,7 +64,7 @@ static inline int ext3_inode_is_fast_sym
 
 int ext3_forget(handle_t *handle, int is_metadata,
 		       struct inode *inode, struct buffer_head *bh,
-		       int blocknr)
+		       unsigned int blocknr)
 {
 	int err;
 
@@ -235,10 +235,10 @@ no_delete:
 	clear_inode(inode);	/* We must guarantee clearing of inode... */
 }
 
-static int ext3_alloc_block (handle_t *handle,
-			struct inode * inode, unsigned long goal, int *err)
+static unsigned int ext3_alloc_block (handle_t *handle,
+			struct inode * inode, unsigned int goal, int *err)
 {
-	unsigned long result;
+	unsigned int result;
 
 	result = ext3_new_block(handle, inode, goal, err);
 	return result;
@@ -296,7 +296,7 @@ static inline int verify_chain(Indirect 
  */
 
 static int ext3_block_to_path(struct inode *inode,
-			long i_block, int offsets[4], int *boundary)
+			unsigned long i_block, unsigned int offsets[4], int *boundary)
 {
 	int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -363,7 +363,7 @@ static int ext3_block_to_path(struct ino
  *	or when it reads all @depth-1 indirect blocks successfully and finds
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
  */
-static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
+static Indirect *ext3_get_branch(struct inode *inode, int depth, unsigned int *offsets,
 				 Indirect chain[4], int *err)
 {
 	struct super_block *sb = inode->i_sb;
@@ -460,7 +460,7 @@ static unsigned long ext3_find_near(stru
  *	stores it in *@goal and returns zero.
  */
 
-static unsigned long ext3_find_goal(struct inode *inode, long block,
+static unsigned long ext3_find_goal(struct inode *inode, unsigned long block,
 		Indirect chain[4], Indirect *partial)
 {
 	struct ext3_block_alloc_info *block_i =  EXT3_I(inode)->i_block_alloc_info;
@@ -505,21 +505,21 @@ static unsigned long ext3_find_goal(stru
 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
 			     int num,
 			     unsigned long goal,
-			     int *offsets,
+			     unsigned int *offsets,
 			     Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int n = 0, keys = 0;
 	int err = 0;
 	int i;
-	int parent = ext3_alloc_block(handle, inode, goal, &err);
+	unsigned int parent = ext3_alloc_block(handle, inode, goal, &err);
 
 	branch[0].key = cpu_to_le32(parent);
 	if (parent) {
 		for (n = 1; n < num; n++) {
 			struct buffer_head *bh;
 			/* Allocate the next block */
-			int nr = ext3_alloc_block(handle, inode, parent, &err);
+			unsigned int nr = ext3_alloc_block(handle, inode, parent, &err);
 			if (!nr)
 				break;
 			branch[n].key = cpu_to_le32(nr);
@@ -585,7 +585,7 @@ static int ext3_alloc_branch(handle_t *h
  *	chain to new block and return 0.
  */
 
-static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
+static int ext3_splice_branch(handle_t *handle, struct inode *inode, unsigned long block,
 			      Indirect chain[4], Indirect *where, int num)
 {
 	int i;
@@ -676,7 +676,7 @@ ext3_get_block_handle(handle_t *handle, 
 		struct buffer_head *bh_result, int create, int extend_disksize)
 {
 	int err = -EIO;
-	int offsets[4];
+	unsigned int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	unsigned long goal;
@@ -852,7 +852,7 @@ get_block:
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
-				long block, int create, int * errp)
+				unsigned long block, int create, int * errp)
 {
 	struct buffer_head dummy;
 	int fatal = 0, err;
@@ -907,7 +907,7 @@ err:
 }
 
 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
-			       int block, int create, int *err)
+			       unsigned int block, int create, int *err)
 {
 	struct buffer_head * bh;
 
@@ -1754,7 +1754,7 @@ static inline int all_zeroes(__le32 *p, 
 
 static Indirect *ext3_find_shared(struct inode *inode,
 				int depth,
-				int offsets[4],
+				unsigned int offsets[4],
 				Indirect chain[4],
 				__le32 *top)
 {
@@ -1967,7 +1967,7 @@ static void ext3_free_branches(handle_t 
 			 */
 			if (!bh) {
 				ext3_error(inode->i_sb, "ext3_free_branches",
-					   "Read failure, inode=%ld, block=%ld",
+					   "Read failure, inode=%lu, block=%lu",
 					   inode->i_ino, nr);
 				continue;
 			}
@@ -2084,12 +2084,12 @@ void ext3_truncate(struct inode * inode)
 	__le32 *i_data = ei->i_data;
 	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
-	int offsets[4];
+	unsigned int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	__le32 nr = 0;
 	int n;
-	long last_block;
+	unsigned long last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	struct page *page;
 
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext3/namei.c linux-2.6.16-rc6-4g/fs/ex
t3/namei.c
--- linux-2.6.16-rc6.org/fs/ext3/namei.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext3/namei.c	2006-03-14 09:29:01.000000000 +0900
@@ -816,7 +816,8 @@ static struct buffer_head * ext3_find_en
 	int ra_ptr = 0;		/* Current index into readahead
 				   buffer */
 	int num = 0;
-	int nblocks, i, err;
+	unsigned int nblocks;
+	int i, err;
 	struct inode *dir = dentry->d_parent->d_inode;
 	int namelen;
 	const u8 *name;
@@ -1910,8 +1911,8 @@ int ext3_orphan_add(handle_t *handle, st
 	if (!err)
 		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
 
-	jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
-	jbd_debug(4, "orphan inode %ld will point to %d\n",
+	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+	jbd_debug(4, "orphan inode %lu will point to %d\n",
 			inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
 	unlock_super(sb);
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext3/resize.c linux-2.6.16-rc6-4g/fs/e
xt3/resize.c
--- linux-2.6.16-rc6.org/fs/ext3/resize.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext3/resize.c	2006-03-14 09:29:01.000000000 +0900
@@ -37,7 +37,7 @@ static int verify_group_input(struct sup
 		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
 	unsigned metaend = start + overhead;
 	struct buffer_head *bh = NULL;
-	int free_blocks_count;
+	long long free_blocks_count;
 	int err = -EINVAL;
 
 	input->free_blocks_count = free_blocks_count =
@@ -45,7 +45,7 @@ static int verify_group_input(struct sup
 
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks "
-		       "(%d free, %u reserved)\n",
+		       "(%lld free, %u reserved)\n",
 		       ext3_bg_has_super(sb, input->group) ? "normal" :
 		       "no-super", input->group, input->blocks_count,
 		       free_blocks_count, input->reserved_blocks);
@@ -138,14 +138,14 @@ static struct buffer_head *bclean(handle
  * need to use it within a single byte (to ensure we get endianness right).
  * We can use memset for the rest of the bitmap as there are no other users.
  */
-static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+static void mark_bitmap_end(unsigned int start_bit, unsigned int end_bit, char *bitmap)
 {
-	int i;
+	unsigned int i;
 
 	if (start_bit >= end_bit)
 		return;
 
-	ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
+	ext3_debug("mark end bits +%u through +%u used\n", start_bit, end_bit);
 	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
 		ext3_set_bit(i, bitmap);
 	if (i < end_bit)
@@ -340,7 +340,7 @@ static int verify_reserved_gdb(struct su
 	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved GDT %ld missing grp %d (%ld)",
+				     "reserved GDT %ld missing grp %d (%lu)",
 				     blk, grp,
 				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
 			return -EINVAL;
@@ -619,7 +619,7 @@ exit_free:
  * at this time.  The resize which changed s_groups_count will backup again.
  */
 static void update_backups(struct super_block *sb,
-			   int blk_off, char *data, int size)
+			   unsigned int blk_off, char *data, int size)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	const unsigned long last = sbi->s_groups_count;
@@ -869,9 +869,9 @@ int ext3_group_add(struct super_block *s
 		input->reserved_blocks);
 
 	/* Update the free space counts */
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
+	percpu_llcounter_mod(&sbi->s_freeblocks_counter,
 			   input->free_blocks_count);
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
+	percpu_llcounter_mod(&sbi->s_freeinodes_counter,
 			   EXT3_INODES_PER_GROUP(sb));
 
 	ext3_journal_dirty_metadata(handle, sbi->s_sbh);
@@ -990,10 +990,10 @@ int ext3_group_extend(struct super_block
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	unlock_super(sb);
-	ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count,
+	ext3_debug("freeing blocks %lu through %lu\n", o_blocks_count,
 		   o_blocks_count + add);
 	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext3_debug("freed blocks %ld through %ld\n", o_blocks_count,
+	ext3_debug("freed blocks %lu through %lu\n", o_blocks_count,
 		   o_blocks_count + add);
 	if ((err = ext3_journal_stop(handle)))
 		goto exit_put;
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext3/super.c linux-2.6.16-rc6-4g/fs/ex
t3/super.c
--- linux-2.6.16-rc6.org/fs/ext3/super.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext3/super.c	2006-03-14 09:29:01.000000000 +0900
@@ -377,7 +377,7 @@ static void dump_orphan_list(struct supe
 	list_for_each(l, &sbi->s_orphan) {
 		struct inode *inode = orphan_list_entry(l);
 		printk(KERN_ERR "  "
-		       "inode %s:%ld at %p: mode %o, nlink %d, next %d\n",
+		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
 		       inode->i_sb->s_id, inode->i_ino, inode,
 		       inode->i_mode, inode->i_nlink, 
 		       NEXT_ORPHAN(inode));
@@ -403,9 +403,9 @@ static void ext3_put_super (struct super
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_llcounter_destroy(&sbi->s_freeblocks_counter);
+	percpu_llcounter_destroy(&sbi->s_freeinodes_counter);
+	percpu_llcounter_destroy(&sbi->s_dirs_counter);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -1253,17 +1253,17 @@ static void ext3_orphan_cleanup (struct 
 		DQUOT_INIT(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
-				"%s: truncating inode %ld to %Ld bytes\n",
+				"%s: truncating inode %lu to %Ld bytes\n",
 				__FUNCTION__, inode->i_ino, inode->i_size);
-			jbd_debug(2, "truncating inode %ld to %Ld bytes\n",
+			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
 				  inode->i_ino, inode->i_size);
 			ext3_truncate(inode);
 			nr_truncates++;
 		} else {
 			printk(KERN_DEBUG
-				"%s: deleting unreferenced inode %ld\n",
+				"%s: deleting unreferenced inode %lu\n",
 				__FUNCTION__, inode->i_ino);
-			jbd_debug(2, "deleting unreferenced inode %ld\n",
+			jbd_debug(2, "deleting unreferenced inode %lu\n",
 				  inode->i_ino);
 			nr_orphans++;
 		}
@@ -1578,9 +1578,9 @@ static int ext3_fill_super (struct super
 		goto failed_mount;
 	}
 
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
+	percpu_llcounter_init(&sbi->s_freeblocks_counter);
+	percpu_llcounter_init(&sbi->s_freeinodes_counter);
+	percpu_llcounter_init(&sbi->s_dirs_counter);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -1728,11 +1728,11 @@ static int ext3_fill_super (struct super
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
+	percpu_llcounter_mod(&sbi->s_freeblocks_counter,
 		ext3_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
+	percpu_llcounter_mod(&sbi->s_freeinodes_counter,
 		ext3_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
+	percpu_llcounter_mod(&sbi->s_dirs_counter,
 		ext3_count_dirs(sb));
 
 	lock_kernel();
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext3/xattr.c linux-2.6.16-rc6-4g/fs/ex
t3/xattr.c
--- linux-2.6.16-rc6.org/fs/ext3/xattr.c	2006-03-14 09:09:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/ext3/xattr.c	2006-03-14 09:29:01.000000000 +0900
@@ -75,7 +75,7 @@
 
 #ifdef EXT3_XATTR_DEBUG
 # define ea_idebug(inode, f...) do { \
-		printk(KERN_DEBUG "inode %s:%ld: ", \
+		printk(KERN_DEBUG "inode %s:%lu: ", \
 			inode->i_sb->s_id, inode->i_ino); \
 		printk(f); \
 		printk("\n"); \
@@ -225,7 +225,7 @@ ext3_xattr_block_get(struct inode *inode
 	error = -ENODATA;
 	if (!EXT3_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh)
 		goto cleanup;
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 bad_block:	ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %d", inode->i_ino,
+			   "inode %lu: bad block %u", inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -366,7 +366,7 @@ ext3_xattr_block_list(struct inode *inod
 	error = 0;
 	if (!EXT3_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	error = -EIO;
 	if (!bh)
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inod
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %d", inode->i_ino,
+			   "inode %lu: bad block %u", inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inod
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext3_xattr_check_block(bs->bh)) {
 			ext3_error(sb, __FUNCTION__,
-				"inode %ld: bad block %d", inode->i_ino,
+				"inode %lu: bad block %u", inode->i_ino,
 				EXT3_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
@@ -792,14 +792,14 @@ inserted:
 			get_bh(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			int goal = le32_to_cpu(
+			unsigned int goal = le32_to_cpu(
 					EXT3_SB(sb)->s_es->s_first_data_block) +
 				EXT3_I(inode)->i_block_group *
 				EXT3_BLOCKS_PER_GROUP(sb);
-			int block = ext3_new_block(handle, inode, goal, &error);
+			unsigned int block = ext3_new_block(handle, inode, goal, &error);
 			if (error)
 				goto cleanup;
-			ea_idebug(inode, "creating block %d", block);
+			ea_idebug(inode, "creating block %u", block);
 
 			new_bh = sb_getblk(sb, block);
 			if (!new_bh) {
@@ -847,7 +847,7 @@ cleanup_dquot:
 
 bad_block:
 	ext3_error(inode->i_sb, __FUNCTION__,
-		   "inode %ld: bad block %d", inode->i_ino,
+		   "inode %lu: bad block %u", inode->i_ino,
 		   EXT3_I(inode)->i_file_acl);
 	goto cleanup;
 
@@ -1076,14 +1076,14 @@ ext3_xattr_delete_inode(handle_t *handle
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: block %d read error", inode->i_ino,
+			"inode %lu: block %u read error", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: bad block %d", inode->i_ino,
+			"inode %lu: bad block %u", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
@@ -1210,11 +1210,11 @@ again:
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
 			ext3_error(inode->i_sb, __FUNCTION__,
-				"inode %ld: block %ld read error",
+				"inode %lu: block %lu read error",
 				inode->i_ino, (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
 				EXT3_XATTR_REFCOUNT_MAX) {
-			ea_idebug(inode, "block %ld refcount %d>=%d",
+			ea_idebug(inode, "block %lu refcount %d>=%d",
 				  (unsigned long) ce->e_block,
 				  le32_to_cpu(BHDR(bh)->h_refcount),
 					  EXT3_XATTR_REFCOUNT_MAX);
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/jbd/journal.c linux-2.6.16-rc6-4g/fs/j
bd/journal.c
--- linux-2.6.16-rc6.org/fs/jbd/journal.c	2006-01-03 12:21:10.000000000 +0900
+++ linux-2.6.16-rc6-4g/fs/jbd/journal.c	2006-03-14 09:29:01.000000000 +0900
@@ -761,7 +761,7 @@ journal_t * journal_init_inode (struct i
 	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
 	journal->j_inode = inode;
 	jbd_debug(1,
-		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+		  "journal %p: inode %s/%u, size %Ld, bits %d, blksize %ld\n",
 		  journal, inode->i_sb->s_id, inode->i_ino, 
 		  (long long) inode->i_size,
 		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/include/linux/ext2_fs_sb.h linux-2.6.16-r
c6-4g/include/linux/ext2_fs_sb.h
--- linux-2.6.16-rc6.org/include/linux/ext2_fs_sb.h	2006-01-03 12:21:10.000000000 +0900
+++ linux-2.6.16-rc6-4g/include/linux/ext2_fs_sb.h	2006-03-14 12:06:21.000000000 +0900
@@ -17,7 +17,7 @@
 #define _LINUX_EXT2_FS_SB
 
 #include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
+#include <linux/percpu_llcounter.h>
 
 /*
  * second extended-fs super-block data in memory
@@ -49,9 +49,9 @@ struct ext2_sb_info {
 	u32 s_next_generation;
 	unsigned long s_dir_count;
 	u8 *s_debts;
-	struct percpu_counter s_freeblocks_counter;
-	struct percpu_counter s_freeinodes_counter;
-	struct percpu_counter s_dirs_counter;
+	struct percpu_llcounter s_freeblocks_counter;
+	struct percpu_llcounter s_freeinodes_counter;
+	struct percpu_llcounter s_dirs_counter;
 	struct blockgroup_lock s_blockgroup_lock;
 };
 
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/include/linux/ext3_fs.h linux-2.6.16-rc6-
4g/include/linux/ext3_fs.h
--- linux-2.6.16-rc6.org/include/linux/ext3_fs.h	2006-01-03 12:21:10.000000000 +0900
+++ linux-2.6.16-rc6-4g/include/linux/ext3_fs.h	2006-03-14 09:29:01.000000000 +0900
@@ -731,7 +731,7 @@ struct dir_private_info {
 /* balloc.c */
 extern int ext3_bg_has_super(struct super_block *sb, int group);
 extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
-extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
+extern unsigned int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
 extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
 			      unsigned long);
 extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
@@ -761,7 +761,6 @@ extern int ext3_sync_file (struct file *
 extern int ext3fs_dirhash(const char *name, int len, struct
 			  dx_hash_info *hinfo);
 
-/* ialloc.c */
 extern struct inode * ext3_new_inode (handle_t *, struct inode *, int);
 extern void ext3_free_inode (handle_t *, struct inode *);
 extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
@@ -772,9 +771,9 @@ extern unsigned long ext3_count_free (st
 
 
 /* inode.c */
-extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
-extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
-extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, unsigned int);
+extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, unsigned long, int, int *);
+extern struct buffer_head * ext3_bread (handle_t *, struct inode *, unsigned int, int, int *);
 
 extern void ext3_read_inode (struct inode *);
 extern int  ext3_write_inode (struct inode *, int);
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/include/linux/ext3_fs_sb.h linux-2.6.16-r
c6-4g/include/linux/ext3_fs_sb.h
--- linux-2.6.16-rc6.org/include/linux/ext3_fs_sb.h	2006-01-03 12:21:10.000000000 +0900
+++ linux-2.6.16-rc6-4g/include/linux/ext3_fs_sb.h	2006-03-14 12:06:35.000000000 +0900
@@ -20,7 +20,7 @@
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
+#include <linux/percpu_llcounter.h>
 #endif
 #include <linux/rbtree.h>
 
@@ -54,9 +54,9 @@ struct ext3_sb_info {
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
 	int s_def_hash_version;
-	struct percpu_counter s_freeblocks_counter;
-	struct percpu_counter s_freeinodes_counter;
-	struct percpu_counter s_dirs_counter;
+	struct percpu_llcounter s_freeblocks_counter;
+	struct percpu_llcounter s_freeinodes_counter;
+	struct percpu_llcounter s_dirs_counter;
 	struct blockgroup_lock s_blockgroup_lock;
 
 	/* root of the per fs reservation window tree */
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/include/linux/percpu_llcounter.h linux-2.
6.16-rc6-4g/include/linux/percpu_llcounter.h
--- linux-2.6.16-rc6.org/include/linux/percpu_llcounter.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.16-rc6-4g/include/linux/percpu_llcounter.h	2006-03-14 13:50:54.000000000 +0900
@@ -0,0 +1,113 @@
+#ifndef _LINUX_LLPERCPU_COUNTER_H
+#define _LINUX_LLPERCPU_COUNTER_H
+/*
+ * A simple "approximate counter" for use in ext2 and ext3 superblocks.
+ *
+ * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
+ */
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+
+#ifdef CONFIG_SMP
+
+struct percpu_llcounter {
+	spinlock_t lock;
+	long long count;
+	long long *counters;
+};
+
+#if NR_CPUS >= 16
+#define FBC_BATCH	(NR_CPUS*2)
+#else
+#define FBC_BATCH	(NR_CPUS*4)
+#endif
+
+static inline void percpu_llcounter_init(struct percpu_llcounter *fbc)
+{
+	spin_lock_init(&fbc->lock);
+	fbc->count = 0;
+	fbc->counters = alloc_percpu(long long);
+}
+
+static inline void percpu_llcounter_destroy(struct percpu_llcounter *fbc)
+{
+	free_percpu(fbc->counters);
+}
+
+void percpu_llcounter_mod(struct percpu_llcounter *fbc, long long amount);
+long long percpu_llcounter_sum(struct percpu_llcounter *fbc);
+
+static inline long long percpu_llcounter_read(struct percpu_llcounter *fbc)
+{
+	return fbc->count;
+}
+
+/*
+ * It is possible for the percpu_llcounter_read() to return a small negative
+ * number for some counter which should never be negative.
+ */
+static inline long long percpu_llcounter_read_positive(struct percpu_llcounter *fbc)
+{
+	long long ret = fbc->count;
+
+	barrier();		/* Prevent reloads of fbc->count */
+	if (ret > 0)
+		return ret;
+	return 1;
+}
+
+#else
+
+struct percpu_llcounter {
+	long long count;
+};
+
+static inline void percpu_llcounter_init(struct percpu_llcounter *fbc)
+{
+	fbc->count = 0;
+}
+
+static inline void percpu_llcounter_destroy(struct percpu_llcounter *fbc)
+{
+}
+
+static inline void
+percpu_llcounter_mod(struct percpu_llcounter *fbc, long long amount)
+{
+	preempt_disable();
+	fbc->count += amount;
+	preempt_enable();
+}
+
+static inline long long percpu_llcounter_read(struct percpu_llcounter *fbc)
+{
+	return fbc->count;
+}
+
+static inline long long percpu_llcounter_read_positive(struct percpu_llcounter *fbc)
+{
+	return fbc->count;
+}
+
+static inline long long percpu_llcounter_sum(struct percpu_llcounter *fbc)
+{
+	return percpu_llcounter_read_positive(fbc);
+}
+
+#endif	/* CONFIG_SMP */
+
+static inline void percpu_llcounter_inc(struct percpu_llcounter *fbc)
+{
+	percpu_llcounter_mod(fbc, 1);
+}
+
+static inline void percpu_llcounter_dec(struct percpu_llcounter *fbc)
+{
+	percpu_llcounter_mod(fbc, -1);
+}
+
+#endif /* _LINUX_LLPERCPU_COUNTER_H */
diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/mm/swap.c linux-2.6.16-rc6-4g/mm/swap.c
--- linux-2.6.16-rc6.org/mm/swap.c	2006-03-14 09:09:07.000000000 +0900
+++ linux-2.6.16-rc6-4g/mm/swap.c	2006-03-14 13:47:18.000000000 +0900
@@ -26,6 +26,7 @@
 #include <linux/buffer_head.h>	/* for try_to_release_page() */
 #include <linux/module.h>
 #include <linux/percpu_counter.h>
+#include <linux/percpu_llcounter.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
@@ -498,6 +499,27 @@ void percpu_counter_mod(struct percpu_co
 }
 EXPORT_SYMBOL(percpu_counter_mod);
 
+void percpu_llcounter_mod(struct percpu_llcounter *fbc, long long amount)
+{
+	long long count;
+	long long *pcount;
+	int cpu = get_cpu();
+
+	pcount = per_cpu_ptr(fbc->counters, cpu);
+	count = *pcount + amount;
+	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+		spin_lock(&fbc->lock);
+		fbc->count += count;
+		*pcount = 0;
+		spin_unlock(&fbc->lock);
+	} else {
+		*pcount = count;
+	}
+	put_cpu();
+}
+EXPORT_SYMBOL(percpu_llcounter_mod);
+
+
 /*
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
@@ -517,6 +539,26 @@ long percpu_counter_sum(struct percpu_co
 	return ret < 0 ? 0 : ret;
 }
 EXPORT_SYMBOL(percpu_counter_sum);
+
+/*
+ * Add up all the per-cpu counts, return the result.  This is a more accurate
+ * but much slower version of percpu_llcounter_read_positive()
+ */
+long long percpu_llcounter_sum(struct percpu_llcounter *fbc)
+{
+	long long ret;
+	int cpu;
+
+	spin_lock(&fbc->lock);
+	ret = fbc->count;
+	for_each_cpu(cpu) {
+		long long *pcount = per_cpu_ptr(fbc->counters, cpu);
+		ret += *pcount;
+	}
+	spin_unlock(&fbc->lock);
+	return ret < 0 ? 0 : ret;
+}
+EXPORT_SYMBOL(percpu_llcounter_sum);
 #endif
 
 /*


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-25 13:33 [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs) sho
@ 2006-03-26 22:37 ` Badari Pulavarty
  2006-03-27  4:17   ` Takashi Sato
  2006-03-27 18:45 ` Mingming Cao
  1 sibling, 1 reply; 49+ messages in thread
From: Badari Pulavarty @ 2006-03-26 22:37 UTC (permalink / raw)
  To: sho; +Cc: linux-kernel, Ext2-devel



sho@tnes.nec.co.jp wrote:

>Hi,
>
>>More information. I ran the test with "-onoreservation" thinking that
>>the patch didn't address "reservation code" issues and I still ran
>>into block allocation problems. Hope this helps.
>>
>
>As you said, the previous patches were broken because of my mailer,
>and part of them would be rejected.
>I'm re-sending them;  I have not changed them other than the mailer.
>Could you try new patches and check what happened?
>I have run fsx with these patches several times and the problems
>weren't reproduced.
>
>Signed-off-by: Takashi Sato sho@tnes.nec.co.jp
>---
>

Sure. I will give it a spin.

BTW, did you really test them with > 8TB filesystem ?
I ran bunch of "dd"s to create few files and then ran mutiple copies of 
"fsx" tests.
Then I run into problems in few seconds of the tests.

Thanks,
Badari

>




^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-26 22:37 ` Badari Pulavarty
@ 2006-03-27  4:17   ` Takashi Sato
  0 siblings, 0 replies; 49+ messages in thread
From: Takashi Sato @ 2006-03-27  4:17 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Ext2-devel, linux-kernel

Hi,

> Sure. I will give it a spin.
> 
> BTW, did you really test them with > 8TB filesystem ?
> I ran bunch of "dd"s to create few files and then ran mutiple copies of 
> "fsx" tests.
> Then I run into problems in few seconds of the tests.

I ran five fsx programs and five dd to create 400MB-files
concurrently on 9TB filesystem, and no problem occurred.

--
Takashi Sato

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-25 13:33 [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs) sho
  2006-03-26 22:37 ` Badari Pulavarty
@ 2006-03-27 18:45 ` Mingming Cao
  2006-03-27 21:10   ` Andrew Morton
  1 sibling, 1 reply; 49+ messages in thread
From: Mingming Cao @ 2006-03-27 18:45 UTC (permalink / raw)
  To: sho, kiran, akpm; +Cc: pbadari, linux-kernel, Ext2-devel, Laurent.Vivier

I am wondering if we have (or plan to have) "long long " type of percpu
counters?  Andrew, Kiran, do you know?  

It seems right now the percpu counters are used mostly by ext2/3 for
filesystem free blocks accounting. Right now the counter is "long" type,
which is not enough if we want to extend the filesystem limit from 2**31
to 2**32 on 32 bit machine.

The patch from Takashi copies the whole percpu_count.h  and create a new
percpu_llcounter.h to support longlong type percpu counters. I am
wondering is there any better way for this?

Mingming

On Sat, 2006-03-25 at 22:33 +0900, sho@tnes.nec.co.jp wrote:

> As you said, the previous patches were broken because of my mailer,
> and part of them would be rejected.
> I'm re-sending them;  I have not changed them other than the mailer.
> Could you try new patches and check what happened?
> I have run fsx with these patches several times and the problems
> weren't reproduced.
> 
> Signed-off-by: Takashi Sato sho@tnes.nec.co.jp
> ---
> diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/fs/ext3/balloc.c linux-2.6.16-rc6-4g/fs/e
> xt3/balloc.c
> --- linux-2.6.16-rc6.org/fs/ext3/balloc.c	2006-03-14 09:09:00.000000000 +0900
> +++ linux-2.6.16-rc6-4g/fs/ext3/balloc.c	2006-03-14 09:29:01.000000000 +0900
> @@ -36,7 +36,6 @@
>   * when a file system is mounted (see ext3_read_super).
>   */
>  
> -
>  #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
>  
>  struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
> @@ -467,7 +466,7 @@ do_more:
>  		cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
>  			group_freed);
>  	spin_unlock(sb_bgl_lock(sbi, block_group));
> -	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
> +	percpu_llcounter_mod(&sbi->s_freeblocks_counter, count);
>  
>  	/* We dirtied the bitmap block */
>  	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
> @@ -1118,9 +1117,10 @@ out:

[...]
> diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/include/linux/percpu_llcounter.h linux-2.
> 6.16-rc6-4g/include/linux/percpu_llcounter.h
> --- linux-2.6.16-rc6.org/include/linux/percpu_llcounter.h	1970-01-01 09:00:00.000000000 +0900
> +++ linux-2.6.16-rc6-4g/include/linux/percpu_llcounter.h	2006-03-14 13:50:54.000000000 +0900
> @@ -0,0 +1,113 @@
> +#ifndef _LINUX_LLPERCPU_COUNTER_H
> +#define _LINUX_LLPERCPU_COUNTER_H
> +/*
> + * A simple "approximate counter" for use in ext2 and ext3 superblocks.
> + *
> + * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
> + */
> +
> +#include <linux/config.h>
> +#include <linux/spinlock.h>
> +#include <linux/smp.h>
> +#include <linux/threads.h>
> +#include <linux/percpu.h>
> +
> +#ifdef CONFIG_SMP
> +
> +struct percpu_llcounter {
> +	spinlock_t lock;
> +	long long count;
> +	long long *counters;
> +};
> +
> +#if NR_CPUS >= 16
> +#define FBC_BATCH	(NR_CPUS*2)
> +#else
> +#define FBC_BATCH	(NR_CPUS*4)
> +#endif
> +
> +static inline void percpu_llcounter_init(struct percpu_llcounter *fbc)
> +{
> +	spin_lock_init(&fbc->lock);
> +	fbc->count = 0;
> +	fbc->counters = alloc_percpu(long long);
> +}
> +
> +static inline void percpu_llcounter_destroy(struct percpu_llcounter *fbc)
> +{
> +	free_percpu(fbc->counters);
> +}
> +
> +void percpu_llcounter_mod(struct percpu_llcounter *fbc, long long amount);
> +long long percpu_llcounter_sum(struct percpu_llcounter *fbc);
> +
> +static inline long long percpu_llcounter_read(struct percpu_llcounter *fbc)
> +{
> +	return fbc->count;
> +}
> +
> +/*
> + * It is possible for the percpu_llcounter_read() to return a small negative
> + * number for some counter which should never be negative.
> + */
> +static inline long long percpu_llcounter_read_positive(struct percpu_llcounter *fbc)
> +{
> +	long long ret = fbc->count;
> +
> +	barrier();		/* Prevent reloads of fbc->count */
> +	if (ret > 0)
> +		return ret;
> +	return 1;
> +}
> +
> +#else
> +
> +struct percpu_llcounter {
> +	long long count;
> +};
> +
> +static inline void percpu_llcounter_init(struct percpu_llcounter *fbc)
> +{
> +	fbc->count = 0;
> +}
> +
> +static inline void percpu_llcounter_destroy(struct percpu_llcounter *fbc)
> +{
> +}
> +
> +static inline void
> +percpu_llcounter_mod(struct percpu_llcounter *fbc, long long amount)
> +{
> +	preempt_disable();
> +	fbc->count += amount;
> +	preempt_enable();
> +}
> +
> +static inline long long percpu_llcounter_read(struct percpu_llcounter *fbc)
> +{
> +	return fbc->count;
> +}
> +
> +static inline long long percpu_llcounter_read_positive(struct percpu_llcounter *fbc)
> +{
> +	return fbc->count;
> +}
> +
> +static inline long long percpu_llcounter_sum(struct percpu_llcounter *fbc)
> +{
> +	return percpu_llcounter_read_positive(fbc);
> +}
> +
> +#endif	/* CONFIG_SMP */
> +
> +static inline void percpu_llcounter_inc(struct percpu_llcounter *fbc)
> +{
> +	percpu_llcounter_mod(fbc, 1);
> +}
> +
> +static inline void percpu_llcounter_dec(struct percpu_llcounter *fbc)
> +{
> +	percpu_llcounter_mod(fbc, -1);
> +}
> +
> +#endif /* _LINUX_LLPERCPU_COUNTER_H */
> diff -uprN -X linux-2.6.16-rc6.org/Documentation/dontdiff linux-2.6.16-rc6.org/mm/swap.c linux-2.6.16-rc6-4g/mm/swap.c
> --- linux-2.6.16-rc6.org/mm/swap.c	2006-03-14 09:09:07.000000000 +0900
> +++ linux-2.6.16-rc6-4g/mm/swap.c	2006-03-14 13:47:18.000000000 +0900
> @@ -26,6 +26,7 @@
>  #include <linux/buffer_head.h>	/* for try_to_release_page() */
>  #include <linux/module.h>
>  #include <linux/percpu_counter.h>
> +#include <linux/percpu_llcounter.h>
>  #include <linux/percpu.h>
>  #include <linux/cpu.h>
>  #include <linux/notifier.h>
> @@ -498,6 +499,27 @@ void percpu_counter_mod(struct percpu_co
>  }
>  EXPORT_SYMBOL(percpu_counter_mod);
>  
> +void percpu_llcounter_mod(struct percpu_llcounter *fbc, long long amount)
> +{
> +	long long count;
> +	long long *pcount;
> +	int cpu = get_cpu();
> +
> +	pcount = per_cpu_ptr(fbc->counters, cpu);
> +	count = *pcount + amount;
> +	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
> +		spin_lock(&fbc->lock);
> +		fbc->count += count;
> +		*pcount = 0;
> +		spin_unlock(&fbc->lock);
> +	} else {
> +		*pcount = count;
> +	}
> +	put_cpu();
> +}
> +EXPORT_SYMBOL(percpu_llcounter_mod);
> +
> +
>  /*
>   * Add up all the per-cpu counts, return the result.  This is a more accurate
>   * but much slower version of percpu_counter_read_positive()
> @@ -517,6 +539,26 @@ long percpu_counter_sum(struct percpu_co
>  	return ret < 0 ? 0 : ret;
>  }
>  EXPORT_SYMBOL(percpu_counter_sum);
> +
> +/*
> + * Add up all the per-cpu counts, return the result.  This is a more accurate
> + * but much slower version of percpu_llcounter_read_positive()
> + */
> +long long percpu_llcounter_sum(struct percpu_llcounter *fbc)
> +{
> +	long long ret;
> +	int cpu;
> +
> +	spin_lock(&fbc->lock);
> +	ret = fbc->count;
> +	for_each_cpu(cpu) {
> +		long long *pcount = per_cpu_ptr(fbc->counters, cpu);
> +		ret += *pcount;
> +	}
> +	spin_unlock(&fbc->lock);
> +	return ret < 0 ? 0 : ret;
> +}
> +EXPORT_SYMBOL(percpu_llcounter_sum);
>  #endif
>  
>  /*
> 
> 
> 
> -------------------------------------------------------
> This SF.Net email is sponsored by xPML, a groundbreaking scripting language
> that extends applications into web and mobile media. Attend the live webcast
> and join the prime developer group breaking into this new coding territory!
> http://sel.as-us.falkag.net/sel?cmd=lnk&kid=110944&bid=241720&dat=121642
> _______________________________________________
> Ext2-devel mailing list
> Ext2-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/ext2-devel


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-27 18:45 ` Mingming Cao
@ 2006-03-27 21:10   ` Andrew Morton
  2006-03-27 22:58     ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 49+ messages in thread
From: Andrew Morton @ 2006-03-27 21:10 UTC (permalink / raw)
  To: cmm; +Cc: sho, kiran, pbadari, linux-kernel, Ext2-devel, Laurent.Vivier

Mingming Cao <cmm@us.ibm.com> wrote:
>
> I am wondering if we have (or plan to have) "long long " type of percpu
>  counters?  Andrew, Kiran, do you know?  
> 
>  It seems right now the percpu counters are used mostly by ext2/3 for
>  filesystem free blocks accounting. Right now the counter is "long" type,
>  which is not enough if we want to extend the filesystem limit from 2**31
>  to 2**32 on 32 bit machine.
> 
>  The patch from Takashi copies the whole percpu_count.h  and create a new
>  percpu_llcounter.h to support longlong type percpu counters. I am
>  wondering is there any better way for this?
> 

I can't immediately think of anything smarter.

One could of course implement a 64-bit percpu counter by simply
concatenating two 32-bit counters.  That would be a little less efficient,
but would introduce less source code and would mean that we don't need to
keep two different implemetations in sync.  But one would need to do a bit
of implementation, see how bad it looks.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-27 21:10   ` Andrew Morton
@ 2006-03-27 22:58     ` Ravikiran G Thirumalai
  2006-03-28  7:15       ` Laurent Vivier
  0 siblings, 1 reply; 49+ messages in thread
From: Ravikiran G Thirumalai @ 2006-03-27 22:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: cmm, sho, pbadari, linux-kernel, Ext2-devel, Laurent.Vivier

On Mon, Mar 27, 2006 at 01:10:49PM -0800, Andrew Morton wrote:
> Mingming Cao <cmm@us.ibm.com> wrote:
> >
> > I am wondering if we have (or plan to have) "long long " type of percpu
> >  counters?  Andrew, Kiran, do you know?  
> > 
> >  It seems right now the percpu counters are used mostly by ext2/3 for
> >  filesystem free blocks accounting. Right now the counter is "long" type,
> >  which is not enough if we want to extend the filesystem limit from 2**31
> >  to 2**32 on 32 bit machine.
> > 
> >  The patch from Takashi copies the whole percpu_count.h  and create a new
> >  percpu_llcounter.h to support longlong type percpu counters. I am
> >  wondering is there any better way for this?
> > 
> 
> I can't immediately think of anything smarter.
> 
> One could of course implement a 64-bit percpu counter by simply
> concatenating two 32-bit counters.  That would be a little less efficient,
> but would introduce less source code and would mean that we don't need to
> keep two different implemetations in sync.  But one would need to do a bit
> of implementation, see how bad it looks.

Since long long is 64 bits on both 32bit and 64 bit arches, we can just
change percpu_counter type to long long (or s64) and just have one
implementation of percpu_counter?  
But reads and writes on 64 bit counters may not be atomic on all 32 bit arches.
So the implementation might have to be reviewed for that.

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-27 22:58     ` Ravikiran G Thirumalai
@ 2006-03-28  7:15       ` Laurent Vivier
  2006-03-28  8:02         ` Ravikiran G Thirumalai
  2006-03-28 18:01         ` Mingming Cao
  0 siblings, 2 replies; 49+ messages in thread
From: Laurent Vivier @ 2006-03-28  7:15 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andrew Morton, Mingming Cao, Takashi Sato, Badari Pulavarty,
	linux-kernel, ext2-devel


[-- Attachment #1.1: Type: text/plain, Size: 1956 bytes --]

Le mar 28/03/2006 à 00:58, Ravikiran G Thirumalai a écrit :
> On Mon, Mar 27, 2006 at 01:10:49PM -0800, Andrew Morton wrote:
> > Mingming Cao <cmm@us.ibm.com> wrote:
> > >
> > > I am wondering if we have (or plan to have) "long long " type of percpu
> > >  counters?  Andrew, Kiran, do you know?  
> > > 
> > >  It seems right now the percpu counters are used mostly by ext2/3 for
> > >  filesystem free blocks accounting. Right now the counter is "long" type,
> > >  which is not enough if we want to extend the filesystem limit from 2**31
> > >  to 2**32 on 32 bit machine.
> > > 
> > >  The patch from Takashi copies the whole percpu_count.h  and create a new
> > >  percpu_llcounter.h to support longlong type percpu counters. I am
> > >  wondering is there any better way for this?
> > > 
> > 
> > I can't immediately think of anything smarter.
> > 
> > One could of course implement a 64-bit percpu counter by simply
> > concatenating two 32-bit counters.  That would be a little less efficient,
> > but would introduce less source code and would mean that we don't need to
> > keep two different implemetations in sync.  But one would need to do a bit
> > of implementation, see how bad it looks.
> 
> Since long long is 64 bits on both 32bit and 64 bit arches, we can just
> change percpu_counter type to long long (or s64) and just have one
> implementation of percpu_counter?  
> But reads and writes on 64 bit counters may not be atomic on all 32 bit arches.
> So the implementation might have to be reviewed for that.

As 64bit per cpu counter is used only by ext3 and needed only on 64bit
architecture and when CONFIG_LBD is set, perhaps we can have only one
implementation, 32bit in the case of 32bit arch and 64bit in the case of
64bit arch + LBD, as I did in my 64bit patches for ext3 ?

Cheers,
Laurent
-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #1.2: percpu.patch --]
[-- Type: text/x-patch, Size: 4594 bytes --]

Index: linux-2.6.16-lv/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.16-lv.orig/include/linux/percpu_counter.h	2006-03-27 15:47:03.000000000 +0200
+++ linux-2.6.16-lv/include/linux/percpu_counter.h	2006-03-27 15:47:14.000000000 +0200
@@ -16,8 +16,13 @@
 
 struct percpu_counter {
 	spinlock_t lock;
+#ifdef CONFIG_LBD
+	long long count;
+	long long *counters;
+#else
 	long count;
 	long *counters;
+#endif
 };
 
 #if NR_CPUS >= 16
@@ -30,7 +35,11 @@ static inline void percpu_counter_init(s
 {
 	spin_lock_init(&fbc->lock);
 	fbc->count = 0;
+#ifdef CONFIG_LBD
+	fbc->counters = alloc_percpu(long long);
+#else
 	fbc->counters = alloc_percpu(long);
+#endif
 }
 
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
@@ -38,10 +47,17 @@ static inline void percpu_counter_destro
 	free_percpu(fbc->counters);
 }
 
+#ifdef CONFIG_LBD
+void percpu_counter_mod(struct percpu_counter *fbc, long long amount);
+long long percpu_counter_sum(struct percpu_counter *fbc);
+
+static inline long long percpu_counter_read(struct percpu_counter *fbc)
+#else
 void percpu_counter_mod(struct percpu_counter *fbc, long amount);
 long percpu_counter_sum(struct percpu_counter *fbc);
 
 static inline long percpu_counter_read(struct percpu_counter *fbc)
+#endif
 {
 	return fbc->count;
 }
@@ -50,9 +66,15 @@ static inline long percpu_counter_read(s
  * It is possible for the percpu_counter_read() to return a small negative
  * number for some counter which should never be negative.
  */
+#ifdef CONFIG_LBD
+static inline long long percpu_counter_read_positive(struct percpu_counter *fbc)
+{
+	long long ret = fbc->count;
+#else
 static inline long percpu_counter_read_positive(struct percpu_counter *fbc)
 {
 	long ret = fbc->count;
+#endif
 
 	barrier();		/* Prevent reloads of fbc->count */
 	if (ret > 0)
@@ -63,7 +85,11 @@ static inline long percpu_counter_read_p
 #else
 
 struct percpu_counter {
+#ifdef CONFIG_LBD
+	long long count;
+#else
 	long count;
+#endif
 };
 
 static inline void percpu_counter_init(struct percpu_counter *fbc)
@@ -76,24 +102,40 @@ static inline void percpu_counter_destro
 }
 
 static inline void
+#ifdef CONFIG_LBD
+percpu_counter_mod(struct percpu_counter *fbc, long long amount)
+#else
 percpu_counter_mod(struct percpu_counter *fbc, long amount)
+#endif
 {
 	preempt_disable();
 	fbc->count += amount;
 	preempt_enable();
 }
 
+#ifdef CONFIG_LBD
+static inline long long percpu_counter_read(struct percpu_counter *fbc)
+#else
 static inline long percpu_counter_read(struct percpu_counter *fbc)
+#endif
 {
 	return fbc->count;
 }
 
+#ifdef CONFIG_LBD
+static inline long long percpu_counter_read_positive(struct percpu_counter *fbc)
+#else
 static inline long percpu_counter_read_positive(struct percpu_counter *fbc)
+#endif
 {
 	return fbc->count;
 }
 
+#ifdef CONFIG_LBD
+static inline long long percpu_counter_sum(struct percpu_counter *fbc)
+#else
 static inline long percpu_counter_sum(struct percpu_counter *fbc)
+#endif
 {
 	return percpu_counter_read_positive(fbc);
 }
Index: linux-2.6.16-lv/mm/swap.c
===================================================================
--- linux-2.6.16-lv.orig/mm/swap.c	2006-03-27 15:47:03.000000000 +0200
+++ linux-2.6.16-lv/mm/swap.c	2006-03-27 15:47:14.000000000 +0200
@@ -479,10 +479,17 @@ static int cpu_swap_callback(struct noti
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_SMP
+#ifdef CONFIG_LBD
+void percpu_counter_mod(struct percpu_counter *fbc, long long amount)
+{
+	long long count;
+	long long *pcount;
+#else
 void percpu_counter_mod(struct percpu_counter *fbc, long amount)
 {
 	long count;
 	long *pcount;
+#endif
 	int cpu = get_cpu();
 
 	pcount = per_cpu_ptr(fbc->counters, cpu);
@@ -503,15 +510,26 @@ EXPORT_SYMBOL(percpu_counter_mod);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
+#ifdef CONFIG_LBD
+long long percpu_counter_sum(struct percpu_counter *fbc)
+{
+	long long ret;
+#else
 long percpu_counter_sum(struct percpu_counter *fbc)
 {
 	long ret;
+#endif
 	int cpu;
 
 	spin_lock(&fbc->lock);
 	ret = fbc->count;
 	for_each_cpu(cpu) {
-		long *pcount = per_cpu_ptr(fbc->counters, cpu);
+#ifdef CONFIG_LBD
+		long long *pcount;
+#else
+		long *pcount;
+#endif
+		pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
 	}
 	spin_unlock(&fbc->lock);

[-- Attachment #2: Ceci est une partie de message numériquement signée. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-28  7:15       ` Laurent Vivier
@ 2006-03-28  8:02         ` Ravikiran G Thirumalai
  2006-03-28 10:34           ` Laurent Vivier
  2006-03-28 18:01         ` Mingming Cao
  1 sibling, 1 reply; 49+ messages in thread
From: Ravikiran G Thirumalai @ 2006-03-28  8:02 UTC (permalink / raw)
  To: Laurent Vivier
  Cc: Andrew Morton, Mingming Cao, Takashi Sato, Badari Pulavarty,
	linux-kernel, ext2-devel

On Tue, Mar 28, 2006 at 09:15:26AM +0200, Laurent Vivier wrote:
> Le mar 28/03/2006 à 00:58, Ravikiran G Thirumalai a écrit :
> > On Mon, Mar 27, 2006 at 01:10:49PM -0800, Andrew Morton wrote:
> > > Mingming Cao <cmm@us.ibm.com> wrote:
> 
> As 64bit per cpu counter is used only by ext3 and needed only on 64bit

No, per-cpu counters are generic, and used for nr_files counter in vfs, and
struct  proto.memory_allocated in net (on current -mm). 


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-28  8:02         ` Ravikiran G Thirumalai
@ 2006-03-28 10:34           ` Laurent Vivier
  0 siblings, 0 replies; 49+ messages in thread
From: Laurent Vivier @ 2006-03-28 10:34 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andrew Morton, Mingming Cao, Takashi Sato, Badari Pulavarty,
	linux-kernel, ext2-devel

[-- Attachment #1: Type: text/plain, Size: 757 bytes --]

Le mar 28/03/2006 à 10:02, Ravikiran G Thirumalai a écrit :
> On Tue, Mar 28, 2006 at 09:15:26AM +0200, Laurent Vivier wrote:
> > Le mar 28/03/2006 à 00:58, Ravikiran G Thirumalai a écrit :
> > > On Mon, Mar 27, 2006 at 01:10:49PM -0800, Andrew Morton wrote:
> > > > Mingming Cao <cmm@us.ibm.com> wrote:
> > 
> > As 64bit per cpu counter is used only by ext3 and needed only on 64bit
> 
> No, per-cpu counters are generic, and used for nr_files counter in vfs, and
> struct  proto.memory_allocated in net (on current -mm). 

In fact, I'm wondering if it is really a problem, as on 64bit arch
sizeof(long) = sizeof(long long) = 8 ...

Laurent
-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #2: Ceci est une partie de message numériquement signée. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-28  7:15       ` Laurent Vivier
  2006-03-28  8:02         ` Ravikiran G Thirumalai
@ 2006-03-28 18:01         ` Mingming Cao
  2006-03-29  9:13           ` Laurent Vivier
  1 sibling, 1 reply; 49+ messages in thread
From: Mingming Cao @ 2006-03-28 18:01 UTC (permalink / raw)
  To: Laurent Vivier
  Cc: Ravikiran G Thirumalai, Andrew Morton, Takashi Sato,
	Badari Pulavarty, linux-kernel, ext2-devel

On Tue, 2006-03-28 at 09:15 +0200, Laurent Vivier wrote:
> Le mar 28/03/2006 à 00:58, Ravikiran G Thirumalai a écrit :
> > On Mon, Mar 27, 2006 at 01:10:49PM -0800, Andrew Morton wrote:
> > > Mingming Cao <cmm@us.ibm.com> wrote:
> > > >
> > > > I am wondering if we have (or plan to have) "long long " type of percpu
> > > >  counters?  Andrew, Kiran, do you know?  
> > > > 
> > > >  It seems right now the percpu counters are used mostly by ext2/3 for
> > > >  filesystem free blocks accounting. Right now the counter is "long" type,
> > > >  which is not enough if we want to extend the filesystem limit from 2**31
> > > >  to 2**32 on 32 bit machine.
> > > > 
> > > >  The patch from Takashi copies the whole percpu_count.h  and create a new
> > > >  percpu_llcounter.h to support longlong type percpu counters. I am
> > > >  wondering is there any better way for this?
> > > > 
> > > 
> > > I can't immediately think of anything smarter.
> > > 
> > > One could of course implement a 64-bit percpu counter by simply
> > > concatenating two 32-bit counters.  That would be a little less efficient,
> > > but would introduce less source code and would mean that we don't need to
> > > keep two different implemetations in sync.  But one would need to do a bit
> > > of implementation, see how bad it looks.
> > 
> > Since long long is 64 bits on both 32bit and 64 bit arches, we can just
> > change percpu_counter type to long long (or s64) and just have one
> > implementation of percpu_counter?  
> > But reads and writes on 64 bit counters may not be atomic on all 32 bit arches.
> > So the implementation might have to be reviewed for that.
> 
> As 64bit per cpu counter is used only by ext3 and needed only on 64bit
> architecture and when CONFIG_LBD is set, perhaps we can have only one
> implementation, 32bit in the case of 32bit arch and 64bit in the case of
> 64bit arch + LBD, as I did in my 64bit patches for ext3 ?
> 

The current percpu counter on 32 bit machine is "long", a signed value.
It's a problem for ext3 on 32 bit arch also, as the total number of free
blocks in ext3 is a type of u32. Isn't it? Did I miss something?


Mingming
> Cheers,
> Laurent


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-28 18:01         ` Mingming Cao
@ 2006-03-29  9:13           ` Laurent Vivier
       [not found]             ` <1143657317.4045.12.camel@dyn9047017067.beaverton.ibm.com>
                               ` (3 more replies)
  0 siblings, 4 replies; 49+ messages in thread
From: Laurent Vivier @ 2006-03-29  9:13 UTC (permalink / raw)
  To: Mingming Cao
  Cc: Ravikiran G Thirumalai, Andrew Morton, Takashi Sato,
	Badari Pulavarty, linux-kernel, ext2-devel

[-- Attachment #1: Type: text/plain, Size: 2673 bytes --]

Le mar 28/03/2006 à 20:01, Mingming Cao a écrit :
> On Tue, 2006-03-28 at 09:15 +0200, Laurent Vivier wrote:
> > Le mar 28/03/2006 à 00:58, Ravikiran G Thirumalai a écrit :
> > > On Mon, Mar 27, 2006 at 01:10:49PM -0800, Andrew Morton wrote:
> > > > Mingming Cao <cmm@us.ibm.com> wrote:
> > > > >
> > > > > I am wondering if we have (or plan to have) "long long " type of percpu
> > > > >  counters?  Andrew, Kiran, do you know?  
> > > > > 
> > > > >  It seems right now the percpu counters are used mostly by ext2/3 for
> > > > >  filesystem free blocks accounting. Right now the counter is "long" type,
> > > > >  which is not enough if we want to extend the filesystem limit from 2**31
> > > > >  to 2**32 on 32 bit machine.
> > > > > 
> > > > >  The patch from Takashi copies the whole percpu_count.h  and create a new
> > > > >  percpu_llcounter.h to support longlong type percpu counters. I am
> > > > >  wondering is there any better way for this?
> > > > > 
> > > > 
> > > > I can't immediately think of anything smarter.
> > > > 
> > > > One could of course implement a 64-bit percpu counter by simply
> > > > concatenating two 32-bit counters.  That would be a little less efficient,
> > > > but would introduce less source code and would mean that we don't need to
> > > > keep two different implemetations in sync.  But one would need to do a bit
> > > > of implementation, see how bad it looks.
> > > 
> > > Since long long is 64 bits on both 32bit and 64 bit arches, we can just
> > > change percpu_counter type to long long (or s64) and just have one
> > > implementation of percpu_counter?  
> > > But reads and writes on 64 bit counters may not be atomic on all 32 bit arches.
> > > So the implementation might have to be reviewed for that.
> > 
> > As 64bit per cpu counter is used only by ext3 and needed only on 64bit
> > architecture and when CONFIG_LBD is set, perhaps we can have only one
> > implementation, 32bit in the case of 32bit arch and 64bit in the case of
> > 64bit arch + LBD, as I did in my 64bit patches for ext3 ?
> > 
> 
> The current percpu counter on 32 bit machine is "long", a signed value.
> It's a problem for ext3 on 32 bit arch also, as the total number of free
> blocks in ext3 is a type of u32. Isn't it? Did I miss something?

You're right, Mingming.

But I think instead of thinking to change "long" by "long long" we
should think about changing "long" by "unsigned long" in the per-cpu
counter structure.

Is there someone knowing why this counter is signed ?

Laurent
-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #2: Ceci est une partie de message numériquement signée. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

[parent not found: <1143657317.4045.12.camel@dyn9047017067.beaverton.ibm.com>]

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
       [not found]             ` <1143657317.4045.12.camel@dyn9047017067.beaverton.ibm.com>
@ 2006-03-29 20:00               ` Ravikiran G Thirumalai
  2006-03-29 20:38                 ` Mingming Cao
  0 siblings, 1 reply; 49+ messages in thread
From: Ravikiran G Thirumalai @ 2006-03-29 20:00 UTC (permalink / raw)
  To: Mingming Cao
  Cc: Laurent Vivier, Andrew Morton, Takashi Sato, Badari Pulavarty,
	linux-kernel

On Wed, Mar 29, 2006 at 10:35:10AM -0800, Mingming Cao wrote:
> On Wed, 2006-03-29 at 11:13 +0200, Laurent Vivier wrote:
> > 
> > You're right, Mingming.
> > 
> > But I think instead of thinking to change "long" by "long long" we
> > should think about changing "long" by "unsigned long" in the per-cpu
> > counter structure.
> > 
> > Is there someone knowing why this counter is signed ?
> 
> I am wondering the same thing asked by Laurent. Initially I thought the
> signed value is there to prevent overflow, or to maintain a "int" type
> counters. Are those the intentions, kiran?

I don't know if the local counter version values can be unsigned in this
case.  Consider a case like this with the initial counter value to be 0,
and FBC_BATCH is 32 (8cpusx4)

cpu 1				cpu 2			cpu 3
--------			-------			--------
add(10)
//local = 10 fbc = 0.
				sub(5)
				//local = -5 fbc = 0
							add(31)
							//local = 31 fbc = 0

				sub(30)
				//local = 0 fbc = -35
				--------------->(A)

Now if the local counters were unsigned, and the global counters unsigned
too, counter read at A would result in a large value, which would mislead
the app.  Maybe it doesn't matter if we use percpu_counter_exceeds at
critical places, so these get caught, but that would mean going on all cpus
more often than before..and that would also mean weird values when we just
use percpu_counter_read to print these counters.

So maybe using long long is a simpler solution here? Andrew, thoughts?

> 
> But it seems the per cpu counters used in ext2/3 are all number of free
> blocks/inodes/directories.  So it should be always positive values.  It
> seems fine to change the percpu counters to type "unsigned long" for
> ext2/3 itself. But I am not sure if this will cause issues for other
> users of percpu counters.  Kiran, could you please confirm this?

I guess most of the uses for per-cpu counters will be up counters, we don't
need the signedness if it wasn't for the issues above.  The nr_files,
memory_allocated counters are up counters too.

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-29 20:00               ` Ravikiran G Thirumalai
@ 2006-03-29 20:38                 ` Mingming Cao
  2006-03-30  8:41                   ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 49+ messages in thread
From: Mingming Cao @ 2006-03-29 20:38 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Laurent Vivier, Andrew Morton, Takashi Sato, Badari Pulavarty,
	linux-kernel

On Wed, 2006-03-29 at 12:00 -0800, Ravikiran G Thirumalai wrote:
> On Wed, Mar 29, 2006 at 10:35:10AM -0800, Mingming Cao wrote:
> > On Wed, 2006-03-29 at 11:13 +0200, Laurent Vivier wrote:
> > > 
> > > You're right, Mingming.
> > > 
> > > But I think instead of thinking to change "long" by "long long" we
> > > should think about changing "long" by "unsigned long" in the per-cpu
> > > counter structure.
> > > 
> > > Is there someone knowing why this counter is signed ?
> > 
> > I am wondering the same thing asked by Laurent. Initially I thought the
> > signed value is there to prevent overflow, or to maintain a "int" type
> > counters. Are those the intentions, kiran?
> 
> I don't know if the local counter version values can be unsigned in this
> case.  Consider a case like this with the initial counter value to be 0,
> and FBC_BATCH is 32 (8cpusx4)
> 
> cpu 1				cpu 2			cpu 3
> --------			-------			--------
> add(10)
> //local = 10 fbc = 0.
> 				sub(5)
> 				//local = -5 fbc = 0
> 							add(31)
> 							//local = 31 fbc = 0
> 
> 				sub(30)
> 				//local = 0 fbc = -35
> 				--------------->(A)
> 
> Now if the local counters were unsigned, and the global counters unsigned
> too, counter read at A would result in a large value, which would mislead
> the app. 

I was thinking to change the global count to "unsigned long", but we
still need to use signed value (long) for the per cpu counters(local
counter), as they are relative values and could be negative.

Something like this:

struct percpu_counter {
        spinlock_t lock;
-       long count;
+	unsigned long count;
        long *counters;
};

This works for ext2/3, as the global value always initialized to some
positive value (e.g. the # of free blocks when the filesystem is
created). But I am concerned the current other two users of percpu
counters(nr_files in VFS and memory_allocated in network code), where
the global value could be initilized to 0, and will have the issue that
you just described.


>  Maybe it doesn't matter if we use percpu_counter_exceeds at
> critical places, so these get caught, but that would mean going on all cpus
> more often than before..and that would also mean weird values when we just
> use percpu_counter_read to print these counters.
> 
Wild suggestion, how about we don't update the global counter is the
result is negative?
 
> So maybe using long long is a simpler solution here? Andrew, thoughts?
> 
> > 
> > But it seems the per cpu counters used in ext2/3 are all number of free
> > blocks/inodes/directories.  So it should be always positive values.  It
> > seems fine to change the percpu counters to type "unsigned long" for
> > ext2/3 itself. But I am not sure if this will cause issues for other
> > users of percpu counters.  Kiran, could you please confirm this?
> 
> I guess most of the uses for per-cpu counters will be up counters, we don't
> need the signedness if it wasn't for the issues above.  The nr_files,
> memory_allocated counters are up counters too.
> 
Okey, that's good to know. 
> Thanks,
> Kiran


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs)
  2006-03-29 20:38                 ` Mingming Cao
@ 2006-03-30  8:41                   ` Ravikiran G Thirumalai
  0 siblings, 0 replies; 49+ messages in thread
From: Ravikiran G Thirumalai @ 2006-03-30  8:41 UTC (permalink / raw)
  To: Mingming Cao
  Cc: Laurent Vivier, Andrew Morton, Takashi Sato, Badari Pulavarty,
	linux-kernel

On Wed, Mar 29, 2006 at 12:38:37PM -0800, Mingming Cao wrote:
> On Wed, 2006-03-29 at 12:00 -0800, Ravikiran G Thirumalai wrote:
> > On Wed, Mar 29, 2006 at 10:35:10AM -0800, Mingming Cao wrote:
> > 
> Wild suggestion, how about we don't update the global counter is the
> result is negative?

You mean just keep the local version even below -FBC_BATCH 
and only empty it to the global unsigned counter if the result is going to 
be +ve?  That would work I think.

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-29  9:13           ` Laurent Vivier
       [not found]             ` <1143657317.4045.12.camel@dyn9047017067.beaverton.ibm.com>
@ 2006-03-30  1:38             ` Mingming Cao
  2006-03-30  1:54               ` Andrew Morton
                                 ` (3 more replies)
  2006-03-30  1:39             ` [RFC][PATCH 1/2]ext3 block allocation/reservation fixes to support 2**32 block numbers Mingming Cao
  2006-03-30  1:39             ` [RFC][PATCH 2/2]Other ext3 in-kernel block number type fix " Mingming Cao
  3 siblings, 4 replies; 49+ messages in thread
From: Mingming Cao @ 2006-03-30  1:38 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Takashi Sato, Laurent Vivier, linux-kernel, ext2-devel,
	linux-fsdevel

There are places in ext3 code to use "int" to represent block numbers in
kernel(not on-disk). This seems the "only" reason that why we can only
have 8TB ext3 rather than 16TB.  Most times it just a bug with no
particular reason why not use unsigned 32 bit value, so the fix is easy.

However, it is not so straightforward fix for the ext3 block allocation
code, as ext3_new_block() returns a block number, and "-1" to indicating
block allocation failure. Ext3 block reservation code, called by
ext3_new_block(), thus also use "int" for block numbers in some places.

The following patches fixed both the ext3 block allocation code, as well
as the simple ones.

This work is inspired by Takashi's extend ext2/3 file/filesystem
limitation work, but rather, it focus on ext3 filesystem limit only, and
fixed the block allocation/reservation code to support in-kernel 2**32
block number. Also thanks to Laurent for his review.

Have verified these two patches on a 64 bit machine with 10TB ext3
filesystem, fsx runs fine for a few hours. Also testes on 32 bit machine
with <8TB ext3.

Please review this patches and I appreciate comments.

The things need to be done to complete this work is the issue with
current percpu counter, which could not handle u32 type count well. 




^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30  1:38             ` [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB Mingming Cao
@ 2006-03-30  1:54               ` Andrew Morton
  2006-03-31 22:42                 ` Mingming Cao
  2006-04-10  9:11                 ` [Ext2-devel] " Laurent Vivier
  2006-03-30 17:36               ` Andreas Dilger
                                 ` (2 subsequent siblings)
  3 siblings, 2 replies; 49+ messages in thread
From: Andrew Morton @ 2006-03-30  1:54 UTC (permalink / raw)
  To: cmm; +Cc: sho, Laurent.Vivier, linux-kernel, ext2-devel, linux-fsdevel

Mingming Cao <cmm@us.ibm.com> wrote:
>
> The things need to be done to complete this work is the issue with
>  current percpu counter, which could not handle u32 type count well. 

I'm surprised there's much of a problem here.  It is a 32-bit value, so it
should mainly be a matter of treating the return value from
percpu_counter_read() as unsigned long.

However a stickier problem is when dealing with a filesystem which has,
say, 0xffff_ff00 blocks.  Because percpu counters are approximate, and a
counter which really has a value of 0xffff_feee might return 0x00000123. 
What do we do then?

Of course the simple option is to nuke the percpu counters in ext3 and use
atomic_long_t (which is signed, so appropriate treat-it-as-unsigned code
would be needed).  I doubt if the percpu counters in ext3 are gaining us
much.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30  1:54               ` Andrew Morton
@ 2006-03-31 22:42                 ` Mingming Cao
  2006-04-02 20:13                   ` Mingming Cao
  2006-04-10  9:11                 ` [Ext2-devel] " Laurent Vivier
  1 sibling, 1 reply; 49+ messages in thread
From: Mingming Cao @ 2006-03-31 22:42 UTC (permalink / raw)
  To: Andrew Morton
  Cc: sho, Laurent.Vivier, linux-kernel, ext2-devel, linux-fsdevel

On Wed, 2006-03-29 at 17:54 -0800, Andrew Morton wrote:
> Mingming Cao <cmm@us.ibm.com> wrote:
> >
> > The things need to be done to complete this work is the issue with
> >  current percpu counter, which could not handle u32 type count well. 
> 
> I'm surprised there's much of a problem here.  It is a 32-bit value, so it
> should mainly be a matter of treating the return value from
> percpu_counter_read() as unsigned long.
> 
> However a stickier problem is when dealing with a filesystem which has,
> say, 0xffff_ff00 blocks.  Because percpu counters are approximate, and a
> counter which really has a value of 0xffff_feee might return 0x00000123. 
> What do we do then?
> 

Hmm... I think we had this issue already even with today's 2**31 ext3.
Since ext2/3 always use percpu_counter_read_positive() to get the total
number of free blocks, so if the real free blocks is 0x0fff_feee, and
the approximate value from the percpu counter is 0xf000_0123, the
percpu_counter_read_positive() will return back 0x0000123.

> Of course the simple option is to nuke the percpu counters in ext3 and use
> atomic_long_t (which is signed, so appropriate treat-it-as-unsigned code
> would be needed).  I doubt if the percpu counters in ext3 are gaining us
> much.

Sounds like the simple solution so far.

Mingming


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-31 22:42                 ` Mingming Cao
@ 2006-04-02 20:13                   ` Mingming Cao
  0 siblings, 0 replies; 49+ messages in thread
From: Mingming Cao @ 2006-04-02 20:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: sho, Laurent.Vivier, linux-kernel, ext2-devel, linux-fsdevel

On Fri, 2006-03-31 at 14:42 -0800, Mingming Cao wrote: 
> On Wed, 2006-03-29 at 17:54 -0800, Andrew Morton wrote:
> > Mingming Cao <cmm@us.ibm.com> wrote:
> > >
> > > The things need to be done to complete this work is the issue with
> > >  current percpu counter, which could not handle u32 type count well. 
> > 
> > I'm surprised there's much of a problem here.  It is a 32-bit value, so it
> > should mainly be a matter of treating the return value from
> > percpu_counter_read() as unsigned long.
> > 
> > However a stickier problem is when dealing with a filesystem which has,
> > say, 0xffff_ff00 blocks.  Because percpu counters are approximate, and a
> > counter which really has a value of 0xffff_feee might return 0x00000123. 
> > What do we do then?
> > 
> 
> Hmm... I think we had this issue already even with today's 2**31 ext3.
> Since ext2/3 always use percpu_counter_read_positive() to get the total
> number of free blocks, so if the real free blocks is 0x0fff_feee, and
> the approximate value from the percpu counter is 0xf000_0123, the
> percpu_counter_read_positive() will return back 0x0000123.
> 

In fact, even worse, percpu_counter_read_positive() always return 1 if
the value is negative (>2**31). So this is not suitable for ext3's
2**32 block numbers. I think we should use percpu_counter_read() and
cast it to unsigned long for ext3's free blocks (and probably for free
inodes also).

Think over again, I think we could fix the possible overflow issue
(caused by approximate value) Andrew was concerned about: Before update
the global counter, check to see if we are trying to increase the global
counter but get a smaller value, or we are trying to decrease the global
counter but instead get a larger value. If any of them is true, we
should not update the global counter at that moment. This check only
happens when try to update the global counter from an local counter, and
probably not needed for those who don't care about unsigned long
counters. This way we shall not get ridiculous values from the counter. 

Comments?

Mingming

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30  1:54               ` Andrew Morton
  2006-03-31 22:42                 ` Mingming Cao
@ 2006-04-10  9:11                 ` Laurent Vivier
  2006-04-10  8:24                   ` Andrew Morton
  2006-04-10 16:57                   ` Mingming Cao
  1 sibling, 2 replies; 49+ messages in thread
From: Laurent Vivier @ 2006-04-10  9:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mingming Cao, Takashi Sato, linux-kernel, ext2-devel,
	linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1155 bytes --]

Le jeu 30/03/2006 à 03:54, Andrew Morton a écrit :
> Mingming Cao <cmm@us.ibm.com> wrote:
> >
> > The things need to be done to complete this work is the issue with
> >  current percpu counter, which could not handle u32 type count well. 
> 
> I'm surprised there's much of a problem here.  It is a 32-bit value, so it
> should mainly be a matter of treating the return value from
> percpu_counter_read() as unsigned long.
> 
> However a stickier problem is when dealing with a filesystem which has,
> say, 0xffff_ff00 blocks.  Because percpu counters are approximate, and a
> counter which really has a value of 0xffff_feee might return 0x00000123. 
> What do we do then?
> 
> Of course the simple option is to nuke the percpu counters in ext3 and use
> atomic_long_t (which is signed, so appropriate treat-it-as-unsigned code
> would be needed).  I doubt if the percpu counters in ext3 are gaining us
> much.

I tried to make something in this way.
Does the attached patch look like the thing you though about ?

Regards,
Laurent
-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #2: cpu_counter --]
[-- Type: text/x-patch, Size: 6792 bytes --]

Index: linux-2.6.16-lv/fs/ext3/balloc.c
===================================================================
--- linux-2.6.16-lv.orig/fs/ext3/balloc.c	2006-04-07 16:27:11.000000000 +0200
+++ linux-2.6.16-lv/fs/ext3/balloc.c	2006-04-07 17:05:28.000000000 +0200
@@ -471,7 +471,7 @@ do_more:
 		cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
 			group_freed);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
-	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+	atomic_long_set(&sbi->s_freeblocks_counter, count);
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -1129,7 +1129,7 @@ static int ext3_has_free_blocks(struct e
 {
 	sector_t free_blocks, root_blocks;
 
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	free_blocks = (sector_t)atomic_long_read(&sbi->s_freeblocks_counter);
 	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
 	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
 		sbi->s_resuid != current->fsuid &&
@@ -1381,7 +1381,7 @@ allocated:
 	gdp->bg_free_blocks_count =
 			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_mod(&sbi->s_freeblocks_counter, -1);
+	atomic_long_dec(&sbi->s_freeblocks_counter);
 
 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 	err = ext3_journal_dirty_metadata(handle, gdp_bh);
Index: linux-2.6.16-lv/include/linux/ext3_fs_sb.h
===================================================================
--- linux-2.6.16-lv.orig/include/linux/ext3_fs_sb.h	2006-04-07 16:27:11.000000000 +0200
+++ linux-2.6.16-lv/include/linux/ext3_fs_sb.h	2006-04-07 17:01:23.000000000 +0200
@@ -20,7 +20,6 @@
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
 #endif
 #include <linux/rbtree.h>
 
@@ -54,9 +53,9 @@ struct ext3_sb_info {
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
 	int s_def_hash_version;
-	struct percpu_counter s_freeblocks_counter;
-	struct percpu_counter s_freeinodes_counter;
-	struct percpu_counter s_dirs_counter;
+	atomic_long_t s_freeblocks_counter;
+	atomic_long_t s_freeinodes_counter;
+	atomic_long_t s_dirs_counter;
 	struct blockgroup_lock s_blockgroup_lock;
 
 	/* root of the per fs reservation window tree */
Index: linux-2.6.16-lv/fs/ext3/super.c
===================================================================
--- linux-2.6.16-lv.orig/fs/ext3/super.c	2006-04-07 16:27:11.000000000 +0200
+++ linux-2.6.16-lv/fs/ext3/super.c	2006-04-07 17:14:22.000000000 +0200
@@ -404,9 +404,6 @@ static void ext3_put_super (struct super
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -1580,9 +1577,9 @@ static int ext3_fill_super (struct super
 		goto failed_mount;
 	}
 
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
+	atomic_long_set(&sbi->s_freeblocks_counter, 0);
+	atomic_long_set(&sbi->s_freeinodes_counter, 0);
+	atomic_long_set(&sbi->s_dirs_counter, 0);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -1730,11 +1727,11 @@ static int ext3_fill_super (struct super
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
+	atomic_long_set(&sbi->s_freeblocks_counter,
 		ext3_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
+	atomic_long_set(&sbi->s_freeinodes_counter,
 		ext3_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
+	atomic_long_set(&sbi->s_dirs_counter,
 		ext3_count_dirs(sb));
 
 	lock_kernel();
Index: linux-2.6.16-lv/fs/ext3/resize.c
===================================================================
--- linux-2.6.16-lv.orig/fs/ext3/resize.c	2006-04-07 16:27:11.000000000 +0200
+++ linux-2.6.16-lv/fs/ext3/resize.c	2006-04-07 17:12:13.000000000 +0200
@@ -871,9 +871,9 @@ int ext3_group_add(struct super_block *s
 		input->reserved_blocks);
 
 	/* Update the free space counts */
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
+	atomic_long_set(&sbi->s_freeblocks_counter,
 			   input->free_blocks_count);
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
+	atomic_long_set(&sbi->s_freeinodes_counter,
 			   EXT3_INODES_PER_GROUP(sb));
 
 	ext3_journal_dirty_metadata(handle, sbi->s_sbh);
Index: linux-2.6.16-lv/fs/ext3/ialloc.c
===================================================================
--- linux-2.6.16-lv.orig/fs/ext3/ialloc.c	2006-04-07 16:27:11.000000000 +0200
+++ linux-2.6.16-lv/fs/ext3/ialloc.c	2006-04-07 17:09:54.000000000 +0200
@@ -170,9 +170,9 @@ void ext3_free_inode (handle_t *handle, 
 				gdp->bg_used_dirs_count = cpu_to_le16(
 				  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
 			spin_unlock(sb_bgl_lock(sbi, block_group));
-			percpu_counter_inc(&sbi->s_freeinodes_counter);
+			atomic_long_inc(&sbi->s_freeinodes_counter);
 			if (is_directory)
-				percpu_counter_dec(&sbi->s_dirs_counter);
+				atomic_long_dec(&sbi->s_dirs_counter);
 
 		}
 		BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
@@ -207,7 +207,7 @@ static int find_group_dir(struct super_b
 	struct buffer_head *bh;
 	int group, best_group = -1;
 
-	freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
+	freei = (sector_t)atomic_long_read(&EXT3_SB(sb)->s_freeinodes_counter);
 	avefreei = freei / ngroups;
 
 	for (group = 0; group < ngroups; group++) {
@@ -269,11 +269,11 @@ static int find_group_orlov(struct super
 	struct ext3_group_desc *desc;
 	struct buffer_head *bh;
 
-	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	freei = (sector_t)atomic_long_read(&sbi->s_freeinodes_counter);
 	avefreei = freei / ngroups;
-	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	freeb = (sector_t)atomic_long_read(&sbi->s_freeblocks_counter);
 	avefreeb = freeb / ngroups;
-	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+	ndirs = (sector_t)atomic_long_read(&sbi->s_dirs_counter);
 
 	if ((parent == sb->s_root->d_inode) ||
 	    (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
@@ -539,9 +539,9 @@ got:
 	err = ext3_journal_dirty_metadata(handle, bh2);
 	if (err) goto fail;
 
-	percpu_counter_dec(&sbi->s_freeinodes_counter);
+	atomic_long_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
-		percpu_counter_inc(&sbi->s_dirs_counter);
+		atomic_long_inc(&sbi->s_dirs_counter);
 	sb->s_dirt = 1;
 
 	inode->i_uid = current->fsuid;

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-10  9:11                 ` [Ext2-devel] " Laurent Vivier
@ 2006-04-10  8:24                   ` Andrew Morton
  2006-04-13 15:26                     ` Laurent Vivier
  2006-04-10 16:57                   ` Mingming Cao
  1 sibling, 1 reply; 49+ messages in thread
From: Andrew Morton @ 2006-04-10  8:24 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: cmm, sho, linux-kernel, ext2-devel, linux-fsdevel

Laurent Vivier <Laurent.Vivier@bull.net> wrote:
>
> Does the attached patch look like the thing you though about ?

I guess so.  But it'll need a lot of performance testing on big SMP
to work out what the impact is.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-10  8:24                   ` Andrew Morton
@ 2006-04-13 15:26                     ` Laurent Vivier
  2006-04-17 21:07                       ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 49+ messages in thread
From: Laurent Vivier @ 2006-04-13 15:26 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mingming Cao, Takashi Sato, linux-kernel, ext2-devel,
	linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 816 bytes --]

Le lun 10/04/2006 à 10:24, Andrew Morton a écrit :
> Laurent Vivier <Laurent.Vivier@bull.net> wrote:
> >
> > Does the attached patch look like the thing you though about ?
> 
> I guess so.  But it'll need a lot of performance testing on big SMP
> to work out what the impact is.

I made some tests with dbench:

IBM x440: 8 CPUs hyperthreaded = 16 CPUs (Xeon at 1.4 Ghz)

with percpu_counter:

        Throughput 188.365 MB/sec 16 procs
        Throughput 226.164 MB/sec 32 procs
        Throughput 142.913 MB/sec 64 procs

with atomic_long_t:

        Throughput 194.385 MB/sec 16 procs
        Throughput 237.273 MB/sec 32 procs
        Throughput 160.751 MB/sec 64 procs

Regards,
Laurent
-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #2: Ceci est une partie de message numériquement signée. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-13 15:26                     ` Laurent Vivier
@ 2006-04-17 21:07                       ` Ravikiran G Thirumalai
  2006-04-17 21:09                         ` Arjan van de Ven
  0 siblings, 1 reply; 49+ messages in thread
From: Ravikiran G Thirumalai @ 2006-04-17 21:07 UTC (permalink / raw)
  To: Laurent Vivier
  Cc: Andrew Morton, Mingming Cao, Takashi Sato, linux-kernel,
	ext2-devel, linux-fsdevel

On Thu, Apr 13, 2006 at 05:26:39PM +0200, Laurent Vivier wrote:
> Le lun 10/04/2006 à 10:24, Andrew Morton a écrit :
> > Laurent Vivier <Laurent.Vivier@bull.net> wrote:
> > >
> > > Does the attached patch look like the thing you though about ?
> > 
> > I guess so.  But it'll need a lot of performance testing on big SMP
> > to work out what the impact is.
> 
> I made some tests with dbench:
> 
> IBM x440: 8 CPUs hyperthreaded = 16 CPUs (Xeon at 1.4 Ghz)
> 

I ran the same tests on a 16 core EM64T box very similar to the one you ran
dbench on :). Dbench results on ext3 varies quite a bit.  I couldn't get 
to a statistically significant conclusion  For eg,

With atomic counters, 32 clients, 3 runs
Throughput 187.712 MB/sec 32 procs
Throughput 197.059 MB/sec 32 procs
Throughput 203.522 MB/sec 32 procs

Without atomic counters (per-cpu counters), 32 clients, 3 runs
Throughput 228.805 MB/sec 32 procs
Throughput 155.831 MB/sec 32 procs
Throughput 134.777 MB/sec 32 procs

The oprofile profiles for the atomic counter case looks like this:

CPU: P4 / Xeon with 2 hyper-threads, speed 3002.77 MHz (estimated)
Counted GLOBAL_POWER_EVENTS events (time during which processor is not
stopped) with a unit mask of 0x01 (mandatory) count 100000
samples  %        app name                 symbol name
180505286 57.7844  vmlinux-t                poll_idle
51944524 16.6288  vmlinux-t                ext3_test_allocatable
43648955 13.9731  vmlinux-t                bitmap_search_next_usable_block
2892251   0.9259  vmlinux-t                copy_user_generic
2099969   0.6723  vmlinux-t                do_get_write_access
1459523   0.4672  vmlinux-t                journal_dirty_metadata
1393413   0.4461  vmlinux-t                journal_stop

So the atomic counters in question are not even hotspots on this workload,
so IMHO, dbench cannot be used to come to any conclusion regarding per-cpu
counters vs atomics here.

Thanks,
Kiran


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-17 21:07                       ` Ravikiran G Thirumalai
@ 2006-04-17 21:09                         ` Arjan van de Ven
  2006-04-17 21:32                           ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 49+ messages in thread
From: Arjan van de Ven @ 2006-04-17 21:09 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Laurent Vivier, Andrew Morton, Mingming Cao, Takashi Sato,
	linux-kernel, ext2-devel, linux-fsdevel

On Mon, 2006-04-17 at 14:07 -0700, Ravikiran G Thirumalai wrote:
> 
> 
> I ran the same tests on a 16 core EM64T box very similar to the one
> you ran
> dbench on :). Dbench results on ext3 varies quite a bit.  I couldn't
> get 
> to a statistically significant conclusion  For eg,


dbench is not a good performance benchmark. At all. Don't use it for
that ;)



^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-17 21:09                         ` Arjan van de Ven
@ 2006-04-17 21:32                           ` Ravikiran G Thirumalai
  2006-04-18  7:14                             ` Laurent Vivier
  0 siblings, 1 reply; 49+ messages in thread
From: Ravikiran G Thirumalai @ 2006-04-17 21:32 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Laurent Vivier, Andrew Morton, Mingming Cao, Takashi Sato,
	linux-kernel, ext2-devel, linux-fsdevel

On Mon, Apr 17, 2006 at 11:09:36PM +0200, Arjan van de Ven wrote:
> On Mon, 2006-04-17 at 14:07 -0700, Ravikiran G Thirumalai wrote:
> > 
> > 
> > I ran the same tests on a 16 core EM64T box very similar to the one
> > you ran
> > dbench on :). Dbench results on ext3 varies quite a bit.  I couldn't
> > get 
> > to a statistically significant conclusion  For eg,
> 
> 
> dbench is not a good performance benchmark. At all. Don't use it for
> that ;)

Agreed. (I did not mean to use it in the first place :).  I was just trying 
to verify the benchmark results posted earlier)

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-17 21:32                           ` Ravikiran G Thirumalai
@ 2006-04-18  7:14                             ` Laurent Vivier
  2006-04-18  7:30                               ` Arjan van de Ven
  0 siblings, 1 reply; 49+ messages in thread
From: Laurent Vivier @ 2006-04-18  7:14 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Arjan van de Ven, Andrew Morton, Mingming Cao, Takashi Sato,
	linux-kernel, ext2-devel, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 934 bytes --]

Le lun 17/04/2006 à 23:32, Ravikiran G Thirumalai a écrit :
> On Mon, Apr 17, 2006 at 11:09:36PM +0200, Arjan van de Ven wrote:
> > On Mon, 2006-04-17 at 14:07 -0700, Ravikiran G Thirumalai wrote:
> > > 
> > > 
> > > I ran the same tests on a 16 core EM64T box very similar to the one
> > > you ran
> > > dbench on :). Dbench results on ext3 varies quite a bit.  I couldn't
> > > get 
> > > to a statistically significant conclusion  For eg,
> > 
> > 
> > dbench is not a good performance benchmark. At all. Don't use it for
> > that ;)
> 
> Agreed. (I did not mean to use it in the first place :).  I was just trying 
> to verify the benchmark results posted earlier)
> 
> Thanks,
> Kiran

What is the good performance benchmark to know if we should use atomic_t
instead of percpu_counter ?

Regards,
Laurent
-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #2: Ceci est une partie de message numériquement signée. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-18  7:14                             ` Laurent Vivier
@ 2006-04-18  7:30                               ` Arjan van de Ven
  2006-04-18 10:57                                 ` Laurent Vivier
                                                   ` (2 more replies)
  0 siblings, 3 replies; 49+ messages in thread
From: Arjan van de Ven @ 2006-04-18  7:30 UTC (permalink / raw)
  To: Laurent Vivier
  Cc: Ravikiran G Thirumalai, Andrew Morton, Mingming Cao, Takashi Sato,
	linux-kernel, ext2-devel, linux-fsdevel

On Tue, 2006-04-18 at 09:14 +0200, Laurent Vivier wrote:
> Le lun 17/04/2006 à 23:32, Ravikiran G Thirumalai a écrit :
> > On Mon, Apr 17, 2006 at 11:09:36PM +0200, Arjan van de Ven wrote:
> > > On Mon, 2006-04-17 at 14:07 -0700, Ravikiran G Thirumalai wrote:
> > > > 
> > > > 
> > > > I ran the same tests on a 16 core EM64T box very similar to the one
> > > > you ran
> > > > dbench on :). Dbench results on ext3 varies quite a bit.  I couldn't
> > > > get 
> > > > to a statistically significant conclusion  For eg,
> > > 
> > > 
> > > dbench is not a good performance benchmark. At all. Don't use it for
> > > that ;)
> > 
> > Agreed. (I did not mean to use it in the first place :).  I was just trying 
> > to verify the benchmark results posted earlier)
> > 
> > Thanks,
> > Kiran
> 
> What is the good performance benchmark to know if we should use atomic_t
> instead of percpu_counter ?

you probably want something like postal/postmark instead or so (although
that's not ideal either), at least that's reproducable


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-18  7:30                               ` Arjan van de Ven
@ 2006-04-18 10:57                                 ` Laurent Vivier
  2006-04-18 19:08                                   ` Ravikiran G Thirumalai
  2006-04-18 14:09                                 ` Laurent Vivier
  2006-04-18 21:01                                 ` Mingming Cao
  2 siblings, 1 reply; 49+ messages in thread
From: Laurent Vivier @ 2006-04-18 10:57 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ravikiran G Thirumalai, Andrew Morton, Mingming Cao, Takashi Sato,
	linux-kernel, ext2-devel, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 2751 bytes --]

Le mar 18/04/2006 à 09:30, Arjan van de Ven a écrit :
> On Tue, 2006-04-18 at 09:14 +0200, Laurent Vivier wrote:
> > Le lun 17/04/2006 à 23:32, Ravikiran G Thirumalai a écrit :
> > > On Mon, Apr 17, 2006 at 11:09:36PM +0200, Arjan van de Ven wrote:
> > > > On Mon, 2006-04-17 at 14:07 -0700, Ravikiran G Thirumalai wrote:
> > > > > 
> > > > > 
> > > > > I ran the same tests on a 16 core EM64T box very similar to the one
> > > > > you ran
> > > > > dbench on :). Dbench results on ext3 varies quite a bit.  I couldn't
> > > > > get 
> > > > > to a statistically significant conclusion  For eg,
> > > > 
> > > > 
> > > > dbench is not a good performance benchmark. At all. Don't use it for
> > > > that ;)
> > > 
> > > Agreed. (I did not mean to use it in the first place :).  I was just trying 
> > > to verify the benchmark results posted earlier)
> > > 
> > > Thanks,
> > > Kiran
> > 
> > What is the good performance benchmark to know if we should use atomic_t
> > instead of percpu_counter ?
> 
> you probably want something like postal/postmark instead or so (although
> that's not ideal either), at least that's reproducable

I made tests on same system (x440) with postmark-1.51 :

pm> set numbers 100000
pm> set transactions 250000
pm> run

With atomic_t:

Time:
        3761 seconds total
        2414 seconds of transactions (103 per second)

Files:
        225064 created (59 per second)
                Creation alone: 100000 files (87 per second)
                Mixed with transactions: 125064 files (51 per second)
        124961 read (51 per second)
        124895 appended (51 per second)
        225064 deleted (59 per second)
                Deletion alone: 100128 files (503 per second)
                Mixed with transactions: 124936 files (51 per second)

Data:
        731.14 megabytes read (199.07 kilobytes per second)
        1359.02 megabytes written (370.02 kilobytes per second)

With percpu_counter:

Time:
        3787 seconds total
        2422 seconds of transactions (103 per second)

Files:
        225064 created (59 per second)
                Creation alone: 100000 files (85 per second)
                Mixed with transactions: 125064 files (51 per second)
        124961 read (51 per second)
        124895 appended (51 per second)
        225064 deleted (59 per second)
                Deletion alone: 100128 files (503 per second)
                Mixed with transactions: 124936 files (51 per second)

Data:
        731.14 megabytes read (197.70 kilobytes per second)
        1359.02 megabytes written (367.48 kilobytes per second)

-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #2: Ceci est une partie de message numériquement signée. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-18 10:57                                 ` Laurent Vivier
@ 2006-04-18 19:08                                   ` Ravikiran G Thirumalai
  0 siblings, 0 replies; 49+ messages in thread
From: Ravikiran G Thirumalai @ 2006-04-18 19:08 UTC (permalink / raw)
  To: Laurent Vivier
  Cc: Arjan van de Ven, Andrew Morton, Mingming Cao, Takashi Sato,
	linux-kernel, ext2-devel, linux-fsdevel

On Tue, Apr 18, 2006 at 12:57:00PM +0200, Laurent Vivier wrote:
> 
> I made tests on same system (x440) with postmark-1.51 :
> 
> pm> set numbers 100000
> pm> set transactions 250000
> pm> run
> 
> With atomic_t:
> 
> Time:
>         3761 seconds total
>         2414 seconds of transactions (103 per second)
> 
> Files:
>         225064 created (59 per second)
>                 Creation alone: 100000 files (87 per second)
>                 Mixed with transactions: 125064 files (51 per second)
>         124961 read (51 per second)
>         124895 appended (51 per second)
>         225064 deleted (59 per second)
>                 Deletion alone: 100128 files (503 per second)
>                 Mixed with transactions: 124936 files (51 per second)
> 
> Data:
>         731.14 megabytes read (199.07 kilobytes per second)
>         1359.02 megabytes written (370.02 kilobytes per second)
> 
> With percpu_counter:
> 
> Time:
>         3787 seconds total
>         2422 seconds of transactions (103 per second)
> 
> Files:
>         225064 created (59 per second)
>                 Creation alone: 100000 files (85 per second)
>                 Mixed with transactions: 125064 files (51 per second)
>         124961 read (51 per second)
>         124895 appended (51 per second)
>         225064 deleted (59 per second)
>                 Deletion alone: 100128 files (503 per second)
>                 Mixed with transactions: 124936 files (51 per second)
> 
> Data:
>         731.14 megabytes read (197.70 kilobytes per second)
>         1359.02 megabytes written (367.48 kilobytes per second)

Can we get oprofile output for these tests please?  It will give us a clue as
to how much of hot spots the ext3 atomic counters are with this benchmark.
Also, it will be nice to have results for 3-5 iterations of the test
to make sure we are looking at statistically significant numbers.

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-18  7:30                               ` Arjan van de Ven
  2006-04-18 10:57                                 ` Laurent Vivier
@ 2006-04-18 14:09                                 ` Laurent Vivier
  2006-04-18 21:01                                 ` Mingming Cao
  2 siblings, 0 replies; 49+ messages in thread
From: Laurent Vivier @ 2006-04-18 14:09 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ravikiran G Thirumalai, Andrew Morton, Mingming Cao, Takashi Sato,
	linux-kernel, ext2-devel, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 4300 bytes --]

Le mar 18/04/2006 à 09:30, Arjan van de Ven a écrit :
> On Tue, 2006-04-18 at 09:14 +0200, Laurent Vivier wrote:
> > Le lun 17/04/2006 à 23:32, Ravikiran G Thirumalai a écrit :
> > > On Mon, Apr 17, 2006 at 11:09:36PM +0200, Arjan van de Ven wrote:
> > > > On Mon, 2006-04-17 at 14:07 -0700, Ravikiran G Thirumalai wrote:
> > > > > 
> > > > > 
> > > > > I ran the same tests on a 16 core EM64T box very similar to the one
> > > > > you ran
> > > > > dbench on :). Dbench results on ext3 varies quite a bit.  I couldn't
> > > > > get 
> > > > > to a statistically significant conclusion  For eg,
> > > > 
> > > > 
> > > > dbench is not a good performance benchmark. At all. Don't use it for
> > > > that ;)
> > > 
> > > Agreed. (I did not mean to use it in the first place :).  I was just trying 
> > > to verify the benchmark results posted earlier)
> > > 
> > > Thanks,
> > > Kiran
> > 
> > What is the good performance benchmark to know if we should use atomic_t
> > instead of percpu_counter ?
> 
> you probably want something like postal/postmark instead or so (although
> that's not ideal either), at least that's reproducable

I made some tests with kernbench too:

***** With percpu_counter:

16 cpus found
Cleaning source tree...
Caching kernel source in ram...
No old config found, using defconfig
Making mrproper
Making defconfig...
Kernel 2.6.16
Performing 5 runs of
make -j 8
make -j 64
make -j

All data logged to kernbench.log
Warmup run...
Half load -j 8 run number 1...
Half load -j 8 run number 2...
Half load -j 8 run number 3...
Half load -j 8 run number 4...
Half load -j 8 run number 5...
Average Half load -j 8 Run (std deviation):
Elapsed Time 120.68 (0.425558)
User Time 583.488 (0.54099)
System Time 84.716 (0.345948)
Percent CPU 553 (2)
Context Switches 13146.4 (66.3272)
Sleeps 26998.2 (297.078)

Optimal load -j 64 run number 1...
Optimal load -j 64 run number 2...
Optimal load -j 64 run number 3...
Optimal load -j 64 run number 4...
Optimal load -j 64 run number 5...
Average Optimal load -j 64 Run (std deviation):
Elapsed Time 86.496 (0.335827)
User Time 809.699 (238.449)
System Time 103.137 (19.423)
Percent CPU 945.3 (413.544)
Context Switches 32549.5 (20471.2)
Sleeps 34308 (7795.17)

Maximal load -j run number 1...
Maximal load -j run number 2...
Maximal load -j run number 3...
Maximal load -j run number 4...
Maximal load -j run number 5...
Average Maximal load -j Run (std deviation):
Elapsed Time 86.47 (0.321636)
User Time 883.568 (219.647)
System Time 108.728 (17.597)
Percent CPU 1073.8 (381.226)
Context Switches 31920.4 (16443.3)
Sleeps 30472.5 (8402.01)

***** With atomic_long_t

16 cpus found
Cleaning source tree...
Caching kernel source in ram...
No old config found, using defconfig
Making mrproper
Making defconfig...
Kernel 2.6.16
Performing 5 runs of
make -j 8
make -j 64
make -j

All data logged to kernbench.log
Warmup run...
Half load -j 8 run number 1...
Half load -j 8 run number 2...
Half load -j 8 run number 3...
Half load -j 8 run number 4...
Half load -j 8 run number 5...
Average Half load -j 8 Run (std deviation):
Elapsed Time 120.468 (0.724134)
User Time 581.226 (0.497624)
System Time 84.358 (0.45417)
Percent CPU 551.8 (3.19374)
Context Switches 13085.6 (108.579)
Sleeps 26965.8 (189.384)

Optimal load -j 64 run number 1...
Optimal load -j 64 run number 2...
Optimal load -j 64 run number 3...
Optimal load -j 64 run number 4...
Optimal load -j 64 run number 5...
Average Optimal load -j 64 Run (std deviation):
Elapsed Time 86.25 (0.263439)
User Time 805.828 (236.752)
System Time 102.262 (18.8792)
Percent CPU 942.7 (412.059)
Context Switches 32339.7 (20299.6)
Sleeps 34301.9 (7741.15)

Maximal load -j run number 1...
Maximal load -j run number 2...
Maximal load -j run number 3...
Maximal load -j run number 4...
Maximal load -j run number 5...
Average Maximal load -j Run (std deviation):
Elapsed Time 85.868 (0.757905)
User Time 879.129 (218.053)
System Time 107.847 (17.2136)
Percent CPU 1072.73 (381.349)
Context Switches 31854.5 (16297.1)
Sleeps 30436.2 (8399.98)


Laurent
-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #2: Ceci est une partie de message numériquement signée. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-18  7:30                               ` Arjan van de Ven
  2006-04-18 10:57                                 ` Laurent Vivier
  2006-04-18 14:09                                 ` Laurent Vivier
@ 2006-04-18 21:01                                 ` Mingming Cao
  2006-04-20 11:28                                   ` Laurent Vivier
       [not found]                                   ` <1145543970.5872.38.camel@openx2.frec.bull.fr>
  2 siblings, 2 replies; 49+ messages in thread
From: Mingming Cao @ 2006-04-18 21:01 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Laurent Vivier, Ravikiran G Thirumalai, Andrew Morton,
	Takashi Sato, linux-kernel, ext2-devel, linux-fsdevel

On Tue, 2006-04-18 at 09:30 +0200, Arjan van de Ven wrote:
> On Tue, 2006-04-18 at 09:14 +0200, Laurent Vivier wrote:
> > Le lun 17/04/2006 à 23:32, Ravikiran G Thirumalai a écrit :
> > > On Mon, Apr 17, 2006 at 11:09:36PM +0200, Arjan van de Ven wrote:
> > > > On Mon, 2006-04-17 at 14:07 -0700, Ravikiran G Thirumalai wrote:
> > > > > 
> > > > > 
> > > > > I ran the same tests on a 16 core EM64T box very similar to the one
> > > > > you ran
> > > > > dbench on :). Dbench results on ext3 varies quite a bit.  I couldn't
> > > > > get 
> > > > > to a statistically significant conclusion  For eg,
> > > > 
> > > > 
> > > > dbench is not a good performance benchmark. At all. Don't use it for
> > > > that ;)
> > > 
> > > Agreed. (I did not mean to use it in the first place :).  I was just trying 
> > > to verify the benchmark results posted earlier)
> > > 
> > > Thanks,
> > > Kiran
> > 
> > What is the good performance benchmark to know if we should use atomic_t
> > instead of percpu_counter ?
> 
> you probably want something like postal/postmark instead or so (although
> that's not ideal either), at least that's reproducable
> 
postmark is a single threaded benchmark.

The ext3 filesystem free blocks counter is mostly being updated at block
allocation and free code. So, a test with many many threads doing block
allocation/deallocation simultaneously will stress the free blocks
counter accounting better than a single threaded fs benchmark. After
all, the main reason we choose to use percpu counter for the free blocks
counter at the first place, I believe, was to support parallel block
allocation. 

I would suggest run tiobench with many threads (>256), or even better,
run tiobench with many dd tests at the background.


Mingming


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-18 21:01                                 ` Mingming Cao
@ 2006-04-20 11:28                                   ` Laurent Vivier
       [not found]                                   ` <1145543970.5872.38.camel@openx2.frec.bull.fr>
  1 sibling, 0 replies; 49+ messages in thread
From: Laurent Vivier @ 2006-04-20 11:28 UTC (permalink / raw)
  To: Mingming Cao
  Cc: Arjan van de Ven, Ravikiran G Thirumalai, Andrew Morton,
	Takashi Sato, linux-kernel, ext2-devel, linux-fsdevel


[-- Attachment #1.1: Type: text/plain, Size: 2305 bytes --]

Le mar 18/04/2006 à 23:01, Mingming Cao a écrit :
> On Tue, 2006-04-18 at 09:30 +0200, Arjan van de Ven wrote:
> > On Tue, 2006-04-18 at 09:14 +0200, Laurent Vivier wrote:
> > > Le lun 17/04/2006 à 23:32, Ravikiran G Thirumalai a écrit :
> > > > On Mon, Apr 17, 2006 at 11:09:36PM +0200, Arjan van de Ven wrote:
> > > > > On Mon, 2006-04-17 at 14:07 -0700, Ravikiran G Thirumalai wrote:
> > > > > > 
> > > > > > 
> > > > > > I ran the same tests on a 16 core EM64T box very similar to the one
> > > > > > you ran
> > > > > > dbench on :). Dbench results on ext3 varies quite a bit.  I couldn't
> > > > > > get 
> > > > > > to a statistically significant conclusion  For eg,
> > > > > 
> > > > > 
> > > > > dbench is not a good performance benchmark. At all. Don't use it for
> > > > > that ;)
> > > > 
> > > > Agreed. (I did not mean to use it in the first place :).  I was just trying 
> > > > to verify the benchmark results posted earlier)
> > > > 
> > > > Thanks,
> > > > Kiran
> > > 
> > > What is the good performance benchmark to know if we should use atomic_t
> > > instead of percpu_counter ?
> > 
> > you probably want something like postal/postmark instead or so (although
> > that's not ideal either), at least that's reproducable
> > 
> postmark is a single threaded benchmark.
> 
> The ext3 filesystem free blocks counter is mostly being updated at block
> allocation and free code. So, a test with many many threads doing block
> allocation/deallocation simultaneously will stress the free blocks
> counter accounting better than a single threaded fs benchmark. After
> all, the main reason we choose to use percpu counter for the free blocks
> counter at the first place, I believe, was to support parallel block
> allocation. 
> 
> I would suggest run tiobench with many threads (>256), or even better,
> run tiobench with many dd tests at the background.

You can find attached my results with tiobench (256 threads, always on
x440 with 8 CPUs hyperthreaded = 16).

But, as the results are very different, I think we can't really
conclude... in fact, I think atomic_t or percpu_counter have no impact
on the results.

Regards,
Laurent
-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #1.2: tiobench.txt --]
[-- Type: text/plain, Size: 9165 bytes --]

tiobench.pl --size 16 --numruns 10 --threads 256

Unit information
================
File size = megabytes
Blk Size  = bytes
Rate      = megabytes per second
CPU%      = percentage of CPU used during the test
Latency   = milliseconds
Lat%      = percent of requests that took longer than X seconds
CPU Eff   = Rate divided by CPU% - throughput per cpu load

Sequential Reads
atomic_long_t                  16   4096  256  394.38 1540.%     2.822     5960.36   0.00000  0.00000    26
atomic_long_t                  16   4096  256  384.70 1547.%     2.087     6187.27   0.00000  0.00000    25
atomic_long_t                  16   4096  256  395.69 1548.%     2.001     5988.64   0.00000  0.00000    26
atomic_long_t                  16   4096  256  396.10 1544.%     1.987     5836.74   0.00000  0.00000    26
atomic_long_t                  16   4096  256  377.66 1551.%     2.435     6038.05   0.00000  0.00000    24
atomic_long_t                  16   4096  256  378.35 1550.%     2.145     6369.26   0.00000  0.00000    24
atomic_long_t                  16   4096  256  375.38 1545.%     2.236     6318.84   0.00000  0.00000    24
atomic_long_t                  16   4096  256  399.97 1546.%     2.021     5911.10   0.00000  0.00000    26
atomic_long_t                  16   4096  256  386.59 1542.%     2.045     5968.13   0.00000  0.00000    25
atomic_long_t                  16   4096  256  401.45 1547.%     2.000     5918.17   0.00000  0.00000    26
percpu_counter                 16   4096  256  396.22 1539.%     1.965     5984.66   0.00000  0.00000    26
percpu_counter                 16   4096  256  403.50 1547.%     2.373     5503.98   0.00000  0.00000    26
percpu_counter                 16   4096  256  388.54 1547.%     2.047     6100.05   0.00000  0.00000    25
percpu_counter                 16   4096  256  397.43 1540.%     2.096     6010.45   0.00000  0.00000    26
percpu_counter                 16   4096  256  398.81 1543.%     2.134     5677.78   0.00000  0.00000    26
percpu_counter                 16   4096  256  399.85 1548.%     1.980     5805.21   0.00000  0.00000    26
percpu_counter                 16   4096  256  394.70 1551.%     2.021     5960.13   0.00000  0.00000    25
percpu_counter                 16   4096  256  396.64 1543.%     2.132     5901.40   0.00000  0.00000    26
percpu_counter                 16   4096  256  390.70 1550.%     1.906     5972.05   0.00000  0.00000    25
percpu_counter                 16   4096  256  401.98 1547.%     2.049     5839.44   0.00000  0.00000    26


Random Reads
atomic_long_t                  16   4096  256  100.79 1230.%     0.351        3.82   0.00000  0.00000     8
atomic_long_t                  16   4096  256  112.61 1342.%     0.367       13.42   0.00000  0.00000     8
atomic_long_t                  16   4096  256  111.16 1354.%     0.503      450.22   0.00000  0.00000     8
atomic_long_t                  16   4096  256  112.79 1333.%     0.366        4.73   0.00000  0.00000     8
atomic_long_t                  16   4096  256  114.61 1322.%     0.375       12.74   0.00000  0.00000     9
atomic_long_t                  16   4096  256  111.18 1350.%     0.368       19.89   0.00000  0.00000     8
atomic_long_t                  16   4096  256  112.14 1344.%     0.395      143.76   0.00000  0.00000     8
atomic_long_t                  16   4096  256  112.41 1346.%     0.374       31.40   0.00000  0.00000     8
atomic_long_t                  16   4096  256  112.89 1353.%     0.372       12.64   0.00000  0.00000     8
atomic_long_t                  16   4096  256  112.40 1354.%     0.365       10.88   0.00000  0.00000     8
percpu_counter                 16   4096  256  112.68 1341.%     0.439      263.93   0.00000  0.00000     8
percpu_counter                 16   4096  256  109.77 1356.%     0.372       35.58   0.00000  0.00000     8
percpu_counter                 16   4096  256  114.18 1339.%     0.371       29.47   0.00000  0.00000     9
percpu_counter                 16   4096  256  112.61 1339.%     0.430      241.78   0.00000  0.00000     8
percpu_counter                 16   4096  256  111.09 1343.%     0.372       46.84   0.00000  0.00000     8
percpu_counter                 16   4096  256  111.98 1355.%     0.358       16.41   0.00000  0.00000     8
percpu_counter                 16   4096  256  112.55 1348.%     0.393      121.30   0.00000  0.00000     8
percpu_counter                 16   4096  256  114.84 1324.%     0.398      112.03   0.00000  0.00000     9
percpu_counter                 16   4096  256  111.45 1353.%     0.368       22.09   0.00000  0.00000     8
percpu_counter                 16   4096  256  112.21 1352.%     0.405      150.62   0.00000  0.00000     8

Sequential Writes
atomic_long_t                  16   4096  256   28.72 350.0%   103.526     3087.83   0.00000  0.00000     8
atomic_long_t                  16   4096  256   28.47 350.1%   107.563     4127.54   0.00000  0.00000     8
atomic_long_t                  16   4096  256   28.10 346.6%   108.709     2767.21   0.00000  0.00000     8
atomic_long_t                  16   4096  256   28.24 345.1%   106.619     3025.58   0.00000  0.00000     8
atomic_long_t                  16   4096  256   28.35 358.4%   110.779     3844.84   0.00000  0.00000     8
atomic_long_t                  16   4096  256   28.14 349.9%   109.956     3580.34   0.00000  0.00000     8
atomic_long_t                  16   4096  256   28.23 349.6%   110.011     2770.21   0.00000  0.00000     8
atomic_long_t                  16   4096  256   28.46 355.4%   108.701     2694.87   0.00000  0.00000     8
atomic_long_t                  16   4096  256   28.14 346.3%   108.114     3461.73   0.00000  0.00000     8
atomic_long_t                  16   4096  256   28.49 348.8%   109.625     3160.93   0.00000  0.00000     8
percpu_counter                 16   4096  256   27.92 344.3%   109.420     3497.14   0.00000  0.00000     8
percpu_counter                 16   4096  256   28.23 343.2%   110.146     3279.80   0.00000  0.00000     8
percpu_counter                 16   4096  256   28.35 345.6%   110.027     3527.68   0.00000  0.00000     8
percpu_counter                 16   4096  256   28.17 348.9%   107.143     3735.24   0.00000  0.00000     8
percpu_counter                 16   4096  256   28.07 333.8%   107.581     2442.35   0.00000  0.00000     8
percpu_counter                 16   4096  256   28.36 343.2%   106.625     2740.56   0.00000  0.00000     8
percpu_counter                 16   4096  256   28.33 343.6%   107.201     3029.49   0.00000  0.00000     8
percpu_counter                 16   4096  256   28.28 339.9%   107.849     3100.73   0.00000  0.00000     8
percpu_counter                 16   4096  256   28.27 344.8%   108.008     3753.06   0.00000  0.00000     8
percpu_counter                 16   4096  256   28.54 354.1%   108.254     2831.67   0.00000  0.00000     8

Random Writes
atomic_long_t                  16   4096  256    2.47 64.55%     5.690     1295.08   0.00000  0.00000     4
atomic_long_t                  16   4096  256    2.45 63.52%     6.021     1386.22   0.00000  0.00000     4
atomic_long_t                  16   4096  256    2.47 63.87%     5.621      912.91   0.00000  0.00000     4
atomic_long_t                  16   4096  256    2.45 64.34%     6.361     1444.26   0.00000  0.00000     4
atomic_long_t                  16   4096  256    2.47 64.04%     5.793     1307.02   0.00000  0.00000     4
atomic_long_t                  16   4096  256    2.47 64.14%     5.979     1690.00   0.00000  0.00000     4
atomic_long_t                  16   4096  256    2.49 64.63%     5.993     1820.59   0.00000  0.00000     4
atomic_long_t                  16   4096  256    2.48 64.98%     6.400     1829.93   0.00000  0.00000     4
atomic_long_t                  16   4096  256    2.44 64.05%     5.763     1631.14   0.00000  0.00000     4
atomic_long_t                  16   4096  256    2.53 66.19%     5.728      919.34   0.00000  0.00000     4
percpu_counter                 16   4096  256    2.43 63.69%     5.927      851.10   0.00000  0.00000     4
percpu_counter                 16   4096  256    2.42 62.73%     5.997     1371.13   0.00000  0.00000     4
percpu_counter                 16   4096  256    2.46 64.17%     6.223     1808.24   0.00000  0.00000     4
percpu_counter                 16   4096  256    2.46 64.04%     5.897     1410.98   0.00000  0.00000     4
percpu_counter                 16   4096  256    2.47 63.98%     5.829     1197.30   0.00000  0.00000     4
percpu_counter                 16   4096  256    2.48 64.15%     5.660     1079.85   0.00000  0.00000     4
percpu_counter                 16   4096  256    2.47 63.81%     6.041     1401.02   0.00000  0.00000     4
percpu_counter                 16   4096  256    2.48 64.88%     6.078     1218.58   0.00000  0.00000     4
percpu_counter                 16   4096  256    2.49 64.90%     6.148     1468.47   0.00000  0.00000     4
percpu_counter                 16   4096  256    2.47 65.64%     5.724     1148.28   0.00000  0.00000     4


[-- Attachment #2: Ceci est une partie de message numériquement signée. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

[parent not found: <1145543970.5872.38.camel@openx2.frec.bull.fr>]

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
       [not found]                                   ` <1145543970.5872.38.camel@openx2.frec.bull.fr>
@ 2006-04-21 11:17                                     ` Laurent Vivier
  0 siblings, 0 replies; 49+ messages in thread
From: Laurent Vivier @ 2006-04-21 11:17 UTC (permalink / raw)
  To: Mingming Cao
  Cc: Arjan van de Ven, Ravikiran G Thirumalai, Andrew Morton,
	Takashi Sato, linux-kernel, ext2-devel, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1905 bytes --]

Le jeu 20/04/2006 à 16:39, Laurent Vivier a écrit : 
> The functions added by my patch are following and as they are atomic
> (one machine instruction) they are not measurable and don't appears in
> oprofile.
> 
> atomic_long_add
> atomic_long_read
> atomic_long_set
> atomic_long_inc

I think, as these commands are atomic/inlined we should measure the time
of the functions modified by the patches.

The functions modified by the patch are:

ext3_free_blocks_sb
ext3_has_free_blocks
ext3_new_block
ext3_put_super
ext3_fill_super
ext3_fill_super
ext3_free_inode
find_group_dir
find_group_orlov
ext3_new_inode
ext3_group_add

If we make a "grep" on tiobench oprofile.out, we have:

atomic_t:

26919     0.0119  vmlinux                  vmlinux                  ext3_new_block
2195     9.7e-04  vmlinux                  vmlinux                  ext3_free_blocks_sb
1192     5.2e-04  vmlinux                  vmlinux                  ext3_has_free_blocks
189      8.3e-05  vmlinux                  vmlinux                  ext3_new_inode
40       1.8e-05  vmlinux                  vmlinux                  ext3_free_inode
2        8.8e-07  vmlinux                  vmlinux                  find_group_orlov

percpu_counter:

16290     0.0067  vmlinux                  vmlinux                  ext3_new_block
2075     8.5e-04  vmlinux                  vmlinux                  ext3_free_blocks_sb
428      1.8e-04  vmlinux                  vmlinux                  ext3_has_free_blocks
162      6.7e-05  vmlinux                  vmlinux                  ext3_new_inode
25       1.0e-05  vmlinux                  vmlinux                  ext3_free_inode

As we can using atomic_long_t is slower than percpu_counter so ...
forget my patch.

Regards,
Laurent 

-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #2: Ceci est une partie de message numériquement signée. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-10  9:11                 ` [Ext2-devel] " Laurent Vivier
  2006-04-10  8:24                   ` Andrew Morton
@ 2006-04-10 16:57                   ` Mingming Cao
  2006-04-10 19:06                     ` Mingming Cao
  1 sibling, 1 reply; 49+ messages in thread
From: Mingming Cao @ 2006-04-10 16:57 UTC (permalink / raw)
  To: Laurent Vivier
  Cc: Andrew Morton, Takashi Sato, linux-kernel, ext2-devel,
	linux-fsdevel

On Mon, 2006-04-10 at 11:11 +0200, Laurent Vivier wrote:
> Le jeu 30/03/2006 à 03:54, Andrew Morton a écrit :
> > Mingming Cao <cmm@us.ibm.com> wrote:
> > >
> > > The things need to be done to complete this work is the issue with
> > >  current percpu counter, which could not handle u32 type count well. 
> > 
> > I'm surprised there's much of a problem here.  It is a 32-bit value, so it
> > should mainly be a matter of treating the return value from
> > percpu_counter_read() as unsigned long.
> > 
> > However a stickier problem is when dealing with a filesystem which has,
> > say, 0xffff_ff00 blocks.  Because percpu counters are approximate, and a
> > counter which really has a value of 0xffff_feee might return 0x00000123. 
> > What do we do then?
> > 
> > Of course the simple option is to nuke the percpu counters in ext3 and use
> > atomic_long_t (which is signed, so appropriate treat-it-as-unsigned code
> > would be needed).  I doubt if the percpu counters in ext3 are gaining us
> > much.
> 
> I tried to make something in this way.
> Does the attached patch look like the thing you though about ?
> 

I tried the other way -- I am trying to keep the percpu counter in use
in ext2/3 as much as possible.  I proposed a fix for percpu counter to
deal with the possible "overflow" (i.e, a counter really has a value of
0xfff_feee and after updating one local counter it truens 0x00000123).
Will send the proposed patch out for review and comments soon.

Mingming


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-10 16:57                   ` Mingming Cao
@ 2006-04-10 19:06                     ` Mingming Cao
  2006-04-11  7:07                       ` Laurent Vivier
  0 siblings, 1 reply; 49+ messages in thread
From: Mingming Cao @ 2006-04-10 19:06 UTC (permalink / raw)
  To: Laurent Vivier, kiran
  Cc: Andrew Morton, Takashi Sato, linux-kernel, ext2-devel,
	linux-fsdevel

On Mon, 2006-04-10 at 09:57 -0700, Mingming Cao wrote:
> On Mon, 2006-04-10 at 11:11 +0200, Laurent Vivier wrote:
> > Le jeu 30/03/2006 à 03:54, Andrew Morton a écrit :
> > > Mingming Cao <cmm@us.ibm.com> wrote:
> > > >
> > > > The things need to be done to complete this work is the issue with
> > > >  current percpu counter, which could not handle u32 type count well. 
> > > 
> > > I'm surprised there's much of a problem here.  It is a 32-bit value, so it
> > > should mainly be a matter of treating the return value from
> > > percpu_counter_read() as unsigned long.
> > > 
> > > However a stickier problem is when dealing with a filesystem which has,
> > > say, 0xffff_ff00 blocks.  Because percpu counters are approximate, and a
> > > counter which really has a value of 0xffff_feee might return 0x00000123. 
> > > What do we do then?
> > > 
> > > Of course the simple option is to nuke the percpu counters in ext3 and use
> > > atomic_long_t (which is signed, so appropriate treat-it-as-unsigned code
> > > would be needed).  I doubt if the percpu counters in ext3 are gaining us
> > > much.
> > 
> > I tried to make something in this way.
> > Does the attached patch look like the thing you though about ?
> > 
> 
Hi Laurent,

Just looked at your patch, shouldn't we use atomic_long_add() instead of
atomic_long_set() to replace percpu_counter_mod()?

> I tried the other way -- I am trying to keep the percpu counter in use
> in ext2/3 as much as possible.  I proposed a fix for percpu counter to
> deal with the possible "overflow" (i.e, a counter really has a value of
> 0xfff_feee and after updating one local counter it truens 0x00000123).
> Will send the proposed patch out for review and comments soon.
> 

Anyway, I am not against the atomic way. Just thought there must be
reasons where we use percpu counters -- the cache pollution on smp
machine is certainly a concern if we use atomic instead, so I  tried to
fix percpu counter first.

I think my fix for percpu counter should work, and the changes doesn't
affect other users of current percpu counters(vfs and network).  Kiran,
Andrew, please review it (posted in another seperate thread). If not,
then I guess we have to use atomic counter -- this is performance vs
capacity kind of trade off.

But both methods don't support 64 bit ext3 block number on 32 bit
machine...I am not happy with this but can't think of a way to fix this
without taking a global lock:(


Mingming
> -------------------------------------------------------
> This SF.Net email is sponsored by xPML, a groundbreaking scripting language
> that extends applications into web and mobile media. Attend the live webcast
> and join the prime developer group breaking into this new coding territory!
> http://sel.as-us.falkag.net/sel?cmd=lnk&kid\x110944&bid$1720&dat\x121642
> _______________________________________________
> Ext2-devel mailing list
> Ext2-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/ext2-devel


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-10 19:06                     ` Mingming Cao
@ 2006-04-11  7:07                       ` Laurent Vivier
  2006-04-14 17:23                         ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 49+ messages in thread
From: Laurent Vivier @ 2006-04-11  7:07 UTC (permalink / raw)
  To: Mingming Cao
  Cc: kiran, Andrew Morton, Takashi Sato, linux-kernel, ext2-devel,
	linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 2038 bytes --]

Le lun 10/04/2006 à 21:06, Mingming Cao a écrit :
> On Mon, 2006-04-10 at 09:57 -0700, Mingming Cao wrote:
> > On Mon, 2006-04-10 at 11:11 +0200, Laurent Vivier wrote:
[...]
> Hi Laurent,
> 
> Just looked at your patch, shouldn't we use atomic_long_add() instead of
> atomic_long_set() to replace percpu_counter_mod()?

Yes, thank you.

> > I tried the other way -- I am trying to keep the percpu counter in use
> > in ext2/3 as much as possible.  I proposed a fix for percpu counter to
> > deal with the possible "overflow" (i.e, a counter really has a value of
> > 0xfff_feee and after updating one local counter it truens 0x00000123).
> > Will send the proposed patch out for review and comments soon.
> > 
> 
> Anyway, I am not against the atomic way. Just thought there must be
> reasons where we use percpu counters -- the cache pollution on smp
> machine is certainly a concern if we use atomic instead, so I  tried to
> fix percpu counter first.
> 
> I think my fix for percpu counter should work, and the changes doesn't
> affect other users of current percpu counters(vfs and network).  Kiran,
> Andrew, please review it (posted in another seperate thread). If not,
> then I guess we have to use atomic counter -- this is performance vs
> capacity kind of trade off.

I made some tests with iozone on 2 CPU hyperthreaded computer (= 4 CPUs,
Bull Express 5800 120 Lh), and it seems atomic_t is faster than
"percpu_counter". I'll try to make some tests on IBM x440 (8 CPUs, 16 if
hyperthreaded) with iozone and sysbench.
Moreover, I think percpu_counter uses a lot of memory...

> But both methods don't support 64 bit ext3 block number on 32 bit
> machine...I am not happy with this but can't think of a way to fix this
> without taking a global lock:(

Anyway, wa can't have a 64bit addressing space on a 32bit machine, so I
think, for the moment, it's not a problem.

Regards,
Laurent
-- 
Laurent Vivier
Bull, Architect of an Open World (TM)
http://www.bullopensource.org/ext4

[-- Attachment #2: Ceci est une partie de message numériquement signée. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [Ext2-devel] Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-04-11  7:07                       ` Laurent Vivier
@ 2006-04-14 17:23                         ` Ravikiran G Thirumalai
  0 siblings, 0 replies; 49+ messages in thread
From: Ravikiran G Thirumalai @ 2006-04-14 17:23 UTC (permalink / raw)
  To: Laurent Vivier
  Cc: Mingming Cao, Andrew Morton, Takashi Sato, linux-kernel,
	ext2-devel, linux-fsdevel

Hi Laurent,

On Tue, Apr 11, 2006 at 09:07:39AM +0200, Laurent Vivier wrote:
> ...  
> I made some tests with iozone on 2 CPU hyperthreaded computer (= 4 CPUs,
> Bull Express 5800 120 Lh), and it seems atomic_t is faster than
> "percpu_counter". I'll try to make some tests on IBM x440 (8 CPUs, 16 if
> hyperthreaded) with iozone and sysbench.
> Moreover, I think percpu_counter uses a lot of memory...

Was this just one iozone thread doing io?  What was the performance
difference?  Please let me know what kind of test you are doing, and I can 
run the same on an IBM x460 with 16 cores here.

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30  1:38             ` [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB Mingming Cao
  2006-03-30  1:54               ` Andrew Morton
@ 2006-03-30 17:36               ` Andreas Dilger
  2006-03-30 19:01                 ` Mingming Cao
  2006-03-30 17:40               ` Andreas Dilger
  2006-05-26  5:00               ` [PATCH 0/2]Define ext3 in-kernel filesystem block types and extend " Mingming Cao
  3 siblings, 1 reply; 49+ messages in thread
From: Andreas Dilger @ 2006-03-30 17:36 UTC (permalink / raw)
  To: Mingming Cao
  Cc: Andrew Morton, Takashi Sato, Laurent Vivier, linux-kernel,
	ext2-devel, linux-fsdevel

On Mar 29, 2006  17:38 -0800, Mingming Cao wrote:
> There are places in ext3 code to use "int" to represent block numbers in
> kernel(not on-disk). This seems the "only" reason that why we can only
> have 8TB ext3 rather than 16TB.  Most times it just a bug with no
> particular reason why not use unsigned 32 bit value, so the fix is easy.
> 
> However, it is not so straightforward fix for the ext3 block allocation
> code, as ext3_new_block() returns a block number, and "-1" to indicating
> block allocation failure. Ext3 block reservation code, called by
> ext3_new_block(), thus also use "int" for block numbers in some places.

What might make the code a lot clearer, easier to audit, and easier to
fix in the future is to declare new types for fs block offsets and group
block offsets.  Something like "ext3_fsblk" and "ext3_grblk".  That way,
we can declare ext3_fsblk as "unsigned long" and "ext3_grblk" as "unsigned
int", and we could optionally change ext3_fsblk to be "unsigned long long"
later to support 64-bit filesystems without having to re-patch all of the
code.

It would be more clear what type of block offset a function is handling
(fs-wide or group-relative).  If we wanted to be able to overload the
block number with an error code we could use ERR_PTR and PTR_ERR like
macros, and just restrict the filesystem to 2^32 - 1024 blocks until we
extend it to 64 bits.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30 17:36               ` Andreas Dilger
@ 2006-03-30 19:01                 ` Mingming Cao
  0 siblings, 0 replies; 49+ messages in thread
From: Mingming Cao @ 2006-03-30 19:01 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Andrew Morton, Takashi Sato, Laurent Vivier, linux-kernel,
	ext2-devel, linux-fsdevel

On Thu, 2006-03-30 at 10:36 -0700, Andreas Dilger wrote:
> On Mar 29, 2006  17:38 -0800, Mingming Cao wrote:
> > There are places in ext3 code to use "int" to represent block numbers in
> > kernel(not on-disk). This seems the "only" reason that why we can only
> > have 8TB ext3 rather than 16TB.  Most times it just a bug with no
> > particular reason why not use unsigned 32 bit value, so the fix is easy.
> > 
> > However, it is not so straightforward fix for the ext3 block allocation
> > code, as ext3_new_block() returns a block number, and "-1" to indicating
> > block allocation failure. Ext3 block reservation code, called by
> > ext3_new_block(), thus also use "int" for block numbers in some places.
> 
Hi Andreas,

> What might make the code a lot clearer, easier to audit, and easier to
> fix in the future is to declare new types for fs block offsets and group
> block offsets.  Something like "ext3_fsblk" and "ext3_grblk".  That way,
> we can declare ext3_fsblk as "unsigned long" and "ext3_grblk" as "unsigned
> int", 

Yep, that makes sense. If we do this, the patch needs more audit, as the
existing code uses "unsigned long" for block numbers in many many
places.

Also I think it might make sense to define "ext3_fileblk" for logical
block type, as right now many functions called "block" in many places
for file logical block, and it takes some to determine whether it's a
file logical block or physical block.

> and we could optionally change ext3_fsblk to be "unsigned long long"
> later to support 64-bit filesystems without having to re-patch all of the
> code.
> 
I do have an untested patch which tries to change all fs-wide block
numbers from "unsigned long" to "sector_t" type as Laurent suggested. He
did this in his 64 bit ext3 block number support patch. I wasn't sure if
we should do this for current 32 bit ext3 or wait until other 64 bit
patches.

Yeah, with the suggestion you made above, this change should be easy to
support 64bit filesystem without go through all the code again.

> It would be more clear what type of block offset a function is handling
> (fs-wide or group-relative). 

Okey, I will add more comments in the function.

>  If we wanted to be able to overload the
> block number with an error code we could use ERR_PTR and PTR_ERR like
> macros, and just restrict the filesystem to 2^32 - 1024 blocks until we
> extend it to 64 bits.
> 
> Cheers, Andreas
> --
> Andreas Dilger
> Principal Software Engineer
> Cluster File Systems, Inc.
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30  1:38             ` [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB Mingming Cao
  2006-03-30  1:54               ` Andrew Morton
  2006-03-30 17:36               ` Andreas Dilger
@ 2006-03-30 17:40               ` Andreas Dilger
  2006-03-30 19:16                 ` Mingming Cao
  2006-05-26  5:00               ` [PATCH 0/2]Define ext3 in-kernel filesystem block types and extend " Mingming Cao
  3 siblings, 1 reply; 49+ messages in thread
From: Andreas Dilger @ 2006-03-30 17:40 UTC (permalink / raw)
  To: Mingming Cao
  Cc: Andrew Morton, Takashi Sato, Laurent Vivier, linux-kernel,
	ext2-devel, linux-fsdevel

On Mar 29, 2006  17:38 -0800, Mingming Cao wrote:
> Have verified these two patches on a 64 bit machine with 10TB ext3
> filesystem, fsx runs fine for a few hours. Also testes on 32 bit machine
> with <8TB ext3.

Have you done tests _near_ 8TB with a 32-bit machine, even without these
patches?  In particular, filling up the filesystem to be close to full
so that we really depend on the > 2TB code to work properly?  Also, in
theory with these patches even a 32-bit machine could run > 8TB, right?

There have been sporadic reports of failure for large ext3 filesystems,
and some of them say that 32-bit systems fail and 64-bit systems work.
There is a kernel bugzilla bug open for this, but it was never really
identified what the source of the problem was.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30 17:40               ` Andreas Dilger
@ 2006-03-30 19:16                 ` Mingming Cao
  2006-03-30 19:22                   ` Mingming Cao
  2006-03-31 13:33                   ` Andi Kleen
  0 siblings, 2 replies; 49+ messages in thread
From: Mingming Cao @ 2006-03-30 19:16 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Andrew Morton, Takashi Sato, Laurent Vivier, linux-kernel,
	ext2-devel, linux-fsdevel

On Thu, 2006-03-30 at 10:40 -0700, Andreas Dilger wrote:
> On Mar 29, 2006  17:38 -0800, Mingming Cao wrote:
> > Have verified these two patches on a 64 bit machine with 10TB ext3
> > filesystem, fsx runs fine for a few hours. Also testes on 32 bit machine
> > with <8TB ext3.
> 
> Have you done tests _near_ 8TB with a 32-bit machine, even without these
> patches?
No I haven't. The >8TB right now is attached to a 64 bit machine, but we
should able to move it to a 32 bit machine.

>   In particular, filling up the filesystem to be close to full
> so that we really depend on the > 2TB code to work properly?

I made a kernel patch to allow a file to specify which block group it
wants it's blocks to allocate from(using ioctl to set the goal
allocation block group). I set the goal block group falls to somewhere
>8TB, and did dd tests on that file. Verified this with debugfs, the
allocated block numbers are beyond 2**31.

Also before run fsx tests, created many directories (32768 at most:) and
verified one directory's inode is located in block group >8TB space. So
when we do fsx test on files under that directory, we are
creating/testing files >8TB.

BTW, do you think this ioctl is useful in general for other users? I
attached the patch here.

I also plan to hack the code of inode allocation to force all files's
inode is put in the block group >8TB, so that we could do a full
filesystem tests there.


>   Also, in
> theory with these patches even a 32-bit machine could run > 8TB, right?
> 
> There have been sporadic reports of failure for large ext3 filesystems,
> and some of them say that 32-bit systems fail and 64-bit systems work.
> There is a kernel bugzilla bug open for this, but it was never really
> identified what the source of the problem was.
> 

Sure, I will verify that on my 32 bit machine with >8TB.

> Cheers, Andreas
> --
> Andreas Dilger
> Principal Software Engineer
> Cluster File Systems, Inc.
> 

Thanks,

Mingming


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30 19:16                 ` Mingming Cao
@ 2006-03-30 19:22                   ` Mingming Cao
  2006-03-31  6:42                     ` Andreas Dilger
  2006-03-31 13:33                   ` Andi Kleen
  1 sibling, 1 reply; 49+ messages in thread
From: Mingming Cao @ 2006-03-30 19:22 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Andrew Morton, Takashi Sato, Laurent Vivier, linux-kernel,
	ext2-devel, linux-fsdevel

On Thu, 2006-03-30 at 11:16 -0800, Mingming Cao wrote:
> On Thu, 2006-03-30 at 10:40 -0700, Andreas Dilger wrote:
> > On Mar 29, 2006  17:38 -0800, Mingming Cao wrote:
> > > Have verified these two patches on a 64 bit machine with 10TB ext3
> > > filesystem, fsx runs fine for a few hours. Also testes on 32 bit machine
> > > with <8TB ext3.
> > 
> > Have you done tests _near_ 8TB with a 32-bit machine, even without these
> > patches?
> No I haven't. The >8TB right now is attached to a 64 bit machine, but we
> should able to move it to a 32 bit machine.
> 
> >   In particular, filling up the filesystem to be close to full
> > so that we really depend on the > 2TB code to work properly?
> 
> I made a kernel patch to allow a file to specify which block group it
> wants it's blocks to allocate from(using ioctl to set the goal
> allocation block group). I set the goal block group falls to somewhere
> >8TB, and did dd tests on that file. Verified this with debugfs, the
> allocated block numbers are beyond 2**31.
> 
> Also before run fsx tests, created many directories (32768 at most:) and
> verified one directory's inode is located in block group >8TB space. So
> when we do fsx test on files under that directory, we are
> creating/testing files >8TB.
> 
> BTW, do you think this ioctl is useful in general for other users? I
> attached the patch here.
> 
---

 linux-2.6.16-ming/fs/ext3/balloc.c          |   24 ++++++++++++++---------
 linux-2.6.16-ming/fs/ext3/ioctl.c           |   29 ++++++++++++++++++++++++++++
 linux-2.6.16-ming/include/linux/ext3_fs.h   |    1 
 linux-2.6.16-ming/include/linux/ext3_fs_i.h |    1 
 4 files changed, 46 insertions(+), 9 deletions(-)

diff -puN fs/ext3/ioctl.c~ext3_set_alloc_blk_group_hack fs/ext3/ioctl.c
--- linux-2.6.16/fs/ext3/ioctl.c~ext3_set_alloc_blk_group_hack	2006-03-28 15:19:58.000000000 -0800
+++ linux-2.6.16-ming/fs/ext3/ioctl.c	2006-03-28 15:54:14.000000000 -0800
@@ -22,6 +22,7 @@ int ext3_ioctl (struct inode * inode, st
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	unsigned int flags;
 	unsigned short rsv_window_size;
+	unsigned int blk_group;
 
 	ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
 
@@ -193,6 +194,34 @@ flags_err:
 		mutex_unlock(&ei->truncate_mutex);
 		return 0;
 	}
+	case EXT3_IOC_SETALLOCBLKGRP: {
+
+		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
+			return -ENOTTY;
+
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+			return -EACCES;
+
+		if (get_user(blk_group, (int __user *)arg))
+			return -EFAULT;
+
+		/*
+		 * need to allocate reservation structure for this inode
+		 * before set the window size
+		 */
+		mutex_lock(&ei->truncate_mutex);
+		if (!ei->i_block_alloc_info)
+			ext3_init_block_alloc_info(inode);
+
+		if (ei->i_block_alloc_info){
+			ei->i_block_alloc_info->goal_block_group = blk_group;
+		}
+		mutex_unlock(&ei->truncate_mutex);
+		return 0;
+	}
 	case EXT3_IOC_GROUP_EXTEND: {
 		unsigned long n_blocks_count;
 		struct super_block *sb = inode->i_sb;
diff -puN include/linux/ext3_fs.h~ext3_set_alloc_blk_group_hack include/linux/ext3_fs.h
--- linux-2.6.16/include/linux/ext3_fs.h~ext3_set_alloc_blk_group_hack	2006-03-28 15:42:51.000000000 -0800
+++ linux-2.6.16-ming/include/linux/ext3_fs.h	2006-03-28 15:51:48.000000000 -0800
@@ -238,6 +238,7 @@ struct ext3_new_group_data {
 #endif
 #define EXT3_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT3_IOC_SETRSVSZ		_IOW('f', 6, long)
+#define EXT3_IOC_SETALLOCBLKGRP		_IOW('f', 9, long)
 
 /*
  *  Mount options
diff -puN include/linux/ext3_fs_i.h~ext3_set_alloc_blk_group_hack include/linux/ext3_fs_i.h
--- linux-2.6.16/include/linux/ext3_fs_i.h~ext3_set_alloc_blk_group_hack	2006-03-28 15:43:59.000000000 -0800
+++ linux-2.6.16-ming/include/linux/ext3_fs_i.h	2006-03-28 15:47:54.000000000 -0800
@@ -51,6 +51,7 @@ struct ext3_block_alloc_info {
 	 * allocation when we detect linearly ascending requests.
 	 */
 	__u32                   last_alloc_physical_block;
+	__u32			goal_block_group;
 };
 
 #define rsv_start rsv_window._rsv_start
diff -puN fs/ext3/balloc.c~ext3_set_alloc_blk_group_hack fs/ext3/balloc.c
--- linux-2.6.16/fs/ext3/balloc.c~ext3_set_alloc_blk_group_hack	2006-03-28 15:45:30.000000000 -0800
+++ linux-2.6.16-ming/fs/ext3/balloc.c	2006-03-28 16:03:55.000000000 -0800
@@ -285,6 +285,7 @@ void ext3_init_block_alloc_info(struct i
 		rsv->rsv_alloc_hit = 0;
 		block_i->last_alloc_logical_block = 0;
 		block_i->last_alloc_physical_block = 0;
+		block_i->goal_block_group = 0;
 	}
 	ei->i_block_alloc_info = block_i;
 }
@@ -1263,15 +1264,20 @@ unsigned long ext3_new_blocks(handle_t *
 		*errp = -ENOSPC;
 		goto out;
 	}
-
-	/*
-	 * First, test whether the goal block is free.
-	 */
-	if (goal < le32_to_cpu(es->s_first_data_block) ||
-	    goal >= le32_to_cpu(es->s_blocks_count))
-		goal = le32_to_cpu(es->s_first_data_block);
-	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
-			EXT3_BLOCKS_PER_GROUP(sb);
+	if (block_i->goal_block_group) {
+		group_no = block_i->goal_block_group;
+		goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +                                group_no * EXT3_BLOCKS_PER_GROUP(sb);
+		block_i->goal_block_group = 0;
+	} else {
+		/*
+		 * First, test whether the goal block is free.
+		 */
+		if (goal < le32_to_cpu(es->s_first_data_block) ||
+		    goal >= le32_to_cpu(es->s_blocks_count))
+			goal = le32_to_cpu(es->s_first_data_block);
+		group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
+				EXT3_BLOCKS_PER_GROUP(sb);
+	}
 	gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
 	if (!gdp)
 		goto io_error;

_



^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30 19:22                   ` Mingming Cao
@ 2006-03-31  6:42                     ` Andreas Dilger
  0 siblings, 0 replies; 49+ messages in thread
From: Andreas Dilger @ 2006-03-31  6:42 UTC (permalink / raw)
  To: Mingming Cao
  Cc: Andrew Morton, Takashi Sato, Laurent Vivier, linux-kernel,
	ext2-devel, linux-fsdevel

On Mar 30, 2006  11:22 -0800, Mingming Cao wrote:
> I made a kernel patch to allow a file to specify which block group it
> wants it's blocks to allocate from(using ioctl to set the goal
> allocation block group). I set the goal block group falls to somewhere
> >8TB, and did dd tests on that file. Verified this with debugfs, the
> allocated block numbers are beyond 2**31.
> 
> Also before run fsx tests, created many directories (32768 at most:) and
> verified one directory's inode is located in block group >8TB space. So
> when we do fsx test on files under that directory, we are
> creating/testing files >8TB.

While useful, I don't think it is critical.  As you mention, it is possible
to do this by creating a lot of directories, though it might be tedious
(need over 16k directories for a 2TB filesystem, 64k for an 8TB fs).

Also, since this increases the allocation for each inode's reservation from
16 bytes to 20 (really 32 because it is in a slab), it might have a small
performance hit.

If it was available under some sort of compile-time configuration option
it might make sense for developers.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30 19:16                 ` Mingming Cao
  2006-03-30 19:22                   ` Mingming Cao
@ 2006-03-31 13:33                   ` Andi Kleen
  2006-04-01  6:50                     ` Nathan Scott
  1 sibling, 1 reply; 49+ messages in thread
From: Andi Kleen @ 2006-03-31 13:33 UTC (permalink / raw)
  To: cmm
  Cc: Andrew Morton, Takashi Sato, Laurent Vivier, linux-kernel,
	ext2-devel, linux-fsdevel

Mingming Cao <cmm@us.ibm.com> writes:

> On Thu, 2006-03-30 at 10:40 -0700, Andreas Dilger wrote:
> > On Mar 29, 2006  17:38 -0800, Mingming Cao wrote:
> > > Have verified these two patches on a 64 bit machine with 10TB ext3
> > > filesystem, fsx runs fine for a few hours. Also testes on 32 bit machine
> > > with <8TB ext3.
> > 
> > Have you done tests _near_ 8TB with a 32-bit machine, even without these
> > patches?
> No I haven't. The >8TB right now is attached to a 64 bit machine, but we
> should able to move it to a 32 bit machine.

If you use XFS or JFS as backing fs you can use a holey loop device
to simulate it.  When I tried this last time JFS worked better for me.
XFS doesn't seem to like that many extents as will be created by 
mkfs.ext2.

-Andi

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB
  2006-03-31 13:33                   ` Andi Kleen
@ 2006-04-01  6:50                     ` Nathan Scott
  0 siblings, 0 replies; 49+ messages in thread
From: Nathan Scott @ 2006-04-01  6:50 UTC (permalink / raw)
  To: Andi Kleen
  Cc: cmm, Andrew Morton, Takashi Sato, Laurent Vivier, linux-kernel,
	ext2-devel, linux-fsdevel

Hi Andi,

On Fri, Mar 31, 2006 at 03:33:24PM +0200, Andi Kleen wrote:
> Mingming Cao <cmm@us.ibm.com> writes:
> > > Have you done tests _near_ 8TB with a 32-bit machine, even without these
> > > patches?
> > No I haven't. The >8TB right now is attached to a 64 bit machine, but we
> > should able to move it to a 32 bit machine.
> 
> If you use XFS or JFS as backing fs you can use a holey loop device
> to simulate it.  When I tried this last time JFS worked better for me.
> XFS doesn't seem to like that many extents as will be created by 
> mkfs.ext2.

Mainline has this issue resolved now (very recently, post-.16).

This (loopback on a local file) technique will get you up to 16TB
for 32 bit platforms, where you hit the unsigned long page->index
limit (but sounds like thats fine for the testing you're doing).

A related technique we've used in the past in testing XFS on large
devices (we've successfully tested in petabyte ranges using this,
on 64 bit systems of course) is to write a tool that modifies the
values in the ondisk data structures managing the "lower" areas of
the device to say "all the space here is used", which then forces
new allocations to be done in the "higher" parts of the device
address space.  Testing then follows this recipe: mkfs-on-loop,
then run the tool, then mount, then run the usual test suites ...
perhaps thats useful here too (I dunno if the ext2/3 format lends
itself to that or not).

cheers.

-- 
Nathan

^ permalink raw reply	[flat|nested] 49+ messages in thread

* [PATCH 0/2]Define ext3 in-kernel filesystem block types and extend ext3 filesystem limit from 8TB to 16TB
  2006-03-30  1:38             ` [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB Mingming Cao
                                 ` (2 preceding siblings ...)
  2006-03-30 17:40               ` Andreas Dilger
@ 2006-05-26  5:00               ` Mingming Cao
  2006-05-26 18:08                 ` Andrew Morton
  3 siblings, 1 reply; 49+ messages in thread
From: Mingming Cao @ 2006-05-26  5:00 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, ext2-devel, linux-fsdevel

Some of the in-kernel ext3 block variable type are treated as signed 4 bytes
int type, thus limited ext3 filesystem to 8TB (4kblock size based). While
trying to fix them, it seems quite confusing in the ext3 code where some
blocks are filesystem-wide blocks, some are group relative offsets that need
to be signed value (as -1 has special meaning). So it seem saner to define two
types of physical blocks: one is filesystem wide blocks, another is
group-relative blocks.  The following patches clarify these two types of
blocks in the ext3 code, and fix the type bugs which limit current 32 bit ext3
filesystem limit to 8TB.

With this series of patches and the percpu counter data type changes in the mm
tree, we are able to extend exts filesystem limit to 16TB.

This work is also a pre-request for the recent >32 bit ext3 work, and makes
the kernel to able to address 48 bit ext3 block a lot easier: Simply
redefine ext3_fsblk_t from unsigned long to sector_t and redefine the format
string for ext3 filesystem block corresponding.

Two RFC with a series patches have been posted to ext2-devel list and have
been reviewed and discussed:
http://marc.theaimsgroup.com/?l=ext2-devel&m=114722190816690&w=2

http://marc.theaimsgroup.com/?l=ext2-devel&m=114784919525942&w=2

The following patches are updated and intergreated patches from two RFC posted
previous:

[Patch 1]ext3_fsblk_t, ext3_grpblk_t and type fixes.
	defines ext3_fsblk_t and ext3_grpblk_t, and the printk format string
	for filesystem wide blocks.

	This patch classifies all block group relative blocks, and
	ext3_fsblk_t blocks occurs in the same function where used to
	be confusing before. Also include kernel bug fixes for filesystem
	wide in-kernel block variables. There are some fileystem wide
	blocks are treated as int/unsigned int type in the kernel currently,
	especially in ext3 block allocation and reservation code. 
	This patch fixed those bugs by converting those variables to
	ext3_fsblk_t(unsigned long) type.

[Patch 2] Convert the ext3 in-kernel filesystem blocks to ext3_fsblk_t.
	Convert the rest of all unsigned long type in-kernel filesystem
	blocks to ext3_fsblk_t, and replace the printk format string
	respondingly.

Patches are tested on both 32 bit machine and 64 bit machine, <8TB ext3 and
>8TB ext3 filesystem(with the latest to be released e2fsprogs-1.39). Tests
 includes overnight fsx, tiobench, dbench and fsstress.

Patches are appliable
to 2.6.17-rc4-mm3, also applied to 2.6.17-rc4 kernel(need to apply percpu
counter changes to support >31 bit ext3 free blocks counters. 2.6.17-rc4
version of percpu cpu counter data type change patch could be found at:

http://ext2.sourceforge.net/48bitext3/patches/patches-2.6.17-rc4-05242006/percpu_counter_longlong.patch

Signed-Off-By: Mingming Cao <cmm@us.ibm.com>

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 0/2]Define ext3 in-kernel filesystem block types and extend ext3 filesystem limit from 8TB to 16TB
  2006-05-26  5:00               ` [PATCH 0/2]Define ext3 in-kernel filesystem block types and extend " Mingming Cao
@ 2006-05-26 18:08                 ` Andrew Morton
  2006-05-30 17:55                   ` Mingming Cao
  0 siblings, 1 reply; 49+ messages in thread
From: Andrew Morton @ 2006-05-26 18:08 UTC (permalink / raw)
  To: cmm; +Cc: linux-kernel, ext2-devel, linux-fsdevel

Mingming Cao <cmm@us.ibm.com> wrote:
>
> Some of the in-kernel ext3 block variable type are treated as signed 4 bytes
>  int type, thus limited ext3 filesystem to 8TB (4kblock size based). While
>  trying to fix them, it seems quite confusing in the ext3 code where some
>  blocks are filesystem-wide blocks, some are group relative offsets that need
>  to be signed value (as -1 has special meaning). So it seem saner to define two
>  types of physical blocks: one is filesystem wide blocks, another is
>  group-relative blocks.  The following patches clarify these two types of
>  blocks in the ext3 code, and fix the type bugs which limit current 32 bit ext3
>  filesystem limit to 8TB.
> 
>  With this series of patches and the percpu counter data type changes in the mm
>  tree, we are able to extend exts filesystem limit to 16TB.

Did you look at the `gcc -W' output before and after these patches are
applied?  That would have found the bug which the previous version
of these patches introduced.

We already get a pile of `warning: comparison between signed and unsigned'
warnings which should be checked, too..


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [PATCH 0/2]Define ext3 in-kernel filesystem block types and extend ext3 filesystem limit from 8TB to 16TB
  2006-05-26 18:08                 ` Andrew Morton
@ 2006-05-30 17:55                   ` Mingming Cao
  0 siblings, 0 replies; 49+ messages in thread
From: Mingming Cao @ 2006-05-30 17:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, ext2-devel, linux-fsdevel

On Fri, 2006-05-26 at 11:08 -0700, Andrew Morton wrote:
> Mingming Cao <cmm@us.ibm.com> wrote:
> >
> > Some of the in-kernel ext3 block variable type are treated as signed 4 bytes
> >  int type, thus limited ext3 filesystem to 8TB (4kblock size based). While
> >  trying to fix them, it seems quite confusing in the ext3 code where some
> >  blocks are filesystem-wide blocks, some are group relative offsets that need
> >  to be signed value (as -1 has special meaning). So it seem saner to define two
> >  types of physical blocks: one is filesystem wide blocks, another is
> >  group-relative blocks.  The following patches clarify these two types of
> >  blocks in the ext3 code, and fix the type bugs which limit current 32 bit ext3
> >  filesystem limit to 8TB.
> > 
> >  With this series of patches and the percpu counter data type changes in the mm
> >  tree, we are able to extend exts filesystem limit to 16TB.
> 
> Did you look at the `gcc -W' output before and after these patches are
> applied?  That would have found the bug which the previous version
> of these patches introduced.
> 
Sorry for the delay, was out for the past holiday.

Yes, I did used gcc -Wall -Wextra.  Pretty careful about it this time. 

> We already get a pile of `warning: comparison between signed and unsigned'
> warnings which should be checked, too..
> 

Yes, indeed.

Mingming


^ permalink raw reply	[flat|nested] 49+ messages in thread

* [RFC][PATCH 1/2]ext3 block allocation/reservation fixes to support 2**32 block numbers
  2006-03-29  9:13           ` Laurent Vivier
       [not found]             ` <1143657317.4045.12.camel@dyn9047017067.beaverton.ibm.com>
  2006-03-30  1:38             ` [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB Mingming Cao
@ 2006-03-30  1:39             ` Mingming Cao
  2006-03-30  1:39             ` [RFC][PATCH 2/2]Other ext3 in-kernel block number type fix " Mingming Cao
  3 siblings, 0 replies; 49+ messages in thread
From: Mingming Cao @ 2006-03-30  1:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Takashi Sato, Laurent Vivier, linux-kernel, ext2-devel,
	linux-fsdevel

This patch fixed the issue around the ext3 block allocation code to
treat block numbers to int type, as some places relies on block number
to be "-1" to indicate allocation failures.

The block allocation failure returned from ext3_new_blocks() is being
captured by the error anyway, so there is no need to keep the returned
block number as "int" type from ext3_new_blocks(). We could still keep
the returned allocated block as "int" from ext3_try_to_allocate_with_rsv
(), as it's a block group relative value so a 4 bytes is enough for now.

patch against 2.6.16-mm2.

Signed-Off-By: Mingming Cao <cmm@us.ibm.com>

---

 linux-2.6.16-ming/fs/ext3/balloc.c        |   67 +++++++++++++++---------------
 linux-2.6.16-ming/fs/ext3/xattr.c         |    6 +-
 linux-2.6.16-ming/include/linux/ext3_fs.h |    4 -
 3 files changed, 40 insertions(+), 37 deletions(-)

diff -puN fs/ext3/balloc.c~ext3_rsv_int-fix fs/ext3/balloc.c
--- linux-2.6.16/fs/ext3/balloc.c~ext3_rsv_int-fix	2006-03-29 15:49:41.199815437 -0800
+++ linux-2.6.16-ming/fs/ext3/balloc.c	2006-03-29 15:49:41.211814047 -0800
@@ -223,7 +223,7 @@ void ext3_rsv_window_add(struct super_bl
 {
 	struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
 	struct rb_node *node = &rsv->rsv_node;
-	unsigned int start = rsv->rsv_start;
+	unsigned long start = rsv->rsv_start;
 
 	struct rb_node ** p = &root->rb_node;
 	struct rb_node * parent = NULL;
@@ -656,7 +656,8 @@ ext3_try_to_allocate(struct super_block 
 			struct buffer_head *bitmap_bh, int goal,
 			unsigned long *count, struct ext3_reserve_window *my_rsv)
 {
-	int group_first_block, start, end;
+	unsigned long group_first_block;
+	int start, end;
 	unsigned long num = 0;
 
 	/* we do allocation within the reservation window if we have a window */
@@ -766,12 +767,13 @@ fail_access:
 static int find_next_reservable_window(
 				struct ext3_reserve_window_node *search_head,
 				struct ext3_reserve_window_node *my_rsv,
-				struct super_block * sb, int start_block,
-				int last_block)
+				struct super_block * sb,
+				unsigned long start_block,
+				unsigned long last_block)
 {
 	struct rb_node *next;
 	struct ext3_reserve_window_node *rsv, *prev;
-	int cur;
+	unsigned long cur;
 	int size = my_rsv->rsv_goal_size;
 
 	/* TODO: make the start of the reservation window byte-aligned */
@@ -889,8 +891,8 @@ static int alloc_new_reservation(struct 
 		unsigned int group, struct buffer_head *bitmap_bh)
 {
 	struct ext3_reserve_window_node *search_head;
-	int group_first_block, group_end_block, start_block;
-	int first_free_block;
+	unsigned long group_first_block, group_end_block, start_block;
+	unsigned long first_free_block;
 	struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
 	unsigned long size;
 	int ret;
@@ -1200,16 +1202,17 @@ int ext3_should_retry_alloc(struct super
  * bitmap, and then for any free bit if that fails.
  * This function also updates quota and i_blocks field.
  */
-int ext3_new_blocks(handle_t *handle, struct inode *inode,
+unsigned long ext3_new_blocks(handle_t *handle, struct inode *inode,
 			unsigned long goal, unsigned long *count, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gdp_bh;
 	int group_no;
 	int goal_group;
-	int ret_block;
+	int group_target_blk;
+	int group_allocated_blk;
+	unsigned long ret_block;
 	int bgi;			/* blockgroup iteration index */
-	int target_block;
 	int fatal = 0, err;
 	int performed_allocation = 0;
 	int free_blocks;
@@ -1285,16 +1288,17 @@ retry:
 		my_rsv = NULL;
 
 	if (free_blocks > 0) {
-		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
+		group_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
 				EXT3_BLOCKS_PER_GROUP(sb));
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
-		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
-					bitmap_bh, ret_block, my_rsv, &num, &fatal);
+		group_allocated_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					 group_no, bitmap_bh,
+					group_target_blk, my_rsv, &num, &fatal);
 		if (fatal)
 			goto out;
-		if (ret_block >= 0)
+		if (group_allocated_blk >= 0)
 			goto allocated;
 	}
 
@@ -1327,11 +1331,12 @@ retry:
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
-		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
+		group_allocated_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no,
 					bitmap_bh, -1, my_rsv, &num, &fatal);
 		if (fatal)
 			goto out;
-		if (ret_block >= 0) 
+		if (group_allocated_blk >= 0)
 			goto allocated;
 	}
 	/*
@@ -1360,18 +1365,19 @@ allocated:
 	if (fatal)
 		goto out;
 
-	target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
+	ret_block = group_allocated_blk + group_no *
+				EXT3_BLOCKS_PER_GROUP(sb)
 				+ le32_to_cpu(es->s_first_data_block);
 
-	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), target_block, num) ||
-	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), target_block, num) ||
-	    in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
+	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
+	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
+	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
 		      EXT3_SB(sb)->s_itb_per_group) ||
-	    in_range(target_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
 		      EXT3_SB(sb)->s_itb_per_group))
 		ext3_error(sb, "ext3_new_block",
 			    "Allocating block in system zone - "
-			    "blocks from %u, length %lu", target_block, num);
+			    "blocks from %lu, length %lu", ret_block, num);
 
 	performed_allocation = 1;
 
@@ -1380,7 +1386,7 @@ allocated:
 		struct buffer_head *debug_bh;
 
 		/* Record bitmap buffer state in the newly allocated block */
-		debug_bh = sb_find_get_block(sb, target_block);
+		debug_bh = sb_find_get_block(sb, ret_block);
 		if (debug_bh) {
 			BUFFER_TRACE(debug_bh, "state when allocated");
 			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
@@ -1393,24 +1399,21 @@ allocated:
 		int i;
 
 		for (i = 0; i < num; i++) {
-			if (ext3_test_bit(ret_block,
+			if (ext3_test_bit(group_allocated_blk,
 					bh2jh(bitmap_bh)->b_committed_data)) {
 				printk("%s: block was unexpectedly set in "
 					"b_committed_data\n", __FUNCTION__);
 			}
 		}
 	}
-	ext3_debug("found bit %d\n", ret_block);
+	ext3_debug("found bit %d\n", group_allocated_blk);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
 	jbd_unlock_bh_state(bitmap_bh);
 #endif
 
-	/* ret_block was blockgroup-relative.  Now it becomes fs-relative */
-	ret_block = target_block;
-
 	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
 		ext3_error(sb, "ext3_new_block",
-			    "block(%d) >= blocks count(%d) - "
+			    "block(%lu) >= blocks count(%d) - "
 			    "block_group = %d, es == %p ", ret_block,
 			le32_to_cpu(es->s_blocks_count), group_no, es);
 		goto out;
@@ -1421,8 +1424,8 @@ allocated:
 	 * list of some description.  We don't know in advance whether
 	 * the caller wants to use it as metadata or data.
 	 */
-	ext3_debug("allocating block %d. Goal hits %d of %d.\n",
-			ret_block, goal_hits, goal_attempts);
+	ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
+		ret_block, goal_hits, goal_attempts);
 
 	spin_lock(sb_bgl_lock(sbi, group_no));
 	gdp->bg_free_blocks_count =
@@ -1461,7 +1464,7 @@ out:
 	return 0;
 }
 
-int ext3_new_block(handle_t *handle, struct inode *inode,
+unsigned long ext3_new_block(handle_t *handle, struct inode *inode,
 			unsigned long goal, int *errp)
 {
 	unsigned long count = 1;
diff -puN fs/ext3/xattr.c~ext3_rsv_int-fix fs/ext3/xattr.c
--- linux-2.6.16/fs/ext3/xattr.c~ext3_rsv_int-fix	2006-03-29 15:49:41.202815089 -0800
+++ linux-2.6.16-ming/fs/ext3/xattr.c	2006-03-29 15:49:41.213813815 -0800
@@ -792,14 +792,14 @@ inserted:
 			get_bh(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			int goal = le32_to_cpu(
+			unsigned long goal = le32_to_cpu(
 					EXT3_SB(sb)->s_es->s_first_data_block) +
 				EXT3_I(inode)->i_block_group *
 				EXT3_BLOCKS_PER_GROUP(sb);
-			int block = ext3_new_block(handle, inode, goal, &error);
+			unsigned long block = ext3_new_block(handle, inode, goal, &error);
 			if (error)
 				goto cleanup;
-			ea_idebug(inode, "creating block %d", block);
+			ea_idebug(inode, "creating block %lu", block);
 
 			new_bh = sb_getblk(sb, block);
 			if (!new_bh) {
diff -puN include/linux/ext3_fs.h~ext3_rsv_int-fix include/linux/ext3_fs.h
--- linux-2.6.16/include/linux/ext3_fs.h~ext3_rsv_int-fix	2006-03-29 15:49:41.205814742 -0800
+++ linux-2.6.16-ming/include/linux/ext3_fs.h	2006-03-29 15:49:41.214813699 -0800
@@ -732,8 +732,8 @@ struct dir_private_info {
 /* balloc.c */
 extern int ext3_bg_has_super(struct super_block *sb, int group);
 extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
-extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
-extern int ext3_new_blocks (handle_t *, struct inode *, unsigned long,
+extern unsigned long ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
+extern unsigned long ext3_new_blocks (handle_t *, struct inode *, unsigned long,
 			unsigned long *, int *);
 extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
 			      unsigned long);

_



^ permalink raw reply	[flat|nested] 49+ messages in thread

* [RFC][PATCH 2/2]Other ext3 in-kernel block number type fix to support 2**32 block numbers
  2006-03-29  9:13           ` Laurent Vivier
                               ` (2 preceding siblings ...)
  2006-03-30  1:39             ` [RFC][PATCH 1/2]ext3 block allocation/reservation fixes to support 2**32 block numbers Mingming Cao
@ 2006-03-30  1:39             ` Mingming Cao
  3 siblings, 0 replies; 49+ messages in thread
From: Mingming Cao @ 2006-03-30  1:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Takashi Sato, Laurent Vivier, linux-kernel, ext2-devel,
	linux-fsdevel

This trivial patch fixed other places in ext3 code(non block allocation
part) to replace "int" type filesystem block number with "unsigned
long".


Signed-Off-By: Mingming Cao <cmm@us.ibm.com>

---

 linux-2.6.16-ming/fs/ext3/balloc.c        |    4 ++--
 linux-2.6.16-ming/fs/ext3/ialloc.c        |    2 +-
 linux-2.6.16-ming/fs/ext3/inode.c         |    2 +-
 linux-2.6.16-ming/fs/ext3/resize.c        |    4 ++--
 linux-2.6.16-ming/fs/ext3/xattr.c         |   16 ++++++++--------
 linux-2.6.16-ming/include/linux/ext3_fs.h |    2 +-
 6 files changed, 15 insertions(+), 15 deletions(-)

diff -puN fs/ext3/balloc.c~ext3_32bit_kernel_fix fs/ext3/balloc.c
--- linux-2.6.16/fs/ext3/balloc.c~ext3_32bit_kernel_fix	2006-03-24 21:32:32.000000000 -0800
+++ linux-2.6.16-ming/fs/ext3/balloc.c	2006-03-27 15:47:17.344404203 -0800
@@ -496,7 +496,7 @@ void ext3_free_blocks(handle_t *handle, 
 			unsigned long block, unsigned long count)
 {
 	struct super_block * sb;
-	int dquot_freed_blocks;
+	unsigned long dquot_freed_blocks;
 
 	sb = inode->i_sb;
 	if (!sb) {
@@ -1166,7 +1166,7 @@ out:
 
 static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 {
-	int free_blocks, root_blocks;
+	unsigned long free_blocks, root_blocks;
 
 	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
 	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
diff -puN fs/ext3/ialloc.c~ext3_32bit_kernel_fix fs/ext3/ialloc.c
--- linux-2.6.16/fs/ext3/ialloc.c~ext3_32bit_kernel_fix	2006-03-24 21:32:32.000000000 -0800
+++ linux-2.6.16-ming/fs/ext3/ialloc.c	2006-03-24 21:32:32.000000000 -0800
@@ -262,7 +262,7 @@ static int find_group_orlov(struct super
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
 	int freei, avefreei;
-	int freeb, avefreeb;
+	unsigned long freeb, avefreeb;
 	int blocks_per_dir, ndirs;
 	int max_debt, max_dirs, min_blocks, min_inodes;
 	int group = -1, i;
diff -puN fs/ext3/inode.c~ext3_32bit_kernel_fix fs/ext3/inode.c
--- linux-2.6.16/fs/ext3/inode.c~ext3_32bit_kernel_fix	2006-03-24 21:32:32.000000000 -0800
+++ linux-2.6.16-ming/fs/ext3/inode.c	2006-03-24 21:32:32.000000000 -0800
@@ -62,7 +62,7 @@ static int ext3_inode_is_fast_symlink(st
  * still needs to be revoked.
  */
 int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
-			struct buffer_head *bh, int blocknr)
+			struct buffer_head *bh, unsigned long blocknr)
 {
 	int err;
 
diff -puN include/linux/ext3_fs.h~ext3_32bit_kernel_fix include/linux/ext3_fs.h
--- linux-2.6.16/include/linux/ext3_fs.h~ext3_32bit_kernel_fix	2006-03-24 21:32:32.000000000 -0800
+++ linux-2.6.16-ming/include/linux/ext3_fs.h	2006-03-24 21:32:32.000000000 -0800
@@ -775,7 +775,7 @@ extern unsigned long ext3_count_free (st
 
 
 /* inode.c */
-int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, unsigned long);
 struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
 struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
diff -puN fs/ext3/resize.c~ext3_32bit_kernel_fix fs/ext3/resize.c
--- linux-2.6.16/fs/ext3/resize.c~ext3_32bit_kernel_fix	2006-03-24 21:32:32.000000000 -0800
+++ linux-2.6.16-ming/fs/ext3/resize.c	2006-03-24 21:32:32.000000000 -0800
@@ -990,10 +990,10 @@ int ext3_group_extend(struct super_block
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	unlock_super(sb);
-	ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count,
+	ext3_debug("freeing blocks %lu through %lu\n", o_blocks_count,
 		   o_blocks_count + add);
 	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext3_debug("freed blocks %ld through %ld\n", o_blocks_count,
+	ext3_debug("freed blocks %lu through %lu\n", o_blocks_count,
 		   o_blocks_count + add);
 	if ((err = ext3_journal_stop(handle)))
 		goto exit_put;
diff -puN fs/ext3/xattr.c~ext3_32bit_kernel_fix fs/ext3/xattr.c
--- linux-2.6.16/fs/ext3/xattr.c~ext3_32bit_kernel_fix	2006-03-24 21:32:32.000000000 -0800
+++ linux-2.6.16-ming/fs/ext3/xattr.c	2006-03-24 21:32:32.000000000 -0800
@@ -225,7 +225,7 @@ ext3_xattr_block_get(struct inode *inode
 	error = -ENODATA;
 	if (!EXT3_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh)
 		goto cleanup;
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 bad_block:	ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %d", inode->i_ino,
+			   "inode %ld: bad block %u", inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -366,7 +366,7 @@ ext3_xattr_block_list(struct inode *inod
 	error = 0;
 	if (!EXT3_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	error = -EIO;
 	if (!bh)
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inod
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %d", inode->i_ino,
+			   "inode %ld: bad block %u", inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inod
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext3_xattr_check_block(bs->bh)) {
 			ext3_error(sb, __FUNCTION__,
-				"inode %ld: bad block %d", inode->i_ino,
+				"inode %ld: bad block %u", inode->i_ino,
 				EXT3_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
@@ -847,7 +847,7 @@ cleanup_dquot:
 
 bad_block:
 	ext3_error(inode->i_sb, __FUNCTION__,
-		   "inode %ld: bad block %d", inode->i_ino,
+		   "inode %ld: bad block %u", inode->i_ino,
 		   EXT3_I(inode)->i_file_acl);
 	goto cleanup;
 
@@ -1076,14 +1076,14 @@ ext3_xattr_delete_inode(handle_t *handle
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: block %d read error", inode->i_ino,
+			"inode %ld: block %u read error", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: bad block %d", inode->i_ino,
+			"inode %ld: bad block %u", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}

_



^ permalink raw reply	[flat|nested] 49+ messages in thread

end of thread, other threads:[~2006-05-30 17:55 UTC | newest]

Thread overview: 49+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-03-25 13:33 [Ext2-devel] [PATCH 2/2] ext2/3: Support2^32-1blocks(e2fsprogs) sho
2006-03-26 22:37 ` Badari Pulavarty
2006-03-27  4:17   ` Takashi Sato
2006-03-27 18:45 ` Mingming Cao
2006-03-27 21:10   ` Andrew Morton
2006-03-27 22:58     ` Ravikiran G Thirumalai
2006-03-28  7:15       ` Laurent Vivier
2006-03-28  8:02         ` Ravikiran G Thirumalai
2006-03-28 10:34           ` Laurent Vivier
2006-03-28 18:01         ` Mingming Cao
2006-03-29  9:13           ` Laurent Vivier
     [not found]             ` <1143657317.4045.12.camel@dyn9047017067.beaverton.ibm.com>
2006-03-29 20:00               ` Ravikiran G Thirumalai
2006-03-29 20:38                 ` Mingming Cao
2006-03-30  8:41                   ` Ravikiran G Thirumalai
2006-03-30  1:38             ` [RFC][PATCH 0/2]Extend ext3 filesystem limit from 8TB to 16TB Mingming Cao
2006-03-30  1:54               ` Andrew Morton
2006-03-31 22:42                 ` Mingming Cao
2006-04-02 20:13                   ` Mingming Cao
2006-04-10  9:11                 ` [Ext2-devel] " Laurent Vivier
2006-04-10  8:24                   ` Andrew Morton
2006-04-13 15:26                     ` Laurent Vivier
2006-04-17 21:07                       ` Ravikiran G Thirumalai
2006-04-17 21:09                         ` Arjan van de Ven
2006-04-17 21:32                           ` Ravikiran G Thirumalai
2006-04-18  7:14                             ` Laurent Vivier
2006-04-18  7:30                               ` Arjan van de Ven
2006-04-18 10:57                                 ` Laurent Vivier
2006-04-18 19:08                                   ` Ravikiran G Thirumalai
2006-04-18 14:09                                 ` Laurent Vivier
2006-04-18 21:01                                 ` Mingming Cao
2006-04-20 11:28                                   ` Laurent Vivier
     [not found]                                   ` <1145543970.5872.38.camel@openx2.frec.bull.fr>
2006-04-21 11:17                                     ` Laurent Vivier
2006-04-10 16:57                   ` Mingming Cao
2006-04-10 19:06                     ` Mingming Cao
2006-04-11  7:07                       ` Laurent Vivier
2006-04-14 17:23                         ` Ravikiran G Thirumalai
2006-03-30 17:36               ` Andreas Dilger
2006-03-30 19:01                 ` Mingming Cao
2006-03-30 17:40               ` Andreas Dilger
2006-03-30 19:16                 ` Mingming Cao
2006-03-30 19:22                   ` Mingming Cao
2006-03-31  6:42                     ` Andreas Dilger
2006-03-31 13:33                   ` Andi Kleen
2006-04-01  6:50                     ` Nathan Scott
2006-05-26  5:00               ` [PATCH 0/2]Define ext3 in-kernel filesystem block types and extend " Mingming Cao
2006-05-26 18:08                 ` Andrew Morton
2006-05-30 17:55                   ` Mingming Cao
2006-03-30  1:39             ` [RFC][PATCH 1/2]ext3 block allocation/reservation fixes to support 2**32 block numbers Mingming Cao
2006-03-30  1:39             ` [RFC][PATCH 2/2]Other ext3 in-kernel block number type fix " Mingming Cao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox