RE: [PATCH 00/23] per device dirty throttling -v9

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: spamtrap@knobisoft.de
Cc: linux-kernel@vger.kernel.org
Subject: RE: [PATCH 00/23] per device dirty throttling -v9
Date: Thu, 23 Aug 2007 19:41:59 +0200	[thread overview]
Message-ID: <1187890919.6114.411.camel@twins> (raw)
In-Reply-To: <137637.20782.qm@web32614.mail.mud.yahoo.com>


[-- Attachment #1.1: Type: text/plain, Size: 1048 bytes --]

On Thu, 2007-08-23 at 08:59 -0700, Martin Knoblauch wrote:
> --- Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > On Thu, 2007-08-16 at 05:49 -0700, Martin Knoblauch wrote:
> > 
> > > Peter,
> > > 
> > >  any chance to get a rollup against 2.6.22-stable?
> > > 
> > >  The 2.6.23 series may not be usable for me due to the
> > > nosharedcache changes for NFS (the new default will massively
> > > disturb the user-space automounter).
> > 
> > I'll see what I can do, bit busy with other stuff atm, hopefully
> > after
> > the weekend.
> > 
> Hi Peter,
> 
>  any progress on a version against 2.6.22.5? I have seen the very
> positive report from Jeffrey W. Baker and would really love to test
> your patch. But as I said, anything newer than 2.6.22.x might not be an
> option due to the NFS changes.

mindless port, seems to compile and boot on my test box ymmv.

I think .5 should not present anything other than trivial rejects if
anything. But I'm not keeping -stable in my git remotes so I can't say
for sure.

[-- Attachment #1.2: bdi-rollup-v9-v2.6.22.patch --]
[-- Type: text/x-patch, Size: 69879 bytes --]

Index: linux-2.6/fs/nfs/write.c
===================================================================
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -237,10 +237,8 @@ static void nfs_end_page_writeback(struc
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
-	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
 		clear_bdi_congested(&nfss->backing_dev_info, WRITE);
-		congestion_end(WRITE);
-	}
 }
 
 /*
@@ -466,6 +464,7 @@ nfs_mark_request_commit(struct nfs_page 
 	set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
 	spin_unlock(&nfsi->req_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 
@@ -552,6 +551,8 @@ static void nfs_cancel_commit_list(struc
 	while(!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				BDI_RECLAIMABLE);
 		nfs_list_remove_request(req);
 		clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
 		nfs_inode_remove_request(req);
@@ -1207,6 +1208,8 @@ nfs_commit_list(struct inode *inode, str
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				BDI_RECLAIMABLE);
 		nfs_clear_page_writeback(req);
 	}
 	return -ENOMEM;
@@ -1232,6 +1235,8 @@ static void nfs_commit_done(struct rpc_t
 		nfs_list_remove_request(req);
 		clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				BDI_RECLAIMABLE);
 
 		dprintk("NFS: commit (%s/%Ld %d@%Ld)",
 			req->wb_context->dentry->d_inode->i_sb->s_id,
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -8,6 +8,9 @@
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 
+#include <linux/percpu_counter.h>
+#include <linux/log2.h>
+#include <linux/proportions.h>
 #include <asm/atomic.h>
 
 struct page;
@@ -24,6 +27,14 @@ enum bdi_state {
 
 typedef int (congested_fn)(void *, int);
 
+enum bdi_stat_item {
+	BDI_RECLAIMABLE,
+	BDI_WRITEBACK,
+	NR_BDI_STAT_ITEMS
+};
+
+#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
 struct backing_dev_info {
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long state;	/* Always use atomic bitops on this */
@@ -32,8 +43,90 @@ struct backing_dev_info {
 	void *congested_data;	/* Pointer to aux data for congested func */
 	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
 	void *unplug_io_data;
+
+	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
+
+	struct prop_local_percpu completions;
+	int dirty_exceeded;
 };
 
+int bdi_init(struct backing_dev_info *bdi);
+void bdi_destroy(struct backing_dev_info *bdi);
+
+static inline void __mod_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item, s64 amount)
+{
+	__percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
+}
+
+static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	__mod_bdi_stat(bdi, item, 1);
+}
+
+static inline void inc_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__inc_bdi_stat(bdi, item);
+	local_irq_restore(flags);
+}
+
+static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	__mod_bdi_stat(bdi, item, -1);
+}
+
+static inline void dec_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__dec_bdi_stat(bdi, item);
+	local_irq_restore(flags);
+}
+
+static inline s64 bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
+}
+
+static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
+}
+
+static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	s64 sum;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	sum = __bdi_stat_sum(bdi, item);
+	local_irq_restore(flags);
+
+	return sum;
+}
+
+/*
+ * maximal error of a stat counter.
+ */
+static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
+{
+#ifdef CONFIG_SMP
+	return nr_cpu_ids * BDI_STAT_BATCH;
+#else
+	return 1;
+#endif
+}
 
 /*
  * Flags in backing_dev_info::capability
@@ -94,7 +187,6 @@ void clear_bdi_congested(struct backing_
 void set_bdi_congested(struct backing_dev_info *bdi, int rw);
 long congestion_wait(int rw, long timeout);
 long congestion_wait_interruptible(int rw, long timeout);
-void congestion_end(int rw);
 
 #define bdi_cap_writeback_dirty(bdi) \
 	(!((bdi)->capabilities & BDI_CAP_NO_WRITEBACK))
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c
+++ linux-2.6/mm/backing-dev.c
@@ -5,6 +5,41 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 
+int bdi_init(struct backing_dev_info *bdi)
+{
+	int i, j;
+	int err;
+
+	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
+		err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+		if (err)
+			goto err;
+	}
+
+	bdi->dirty_exceeded = 0;
+	err = prop_local_init_percpu(&bdi->completions);
+
+	if (err) {
+err:
+		for (j = 0; j < i; j++)
+			percpu_counter_destroy(&bdi->bdi_stat[i]);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(bdi_init);
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+	int i;
+
+	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+		percpu_counter_destroy(&bdi->bdi_stat[i]);
+
+	prop_local_destroy_percpu(&bdi->completions);
+}
+EXPORT_SYMBOL(bdi_destroy);
+
 static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -70,16 +105,3 @@ long congestion_wait_interruptible(int r
 	return ret;
 }
 EXPORT_SYMBOL(congestion_wait_interruptible);
-
-/**
- * congestion_end - wake up sleepers on a congested backing_dev_info
- * @rw: READ or WRITE
- */
-void congestion_end(int rw)
-{
-	wait_queue_head_t *wqh = &congestion_wqh[rw];
-
-	if (waitqueue_active(wqh))
-		wake_up(wqh);
-}
-EXPORT_SYMBOL(congestion_end);
Index: linux-2.6/fs/ext2/balloc.c
===================================================================
--- linux-2.6.orig/fs/ext2/balloc.c
+++ linux-2.6/fs/ext2/balloc.c
@@ -124,7 +124,7 @@ static int reserve_blocks(struct super_b
 			return 0;
 	}
 
-	percpu_counter_mod(&sbi->s_freeblocks_counter, -count);
+	percpu_counter_sub(&sbi->s_freeblocks_counter, count);
 	sb->s_dirt = 1;
 	return count;
 }
@@ -134,7 +134,7 @@ static void release_blocks(struct super_
 	if (count) {
 		struct ext2_sb_info *sbi = EXT2_SB(sb);
 
-		percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+		percpu_counter_add(&sbi->s_freeblocks_counter, count);
 		sb->s_dirt = 1;
 	}
 }
Index: linux-2.6/fs/ext2/ialloc.c
===================================================================
--- linux-2.6.orig/fs/ext2/ialloc.c
+++ linux-2.6/fs/ext2/ialloc.c
@@ -542,7 +542,7 @@ got:
 		goto fail;
 	}
 
-	percpu_counter_mod(&sbi->s_freeinodes_counter, -1);
+	percpu_counter_add(&sbi->s_freeinodes_counter, -1);
 	if (S_ISDIR(mode))
 		percpu_counter_inc(&sbi->s_dirs_counter);
 
Index: linux-2.6/fs/ext3/balloc.c
===================================================================
--- linux-2.6.orig/fs/ext3/balloc.c
+++ linux-2.6/fs/ext3/balloc.c
@@ -570,7 +570,7 @@ do_more:
 		cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
 			group_freed);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
-	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -1633,7 +1633,7 @@ allocated:
 	gdp->bg_free_blocks_count =
 			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
+	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 
 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 	err = ext3_journal_dirty_metadata(handle, gdp_bh);
Index: linux-2.6/fs/ext3/resize.c
===================================================================
--- linux-2.6.orig/fs/ext3/resize.c
+++ linux-2.6/fs/ext3/resize.c
@@ -884,9 +884,9 @@ int ext3_group_add(struct super_block *s
 		input->reserved_blocks);
 
 	/* Update the free space counts */
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
+	percpu_counter_add(&sbi->s_freeblocks_counter,
 			   input->free_blocks_count);
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
+	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT3_INODES_PER_GROUP(sb));
 
 	ext3_journal_dirty_metadata(handle, sbi->s_sbh);
Index: linux-2.6/fs/ext4/balloc.c
===================================================================
--- linux-2.6.orig/fs/ext4/balloc.c
+++ linux-2.6/fs/ext4/balloc.c
@@ -587,7 +587,7 @@ do_more:
 		cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
 			group_freed);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
-	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -1647,7 +1647,7 @@ allocated:
 	gdp->bg_free_blocks_count =
 			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
+	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 
 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 	err = ext4_journal_dirty_metadata(handle, gdp_bh);
Index: linux-2.6/fs/ext4/resize.c
===================================================================
--- linux-2.6.orig/fs/ext4/resize.c
+++ linux-2.6/fs/ext4/resize.c
@@ -893,9 +893,9 @@ int ext4_group_add(struct super_block *s
 		input->reserved_blocks);
 
 	/* Update the free space counts */
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
+	percpu_counter_add(&sbi->s_freeblocks_counter,
 			   input->free_blocks_count);
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
+	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT4_INODES_PER_GROUP(sb));
 
 	ext4_journal_dirty_metadata(handle, sbi->s_sbh);
Index: linux-2.6/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.orig/include/linux/percpu_counter.h
+++ linux-2.6/include/linux/percpu_counter.h
@@ -26,20 +26,43 @@ struct percpu_counter {
 #define FBC_BATCH	(NR_CPUS*4)
 #endif
 
-static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+static inline
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {
 	spin_lock_init(&fbc->lock);
 	fbc->count = amount;
 	fbc->counters = alloc_percpu(s32);
+	if (!fbc->counters)
+		return -ENOMEM;
+	return 0;
 }
 
+int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
+
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 {
 	free_percpu(fbc->counters);
 }
 
-void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
-s64 percpu_counter_sum(struct percpu_counter *fbc);
+void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
+void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
+s64 __percpu_counter_sum(struct percpu_counter *fbc);
+
+static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
+{
+	__percpu_counter_add(fbc, amount, FBC_BATCH);
+}
+
+static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
+{
+	s64 ret = __percpu_counter_sum(fbc);
+	return ret < 0 ? 0 : ret;
+}
+
+static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
+{
+	return __percpu_counter_sum(fbc);
+}
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
@@ -67,17 +90,28 @@ struct percpu_counter {
 	s64 count;
 };
 
-static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {
 	fbc->count = amount;
+	return 0;
 }
 
+#define percpu_counter_init_irq percpu_counter_init
+
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 {
 }
 
+static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
+{
+	fbc->count = amount;
+}
+
+#define __percpu_counter_add(fbc, amount, batch) \
+	percpu_counter_add(fbc, amount)
+
 static inline void
-percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
+percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
 	preempt_disable();
 	fbc->count += amount;
@@ -94,21 +128,31 @@ static inline s64 percpu_counter_read_po
 	return fbc->count;
 }
 
-static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
+static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
 	return percpu_counter_read_positive(fbc);
 }
 
+static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
+{
+	return percpu_counter_read(fbc);
+}
+
 #endif	/* CONFIG_SMP */
 
 static inline void percpu_counter_inc(struct percpu_counter *fbc)
 {
-	percpu_counter_mod(fbc, 1);
+	percpu_counter_add(fbc, 1);
 }
 
 static inline void percpu_counter_dec(struct percpu_counter *fbc)
 {
-	percpu_counter_mod(fbc, -1);
+	percpu_counter_add(fbc, -1);
+}
+
+static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
+{
+	percpu_counter_add(fbc, -amount);
 }
 
 #endif /* _LINUX_PERCPU_COUNTER_H */
Index: linux-2.6/lib/percpu_counter.c
===================================================================
--- linux-2.6.orig/lib/percpu_counter.c
+++ linux-2.6/lib/percpu_counter.c
@@ -5,15 +5,41 @@
 #include <linux/percpu_counter.h>
 #include <linux/module.h>
 
-void percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
+void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 {
-	long count;
+	int cpu;
+
+	spin_lock(&fbc->lock);
+	for_each_possible_cpu(cpu) {
+		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
+		*pcount = 0;
+	}
+	fbc->count = amount;
+	spin_unlock(&fbc->lock);
+}
+EXPORT_SYMBOL(percpu_counter_set);
+
+static struct lock_class_key percpu_counter_irqsafe;
+
+int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount)
+{
+	int err;
+
+	err = percpu_counter_init(fbc, amount);
+	if (!err)
+		lockdep_set_class(&fbc->lock, &percpu_counter_irqsafe);
+	return err;
+}
+
+void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
+{
+	s64 count;
 	s32 *pcount;
 	int cpu = get_cpu();
 
 	pcount = per_cpu_ptr(fbc->counters, cpu);
 	count = *pcount + amount;
-	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+	if (count >= batch || count <= -batch) {
 		spin_lock(&fbc->lock);
 		fbc->count += count;
 		*pcount = 0;
@@ -23,13 +49,13 @@ void percpu_counter_mod(struct percpu_co
 	}
 	put_cpu();
 }
-EXPORT_SYMBOL(percpu_counter_mod);
+EXPORT_SYMBOL(__percpu_counter_add);
 
 /*
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc)
 {
 	s64 ret;
 	int cpu;
@@ -41,6 +67,6 @@ s64 percpu_counter_sum(struct percpu_cou
 		ret += *pcount;
 	}
 	spin_unlock(&fbc->lock);
-	return ret < 0 ? 0 : ret;
+	return ret;
 }
-EXPORT_SYMBOL(percpu_counter_sum);
+EXPORT_SYMBOL(__percpu_counter_sum);
Index: linux-2.6/fs/ext3/super.c
===================================================================
--- linux-2.6.orig/fs/ext3/super.c
+++ linux-2.6/fs/ext3/super.c
@@ -1406,6 +1406,7 @@ static int ext3_fill_super (struct super
 	int i;
 	int needs_recovery;
 	__le32 features;
+	int err;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -1665,12 +1666,16 @@ static int ext3_fill_super (struct super
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
-	percpu_counter_init(&sbi->s_freeblocks_counter,
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
 		ext3_count_free_blocks(sb));
-	percpu_counter_init(&sbi->s_freeinodes_counter,
+	err |= percpu_counter_init(&sbi->s_freeinodes_counter,
 		ext3_count_free_inodes(sb));
-	percpu_counter_init(&sbi->s_dirs_counter,
+	err |= percpu_counter_init(&sbi->s_dirs_counter,
 		ext3_count_dirs(sb));
+	if (err) {
+		printk(KERN_ERR "EXT3-fs: insufficient memory\n");
+		goto failed_mount3;
+	}
 
 	/* per fileystem reservation list head & lock */
 	spin_lock_init(&sbi->s_rsv_window_lock);
@@ -2448,12 +2453,12 @@ static int ext3_statfs (struct dentry * 
 	buf->f_type = EXT3_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
-	buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
 	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
 	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
-	buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
 	buf->f_namelen = EXT3_NAME_LEN;
 	fsid = le64_to_cpup((void *)es->s_uuid) ^
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
Index: linux-2.6/fs/ext4/super.c
===================================================================
--- linux-2.6.orig/fs/ext4/super.c
+++ linux-2.6/fs/ext4/super.c
@@ -1465,6 +1465,7 @@ static int ext4_fill_super (struct super
 	int needs_recovery;
 	__le32 features;
 	__u64 blocks_count;
+	int err;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -1737,12 +1738,16 @@ static int ext4_fill_super (struct super
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
-	percpu_counter_init(&sbi->s_freeblocks_counter,
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
 		ext4_count_free_blocks(sb));
-	percpu_counter_init(&sbi->s_freeinodes_counter,
+	err |= percpu_counter_init(&sbi->s_freeinodes_counter,
 		ext4_count_free_inodes(sb));
-	percpu_counter_init(&sbi->s_dirs_counter,
+	err |= percpu_counter_init(&sbi->s_dirs_counter,
 		ext4_count_dirs(sb));
+	if (err) {
+		printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+		goto failed_mount3;
+	}
 
 	/* per fileystem reservation list head & lock */
 	spin_lock_init(&sbi->s_rsv_window_lock);
@@ -2523,12 +2528,12 @@ static int ext4_statfs (struct dentry * 
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = ext4_blocks_count(es) - overhead;
-	buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
 	if (buf->f_bfree < ext4_r_blocks_count(es))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
-	buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
 	buf->f_namelen = EXT4_NAME_LEN;
 	fsid = le64_to_cpup((void *)es->s_uuid) ^
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c
+++ linux-2.6/fs/file_table.c
@@ -98,7 +98,7 @@ struct file *get_empty_filp(void)
 		 * percpu_counters are inaccurate.  Do an expensive check before
 		 * we go and fail.
 		 */
-		if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
+		if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
 			goto over;
 	}
 
Index: linux-2.6/fs/ext2/super.c
===================================================================
--- linux-2.6.orig/fs/ext2/super.c
+++ linux-2.6/fs/ext2/super.c
@@ -652,6 +652,7 @@ static int ext2_fill_super(struct super_
 	int db_count;
 	int i, j;
 	__le32 features;
+	int err;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -907,12 +908,16 @@ static int ext2_fill_super(struct super_
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
-	percpu_counter_init(&sbi->s_freeblocks_counter,
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
 				ext2_count_free_blocks(sb));
-	percpu_counter_init(&sbi->s_freeinodes_counter,
+	err |= percpu_counter_init(&sbi->s_freeinodes_counter,
 				ext2_count_free_inodes(sb));
-	percpu_counter_init(&sbi->s_dirs_counter,
+	err |= percpu_counter_init(&sbi->s_dirs_counter,
 				ext2_count_dirs(sb));
+	if (err) {
+		printk(KERN_ERR "EXT2-fs: insufficient memory\n");
+		goto failed_mount3;
+	}
 	/*
 	 * set up enough so that it can read an inode
 	 */
Index: linux-2.6/block/ll_rw_blk.c
===================================================================
--- linux-2.6.orig/block/ll_rw_blk.c
+++ linux-2.6/block/ll_rw_blk.c
@@ -1783,6 +1783,7 @@ static void blk_release_queue(struct kob
 
 	blk_trace_shutdown(q);
 
+	bdi_destroy(&q->backing_dev_info);
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -1835,6 +1836,7 @@ static struct kobj_type queue_ktype;
 
 request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
+	int err;
 	request_queue_t *q;
 
 	q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id);
@@ -1842,15 +1844,20 @@ request_queue_t *blk_alloc_queue_node(gf
 		return NULL;
 
 	memset(q, 0, sizeof(*q));
+	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
+	q->backing_dev_info.unplug_io_data = q;
+	err = bdi_init(&q->backing_dev_info);
+	if (err) {
+		kmem_cache_free(requestq_cachep, q);
+		return NULL;
+	}
+
 	init_timer(&q->unplug_timer);
 
 	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
 	q->kobj.ktype = &queue_ktype;
 	kobject_init(&q->kobj);
 
-	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
-	q->backing_dev_info.unplug_io_data = q;
-
 	mutex_init(&q->sysfs_lock);
 
 	return q;
@@ -3984,6 +3991,73 @@ static ssize_t queue_max_hw_sectors_show
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
 
+static ssize_t queue_nr_reclaimable_show(struct request_queue *q, char *page)
+{
+	unsigned long long nr_reclaimable =
+		bdi_stat(&q->backing_dev_info, BDI_RECLAIMABLE);
+
+	return sprintf(page, "%llu\n",
+			nr_reclaimable >> (PAGE_CACHE_SHIFT - 10));
+}
+
+static ssize_t queue_nr_writeback_show(struct request_queue *q, char *page)
+{
+	unsigned long long nr_writeback =
+		bdi_stat(&q->backing_dev_info, BDI_WRITEBACK);
+
+	return sprintf(page, "%llu\n",
+			nr_writeback >> (PAGE_CACHE_SHIFT - 10));
+}
+
+extern void bdi_writeout_fraction(struct backing_dev_info *bdi,
+		long *numerator, long *denominator);
+
+static ssize_t queue_nr_cache_ratio_show(struct request_queue *q, char *page)
+{
+	long scale, div;
+
+	bdi_writeout_fraction(&q->backing_dev_info, &scale, &div);
+	scale *= 1024;
+	scale /= div;
+
+	return sprintf(page, "%ld\n", scale);
+}
+
+static ssize_t queue_nr_cache_num_show(struct request_queue *q, char *page)
+{
+	long scale, div;
+
+	bdi_writeout_fraction(&q->backing_dev_info, &scale, &div);
+
+	return sprintf(page, "%ld\n", scale);
+}
+
+static ssize_t queue_nr_cache_denom_show(struct request_queue *q, char *page)
+{
+	long scale, div;
+
+	bdi_writeout_fraction(&q->backing_dev_info, &scale, &div);
+
+	return sprintf(page, "%ld\n", div);
+}
+
+extern void
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+		struct backing_dev_info *bdi);
+
+static ssize_t queue_nr_cache_size_show(struct request_queue *q, char *page)
+{
+	long background, dirty, bdi_dirty;
+	get_dirty_limits(&background, &dirty, &bdi_dirty, &q->backing_dev_info);
+	return sprintf(page, "%ld\n", bdi_dirty);
+}
+
+static ssize_t queue_nr_cache_total_show(struct request_queue *q, char *page)
+{
+	long background, dirty, bdi_dirty;
+	get_dirty_limits(&background, &dirty, &bdi_dirty, &q->backing_dev_info);
+	return sprintf(page, "%ld\n", dirty);
+}
 
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -4008,6 +4082,41 @@ static struct queue_sysfs_entry queue_ma
 	.show = queue_max_hw_sectors_show,
 };
 
+static struct queue_sysfs_entry queue_reclaimable_entry = {
+	.attr = {.name = "reclaimable_kb", .mode = S_IRUGO },
+	.show = queue_nr_reclaimable_show,
+};
+
+static struct queue_sysfs_entry queue_writeback_entry = {
+	.attr = {.name = "writeback_kb", .mode = S_IRUGO },
+	.show = queue_nr_writeback_show,
+};
+
+static struct queue_sysfs_entry queue_cache_ratio_entry = {
+	.attr = {.name = "cache_ratio", .mode = S_IRUGO },
+	.show = queue_nr_cache_ratio_show,
+};
+
+static struct queue_sysfs_entry queue_cache_num_entry = {
+	.attr = {.name = "cache_num", .mode = S_IRUGO },
+	.show = queue_nr_cache_num_show,
+};
+
+static struct queue_sysfs_entry queue_cache_denom_entry = {
+	.attr = {.name = "cache_denom", .mode = S_IRUGO },
+	.show = queue_nr_cache_denom_show,
+};
+
+static struct queue_sysfs_entry queue_cache_size_entry = {
+	.attr = {.name = "cache_size", .mode = S_IRUGO },
+	.show = queue_nr_cache_size_show,
+};
+
+static struct queue_sysfs_entry queue_cache_total_entry = {
+	.attr = {.name = "cache_total", .mode = S_IRUGO },
+	.show = queue_nr_cache_total_show,
+};
+
 static struct queue_sysfs_entry queue_iosched_entry = {
 	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
 	.show = elv_iosched_show,
@@ -4019,6 +4128,13 @@ static struct attribute *default_attrs[]
 	&queue_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
+	&queue_reclaimable_entry.attr,
+	&queue_writeback_entry.attr,
+	&queue_cache_ratio_entry.attr,
+	&queue_cache_num_entry.attr,
+	&queue_cache_denom_entry.attr,
+	&queue_cache_size_entry.attr,
+	&queue_cache_total_entry.attr,
 	&queue_iosched_entry.attr,
 	NULL,
 };
Index: linux-2.6/drivers/block/rd.c
===================================================================
--- linux-2.6.orig/drivers/block/rd.c
+++ linux-2.6/drivers/block/rd.c
@@ -411,6 +411,9 @@ static void __exit rd_cleanup(void)
 		blk_cleanup_queue(rd_queue[i]);
 	}
 	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+	bdi_destroy(&rd_file_backing_dev_info);
+	bdi_destroy(&rd_backing_dev_info);
 }
 
 /*
@@ -419,7 +422,19 @@ static void __exit rd_cleanup(void)
 static int __init rd_init(void)
 {
 	int i;
-	int err = -ENOMEM;
+	int err;
+
+	err = bdi_init(&rd_backing_dev_info);
+	if (err)
+		goto out2;
+
+	err = bdi_init(&rd_file_backing_dev_info);
+	if (err) {
+		bdi_destroy(&rd_backing_dev_info);
+		goto out2;
+	}
+
+	err = -ENOMEM;
 
 	if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 ||
 			(rd_blocksize & (rd_blocksize-1))) {
@@ -473,6 +488,9 @@ out:
 		put_disk(rd_disks[i]);
 		blk_cleanup_queue(rd_queue[i]);
 	}
+	bdi_destroy(&rd_backing_dev_info);
+	bdi_destroy(&rd_file_backing_dev_info);
+out2:
 	return err;
 }
 
Index: linux-2.6/drivers/char/mem.c
===================================================================
--- linux-2.6.orig/drivers/char/mem.c
+++ linux-2.6/drivers/char/mem.c
@@ -977,6 +977,11 @@ static struct class *mem_class;
 static int __init chr_dev_init(void)
 {
 	int i;
+	int err;
+
+	err = bdi_init(&zero_bdi);
+	if (err)
+		return err;
 
 	if (register_chrdev(MEM_MAJOR,"mem",&memory_fops))
 		printk("unable to get major %d for memory devs\n", MEM_MAJOR);
Index: linux-2.6/fs/char_dev.c
===================================================================
--- linux-2.6.orig/fs/char_dev.c
+++ linux-2.6/fs/char_dev.c
@@ -546,6 +546,7 @@ static struct kobject *base_probe(dev_t 
 void __init chrdev_init(void)
 {
 	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
+	bdi_init(&directly_mappable_cdev_bdi);
 }
 
 
Index: linux-2.6/fs/fuse/inode.c
===================================================================
--- linux-2.6.orig/fs/fuse/inode.c
+++ linux-2.6/fs/fuse/inode.c
@@ -401,6 +401,7 @@ static int fuse_show_options(struct seq_
 static struct fuse_conn *new_conn(void)
 {
 	struct fuse_conn *fc;
+	int err;
 
 	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (fc) {
@@ -416,10 +417,17 @@ static struct fuse_conn *new_conn(void)
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
+		err = bdi_init(&fc->bdi);
+		if (err) {
+			kfree(fc);
+			fc = NULL;
+			goto out;
+		}
 		fc->reqctr = 0;
 		fc->blocked = 1;
 		get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 	}
+out:
 	return fc;
 }
 
@@ -429,6 +437,7 @@ void fuse_conn_put(struct fuse_conn *fc)
 		if (fc->destroy_req)
 			fuse_request_free(fc->destroy_req);
 		mutex_destroy(&fc->inst_mutex);
+		bdi_destroy(&fc->bdi);
 		kfree(fc);
 	}
 }
Index: linux-2.6/fs/nfs/client.c
===================================================================
--- linux-2.6.orig/fs/nfs/client.c
+++ linux-2.6/fs/nfs/client.c
@@ -658,6 +658,7 @@ static void nfs_server_set_fsinfo(struct
 	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
 		server->rsize = NFS_MAX_FILE_IO_SIZE;
 	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
 
 	if (server->wsize > max_rpc_payload)
@@ -708,6 +709,10 @@ static int nfs_probe_fsinfo(struct nfs_s
 		goto out_error;
 
 	nfs_server_set_fsinfo(server, &fsinfo);
+	error = bdi_init(&server->backing_dev_info);
+	if (error)
+		goto out_error;
+
 
 	/* Get some general file system info */
 	if (server->namelen == 0) {
@@ -787,6 +792,7 @@ void nfs_free_server(struct nfs_server *
 	nfs_put_client(server->nfs_client);
 
 	nfs_free_iostats(server->io_stats);
+	bdi_destroy(&server->backing_dev_info);
 	kfree(server);
 	nfs_release_automount_timer();
 	dprintk("<-- nfs_free_server()\n");
Index: linux-2.6/fs/hugetlbfs/inode.c
===================================================================
--- linux-2.6.orig/fs/hugetlbfs/inode.c
+++ linux-2.6/fs/hugetlbfs/inode.c
@@ -802,11 +802,15 @@ static int __init init_hugetlbfs_fs(void
 	int error;
 	struct vfsmount *vfsmount;
 
+	error = bdi_init(&hugetlbfs_backing_dev_info);
+	if (error)
+		return error;
+
 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
 					sizeof(struct hugetlbfs_inode_info),
 					0, 0, init_once, NULL);
 	if (hugetlbfs_inode_cachep == NULL)
-		return -ENOMEM;
+		goto out2;
 
 	error = register_filesystem(&hugetlbfs_fs_type);
 	if (error)
@@ -824,6 +828,8 @@ static int __init init_hugetlbfs_fs(void
  out:
 	if (error)
 		kmem_cache_destroy(hugetlbfs_inode_cachep);
+ out2:
+	bdi_destroy(&hugetlbfs_backing_dev_info);
 	return error;
 }
 
@@ -831,6 +837,7 @@ static void __exit exit_hugetlbfs_fs(voi
 {
 	kmem_cache_destroy(hugetlbfs_inode_cachep);
 	unregister_filesystem(&hugetlbfs_fs_type);
+	bdi_destroy(&hugetlbfs_backing_dev_info);
 }
 
 module_init(init_hugetlbfs_fs)
Index: linux-2.6/fs/ocfs2/dlm/dlmfs.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/dlm/dlmfs.c
+++ linux-2.6/fs/ocfs2/dlm/dlmfs.c
@@ -588,13 +588,17 @@ static int __init init_dlmfs_fs(void)
 
 	dlmfs_print_version();
 
+	status = bdi_init(&dlmfs_backing_dev_info);
+	if (status)
+		return status;
+
 	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
 				sizeof(struct dlmfs_inode_private),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 					SLAB_MEM_SPREAD),
 				dlmfs_init_once, NULL);
 	if (!dlmfs_inode_cache)
-		return -ENOMEM;
+		goto bail;
 	cleanup_inode = 1;
 
 	user_dlm_worker = create_singlethread_workqueue("user_dlm");
@@ -611,6 +615,7 @@ bail:
 			kmem_cache_destroy(dlmfs_inode_cache);
 		if (cleanup_worker)
 			destroy_workqueue(user_dlm_worker);
+		bdi_destroy(&dlmfs_backing_dev_info);
 	} else
 		printk("OCFS2 User DLM kernel interface loaded\n");
 	return status;
@@ -624,6 +629,8 @@ static void __exit exit_dlmfs_fs(void)
 	destroy_workqueue(user_dlm_worker);
 
 	kmem_cache_destroy(dlmfs_inode_cache);
+
+	bdi_destroy(&dlmfs_backing_dev_info);
 }
 
 MODULE_AUTHOR("Oracle");
Index: linux-2.6/fs/configfs/configfs_internal.h
===================================================================
--- linux-2.6.orig/fs/configfs/configfs_internal.h
+++ linux-2.6/fs/configfs/configfs_internal.h
@@ -55,6 +55,8 @@ extern int configfs_is_root(struct confi
 
 extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *);
 extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern int configfs_inode_init(void);
+extern void configfs_inode_exit(void);
 
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
Index: linux-2.6/fs/configfs/inode.c
===================================================================
--- linux-2.6.orig/fs/configfs/inode.c
+++ linux-2.6/fs/configfs/inode.c
@@ -256,4 +256,12 @@ void configfs_hash_and_remove(struct den
 	mutex_unlock(&dir->d_inode->i_mutex);
 }
 
+int __init configfs_inode_init(void)
+{
+	return bdi_init(&configfs_backing_dev_info);
+}
 
+void __exit configfs_inode_exit(void)
+{
+	bdi_destroy(&configfs_backing_dev_info);
+}
Index: linux-2.6/fs/configfs/mount.c
===================================================================
--- linux-2.6.orig/fs/configfs/mount.c
+++ linux-2.6/fs/configfs/mount.c
@@ -154,8 +154,16 @@ static int __init configfs_init(void)
 		subsystem_unregister(&config_subsys);
 		kmem_cache_destroy(configfs_dir_cachep);
 		configfs_dir_cachep = NULL;
+		goto out;
 	}
 
+	err = configfs_inode_init();
+	if (err) {
+		unregister_filesystem(&configfs_fs_type);
+		subsystem_unregister(&config_subsys);
+		kmem_cache_destroy(configfs_dir_cachep);
+		configfs_dir_cachep = NULL;
+	}
 out:
 	return err;
 }
@@ -166,6 +174,7 @@ static void __exit configfs_exit(void)
 	subsystem_unregister(&config_subsys);
 	kmem_cache_destroy(configfs_dir_cachep);
 	configfs_dir_cachep = NULL;
+	configfs_inode_exit();
 }
 
 MODULE_AUTHOR("Oracle");
Index: linux-2.6/fs/ramfs/inode.c
===================================================================
--- linux-2.6.orig/fs/ramfs/inode.c
+++ linux-2.6/fs/ramfs/inode.c
@@ -222,7 +222,17 @@ module_exit(exit_ramfs_fs)
 
 int __init init_rootfs(void)
 {
-	return register_filesystem(&rootfs_fs_type);
+	int err;
+
+	err = bdi_init(&ramfs_backing_dev_info);
+	if (err)
+		return err;
+
+	err = register_filesystem(&rootfs_fs_type);
+	if (err)
+		bdi_destroy(&ramfs_backing_dev_info);
+
+	return err;
 }
 
 MODULE_LICENSE("GPL");
Index: linux-2.6/fs/sysfs/inode.c
===================================================================
--- linux-2.6.orig/fs/sysfs/inode.c
+++ linux-2.6/fs/sysfs/inode.c
@@ -44,6 +44,11 @@ void sysfs_delete_inode(struct inode *in
 	return generic_delete_inode(inode);
 }
 
+int __init sysfs_inode_init(void)
+{
+	return bdi_init(&sysfs_backing_dev_info);
+}
+
 int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 {
 	struct inode * inode = dentry->d_inode;
Index: linux-2.6/fs/sysfs/mount.c
===================================================================
--- linux-2.6.orig/fs/sysfs/mount.c
+++ linux-2.6/fs/sysfs/mount.c
@@ -98,6 +98,10 @@ int __init sysfs_init(void)
 	if (!sysfs_dir_cachep)
 		goto out;
 
+	err = sysfs_inode_init();
+	if (err)
+		goto out_err;
+
 	err = register_filesystem(&sysfs_fs_type);
 	if (!err) {
 		sysfs_mount = kern_mount(&sysfs_fs_type);
Index: linux-2.6/fs/sysfs/sysfs.h
===================================================================
--- linux-2.6.orig/fs/sysfs/sysfs.h
+++ linux-2.6/fs/sysfs/sysfs.h
@@ -17,6 +17,7 @@ extern struct kmem_cache *sysfs_dir_cach
 extern void sysfs_delete_inode(struct inode *inode);
 extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *);
 extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern int sysfs_inode_init(void);
 
 extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *);
 extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *,
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c
+++ linux-2.6/mm/shmem.c
@@ -2490,6 +2490,10 @@ static int __init init_tmpfs(void)
 {
 	int error;
 
+	error = bdi_init(&shmem_backing_dev_info);
+	if (error)
+		goto out4;
+
 	error = init_inodecache();
 	if (error)
 		goto out3;
@@ -2514,6 +2518,8 @@ out1:
 out2:
 	destroy_inodecache();
 out3:
+	bdi_destroy(&shmem_backing_dev_info);
+out4:
 	shm_mnt = ERR_PTR(error);
 	return error;
 }
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c
+++ linux-2.6/mm/swap.c
@@ -505,6 +505,10 @@ void __init swap_setup(void)
 {
 	unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
 
+#ifdef CONFIG_SWAP
+	bdi_init(swapper_space.backing_dev_info);
+#endif
+
 	/* Use a smaller cluster for small-memory machines */
 	if (megs < 16)
 		page_cluster = 2;
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c
+++ linux-2.6/mm/readahead.c
@@ -75,6 +75,12 @@ static inline void ra_off(struct file_ra
 	return;
 }
 
+static int __init readahead_init(void)
+{
+	return bdi_init(&default_backing_dev_info);
+}
+subsys_initcall(readahead_init);
+
 /*
  * Set the initial window size, round to next power of 2 and square
  * for small size, x 4 for medium, and x 2 for large
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -726,6 +726,8 @@ int __set_page_dirty_buffers(struct page
 	if (page->mapping) {	/* Race with truncate? */
 		if (mapping_cap_account_dirty(mapping)) {
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
+			__inc_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -2,6 +2,7 @@
  * mm/page-writeback.c
  *
  * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  * Contains functions related to writing back dirty pages at the
  * address_space level.
@@ -49,8 +50,6 @@
  */
 static long ratelimit_pages = 32;
 
-static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
-
 /*
  * When balance_dirty_pages decides that the caller needs to perform some
  * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode);
 static void background_writeout(unsigned long _min_pages);
 
 /*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by keeping a floating proportion between BDIs, based on page
+ * writeback completions [end_page_writeback()]. Those devices that write out
+ * pages fastest will get the larger share, while the slower will get a smaller
+ * share.
+ *
+ * We use page writeout completions because we are interested in getting rid of
+ * dirty pages. Having them written out is the primary goal.
+ *
+ * We introduce a concept of time, a period over which we measure these events,
+ * because demand can/will vary over time. The length of this period itself is
+ * measured in page writeback completions.
+ *
+ */
+static struct prop_descriptor vm_completions;
+static struct prop_descriptor vm_dirties;
+
+static unsigned long determine_dirtyable_memory(void);
+
+/*
+ * couple the period to the dirty_ratio:
+ *
+ *   period/2 ~ roundup_pow_of_two(dirty limit)
+ */
+static int calc_period_shift(void)
+{
+	unsigned long dirty_total;
+
+	dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+	return 2 + ilog2(dirty_total - 1);
+}
+
+/*
+ * update the period when the dirty ratio changes.
+ */
+int dirty_ratio_handler(ctl_table *table, int write,
+		struct file *filp, void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int old_ratio = vm_dirty_ratio;
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+		int shift = calc_period_shift();
+		prop_change_shift(&vm_completions, shift);
+		prop_change_shift(&vm_dirties, shift);
+	}
+	return ret;
+}
+
+/*
+ * Increment the BDI's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+	__prop_inc_percpu(&vm_completions, &bdi->completions);
+}
+
+static inline void task_dirty_inc(struct task_struct *tsk)
+{
+	prop_inc_single(&vm_dirties, &tsk->dirties);
+}
+
+/*
+ * Obtain an accurate fraction of the BDI's portion.
+ */
+void bdi_writeout_fraction(struct backing_dev_info *bdi,
+		long *numerator, long *denominator)
+{
+	if (bdi_cap_writeback_dirty(bdi)) {
+		prop_fraction_percpu(&vm_completions, &bdi->completions,
+				numerator, denominator);
+	} else {
+		*numerator = 0;
+		*denominator = 1;
+	}
+}
+
+/*
+ * Clip the earned share of dirty pages to that which is actually available.
+ * This avoids exceeding the total dirty_limit when the floating averages
+ * fluctuate too quickly.
+ */
+static void
+clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+{
+	long avail_dirty;
+
+	avail_dirty = dirty -
+		(global_page_state(NR_FILE_DIRTY) +
+		 global_page_state(NR_WRITEBACK) +
+		 global_page_state(NR_UNSTABLE_NFS));
+
+	if (avail_dirty < 0)
+		avail_dirty = 0;
+
+	avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+		bdi_stat(bdi, BDI_WRITEBACK);
+
+	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
+}
+
+static inline void task_dirties_fraction(struct task_struct *tsk,
+		long *numerator, long *denominator)
+{
+	prop_fraction_single(&vm_dirties, &tsk->dirties,
+				numerator, denominator);
+}
+
+/*
+ * scale the dirty limit
+ *
+ * task specific dirty limit:
+ *
+ *   dirty -= (dirty/8) * p_{t}
+ */
+void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+{
+	long numerator, denominator;
+	long dirty = *pdirty;
+	long long inv = dirty >> 3;
+
+	task_dirties_fraction(tsk, &numerator, &denominator);
+	inv *= numerator;
+	do_div(inv, denominator);
+
+	dirty -= inv;
+	if (dirty < *pdirty/2)
+		dirty = *pdirty/2;
+
+	*pdirty = dirty;
+}
+
+/*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
  *
@@ -157,9 +291,9 @@ static unsigned long determine_dirtyable
 	return x + 1;	/* Ensure that we never return 0 */
 }
 
-static void
-get_dirty_limits(long *pbackground, long *pdirty,
-					struct address_space *mapping)
+void
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+		 struct backing_dev_info *bdi)
 {
 	int background_ratio;		/* Percentages */
 	int dirty_ratio;
@@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long
 	}
 	*pbackground = background;
 	*pdirty = dirty;
+
+	if (bdi) {
+		long long bdi_dirty = dirty;
+		long numerator, denominator;
+
+		/*
+		 * Calculate this BDI's share of the dirty ratio.
+		 */
+		bdi_writeout_fraction(bdi, &numerator, &denominator);
+
+		bdi_dirty *= numerator;
+		do_div(bdi_dirty, denominator);
+
+		*pbdi_dirty = bdi_dirty;
+		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
+		task_dirty_limit(current, pbdi_dirty);
+	}
 }
 
 /*
@@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long
  */
 static void balance_dirty_pages(struct address_space *mapping)
 {
-	long nr_reclaimable;
+	long bdi_nr_reclaimable;
+	long bdi_nr_writeback;
 	long background_thresh;
 	long dirty_thresh;
+	long bdi_thresh;
 	unsigned long pages_written = 0;
 	unsigned long write_chunk = sync_writeback_pages();
 
@@ -221,15 +374,15 @@ static void balance_dirty_pages(struct a
 			.range_cyclic	= 1,
 		};
 
-		get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
-		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
-		if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
-			dirty_thresh)
+		get_dirty_limits(&background_thresh, &dirty_thresh,
+				&bdi_thresh, bdi);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
 				break;
 
-		if (!dirty_exceeded)
-			dirty_exceeded = 1;
+		if (!bdi->dirty_exceeded)
+			bdi->dirty_exceeded = 1;
 
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
@@ -237,16 +390,37 @@ static void balance_dirty_pages(struct a
 		 * written to the server's write cache, but has not yet
 		 * been flushed to permanent storage.
 		 */
-		if (nr_reclaimable) {
+		if (bdi_nr_reclaimable) {
 			writeback_inodes(&wbc);
-			get_dirty_limits(&background_thresh,
-					 	&dirty_thresh, mapping);
-			nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
-			if (nr_reclaimable +
-				global_page_state(NR_WRITEBACK)
-					<= dirty_thresh)
-						break;
+
+			get_dirty_limits(&background_thresh, &dirty_thresh,
+				       &bdi_thresh, bdi);
+
+			/*
+			 * In order to avoid the stacked BDI deadlock we need
+			 * to ensure we accurately count the 'dirty' pages when
+			 * the threshold is low.
+			 *
+			 * Otherwise it would be possible to get thresh+n pages
+			 * reported dirty, even though there are thresh-m pages
+			 * actually dirty; with m+n sitting in the percpu
+			 * deltas.
+			 */
+			if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+				bdi_nr_reclaimable =
+					bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+				bdi_nr_writeback =
+					bdi_stat_sum(bdi, BDI_WRITEBACK);
+			} else {
+				bdi_nr_reclaimable =
+					bdi_stat(bdi, BDI_RECLAIMABLE);
+				bdi_nr_writeback =
+					bdi_stat(bdi, BDI_WRITEBACK);
+			}
+
+			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+				break;
+
 			pages_written += write_chunk - wbc.nr_to_write;
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
@@ -254,9 +428,9 @@ static void balance_dirty_pages(struct a
 		congestion_wait(WRITE, HZ/10);
 	}
 
-	if (nr_reclaimable + global_page_state(NR_WRITEBACK)
-		<= dirty_thresh && dirty_exceeded)
-			dirty_exceeded = 0;
+	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
+			bdi->dirty_exceeded)
+		bdi->dirty_exceeded = 0;
 
 	if (writeback_in_progress(bdi))
 		return;		/* pdflush is already working this queue */
@@ -270,7 +444,9 @@ static void balance_dirty_pages(struct a
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
 	if ((laptop_mode && pages_written) ||
-	     (!laptop_mode && (nr_reclaimable > background_thresh)))
+			(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+					  + global_page_state(NR_UNSTABLE_NFS)
+					  > background_thresh)))
 		pdflush_operation(background_writeout, 0);
 }
 
@@ -306,7 +482,7 @@ void balance_dirty_pages_ratelimited_nr(
 	unsigned long *p;
 
 	ratelimit = ratelimit_pages;
-	if (dirty_exceeded)
+	if (mapping->backing_dev_info->dirty_exceeded)
 		ratelimit = 8;
 
 	/*
@@ -342,7 +518,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
 	}
 
         for ( ; ; ) {
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
@@ -377,7 +553,7 @@ static void background_writeout(unsigned
 		long background_thresh;
 		long dirty_thresh;
 
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 		if (global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) < background_thresh
 				&& min_pages <= 0)
@@ -582,9 +758,15 @@ static struct notifier_block __cpuinitda
  */
 void __init page_writeback_init(void)
 {
+	int shift;
+
 	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
+
+	shift = calc_period_shift();
+	prop_descriptor_init(&vm_completions, shift);
+	prop_descriptor_init(&vm_dirties, shift);
 }
 
 /**
@@ -828,6 +1010,8 @@ int __set_page_dirty_nobuffers(struct pa
 			BUG_ON(mapping2 != mapping);
 			if (mapping_cap_account_dirty(mapping)) {
 				__inc_zone_page_state(page, NR_FILE_DIRTY);
+				__inc_bdi_stat(mapping->backing_dev_info,
+						BDI_RECLAIMABLE);
 				task_io_account_write(PAGE_CACHE_SIZE);
 			}
 			radix_tree_tag_set(&mapping->page_tree,
@@ -860,7 +1044,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage
  * If the mapping doesn't provide a set_page_dirty a_op, then
  * just fall through and assume that it wants buffer_heads.
  */
-int fastcall set_page_dirty(struct page *page)
+static int __set_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 
@@ -878,6 +1062,14 @@ int fastcall set_page_dirty(struct page 
 	}
 	return 0;
 }
+
+int fastcall set_page_dirty(struct page *page)
+{
+	int ret = __set_page_dirty(page);
+	if (ret)
+		task_dirty_inc(current);
+	return ret;
+}
 EXPORT_SYMBOL(set_page_dirty);
 
 /*
@@ -954,6 +1146,8 @@ int clear_page_dirty_for_io(struct page 
 			set_page_dirty(page);
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
+			dec_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
 			return 1;
 		}
 		return 0;
@@ -968,14 +1162,20 @@ int test_clear_page_writeback(struct pag
 	int ret;
 
 	if (mapping) {
+		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
-		if (ret)
+		if (ret) {
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			if (bdi_cap_writeback_dirty(bdi)) {
+				__dec_bdi_stat(bdi, BDI_WRITEBACK);
+				__bdi_writeout_inc(bdi);
+			}
+		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
@@ -989,14 +1189,18 @@ int test_set_page_writeback(struct page 
 	int ret;
 
 	if (mapping) {
+		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
-		if (!ret)
+		if (!ret) {
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			if (bdi_cap_writeback_dirty(bdi))
+				__inc_bdi_stat(bdi, BDI_WRITEBACK);
+		}
 		if (!PageDirty(page))
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c
+++ linux-2.6/mm/truncate.c
@@ -72,6 +72,8 @@ void cancel_dirty_page(struct page *page
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
+			dec_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
 		}
Index: linux-2.6/lib/proportions.c
===================================================================
--- /dev/null
+++ linux-2.6/lib/proportions.c
@@ -0,0 +1,385 @@
+/*
+ * FLoating proportions
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Description:
+ *
+ * The floating proportion is a time derivative with an exponentially decaying
+ * history:
+ *
+ *   p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i)
+ *
+ * Where j is an element from {prop_local}, x_{j} is j's number of events,
+ * and i the time period over which the differential is taken. So d/dt_{-i} is
+ * the differential over the i-th last period.
+ *
+ * The decaying history gives smooth transitions. The time differential carries
+ * the notion of speed.
+ *
+ * The denominator is 2^(1+i) because we want the series to be normalised, ie.
+ *
+ *   \Sum_{i=0} 1/2^(1+i) = 1
+ *
+ * Further more, if we measure time (t) in the same events as x; so that:
+ *
+ *   t = \Sum_{j} x_{j}
+ *
+ * we get that:
+ *
+ *   \Sum_{j} p_{j} = 1
+ *
+ * Writing this in an iterative fashion we get (dropping the 'd's):
+ *
+ *   if (++x_{j}, ++t > period)
+ *     t /= 2;
+ *     for_each (j)
+ *       x_{j} /= 2;
+ *
+ * so that:
+ *
+ *   p_{j} = x_{j} / t;
+ *
+ * We optimize away the '/= 2' for the global time delta by noting that:
+ *
+ *   if (++t > period) t /= 2:
+ *
+ * Can be approximated by:
+ *
+ *   period/2 + (++t % period/2)
+ *
+ * [ Furthermore, when we choose period to be 2^n it can be written in terms of
+ *   binary operations and wraparound artefacts disappear. ]
+ *
+ * Also note that this yields a natural counter of the elapsed periods:
+ *
+ *   c = t / (period/2)
+ *
+ * [ Its monotonic increasing property can be applied to mitigate the wrap-
+ *   around issue. ]
+ *
+ * This allows us to do away with the loop over all prop_locals on each period
+ * expiration. By remembering the period count under which it was last accessed
+ * as c_{j}, we can obtain the number of 'missed' cycles from:
+ *
+ *   c - c_{j}
+ *
+ * We can then lazily catch up to the global period count every time we are
+ * going to use x_{j}, by doing:
+ *
+ *   x_{j} /= 2^(c - c_{j}), c_{j} = c
+ */
+
+#include <linux/proportions.h>
+#include <linux/rcupdate.h>
+
+/*
+ * Limit the time part in order to ensure there are some bits left for the
+ * cycle counter.
+ */
+#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4)
+
+int prop_descriptor_init(struct prop_descriptor *pd, int shift)
+{
+	int err;
+
+	if (shift > PROP_MAX_SHIFT)
+		shift = PROP_MAX_SHIFT;
+
+	pd->index = 0;
+	pd->pg[0].shift = shift;
+	mutex_init(&pd->mutex);
+	err = percpu_counter_init_irq(&pd->pg[0].events, 0);
+	if (err)
+		goto out;
+
+	err = percpu_counter_init_irq(&pd->pg[1].events, 0);
+	if (err)
+		percpu_counter_destroy(&pd->pg[0].events);
+
+out:
+	return err;
+}
+
+/*
+ * We have two copies, and flip between them to make it seem like an atomic
+ * update. The update is not really atomic wrt the events counter, but
+ * it is internally consistent with the bit layout depending on shift.
+ *
+ * We copy the events count, move the bits around and flip the index.
+ */
+void prop_change_shift(struct prop_descriptor *pd, int shift)
+{
+	int index;
+	int offset;
+	u64 events;
+	unsigned long flags;
+
+	if (shift > PROP_MAX_SHIFT)
+		shift = PROP_MAX_SHIFT;
+
+	mutex_lock(&pd->mutex);
+
+	index = pd->index ^ 1;
+	offset = pd->pg[pd->index].shift - shift;
+	if (!offset)
+		goto out;
+
+	pd->pg[index].shift = shift;
+
+	local_irq_save(flags);
+	events = percpu_counter_sum(&pd->pg[pd->index].events);
+	if (offset < 0)
+		events <<= -offset;
+	else
+		events >>= offset;
+	percpu_counter_set(&pd->pg[index].events, events);
+
+	/*
+	 * ensure the new pg is fully written before the switch
+	 */
+	smp_wmb();
+	pd->index = index;
+	local_irq_restore(flags);
+
+	synchronize_rcu();
+
+out:
+	mutex_unlock(&pd->mutex);
+}
+
+/*
+ * wrap the access to the data in an rcu_read_lock() section;
+ * this is used to track the active references.
+ */
+static struct prop_global *prop_get_global(struct prop_descriptor *pd)
+{
+	int index;
+
+	rcu_read_lock();
+	index = pd->index;
+	/*
+	 * match the wmb from vcd_flip()
+	 */
+	smp_rmb();
+	return &pd->pg[index];
+}
+
+static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg)
+{
+	rcu_read_unlock();
+}
+
+static void
+prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
+{
+	int offset = *pl_shift - new_shift;
+
+	if (!offset)
+		return;
+
+	if (offset < 0)
+		*pl_period <<= -offset;
+	else
+		*pl_period >>= offset;
+
+	*pl_shift = new_shift;
+}
+
+/*
+ * PERCPU
+ */
+
+int prop_local_init_percpu(struct prop_local_percpu *pl)
+{
+	spin_lock_init(&pl->lock);
+	pl->shift = 0;
+	pl->period = 0;
+	return percpu_counter_init_irq(&pl->events, 0);
+}
+
+void prop_local_destroy_percpu(struct prop_local_percpu *pl)
+{
+	percpu_counter_destroy(&pl->events);
+}
+
+/*
+ * Catch up with missed period expirations.
+ *
+ *   until (c_{j} == c)
+ *     x_{j} -= x_{j}/2;
+ *     c_{j}++;
+ */
+static
+void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
+{
+	unsigned long period = 1UL << (pg->shift - 1);
+	unsigned long period_mask = ~(period - 1);
+	unsigned long global_period;
+	unsigned long flags;
+
+	global_period = percpu_counter_read(&pg->events);
+	global_period &= period_mask;
+
+	/*
+	 * Fast path - check if the local and global period count still match
+	 * outside of the lock.
+	 */
+	if (pl->period == global_period)
+		return;
+
+	spin_lock_irqsave(&pl->lock, flags);
+	prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
+	period = 1UL << (pg->shift - 1);
+	/*
+	 * For each missed period, we half the local counter.
+	 * basically:
+	 *   pl->events >> (global_period - pl->period);
+	 *
+	 * but since the distributed nature of percpu counters make division
+	 * rather hard, use a regular subtraction loop. This is safe, because
+	 * the events will only every be incremented, hence the subtraction
+	 * can never result in a negative number.
+	 */
+	while (pl->period != global_period) {
+		unsigned long val = percpu_counter_read(&pl->events);
+		unsigned long half = (val + 1) >> 1;
+
+		/*
+		 * Half of zero won't be much less, break out.
+		 * This limits the loop to shift iterations, even
+		 * if we missed a million.
+		 */
+		if (!val)
+			break;
+
+		percpu_counter_add(&pl->events, -half);
+		pl->period += period;
+	}
+	pl->period = global_period;
+	spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/*
+ *   ++x_{j}, ++t
+ */
+void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
+{
+	struct prop_global *pg = prop_get_global(pd);
+
+	prop_norm_percpu(pg, pl);
+	percpu_counter_add(&pl->events, 1);
+	percpu_counter_add(&pg->events, 1);
+	prop_put_global(pd, pg);
+}
+
+/*
+ * Obtain an fraction of this proportion
+ *
+ *   p_{j} = x_{j} / (period/2 + t % period/2)
+ */
+void prop_fraction_percpu(struct prop_descriptor *pd,
+		struct prop_local_percpu *pl,
+		long *numerator, long *denominator)
+{
+	struct prop_global *pg = prop_get_global(pd);
+	unsigned long period_2 = 1UL << (pg->shift - 1);
+	unsigned long counter_mask = period_2 - 1;
+	unsigned long global_count;
+
+	prop_norm_percpu(pg, pl);
+	*numerator = percpu_counter_read_positive(&pl->events);
+
+	global_count = percpu_counter_read(&pg->events);
+	*denominator = period_2 + (global_count & counter_mask);
+
+	prop_put_global(pd, pg);
+}
+
+/*
+ * SINGLE
+ */
+
+int prop_local_init_single(struct prop_local_single *pl)
+{
+	spin_lock_init(&pl->lock);
+	pl->shift = 0;
+	pl->period = 0;
+	pl->events = 0;
+	return 0;
+}
+
+void prop_local_destroy_single(struct prop_local_single *pl)
+{
+}
+
+/*
+ * Catch up with missed period expirations.
+ */
+static
+void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl)
+{
+	unsigned long period = 1UL << (pg->shift - 1);
+	unsigned long period_mask = ~(period - 1);
+	unsigned long global_period;
+	unsigned long flags;
+
+	global_period = percpu_counter_read(&pg->events);
+	global_period &= period_mask;
+
+	/*
+	 * Fast path - check if the local and global period count still match
+	 * outside of the lock.
+	 */
+	if (pl->period == global_period)
+		return;
+
+	spin_lock_irqsave(&pl->lock, flags);
+	prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
+	/*
+	 * For each missed period, we half the local counter.
+	 */
+	period = (global_period - pl->period) >> (pg->shift - 1);
+	if (likely(period < BITS_PER_LONG))
+		pl->events >>= period;
+	else
+		pl->events = 0;
+	pl->period = global_period;
+	spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/*
+ *   ++x_{j}, ++t
+ */
+void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
+{
+	struct prop_global *pg = prop_get_global(pd);
+
+	prop_norm_single(pg, pl);
+	pl->events++;
+	percpu_counter_add(&pg->events, 1);
+	prop_put_global(pd, pg);
+}
+
+/*
+ * Obtain an fraction of this proportion
+ *
+ *   p_{j} = x_{j} / (period/2 + t % period/2)
+ */
+void prop_fraction_single(struct prop_descriptor *pd,
+	       	struct prop_local_single *pl,
+		long *numerator, long *denominator)
+{
+	struct prop_global *pg = prop_get_global(pd);
+	unsigned long period_2 = 1UL << (pg->shift - 1);
+	unsigned long counter_mask = period_2 - 1;
+	unsigned long global_count;
+
+	prop_norm_single(pg, pl);
+	*numerator = pl->events;
+
+	global_count = percpu_counter_read(&pg->events);
+	*denominator = period_2 + (global_count & counter_mask);
+
+	prop_put_global(pd, pg);
+}
Index: linux-2.6/include/linux/proportions.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/proportions.h
@@ -0,0 +1,119 @@
+/*
+ * FLoating proportions
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+
+#ifndef _LINUX_PROPORTIONS_H
+#define _LINUX_PROPORTIONS_H
+
+#include <linux/percpu_counter.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+struct prop_global {
+	/*
+	 * The period over which we differentiate
+	 *
+	 *   period = 2^shift
+	 */
+	int shift;
+	/*
+	 * The total event counter aka 'time'.
+	 *
+	 * Treated as an unsigned long; the lower 'shift - 1' bits are the
+	 * counter bits, the remaining upper bits the period counter.
+	 */
+	struct percpu_counter events;
+};
+
+/*
+ * global proportion descriptor
+ *
+ * this is needed to consitently flip prop_global structures.
+ */
+struct prop_descriptor {
+	int index;
+	struct prop_global pg[2];
+	struct mutex mutex;		/* serialize the prop_global switch */
+};
+
+int prop_descriptor_init(struct prop_descriptor *pd, int shift);
+void prop_change_shift(struct prop_descriptor *pd, int new_shift);
+
+/*
+ * ----- PERCPU ------
+ */
+
+struct prop_local_percpu {
+	/*
+	 * the local events counter
+	 */
+	struct percpu_counter events;
+
+	/*
+	 * snapshot of the last seen global state
+	 */
+	int shift;
+	unsigned long period;
+	spinlock_t lock;		/* protect the snapshot state */
+};
+
+int prop_local_init_percpu(struct prop_local_percpu *pl);
+void prop_local_destroy_percpu(struct prop_local_percpu *pl);
+void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl);
+void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl,
+		long *numerator, long *denominator);
+
+static inline
+void prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__prop_inc_percpu(pd, pl);
+	local_irq_restore(flags);
+}
+
+/*
+ * ----- SINGLE ------
+ */
+
+struct prop_local_single {
+	/*
+	 * the local events counter
+	 */
+	unsigned long events;
+
+	/*
+	 * snapshot of the last seen global state
+	 * and a lock protecting this state
+	 */
+	int shift;
+	unsigned long period;
+	spinlock_t lock;		/* protect the snapshot state */
+};
+
+#define INIT_PROP_LOCAL_SINGLE(name)			\
+{	.lock = __SPIN_LOCK_UNLOCKED(name.lock),	\
+}
+
+int prop_local_init_single(struct prop_local_single *pl);
+void prop_local_destroy_single(struct prop_local_single *pl);
+void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl);
+void prop_fraction_single(struct prop_descriptor *pd, struct prop_local_single *pl,
+		long *numerator, long *denominator);
+
+static inline
+void prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__prop_inc_single(pd, pl);
+	local_irq_restore(flags);
+}
+
+#endif /* _LINUX_PROPORTIONS_H */
Index: linux-2.6/lib/Makefile
===================================================================
--- linux-2.6.orig/lib/Makefile
+++ linux-2.6/lib/Makefile
@@ -5,7 +5,8 @@
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
-	 sha1.o irq_regs.o reciprocal_div.o
+	 sha1.o irq_regs.o reciprocal_div.o \
+	 proportions.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -160,6 +160,10 @@ extern ctl_table inotify_table[];
 int sysctl_legacy_va_layout;
 #endif
 
+extern int dirty_ratio_handler(ctl_table *table, int write,
+		struct file *filp, void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
 
 /* The default sysctl tables: */
 
@@ -675,7 +679,7 @@ static ctl_table vm_table[] = {
 		.data		= &vm_dirty_ratio,
 		.maxlen		= sizeof(vm_dirty_ratio),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &dirty_ratio_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -83,6 +83,7 @@ struct sched_param {
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
 #include <linux/task_io_accounting.h>
+#include <linux/proportions.h>
 
 #include <asm/processor.h>
 
@@ -1076,6 +1077,7 @@ struct task_struct {
 #ifdef CONFIG_FAULT_INJECTION
 	int make_it_fail;
 #endif
+	struct prop_local_single dirties;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -106,6 +106,7 @@ static struct kmem_cache *mm_cachep;
 
 void free_task(struct task_struct *tsk)
 {
+	prop_local_destroy_single(&tsk->dirties);
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	free_task_struct(tsk);
@@ -162,6 +163,7 @@ static struct task_struct *dup_task_stru
 {
 	struct task_struct *tsk;
 	struct thread_info *ti;
+	int err;
 
 	prepare_to_copy(orig);
 
@@ -177,6 +179,14 @@ static struct task_struct *dup_task_stru
 
 	*tsk = *orig;
 	tsk->stack = ti;
+
+	err = prop_local_init_single(&tsk->dirties);
+	if (err) {
+		free_thread_info(ti);
+		free_task_struct(tsk);
+		return NULL;
+	}
+
 	setup_thread_stack(tsk, orig);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
Index: linux-2.6/include/linux/init_task.h
===================================================================
--- linux-2.6.orig/include/linux/init_task.h
+++ linux-2.6/include/linux/init_task.h
@@ -167,6 +167,7 @@ extern struct group_info init_groups;
 		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
 		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
 	},								\
+	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 }

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

next prev parent reply	other threads:[~2007-08-23 17:42 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-08-16 12:49 [PATCH 00/23] per device dirty throttling -v9 Martin Knoblauch
2007-08-16 12:55 ` Peter Zijlstra
2007-08-16 13:21   ` Martin Knoblauch
2007-08-23 15:59   ` Martin Knoblauch
2007-08-23 17:41     ` Peter Zijlstra [this message]
2007-08-24 10:47       ` Martin Knoblauch
2007-09-03 15:20       ` RFC: [PATCH] Small patch on top of " Martin Knoblauch
  -- strict thread matches above, loose matches on Subject: below --
2007-08-16  7:45 [PATCH 00/23] " Peter Zijlstra
2007-08-16  7:45 ` Peter Zijlstra
2007-08-16 21:29 ` Christoph Lameter
2007-08-16 21:29   ` Christoph Lameter
2007-08-17  7:19   ` Peter Zijlstra
2007-08-17 20:37     ` Christoph Lameter
2007-08-17 20:37       ` Christoph Lameter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1187890919.6114.411.camel@twins \
    --to=a.p.zijlstra@chello.nl \
    --cc=linux-kernel@vger.kernel.org \
    --cc=spamtrap@knobisoft.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.