From: Andrew Morton <akpm@zip.com.au>
To: Linus Torvalds <torvalds@transmeta.com>
Cc: lkml <linux-kernel@vger.kernel.org>
Subject: [patch 8/15] pdflush exclusion infrastructure
Date: Sun, 19 May 2002 12:41:13 -0700 [thread overview]
Message-ID: <3CE7FFD9.AAFE2D47@zip.com.au> (raw)
Collision avoidance for pdflush threads.
Turns the request_queue-based `unsigned long ra_pages' into a structure
which contains ra_pages as well as a longword.
That longword is used to record the fact that a pdflush thread is
currently writing something back against this request_queue.
Avoids the situation where several pdflush threads are sleeping on the
same request_queue.
This patch provides only the infrastructure for the pdflush exclusion.
This infrastructure gets used in pdflush-single.patch
=====================================
--- 2.5.16/drivers/block/blkpg.c~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/drivers/block/blkpg.c Sun May 19 11:49:48 2002
@@ -35,6 +35,7 @@
#include <linux/blkpg.h>
#include <linux/genhd.h>
#include <linux/module.h> /* for EXPORT_SYMBOL */
+#include <linux/backing-dev.h>
#include <asm/uaccess.h>
@@ -219,7 +220,7 @@ int blk_ioctl(struct block_device *bdev,
unsigned short usval;
kdev_t dev = to_kdev_t(bdev->bd_dev);
int holder;
- unsigned long *ra_pages;
+ struct backing_dev_info *bdi;
intval = block_ioctl(bdev, cmd, arg);
if (intval != -ENOTTY)
@@ -241,20 +242,20 @@ int blk_ioctl(struct block_device *bdev,
case BLKFRASET:
if(!capable(CAP_SYS_ADMIN))
return -EACCES;
- ra_pages = blk_get_ra_pages(bdev);
- if (ra_pages == NULL)
+ bdi = blk_get_backing_dev_info(bdev);
+ if (bdi == NULL)
return -ENOTTY;
- *ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
+ bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
return 0;
case BLKRAGET:
case BLKFRAGET:
if (!arg)
return -EINVAL;
- ra_pages = blk_get_ra_pages(bdev);
- if (ra_pages == NULL)
+ bdi = blk_get_backing_dev_info(bdev);
+ if (bdi == NULL)
return -ENOTTY;
- return put_user((*ra_pages * PAGE_CACHE_SIZE) / 512,
+ return put_user((bdi->ra_pages * PAGE_CACHE_SIZE) / 512,
(long *)arg);
case BLKSECTGET:
--- 2.5.16/drivers/block/ll_rw_blk.c~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/drivers/block/ll_rw_blk.c Sun May 19 11:49:48 2002
@@ -27,6 +27,7 @@
#include <linux/completion.h>
#include <linux/compiler.h>
#include <scsi/scsi.h>
+#include <linux/backing-dev.h>
#include <asm/system.h>
#include <asm/io.h>
@@ -100,21 +101,21 @@ inline request_queue_t *blk_get_queue(kd
}
/**
- * blk_get_ra_pages - get the address of a queue's readahead tunable
+ * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
* @dev: device
*
* Locates the passed device's request queue and returns the address of its
- * readahead setting.
+ * backing_dev_info
*
* Will return NULL if the request queue cannot be located.
*/
-unsigned long *blk_get_ra_pages(struct block_device *bdev)
+struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
{
- unsigned long *ret = NULL;
+ struct backing_dev_info *ret = NULL;
request_queue_t *q = blk_get_queue(to_kdev_t(bdev->bd_dev));
if (q)
- ret = &q->ra_pages;
+ ret = &q->backing_dev_info;
return ret;
}
@@ -153,7 +154,8 @@ void blk_queue_make_request(request_queu
q->max_phys_segments = MAX_PHYS_SEGMENTS;
q->max_hw_segments = MAX_HW_SEGMENTS;
q->make_request_fn = mfn;
- q->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ q->backing_dev_info.state = 0;
blk_queue_max_sectors(q, MAX_SECTORS);
blk_queue_hardsect_size(q, 512);
--- 2.5.16/fs/block_dev.c~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/fs/block_dev.c Sun May 19 11:49:48 2002
@@ -331,7 +331,7 @@ struct block_device *bdget(dev_t dev)
inode->i_bdev = new_bdev;
inode->i_data.a_ops = &def_blk_aops;
inode->i_data.gfp_mask = GFP_USER;
- inode->i_data.ra_pages = &default_ra_pages;
+ inode->i_data.backing_dev_info = &default_backing_dev_info;
spin_lock(&bdev_lock);
bdev = bdfind(dev, head);
if (!bdev) {
@@ -594,11 +594,12 @@ static int do_open(struct block_device *
}
}
}
- if (bdev->bd_inode->i_data.ra_pages == &default_ra_pages) {
- unsigned long *ra_pages = blk_get_ra_pages(bdev);
- if (ra_pages == NULL)
- ra_pages = &default_ra_pages;
- inode->i_data.ra_pages = ra_pages;
+ if (bdev->bd_inode->i_data.backing_dev_info ==
+ &default_backing_dev_info) {
+ struct backing_dev_info *bdi = blk_get_backing_dev_info(bdev);
+ if (bdi == NULL)
+ bdi = &default_backing_dev_info;
+ inode->i_data.backing_dev_info = bdi;
}
if (bdev->bd_op->open) {
ret = bdev->bd_op->open(inode, file);
@@ -624,7 +625,7 @@ static int do_open(struct block_device *
out2:
if (!bdev->bd_openers) {
bdev->bd_op = NULL;
- bdev->bd_inode->i_data.ra_pages = &default_ra_pages;
+ bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
if (bdev != bdev->bd_contains) {
blkdev_put(bdev->bd_contains, BDEV_RAW);
bdev->bd_contains = NULL;
@@ -698,7 +699,7 @@ int blkdev_put(struct block_device *bdev
__MOD_DEC_USE_COUNT(bdev->bd_op->owner);
if (!bdev->bd_openers) {
bdev->bd_op = NULL;
- bdev->bd_inode->i_data.ra_pages = &default_ra_pages;
+ bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
if (bdev != bdev->bd_contains) {
blkdev_put(bdev->bd_contains, BDEV_RAW);
bdev->bd_contains = NULL;
--- 2.5.16/fs/inode.c~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/fs/inode.c Sun May 19 12:02:58 2002
@@ -12,6 +12,7 @@
#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/writeback.h>
+#include <linux/backing-dev.h>
/*
* New inode.c implementation.
@@ -83,6 +84,8 @@ static struct inode *alloc_inode(struct
inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
if (inode) {
+ struct address_space * const mapping = &inode->i_data;
+
inode->i_sb = sb;
inode->i_dev = sb->s_dev;
inode->i_blkbits = sb->s_blocksize_bits;
@@ -100,16 +103,17 @@ static struct inode *alloc_inode(struct
inode->i_pipe = NULL;
inode->i_bdev = NULL;
inode->i_cdev = NULL;
- inode->i_data.a_ops = &empty_aops;
- inode->i_data.host = inode;
- inode->i_data.gfp_mask = GFP_HIGHUSER;
- inode->i_data.dirtied_when = 0;
- inode->i_mapping = &inode->i_data;
- inode->i_data.ra_pages = &default_ra_pages;
- inode->i_data.assoc_mapping = NULL;
+
+ mapping->a_ops = &empty_aops;
+ mapping->host = inode;
+ mapping->gfp_mask = GFP_HIGHUSER;
+ mapping->dirtied_when = 0;
+ mapping->assoc_mapping = NULL;
+ mapping->backing_dev_info = &default_backing_dev_info;
if (sb->s_bdev)
- inode->i_data.ra_pages = sb->s_bdev->bd_inode->i_mapping->ra_pages;
+ inode->i_data.backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
memset(&inode->u, 0, sizeof(inode->u));
+ inode->i_mapping = mapping;
}
return inode;
}
--- 2.5.16/fs/open.c~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/fs/open.c Sun May 19 11:49:48 2002
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/tty.h>
#include <linux/iobuf.h>
+#include <linux/backing-dev.h>
#include <asm/uaccess.h>
@@ -632,7 +633,7 @@ struct file *dentry_open(struct dentry *
goto cleanup_file;
}
- f->f_ra.ra_pages = *inode->i_mapping->ra_pages;
+ f->f_ra.ra_pages = inode->i_mapping->backing_dev_info->ra_pages;
f->f_dentry = dentry;
f->f_vfsmnt = mnt;
f->f_pos = 0;
--- 2.5.16/fs/ntfs/super.c~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/fs/ntfs/super.c Sun May 19 11:49:48 2002
@@ -26,6 +26,7 @@
#include <linux/locks.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h> /* For bdev_hardsect_size(). */
+#include <linux/backing-dev.h>
#include "ntfs.h"
#include "sysctl.h"
@@ -1519,8 +1520,8 @@ static int ntfs_fill_super(struct super_
vol->mftbmp_mapping.assoc_mapping = NULL;
vol->mftbmp_mapping.dirtied_when = 0;
vol->mftbmp_mapping.gfp_mask = GFP_HIGHUSER;
- vol->mftbmp_mapping.ra_pages =
- sb->s_bdev->bd_inode->i_mapping->ra_pages;
+ vol->mftbmp_mapping.backing_dev_info =
+ sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
/*
* Default is group and other don't have any access to files or
--- 2.5.16/include/linux/blkdev.h~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/include/linux/blkdev.h Sun May 19 11:52:27 2002
@@ -7,6 +7,7 @@
#include <linux/tqueue.h>
#include <linux/list.h>
#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
#include <asm/scatterlist.h>
@@ -162,11 +163,7 @@ struct request_queue
make_request_fn *make_request_fn;
prep_rq_fn *prep_rq_fn;
- /*
- * The VM-level readahead tunable for this device. In
- * units of PAGE_CACHE_SIZE pages.
- */
- unsigned long ra_pages;
+ struct backing_dev_info backing_dev_info;
/*
* The queue owner gets to use this for whatever they like.
@@ -328,7 +325,7 @@ extern void blk_queue_hardsect_size(requ
extern void blk_queue_segment_boundary(request_queue_t *q, unsigned long);
extern void blk_queue_assign_lock(request_queue_t *q, spinlock_t *);
extern void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn);
-extern unsigned long *blk_get_ra_pages(struct block_device *bdev);
+extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *);
--- 2.5.16/include/linux/fs.h~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/include/linux/fs.h Sun May 19 12:02:58 2002
@@ -305,6 +305,7 @@ struct address_space_operations {
int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
};
+struct backing_dev_info;
struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
@@ -320,7 +321,7 @@ struct address_space {
spinlock_t i_shared_lock; /* and spinlock protecting it */
unsigned long dirtied_when; /* jiffies of first page dirtying */
int gfp_mask; /* how to allocate the pages */
- unsigned long *ra_pages; /* device readahead */
+ struct backing_dev_info *backing_dev_info; /* device readahead, etc */
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
--- 2.5.16/include/linux/mm.h~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/include/linux/mm.h Sun May 19 11:52:27 2002
@@ -454,7 +454,6 @@ void do_page_cache_readahead(struct file
void page_cache_readahead(struct file *file, unsigned long offset);
void page_cache_readaround(struct file *file, unsigned long offset);
void handle_ra_thrashing(struct file *file);
-extern unsigned long default_ra_pages;
/* vma is the first one with address < vma->vm_end,
* and even address < vma->vm_start. Have to extend vma. */
--- 2.5.16/mm/readahead.c~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/mm/readahead.c Sun May 19 11:49:48 2002
@@ -11,8 +11,12 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
-unsigned long default_ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+struct backing_dev_info default_backing_dev_info = {
+ ra_pages: (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
+ state: 0,
+};
/*
* Return max readahead size for this inode in number-of-pages.
--- /dev/null Thu Aug 30 13:30:55 2001
+++ 2.5.16-akpm/include/linux/backing-dev.h Sun May 19 11:49:48 2002
@@ -0,0 +1,30 @@
+/*
+ * include/linux/backing-dev.h
+ *
+ * low-level device information and state which is propagated up through
+ * to high-level code.
+ */
+
+#ifndef _LINUX_BACKING_DEV_H
+#define _LINUX_BACKING_DEV_H
+
+/*
+ * Bits in backing_dev_info.state
+ */
+enum bdi_state {
+ BDI_pdflush, /* A pdflush thread is working this device */
+ BDI_unused, /* Available bits start here */
+};
+
+struct backing_dev_info {
+ unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
+ unsigned long state; /* Always use atomic bitops on this */
+};
+
+extern struct backing_dev_info default_backing_dev_info;
+
+int writeback_acquire(struct backing_dev_info *bdi);
+int writeback_in_progress(struct backing_dev_info *bdi);
+void writeback_release(struct backing_dev_info *bdi);
+
+#endif /* _LINUX_BACKING_DEV_H */
--- 2.5.16/fs/fs-writeback.c~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/fs/fs-writeback.c Sun May 19 12:02:58 2002
@@ -19,6 +19,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/writeback.h>
+#include <linux/backing-dev.h>
/**
* __mark_inode_dirty - internal function
@@ -508,3 +509,40 @@ int generic_osync_inode(struct inode *in
return err;
}
+
+/**
+ * writeback_acquire: attempt to get exclusive writeback access to a device
+ * @bdi: the device's backing_dev_info structure
+ *
+ * It is a waste of resources to have more than one pdflush thread blocked on
+ * a single request queue. Exclusion at the request_queue level is obtained
+ * via a flag in the request_queue's backing_dev_info.state.
+ *
+ * Non-request_queue-backed address_spaces will share default_backing_dev_info,
+ * unless they implement their own. Which is somewhat inefficient, as this
+ * may prevent concurrent writeback against multiple devices.
+ */
+int writeback_acquire(struct backing_dev_info *bdi)
+{
+ return !test_and_set_bit(BDI_pdflush, &bdi->state);
+}
+
+/**
+ * writeback_in_progress: determine whether there is writeback in progress
+ * against a backing device.
+ * @bdi: the device's backing_dev_info structure.
+ */
+int writeback_in_progress(struct backing_dev_info *bdi)
+{
+ return test_bit(BDI_pdflush, &bdi->state);
+}
+
+/**
+ * writeback_release: relinquish exclusive writeback access against a device.
+ * @bdi: the device's backing_dev_info structure
+ */
+void writeback_release(struct backing_dev_info *bdi)
+{
+ BUG_ON(!writeback_in_progress(bdi));
+ clear_bit(BDI_pdflush, &bdi->state);
+}
--- 2.5.16/mm/page-writeback.c~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/mm/page-writeback.c Sun May 19 12:02:58 2002
@@ -166,6 +166,7 @@ int pdflush_flush(unsigned long nr_pages
* to perform their I/O against a large file.
*/
static int wb_writeback_jifs = 5 * HZ;
+static struct timer_list wb_timer;
/*
* Periodic writeback of "old" data.
@@ -206,16 +207,11 @@ static void wb_kupdate(unsigned long arg
yield();
}
run_task_queue(&tq_disk);
+ mod_timer(&wb_timer, jiffies + wb_writeback_jifs);
}
-/*
- * The writeback timer, for kupdate-style functionality
- */
-static struct timer_list wb_timer;
-
static void wb_timer_fn(unsigned long unused)
{
- mod_timer(&wb_timer, jiffies + wb_writeback_jifs);
pdflush_operation(wb_kupdate, 0);
}
--- 2.5.16/mm/pdflush.c~pdflush-exclusion Sun May 19 11:49:48 2002
+++ 2.5.16-akpm/mm/pdflush.c Sun May 19 11:49:48 2002
@@ -103,6 +103,7 @@ static int __pdflush(struct pdflush_work
preempt_disable();
spin_lock_irq(&pdflush_lock);
nr_pdflush_threads++;
+// printk("pdflush %d [%d] starts\n", nr_pdflush_threads, current->pid);
for ( ; ; ) {
struct pdflush_work *pdf;
@@ -124,7 +125,7 @@ static int __pdflush(struct pdflush_work
if (jiffies - last_empty_jifs > 1 * HZ) {
/* unlocked list_empty() test is OK here */
if (list_empty(&pdflush_list)) {
- /* unlocked nr_pdflush_threads test is OK here */
+ /* unlocked test is OK here */
if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
start_one_pdflush_thread();
}
@@ -147,6 +148,7 @@ static int __pdflush(struct pdflush_work
}
}
nr_pdflush_threads--;
+// printk("pdflush %d [%d] ends\n", nr_pdflush_threads, current->pid);
spin_unlock_irq(&pdflush_lock);
preempt_enable();
return 0;
-
reply other threads:[~2002-05-19 19:40 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=3CE7FFD9.AAFE2D47@zip.com.au \
--to=akpm@zip.com.au \
--cc=linux-kernel@vger.kernel.org \
--cc=torvalds@transmeta.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.