* direct-to-BIO for O_DIRECT
@ 2002-07-08 3:19 Andrew Morton
2002-07-08 3:30 ` Lincoln Dale
` (2 more replies)
0 siblings, 3 replies; 17+ messages in thread
From: Andrew Morton @ 2002-07-08 3:19 UTC (permalink / raw)
To: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
Linus Torvalds, lkml, Steve Lord
Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
the kiovec layer. It's followed by a patch which converts the raw
driver to use the O_DIRECT engine.
CPU utilisation is about the same as the kiovec-based implementation.
Read and write bandwidth are the same too, for 128k chunks. But with
one megabyte chunks, this implementation is 20% faster at writing.
I assume this is because the kiobuf-based implementation has to stop
and wait for each 128k chunk, whereas this code streams the entire
request, regardless of its size.
This is with a single (oldish) scsi disk on aic7xxx. I'd expect the
margin to widen on higher-end hardware which likes to have more
requests in flight.
Question is: what do we want to do with this sucker? These are the
remaining users of kiovecs:
drivers/md/lvm-snap.c
drivers/media/video/video-buf.c
drivers/mtd/devices/blkmtd.c
drivers/scsi/sg.c
the video and mtd drivers seems to be fairly easy to de-kiobufize.
I'm aware of one proprietary driver which uses kiobufs. XFS uses
kiobufs a little bit - just to map the pages.
So with a bit of effort and maintainer-irritation, we can extract
the kiobuf layer from the kernel.
Do we want to do that?
fs/Makefile | 2
fs/block_dev.c | 7
fs/buffer.c | 2
fs/direct-io.c | 491 ++++++++++++++++++++++++++++++++++++++++++++
fs/ext2/inode.c | 7
include/linux/buffer_head.h | 2
include/linux/fs.h | 11
mm/filemap.c | 64 ++---
8 files changed, 543 insertions(+), 43 deletions(-)
--- /dev/null Thu Aug 30 13:30:55 2001
+++ 2.5.25-akpm/fs/direct-io.c Sun Jul 7 19:40:20 2002
@@ -0,0 +1,491 @@
+/*
+ * mm/direct-io.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * O_DIRECT
+ *
+ * 04Jul2002 akpm@zip.com.au
+ * Initial version
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/buffer_head.h>
+#include <linux/rwsem.h>
+#include <asm/atomic.h>
+
+/*
+ * The largest-sized BIO which this code will assemble, in bytes. Set this
+ * to PAGE_SIZE if your drivers are broken.
+ */
+#define DIO_BIO_MAX_SIZE BIO_MAX_SIZE
+
+/*
+ * How many user pages to map in one call to get_user_pages(). This determines
+ * the size of a structure on the stack.
+ */
+#define DIO_PAGES 64
+
+struct dio {
+ /* BIO submission state */
+ struct bio *bio; /* bio under assembly */
+ struct bio_vec *bvec; /* current bvec in that bio */
+ struct inode *inode;
+ int rw;
+ sector_t block_in_file; /* changes */
+ sector_t final_block_in_request;/* doesn't change */
+ unsigned first_block_in_page; /* doesn't change */
+ int boundary; /* prev block is at a boundary */
+ int reap_counter; /* rate limit reaping */
+ get_block_t *get_block;
+ sector_t last_block_in_bio;
+
+ /* Page fetching state */
+ int curr_page; /* changes */
+ int total_pages; /* doesn't change */
+ unsigned long curr_user_address;/* changes */
+
+ /* Page queue */
+ struct page *pages[DIO_PAGES];
+ unsigned head;
+ unsigned tail;
+
+ /* BIO completion state */
+ atomic_t bio_count;
+ spinlock_t bio_list_lock;
+ struct bio *bio_list; /* singly linked via bi_private */
+ wait_queue_head_t wait_q;
+};
+
+/*
+ * How many pages are in the queue?
+ */
+static inline unsigned dio_pages_present(struct dio *dio)
+{
+ return dio->head - dio->tail;
+}
+
+/*
+ * Go grab and pin some userspace pages. Typically we'll get 64 at a time.
+ */
+static int dio_refill_pages(struct dio *dio)
+{
+ int ret;
+ int nr_pages;
+
+ nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+ ret = get_user_pages(
+ current, /* Task for fault acounting */
+ current->mm, /* whose pages? */
+ dio->curr_user_address, /* Where from? */
+ nr_pages, /* How many pages? */
+ dio->rw == READ, /* Write to memory? */
+ 0, /* force (?) */
+ &dio->pages[0],
+ NULL); /* vmas */
+
+ if (ret >= 0) {
+ dio->curr_user_address += ret * PAGE_SIZE;
+ dio->curr_page += ret;
+ dio->head = 0;
+ dio->tail = ret;
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * Get another userspace page. Returns an ERR_PTR on error. Pages are
+ * buffered inside the dio so that we can call get_user_pages() against a
+ * decent number of pages, less frequently. To provide nicer use of the
+ * L1 cache.
+ */
+static struct page *dio_get_page(struct dio *dio)
+{
+ if (dio_pages_present(dio) == 0) {
+ int ret;
+
+ ret = dio_refill_pages(dio);
+ if (ret) {
+ printk("%s: dio_refill_pages returns %d\n",
+ __FUNCTION__, ret);
+ return ERR_PTR(ret);
+ }
+ BUG_ON(dio_pages_present(dio) == 0);
+ }
+ return dio->pages[dio->head++];
+}
+
+/*
+ * The BIO completion handler simply queues the BIO up for the process-context
+ * handler.
+ *
+ * During I/O bi_private points at the dio. After I/O, bi_private is used to
+ * implement a singly-linked list of completed BIOs, at dio->bio_list.
+ */
+static void dio_bio_end_io(struct bio *bio)
+{
+ struct dio *dio = bio->bi_private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dio->bio_list_lock, flags);
+ bio->bi_private = dio->bio_list;
+ dio->bio_list = bio;
+ spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+ wake_up(&dio->wait_q);
+}
+
+static int
+dio_bio_alloc(struct dio *dio, struct block_device *bdev,
+ sector_t first_sector, int nr_vecs)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_KERNEL, nr_vecs);
+ if (bio == NULL)
+ return -ENOMEM;
+
+ bio->bi_bdev = bdev;
+ bio->bi_vcnt = nr_vecs;
+ bio->bi_idx = 0;
+ bio->bi_size = 0;
+ bio->bi_sector = first_sector;
+ bio->bi_io_vec[0].bv_page = NULL;
+ bio->bi_end_io = dio_bio_end_io;
+
+ dio->bio = bio;
+ dio->bvec = NULL; /* debug */
+ return 0;
+}
+
+static void dio_bio_submit(struct dio *dio)
+{
+ struct bio *bio = dio->bio;
+
+ bio->bi_vcnt = bio->bi_idx;
+ bio->bi_idx = 0;
+ bio->bi_private = dio;
+ atomic_inc(&dio->bio_count);
+ submit_bio(dio->rw, bio);
+
+ dio->bio = NULL;
+ dio->bvec = NULL;
+}
+
+/*
+ * Release any resources in case of a failure
+ */
+static void dio_cleanup(struct dio *dio)
+{
+ while (dio_pages_present(dio))
+ page_cache_release(dio_get_page(dio));
+}
+
+/*
+ * Wait for the next BIO to complete. Remove it and return it.
+ */
+static struct bio *dio_await_one(struct dio *dio)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ unsigned long flags;
+ struct bio *bio;
+
+ spin_lock_irqsave(&dio->bio_list_lock, flags);
+ while (dio->bio_list == NULL) {
+ add_wait_queue(&dio->wait_q, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (dio->bio_list == NULL) {
+ spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+ blk_run_queues();
+ schedule();
+ spin_lock_irqsave(&dio->bio_list_lock, flags);
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&dio->wait_q, &wait);
+ }
+ bio = dio->bio_list;
+ dio->bio_list = bio->bi_private;
+ spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+ return bio;
+}
+
+/*
+ * Process one completed BIO. No locks are held.
+ */
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int page_no;
+ int ret = 0;
+
+ for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+ struct page *page = bvec[page_no].bv_page;
+
+ if (!uptodate) {
+ if (ret == 0)
+ ret = -EIO;
+ }
+
+ if (dio->rw == READ)
+ set_page_dirty(page);
+ page_cache_release(page);
+ }
+ atomic_dec(&dio->bio_count);
+ bio_put(bio);
+ return ret;
+}
+
+/*
+ * Wait on and process all in-flight BIOs.
+ */
+static int dio_await_completion(struct dio *dio)
+{
+ int ret = 0;
+ while (atomic_read(&dio->bio_count)) {
+ struct bio *bio = dio_await_one(dio);
+ int ret2;
+
+ ret2 = dio_bio_complete(dio, bio);
+ if (ret == 0)
+ ret = ret2;
+ }
+ return ret;
+}
+
+/*
+ * A really large O_DIRECT read or write can generate a lot of BIOs. So
+ * to keep the memory consumption sane we periodically reap any completed BIOs
+ * during the BIO generation phase.
+ *
+ * This also helps to limis the peak amount of pinned userspace memory.
+ */
+static int dio_bio_reap(struct dio *dio)
+{
+ int ret = 0;
+
+ if (dio->reap_counter++ >= 64) {
+ while (dio->bio_list) {
+ unsigned long flags;
+ struct bio *bio;
+ int ret2;
+
+ spin_lock_irqsave(&dio->bio_list_lock, flags);
+ bio = dio->bio_list;
+ dio->bio_list = bio->bi_private;
+ spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+ ret2 = dio_bio_complete(dio, bio);
+ if (ret == 0)
+ ret = ret2;
+ }
+ dio->reap_counter = 0;
+ }
+ return ret;
+}
+
+/*
+ * Walk the user pages, and the file, mapping blocks to disk and emitting BIOs.
+ */
+int do_direct_IO(struct dio *dio)
+{
+ struct inode * const inode = dio->inode;
+ const unsigned blkbits = inode->i_blkbits;
+ const unsigned blocksize = 1 << blkbits;
+ const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+ struct page *page;
+ unsigned block_in_page;
+ int ret;
+
+ /* The I/O can start at any block offset within the first page */
+ block_in_page = dio->first_block_in_page;
+
+ while (dio->block_in_file < dio->final_block_in_request) {
+ int new_page; /* Need to insert this page into the BIO? */
+
+ page = dio_get_page(dio);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ goto out;
+ }
+
+ new_page = 1;
+ for ( ; block_in_page < blocks_per_page; block_in_page++) {
+ struct buffer_head map_bh;
+ struct bio *bio;
+
+ map_bh.b_state = 0;
+ ret = (*dio->get_block)(inode, dio->block_in_file,
+ &map_bh, dio->rw == WRITE);
+ if (ret) {
+ printk("%s: get_block returns %d\n",
+ __FUNCTION__, ret);
+ goto fail_release;
+ }
+ /* blockdevs do not set buffer_new */
+ if (buffer_new(&map_bh))
+ unmap_underlying_metadata(map_bh.b_bdev,
+ map_bh.b_blocknr);
+ if (!buffer_mapped(&map_bh)) {
+ ret = -EINVAL; /* A hole */
+ goto fail_release;
+ }
+ if (dio->bio) {
+ if (dio->bio->bi_idx == dio->bio->bi_vcnt ||
+ dio->boundary ||
+ dio->last_block_in_bio !=
+ map_bh.b_blocknr - 1) {
+ dio_bio_submit(dio);
+ dio->boundary = 0;
+ }
+ }
+ if (dio->bio == NULL) {
+ ret = dio_bio_reap(dio);
+ if (ret)
+ goto fail_release;
+ ret = dio_bio_alloc(dio, map_bh.b_bdev,
+ map_bh.b_blocknr << (blkbits - 9),
+ DIO_BIO_MAX_SIZE / PAGE_SIZE);
+ if (ret)
+ goto fail_release;
+ new_page = 1;
+ dio->boundary = 0;
+ }
+
+ bio = dio->bio;
+ if (new_page) {
+ dio->bvec = &bio->bi_io_vec[bio->bi_idx];
+ page_cache_get(page);
+ dio->bvec->bv_page = page;
+ dio->bvec->bv_len = 0;
+ dio->bvec->bv_offset = block_in_page*blocksize;
+ bio->bi_idx++;
+ }
+ new_page = 0;
+ dio->bvec->bv_len += blocksize;
+ bio->bi_size += blocksize;
+ dio->last_block_in_bio = map_bh.b_blocknr;
+ dio->boundary = buffer_boundary(&map_bh);
+
+ dio->block_in_file++;
+ if (dio->block_in_file >= dio->final_block_in_request)
+ break;
+ }
+ block_in_page = 0;
+ page_cache_release(page);
+ }
+ ret = 0;
+ goto out;
+fail_release:
+ page_cache_release(page);
+out:
+ return ret;
+}
+
+struct dio *g_dio;
+
+int
+generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
+ size_t count, get_block_t get_block)
+{
+ const unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
+ const unsigned long user_addr = (unsigned long)buf;
+ int ret = 0;
+ int ret2;
+ struct dio dio;
+ size_t bytes;
+
+ /* Check the memory alignment. Blocks cannot straddle pages */
+ if ((user_addr & blocksize_mask) || (count & blocksize_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ g_dio = &dio;
+
+ /* BIO submission state */
+ dio.bio = NULL;
+ dio.bvec = NULL;
+ dio.inode = inode;
+ dio.rw = rw;
+ dio.block_in_file = offset >> inode->i_blkbits;
+ dio.final_block_in_request = (offset + count) >> inode->i_blkbits;
+
+ /* Index into the first page of the first block */
+ dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1))
+ >> inode->i_blkbits;
+ dio.boundary = 0;
+ dio.reap_counter = 0;
+ dio.get_block = get_block;
+ dio.last_block_in_bio = -1;
+
+ /* Page fetching state */
+ dio.curr_page = 0;
+ bytes = count;
+ dio.total_pages = 0;
+ if (offset & PAGE_SIZE) {
+ dio.total_pages++;
+ bytes -= PAGE_SIZE - (offset & ~(PAGE_SIZE - 1));
+ }
+ dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+ dio.curr_user_address = user_addr;
+
+ /* Page queue */
+ dio.head = 0;
+ dio.tail = 0;
+
+ /* BIO completion state */
+ atomic_set(&dio.bio_count, 0);
+ spin_lock_init(&dio.bio_list_lock);
+ dio.bio_list = NULL;
+ init_waitqueue_head(&dio.wait_q);
+
+ down_read(¤t->mm->mmap_sem);
+ ret = do_direct_IO(&dio);
+ up_read(¤t->mm->mmap_sem);
+
+ if (dio.bio)
+ dio_bio_submit(&dio);
+ if (ret)
+ dio_cleanup(&dio);
+ ret2 = dio_await_completion(&dio);
+ if (ret == 0)
+ ret = ret2;
+ if (ret == 0)
+ ret = count - ((dio.final_block_in_request -
+ dio.block_in_file) << inode->i_blkbits);
+out:
+ return ret;
+}
+
+ssize_t
+generic_file_direct_IO(int rw, struct inode *inode, char *buf,
+ loff_t offset, size_t count)
+{
+ struct address_space *mapping = inode->i_mapping;
+ unsigned blocksize_mask;
+ ssize_t retval;
+
+ blocksize_mask = (1 << inode->i_blkbits) - 1;
+ if ((offset & blocksize_mask) || (count & blocksize_mask)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (mapping->nrpages) {
+ retval = filemap_fdatawrite(mapping);
+ if (retval == 0)
+ retval = filemap_fdatawait(mapping);
+ if (retval)
+ goto out;
+ }
+ retval = mapping->a_ops->direct_IO(rw, inode, buf, offset, count);
+out:
+ return retval;
+}
--- 2.5.25/include/linux/fs.h~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/include/linux/fs.h Sun Jul 7 19:35:39 2002
@@ -303,8 +303,8 @@ struct address_space_operations {
int (*bmap)(struct address_space *, long);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
-#define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
- int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
+ int (*direct_IO)(int, struct inode *, char *buf,
+ loff_t offset, size_t count);
};
struct backing_dev_info;
@@ -1128,7 +1128,7 @@ extern int check_disk_change(kdev_t);
extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int);
extern void invalidate_inode_pages(struct inode *);
-extern void invalidate_inode_pages2(struct address_space *);
+extern void invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
@@ -1233,6 +1233,11 @@ extern int file_read_actor(read_descript
extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
+ssize_t generic_file_direct_IO(int rw, struct inode *inode, char *buf,
+ loff_t offset, size_t count);
+int generic_direct_IO(int rw, struct inode *inode, char *buf,
+ loff_t offset, size_t count, get_block_t *get_block);
+
extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
--- 2.5.25/include/linux/buffer_head.h~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/include/linux/buffer_head.h Sun Jul 7 19:35:39 2002
@@ -182,8 +182,6 @@ int block_sync_page(struct page *);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
int block_truncate_page(struct address_space *, loff_t, get_block_t *);
-int generic_direct_IO(int, struct inode *, struct kiobuf *,
- unsigned long, int, get_block_t *);
int file_fsync(struct file *, struct dentry *, int);
#define OSYNC_METADATA (1<<0)
--- 2.5.25/fs/buffer.c~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/fs/buffer.c Sun Jul 7 19:35:39 2002
@@ -2298,6 +2298,7 @@ sector_t generic_block_bmap(struct addre
return tmp.b_blocknr;
}
+#if 0
int generic_direct_IO(int rw, struct inode *inode,
struct kiobuf *iobuf, unsigned long blocknr,
int blocksize, get_block_t *get_block)
@@ -2344,6 +2345,7 @@ int generic_direct_IO(int rw, struct ino
out:
return retval;
}
+#endif
/*
* Start I/O on a physical range of kernel memory, defined by a vector
--- 2.5.25/mm/filemap.c~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/mm/filemap.c Sun Jul 7 19:35:39 2002
@@ -413,7 +413,7 @@ static int invalidate_list_pages2(struct
* free the pages because they're mapped.
* @mapping: the address_space which pages we want to invalidate
*/
-void invalidate_inode_pages2(struct address_space * mapping)
+void invalidate_inode_pages2(struct address_space *mapping)
{
int unlocked;
@@ -1101,6 +1101,7 @@ no_cached_page:
UPDATE_ATIME(inode);
}
+#if 0
static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
{
ssize_t retval;
@@ -1181,6 +1182,7 @@ static ssize_t generic_file_direct_IO(in
out:
return retval;
}
+#endif
int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
{
@@ -1208,15 +1210,36 @@ int file_read_actor(read_descriptor_t *
* This is the "read()" routine for all filesystems
* that can use the page cache directly.
*/
-ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+ssize_t
+generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
{
ssize_t retval;
if ((ssize_t) count < 0)
return -EINVAL;
- if (filp->f_flags & O_DIRECT)
- goto o_direct;
+ if (filp->f_flags & O_DIRECT) {
+ loff_t pos = *ppos, size;
+ struct address_space *mapping;
+ struct inode *inode;
+
+ mapping = filp->f_dentry->d_inode->i_mapping;
+ inode = mapping->host;
+ retval = 0;
+ if (!count)
+ goto out; /* skip atime */
+ size = inode->i_size;
+ if (pos < size) {
+ if (pos + count > size)
+ count = size - pos;
+ retval = generic_file_direct_IO(READ, inode,
+ buf, pos, count);
+ if (retval > 0)
+ *ppos = pos + retval;
+ }
+ UPDATE_ATIME(filp->f_dentry->d_inode);
+ goto out;
+ }
retval = -EFAULT;
if (access_ok(VERIFY_WRITE, buf, count)) {
@@ -1229,36 +1252,14 @@ ssize_t generic_file_read(struct file *
desc.count = count;
desc.buf = buf;
desc.error = 0;
- do_generic_file_read(filp, ppos, &desc, file_read_actor);
-
+ do_generic_file_read(filp,ppos,&desc,file_read_actor);
retval = desc.written;
if (!retval)
retval = desc.error;
}
}
- out:
+out:
return retval;
-
- o_direct:
- {
- loff_t pos = *ppos, size;
- struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
- struct inode *inode = mapping->host;
-
- retval = 0;
- if (!count)
- goto out; /* skip atime */
- size = inode->i_size;
- if (pos < size) {
- if (pos + count > size)
- count = size - pos;
- retval = generic_file_direct_IO(READ, filp, buf, count, pos);
- if (retval > 0)
- *ppos = pos + retval;
- }
- UPDATE_ATIME(filp->f_dentry->d_inode);
- goto out;
- }
}
static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
@@ -2199,8 +2200,8 @@ generic_file_write(struct file *file, co
}
if (unlikely(file->f_flags & O_DIRECT)) {
- written = generic_file_direct_IO(WRITE, file,
- (char *) buf, count, pos);
+ written = generic_file_direct_IO(WRITE, inode,
+ (char *)buf, pos, count);
if (written > 0) {
loff_t end = pos + written;
if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
@@ -2208,7 +2209,8 @@ generic_file_write(struct file *file, co
mark_inode_dirty(inode);
}
*ppos = end;
- invalidate_inode_pages2(mapping);
+ if (mapping->nrpages)
+ invalidate_inode_pages2(mapping);
}
/*
* Sync the fs metadata but not the minor inode changes and
--- 2.5.25/fs/ext2/inode.c~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/fs/ext2/inode.c Sun Jul 7 19:35:39 2002
@@ -607,11 +607,10 @@ static int ext2_bmap(struct address_spac
}
static int
-ext2_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
- unsigned long blocknr, int blocksize)
+ext2_direct_IO(int rw, struct inode *inode, char *buf,
+ loff_t offset, size_t count)
{
- return generic_direct_IO(rw, inode, iobuf, blocknr,
- blocksize, ext2_get_block);
+ return generic_direct_IO(rw, inode, buf, offset, count, ext2_get_block);
}
static int
--- 2.5.25/fs/Makefile~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/fs/Makefile Sun Jul 7 19:35:39 2002
@@ -15,7 +15,7 @@ obj-y := open.o read_write.o devices.o f
namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
filesystems.o namespace.o seq_file.o xattr.o libfs.o \
- fs-writeback.o mpage.o
+ fs-writeback.o mpage.o direct-io.o
ifneq ($(CONFIG_NFSD),n)
ifneq ($(CONFIG_NFSD),)
--- 2.5.25/fs/block_dev.c~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/fs/block_dev.c Sun Jul 7 19:35:39 2002
@@ -105,9 +105,12 @@ static int blkdev_get_block(struct inode
return 0;
}
-static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
+static int
+blkdev_direct_IO(int rw, struct inode *inode, char *buf,
+ loff_t offset, size_t count)
{
- return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, blkdev_get_block);
+ return generic_direct_IO(rw, inode, buf, offset,
+ count, blkdev_get_block);
}
static int blkdev_writepage(struct page * page)
-
raw.c | 136 ++++++++++++------------------------------------------------------
1 files changed, 26 insertions(+), 110 deletions(-)
--- 2.5.25/drivers/char/raw.c~raw-use-generic Sun Jul 7 19:35:44 2002
+++ 2.5.25-akpm/drivers/char/raw.c Sun Jul 7 19:58:33 2002
@@ -8,8 +8,8 @@
* device are used to bind the other minor numbers to block devices.
*/
+#include <linux/init.h>
#include <linux/fs.h>
-#include <linux/iobuf.h>
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/raw.h>
@@ -86,12 +86,6 @@ int raw_open(struct inode *inode, struct
return 0;
}
- if (!filp->f_iobuf) {
- err = alloc_kiovec(1, &filp->f_iobuf);
- if (err)
- return err;
- }
-
down(&raw_devices[minor].mutex);
/*
* No, it is a normal raw device. All we need to do on open is
@@ -256,124 +250,46 @@ int raw_ctl_ioctl(struct inode *inode,
return err;
}
-
-
-ssize_t raw_read(struct file *filp, char * buf,
- size_t size, loff_t *offp)
+ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp)
{
return rw_raw_dev(READ, filp, buf, size, offp);
}
-ssize_t raw_write(struct file *filp, const char *buf,
- size_t size, loff_t *offp)
+ssize_t raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp)
{
return rw_raw_dev(WRITE, filp, (char *) buf, size, offp);
}
-#define SECTOR_BITS 9
-#define SECTOR_SIZE (1U << SECTOR_BITS)
-#define SECTOR_MASK (SECTOR_SIZE - 1)
-
-ssize_t rw_raw_dev(int rw, struct file *filp, char *buf,
- size_t size, loff_t *offp)
+ssize_t
+rw_raw_dev(int rw, struct file *filp, char *buf, size_t size, loff_t *offp)
{
- struct kiobuf * iobuf;
- int new_iobuf;
- int err = 0;
- unsigned long blocks;
- size_t transferred;
- int iosize;
- int minor;
- kdev_t dev;
- unsigned long limit;
- int sector_size, sector_bits, sector_mask;
- sector_t blocknr;
struct block_device *bdev;
-
- /*
- * First, a few checks on device size limits
- */
+ struct inode *inode;
+ int minor;
+ ssize_t ret = 0;
minor = minor(filp->f_dentry->d_inode->i_rdev);
-
- new_iobuf = 0;
- iobuf = filp->f_iobuf;
- if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
- /*
- * A parallel read/write is using the preallocated iobuf
- * so just run slow and allocate a new one.
- */
- err = alloc_kiovec(1, &iobuf);
- if (err)
- goto out;
- new_iobuf = 1;
- }
-
bdev = raw_devices[minor].binding;
- dev = to_kdev_t(bdev->bd_dev);
- sector_size = raw_devices[minor].sector_size;
- sector_bits = raw_devices[minor].sector_bits;
- sector_mask = sector_size - 1;
-
- limit = bdev->bd_inode->i_size >> sector_bits;
- if (!limit)
- limit = INT_MAX;
- dprintk ("rw_raw_dev: dev %d:%d (+%d)\n",
- major(dev), minor(dev), limit);
-
- err = -EINVAL;
- if ((*offp & sector_mask) || (size & sector_mask))
- goto out_free;
- err = 0;
- if (size)
- err = -ENXIO;
- if ((*offp >> sector_bits) >= limit)
- goto out_free;
-
- transferred = 0;
- blocknr = *offp >> sector_bits;
- while (size > 0) {
- blocks = size >> sector_bits;
- if (blocks > limit - blocknr)
- blocks = limit - blocknr;
- if (!blocks)
- break;
-
- iosize = blocks << sector_bits;
+ inode = bdev->bd_inode;
- err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
- if (err)
- break;
-
- err = brw_kiovec(rw, 1, &iobuf, raw_devices[minor].binding, &blocknr, sector_size);
-
- if (rw == READ && err > 0)
- mark_dirty_kiobuf(iobuf, err);
-
- if (err >= 0) {
- transferred += err;
- size -= err;
- buf += err;
- }
-
- blocknr += blocks;
-
- unmap_kiobuf(iobuf);
-
- if (err != iosize)
- break;
+ if (size == 0)
+ goto out;
+ if (size < 0) {
+ ret = -EINVAL;
+ goto out;
}
-
- if (transferred) {
- *offp += transferred;
- err = transferred;
+ if (*offp >= inode->i_size) {
+ ret = -ENXIO;
+ goto out;
}
+ if (size + *offp > inode->i_size)
+ size = inode->i_size - *offp;
- out_free:
- if (!new_iobuf)
- clear_bit(0, &filp->f_iobuf_lock);
- else
- free_kiovec(1, &iobuf);
- out:
- return err;
+ ret = generic_file_direct_IO(rw, inode, buf, *offp, size);
+ if (ret > 0)
+ *offp += ret;
+ if (inode->i_mapping->nrpages)
+ invalidate_inode_pages2(inode->i_mapping);
+out:
+ return ret;
}
-
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: direct-to-BIO for O_DIRECT
2002-07-08 3:19 direct-to-BIO for O_DIRECT Andrew Morton
@ 2002-07-08 3:30 ` Lincoln Dale
2002-07-08 7:44 ` Ingo Oeser
2002-07-11 2:25 ` Lincoln Dale
2 siblings, 0 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-08 3:30 UTC (permalink / raw)
To: Andrew Morton
Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
Linus Torvalds, lkml, Steve Lord
At 08:19 PM 7/07/2002 -0700, Andrew Morton wrote:
>Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
>the kiovec layer. It's followed by a patch which converts the raw
>driver to use the O_DIRECT engine.
>
>CPU utilisation is about the same as the kiovec-based implementation.
>Read and write bandwidth are the same too, for 128k chunks. But with
>one megabyte chunks, this implementation is 20% faster at writing.
>
>I assume this is because the kiobuf-based implementation has to stop
>and wait for each 128k chunk, whereas this code streams the entire
>request, regardless of its size.
>
>This is with a single (oldish) scsi disk on aic7xxx. I'd expect the
>margin to widen on higher-end hardware which likes to have more
>requests in flight.
i'll have a go at benchmark-testing these.
now have even bigger hardware than before: 2 x 2gbit/s FC HBAs in multiple
dual-processor (Dual P3 Xeon 550MHz 2M L2 cache and Dual P3 Xeon 833MHz
256K L2 cache) boxen, 8 x 15K RPM FC, 28 x 10K RPM SCSI.
cheers,
lincoln.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: direct-to-BIO for O_DIRECT
2002-07-08 3:19 direct-to-BIO for O_DIRECT Andrew Morton
2002-07-08 3:30 ` Lincoln Dale
@ 2002-07-08 7:44 ` Ingo Oeser
2002-07-11 2:25 ` Lincoln Dale
2 siblings, 0 replies; 17+ messages in thread
From: Ingo Oeser @ 2002-07-08 7:44 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel
On Sun, Jul 07, 2002 at 08:19:33PM -0700, Andrew Morton wrote:
> Question is: what do we want to do with this sucker? These are the
> remaining users of kiovecs:
>
> drivers/md/lvm-snap.c
> drivers/media/video/video-buf.c
> drivers/mtd/devices/blkmtd.c
> drivers/scsi/sg.c
>
> the video and mtd drivers seems to be fairly easy to de-kiobufize.
> I'm aware of one proprietary driver which uses kiobufs. XFS uses
> kiobufs a little bit - just to map the pages.
It would be nice if we could just map a set of user pages to a scatterlist.
Developers of mass transfer devices (video grabbers, dsp devices, sg and
many others) would just LOVE you for this ;-)
Block devices are the common case worth optimizing for, but character
devices just need to reimplement most of this, if they want the same
optimizations. Some devices need mass transfers and are NOT blockdevices.
Linux supports only one class of them properly: NICs.
Please consider supporting them better for 2.5 in stuff similiar to BIOs
and DMA to/from user pages.
Thanks & Regards
Ingo Oeser
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: direct-to-BIO for O_DIRECT
2002-07-08 3:19 direct-to-BIO for O_DIRECT Andrew Morton
2002-07-08 3:30 ` Lincoln Dale
2002-07-08 7:44 ` Ingo Oeser
@ 2002-07-11 2:25 ` Lincoln Dale
2002-07-11 3:24 ` Andrew Morton
2002-07-11 19:52 ` direct-to-BIO for O_DIRECT Jesse Barnes
2 siblings, 2 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-11 2:25 UTC (permalink / raw)
To: Andrew Morton
Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
Linus Torvalds, lkml, Steve Lord
At 08:19 PM 7/07/2002 -0700, Andrew Morton wrote:
>Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
>the kiovec layer. It's followed by a patch which converts the raw
>driver to use the O_DIRECT engine.
>
>CPU utilisation is about the same as the kiovec-based implementation.
>Read and write bandwidth are the same too, for 128k chunks. But with
>one megabyte chunks, this implementation is 20% faster at writing.
..
>This is with a single (oldish) scsi disk on aic7xxx. I'd expect the
>margin to widen on higher-end hardware which likes to have more
>requests in flight.
sorry for the delay.
upgrading from 2.4.19 to 2.5.25 took longer than expected, since the QLogic
FC 2300 HBA
driver isn't part of the standard kernel, and i had to update it to reflect the
io_request_lock -> host->host_lock, kdev_t and kbuild changes. urgh, pain
pain pain.
in the process, i discovered some races in their driver, so fixed them also.
the 2.5 block i/o layer is FAR superior to the 2.4 block i/o layer. kudos
to Jens, Andrew & co for the changeover.
the results:
2.4.19pre8aa2 (with lockmeter and profile=2)
normal 167772160 blocks of 512 bytes in 778 seconds (105.27
mbyte/sec), CPUs 0% idle
O_DIRECT 20480 blocks of 4194304 bytes in 430 seconds (190.47
mbyte/sec), CPUs ~55% idle
/dev/rawN 20480 blocks of 4194304 bytes in 463 seconds (176.86
mbyte/sec), CPUs ~62% idle
2.5.25 ('virgin' 2.5.25 with the exception of changing PAGE_OFFSET to
0x80000000 and
your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
below)
normal 167772160 blocks of 512 bytes in 607 seconds (134.81
mbyte/sec), CPUs 0% idle
O_DIRECT 20480 blocks of 4194304 bytes in 420 seconds (194.61
mbyte/sec), CPUs ~93% idle
/dev/rawN 20480 blocks of 4194304 bytes in 422 seconds (193.84
mbyte/sec), CPUs ~92% idle
2.5.25 with direct-to-BIO (and PAGE_OFFSET at 0x80000000)
normal 167772160 blocks of 512 bytes in 615 seconds (133.06
mbyte/sec), CPUs 0% idle
O_DIRECT 20480 blocks of 4194304 bytes in 421 seconds (194.37
mbyte/sec), CPUs ~92% idle
/dev/rawN 20480 blocks of 4194304 bytes in 421 seconds (194.35
mbyte/sec), CPUs ~92% idle
its a little hard to tell CPU load difference between direct-to-BIO versus
non-direct-to-BIO,
but clearly performance was at 100% of 2gbit/s Fibre Channel with
direct-to-bio; i've never
seen it sustain exactly 100% throughout a test before.
it was interesting to watch the test of 2.4.19pre8aa2 versus both 2.5.25
tests; whether it is a
change in the linux scheduler or some other artifact, all "worker" threads
(1 thread per disk)
completed at almost exactly the same time on 2.5.25 kernels.
in contrast, the benchmark on 2.4.19pre8aa2 had some disks complete their
work up to half
a minute prior to the last thread finishing -- clearly there was some
degree of "unfairness"
between threads that has since been addressed.
i'll see about getting dual 2gbit/s FC HBAs working now; my FC MultiPathing
configuration
is having a bad hair day today and i'm not physically near the test host in
question to
replace a physical fibre cable reporting errors.
details of how the test was conducted --
test host:
- dual P3 Xeon (733MHz), 2GB PC133 SDRAM (no HIGHMEM defined)
- single QLogic FC 2300 HBA operating at 2gbit/s in a 64/66 PCI slot
test:
- benchmark consisted of sequential read requests in parallel across
8 x 18G 15K RPM FC disks across the first 10GB of each disk
(why use "sequential reads" you ask? because its generally consistent --
i'm not measuring any i/o re-ordering/elevator behaviour, nor am
i measuring the speed of any disk-shelf controller cache or
disk-spindle seek speed. i'm purely measuring how fast data can
move from the storage subsystem to userspace).
- benchmark-test considered complete when all disks have gone idle.
- benchmark program is multithreaded, one thread per device
- each test run twice with machine rebooted in-between to ensure
repeatability
block sizes:
- for normal, test used 20971520 blocks of 512 bytes (10GB) on each disk
- for O_DIRECT, test used 2560 blocks of 4194304 bytes (10GB) on each disk
- for /dev/rawN, test used 2560 blocks of 4194304 bytes (10GB) on each disk
oops report #1: (virgin 2.5.25)
oops occurs on attempting to issue a read() on a O_DIRECT device.
this was corrected with Andrew's patch of:
Oops: 0000
CPU: 0
EIP: 0010:[<801c4e11>] Not tainted
Using defaults from ksymoops -t elf32-i386 -a i386
EFLAGS: 00010296
eax: 00000080 ebx: 00000000 ecx: f6e83b20 edx: f3e79c00
esi: f3e79cc0 edi: 00010000 ebp: f6e83b20 esp: f393bdcc
ds: 0018 es: 0018 ss: 0018
Stack: 8013e856 820fcde0 00000010 000000c0 2aca6000 00000000
f3e79cc0 00070000
00000070 801c4fac f6e83b20 f6e83b20 8013edbd 00000000
f6e83b20 00000010
00000010 00000000 00000000 00000010 00000001 80127acb
f56e9ae0 f54691e0
Call Trace: [<8013e856>] [<801c4fac>] [<8013edbd>] [<80127acb>]
[<8013e118>]
[<8013e05f>] [<801269de>] [<80126af8>] [<80140113>]
[<801400a0>] [<8012a9c7>]
[<8012abad>] [<8011404b>] [<8013a738>] [<8013a8ea>] [<80108a0b>]
Code: 8b 43 0c c1 ef 09 8b 50 38 8b 40 34 0f ac d0 09 89 c6 85 f6
>>EIP; 801c4e11 <generic_make_request+11/130> <=====
Trace; 8013e856 <bio_alloc+e6/1a0>
Trace; 801c4fac <submit_bio+5c/70>
Trace; 8013edbd <ll_rw_kio+1ad/210>
Trace; 80127acb <handle_mm_fault+6b/e0>
Trace; 8013e118 <brw_kiovec+a8/100>
Trace; 8013e05f <generic_direct_IO+ef/100>
Trace; 801269de <get_user_pages+ee/150>
Trace; 80126af8 <map_user_kiobuf+b8/100>
Trace; 80140113 <blkdev_direct_IO+23/30>
Trace; 801400a0 <blkdev_get_block+0/50>
Trace; 8012a9c7 <generic_file_direct_IO+167/1e0>
Trace; 8012abad <generic_file_read+ed/130>
Trace; 8011404b <schedule+33b/3a0>
Trace; 8013a738 <vfs_read+98/110>
Trace; 8013a8ea <sys_read+2a/40>
Trace; 80108a0b <syscall_call+7/b>
Code; 801c4e11 <generic_make_request+11/130>
00000000 <_EIP>:
Code; 801c4e11 <generic_make_request+11/130> <=====
0: 8b 43 0c mov 0xc(%ebx),%eax <=====
Code; 801c4e14 <generic_make_request+14/130>
3: c1 ef 09 shr $0x9,%edi
Code; 801c4e17 <generic_make_request+17/130>
6: 8b 50 38 mov 0x38(%eax),%edx
Code; 801c4e1a <generic_make_request+1a/130>
9: 8b 40 34 mov 0x34(%eax),%eax
Code; 801c4e1d <generic_make_request+1d/130>
c: 0f ac d0 09 shrd $0x9,%edx,%eax
Code; 801c4e21 <generic_make_request+21/130>
10: 89 c6 mov %eax,%esi
Code; 801c4e23 <generic_make_request+23/130>
12: 85 f6 test %esi,%esi
cheers,
lincoln.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: direct-to-BIO for O_DIRECT
2002-07-11 2:25 ` Lincoln Dale
@ 2002-07-11 3:24 ` Andrew Morton
2002-07-11 3:25 ` Lincoln Dale
2002-07-11 19:52 ` direct-to-BIO for O_DIRECT Jesse Barnes
1 sibling, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2002-07-11 3:24 UTC (permalink / raw)
To: Lincoln Dale
Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
Linus Torvalds, lkml, Steve Lord
Lincoln Dale wrote:
>
> ...
> sorry for the delay.
Is cool. Thanks for doing this.
> upgrading from 2.4.19 to 2.5.25 took longer than expected, since the QLogic
> FC 2300 HBA
> driver isn't part of the standard kernel, and i had to update it to reflect the
> io_request_lock -> host->host_lock, kdev_t and kbuild changes. urgh, pain
> pain pain.
> in the process, i discovered some races in their driver, so fixed them also.
>
> the 2.5 block i/o layer is FAR superior to the 2.4 block i/o layer. kudos
> to Jens, Andrew & co for the changeover.
>
> the results:
> 2.4.19pre8aa2 (with lockmeter and profile=2)
> normal 167772160 blocks of 512 bytes in 778 seconds (105.27
> mbyte/sec), CPUs 0% idle
> O_DIRECT 20480 blocks of 4194304 bytes in 430 seconds (190.47
> mbyte/sec), CPUs ~55% idle
> /dev/rawN 20480 blocks of 4194304 bytes in 463 seconds (176.86
> mbyte/sec), CPUs ~62% idle
>
> 2.5.25 ('virgin' 2.5.25 with the exception of changing PAGE_OFFSET to
> 0x80000000 and
> your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
> below)
> normal 167772160 blocks of 512 bytes in 607 seconds (134.81
> mbyte/sec), CPUs 0% idle
> O_DIRECT 20480 blocks of 4194304 bytes in 420 seconds (194.61
> mbyte/sec), CPUs ~93% idle
> /dev/rawN 20480 blocks of 4194304 bytes in 422 seconds (193.84
> mbyte/sec), CPUs ~92% idle
The 30% improvement in pagecache-buffered reads is somewhat unexpected.
The blockdevs are not using multipage BIOs - they're still using
buffer_head-based I/O for both reads and writes. Are you sure that
the 2.4 QLogic driver is using block-highmem?
> 2.5.25 with direct-to-BIO (and PAGE_OFFSET at 0x80000000)
> normal 167772160 blocks of 512 bytes in 615 seconds (133.06
> mbyte/sec), CPUs 0% idle
> O_DIRECT 20480 blocks of 4194304 bytes in 421 seconds (194.37
> mbyte/sec), CPUs ~92% idle
> /dev/rawN 20480 blocks of 4194304 bytes in 421 seconds (194.35
> mbyte/sec), CPUs ~92% idle
OK, so there's nothing there at all really (or there may be. Hard
to tell when the interface has saturated).
But on my lowly scsi disks I was seeing no change in read bandwidth
either. Only writes benefitted for some reason. Can you do
some write testing as well? If you test writes through the pagecache,
use ext2 and not direct-to-blockdev please - that'll take the multipage
BIOs, buffer_head-bypass route. Plain old read and write of /dev/XdYY
isn't very optimised at all.
Thanks.
-
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: direct-to-BIO for O_DIRECT
2002-07-11 3:24 ` Andrew Morton
@ 2002-07-11 3:25 ` Lincoln Dale
[not found] ` <3D2CFF48.9EFF9C59@zip.com.au>
0 siblings, 1 reply; 17+ messages in thread
From: Lincoln Dale @ 2002-07-11 3:25 UTC (permalink / raw)
To: Andrew Morton
Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
Linus Torvalds, lkml, Steve Lord
At 08:24 PM 10/07/2002 -0700, Andrew Morton wrote:
> > 2.5.25 ('virgin' 2.5.25 with the exception of changing PAGE_OFFSET to
> > 0x80000000 and
> > your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
> > below)
> > normal 167772160 blocks of 512 bytes in 607 seconds (134.81
> > mbyte/sec), CPUs 0% idle
> > O_DIRECT 20480 blocks of 4194304 bytes in 420 seconds (194.61
> > mbyte/sec), CPUs ~93% idle
> > /dev/rawN 20480 blocks of 4194304 bytes in 422 seconds (193.84
> > mbyte/sec), CPUs ~92% idle
>
>The 30% improvement in pagecache-buffered reads is somewhat unexpected.
>The blockdevs are not using multipage BIOs - they're still using
>buffer_head-based I/O for both reads and writes. Are you sure that
>the 2.4 QLogic driver is using block-highmem?
pretty sure -- there's no highmem in the system: :-)
(i.e. i changed PAGE_OFFSET in order to prevent there being any highmem).
[root@mel-stglab-host1 root]# cat /proc/meminfo
MemTotal: 1945680 kB
MemFree: 1853812 kB
MemShared: 0 kB
Cached: 29536 kB
SwapCached: 2520 kB
Active: 32336 kB
Inactive: 8336 kB
HighTotal: 0 kB
HighFree: 0 kB
LowTotal: 1945680 kB
LowFree: 1853812 kB
SwapTotal: 2047992 kB
SwapFree: 2037268 kB
Dirty: 1396 kB
Writeback: 0 kB
>OK, so there's nothing there at all really (or there may be. Hard
>to tell when the interface has saturated).
>
>But on my lowly scsi disks I was seeing no change in read bandwidth
>either. Only writes benefitted for some reason. Can you do
>some write testing as well? If you test writes through the pagecache,
>use ext2 and not direct-to-blockdev please - that'll take the multipage
>BIOs, buffer_head-bypass route. Plain old read and write of /dev/XdYY
>isn't very optimised at all.
will do.
do you have any other preferences --
- ext2 or ext3?
- if ext3, change the journalling mode?
- i/o to a single large file or multiple files per spindle?
i can also add combinations of read/write & seeking also.
what kind of file-size should i be using?
cheers,
lincoln.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: direct-to-BIO for O_DIRECT
2002-07-11 2:25 ` Lincoln Dale
2002-07-11 3:24 ` Andrew Morton
@ 2002-07-11 19:52 ` Jesse Barnes
2002-07-11 23:40 ` Lincoln Dale
1 sibling, 1 reply; 17+ messages in thread
From: Jesse Barnes @ 2002-07-11 19:52 UTC (permalink / raw)
To: Lincoln Dale; +Cc: lkml
On Thu, Jul 11, 2002 at 12:25:03PM +1000, Lincoln Dale wrote:
> sorry for the delay.
> upgrading from 2.4.19 to 2.5.25 took longer than expected, since the
> QLogic FC 2300 HBA driver isn't part of the standard kernel, and i
> had to update it to reflect the io_request_lock -> host->host_lock,
> kdev_t and kbuild changes. urgh, pain pain pain. in the process, i
> discovered some races in their driver, so fixed them also.
So you ported the qla2x00 driver forward to 2.5? Would it be possible
to post that driver? Not having it has held up some testing I'd like
to do...
Thanks,
Jesse
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: direct-to-BIO for O_DIRECT
2002-07-11 19:52 ` direct-to-BIO for O_DIRECT Jesse Barnes
@ 2002-07-11 23:40 ` Lincoln Dale
0 siblings, 0 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-11 23:40 UTC (permalink / raw)
To: Jesse Barnes; +Cc: lkml
At 12:52 PM 11/07/2002 -0700, Jesse Barnes wrote:
>On Thu, Jul 11, 2002 at 12:25:03PM +1000, Lincoln Dale wrote:
> > sorry for the delay.
> > upgrading from 2.4.19 to 2.5.25 took longer than expected, since the
> > QLogic FC 2300 HBA driver isn't part of the standard kernel, and i
> > had to update it to reflect the io_request_lock -> host->host_lock,
> > kdev_t and kbuild changes. urgh, pain pain pain. in the process, i
> > discovered some races in their driver, so fixed them also.
>
>So you ported the qla2x00 driver forward to 2.5? Would it be possible
>to post that driver? Not having it has held up some testing I'd like
>to do...
these are the changes to the qla2x00 6.1 beta 2 driver, as downloadable
from the QLogic web-site.
there were also some changes required to the makefiles to get this working
with linux-2.5 kbuild infrastructure.
the hacks i did there are awful and i'm not prepared to put my name against
those bad hacks just yet. :-)
===
diff -urN base/listops.h 2.5.25/listops.h
--- base/listops.h Tue Apr 16 05:15:40 2002
+++ 2.5.25/listops.h Fri Jul 12 09:29:45 2002
@@ -324,9 +324,9 @@
return;
}
- spin_lock_irqsave(&io_request_lock, flags);
+ spin_lock_irqsave(ha->host->host_lock, flags);
qla2x00_callback(ha, sp->cmd);
- spin_unlock_irqrestore(&io_request_lock, flags);
+ spin_unlock_irqrestore(ha->host->host_lock, flags);
}
/**************************************************************************
diff -urN base/qla2x00.c 2.5.25/qla2x00.c
--- base/qla2x00.c Wed Jul 10 18:32:25 2002
+++ 2.5.25/qla2x00.c Fri Jul 12 09:29:51 2002
@@ -532,10 +532,11 @@
static int recoveryTime = MAX_RECOVERYTIME;
static int failbackTime = MAX_FAILBACKTIME;
#endif /* end of MPIO_SUPPORT */
-#ifdef MODULE
+
static char *ql2xopts = NULL;
static int ql2xmaxqdepth = 0;
+#ifdef MODULE
/* insmod qla2100 ql2xopts=verbose" */
MODULE_PARM(ql2xopts, "s");
MODULE_PARM(ql2xmaxqdepth, "i");
@@ -552,7 +553,6 @@
MODULE_LICENSE("GPL");
#endif
-#include "listops.h"
#include "qla_fo.cfg"
@@ -564,6 +564,7 @@
static char dummy_buffer[60] = "Please don't add commas in your insmod
command!!\n";
#endif
+#include "listops.h"
#if QLA2100_LIPTEST
static int qla2x00_lip = 0;
@@ -1459,10 +1460,6 @@
ENTER("qla2x00_detect");
-#if NEW_EH_CODE
- spin_unlock_irq(&io_request_lock);
-#endif
-
#ifdef MODULE
DEBUG2(printk("DEBUG: qla2x00_set_info starts at address = %p\n",
qla2x00_set_info);)
@@ -1497,9 +1494,6 @@
if (!pci_present()) {
printk("scsi: PCI not present\n");
-#if NEW_EH_CODE
- spin_lock_irq(&io_request_lock);
-#endif
return 0;
} /* end of !pci_present() */
@@ -1542,9 +1536,6 @@
continue;
}
*/
-#if NEW_EH_CODE
- spin_lock_irq(&io_request_lock);
-#endif
if ((host =
scsi_register(
@@ -1609,9 +1600,6 @@
"scsi%d: [ERROR] Failed to allocate "
"memory for adapter\n",host->host_no);
qla2x00_mem_free(ha);
-#if NEW_EH_CODE
- spin_unlock_irq(&io_request_lock);
-#endif
continue;
}
@@ -1654,10 +1642,6 @@
ha->list_lock = SPIN_LOCK_UNLOCKED;
-#if NEW_EH_CODE
- spin_unlock_irq(&io_request_lock);
-#endif
-
if (qla2x00_initialize_adapter(ha) &&
!(ha->device_flags & DFLG_NO_CABLE)) {
@@ -1706,8 +1690,7 @@
ha->fabricid[SIMPLE_NAME_SERVER].in_use = TRUE;
#if NEW_EH_CODE
-
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
#endif
/* Register our resources with Linux */
@@ -1719,7 +1702,7 @@
qla2x00_mem_free(ha);
scsi_unregister(host);
#if NEW_EH_CODE
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(host->host_lock);
#endif
continue;
}
@@ -1741,7 +1724,7 @@
spin_unlock_irqrestore(&ha->hardware_lock, flags);
#if NEW_EH_CODE
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(host->host_lock);
#endif
#if MPIO_SUPPORT
@@ -1805,10 +1788,6 @@
}
} /* end of FOR */
-#if NEW_EH_CODE
- spin_lock_irq(&io_request_lock);
-#endif
-
LEAVE("qla2x00_detect");
return num_hosts;
@@ -2217,7 +2196,7 @@
ha = (scsi_qla_host_t *) host->hostdata;
cmd->scsi_done = fn;
- spin_unlock(&io_request_lock);
+ spin_unlock(host->host_lock);
/* Allocate a command packet from the "sp" pool.
* If we cant get back one then let scsi layer
@@ -2227,7 +2206,7 @@
printk(KERN_WARNING
"queuecommand: Couldn't allocate memory "
"for sp - retried.\n");
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
LEAVE("qla2x00_queuecommand");
return(1);
@@ -2309,14 +2288,14 @@
(int)ha->host_no,t,l);)
CMD_RESULT(cmd) = DID_NO_CONNECT << 16;
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
__sp_put(ha, sp);
return(0);
}
if (l >= ha->max_luns) {
CMD_RESULT(cmd) = DID_NO_CONNECT << 16;
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
__sp_put(ha, sp);
LEAVE("qla2x00_queuecommand");
return(0);
@@ -2379,7 +2358,7 @@
tasklet_schedule(&ha->run_qla_task);
LEAVE("qla2x00_queuecommand");
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
return (0);
}
@@ -2427,7 +2406,7 @@
qla2x00_extend_timeout(sp->cmd ,60);
LEAVE("qla2x00_queuecommand");
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
return (0);
} else {
sp->flags &= ~SRB_BUSY; /* v5.21b16 */
@@ -2449,7 +2428,7 @@
add_to_scsi_retry_queue(ha,sp);
LEAVE("qla2x00_queuecommand");
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
return (0);
}
@@ -2462,7 +2441,7 @@
COMTRACE('c')
LEAVE("qla2x00_queuecommand");
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
return (0);
}
@@ -2526,10 +2505,10 @@
break;
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(2*HZ);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
} while (time_before_eq(jiffies, max_wait_time));
@@ -2811,7 +2790,7 @@
sp_get(ha,sp);
spin_unlock_irqrestore(&ha->hardware_lock, flags);
- spin_unlock(&io_request_lock);
+ spin_unlock(host->host_lock);
if (qla2x00_abort_command(ha, sp)) {
DEBUG2(printk("qla2xxx_eh_abort:
abort_command "
@@ -2825,7 +2804,7 @@
}
sp_put(ha,sp);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
spin_lock_irqsave(&ha->hardware_lock, flags);
/*
@@ -2862,15 +2841,15 @@
*/
if ((which_ha & BIT_0) && (!list_empty(&ha->done_queue))) {
DEBUG3(printk("qla2xxx_eh_abort: calling done for ha.\n");)
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(host->host_lock);
qla2x00_done(ha);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
}
if ((which_ha & BIT_1) && (!list_empty(&vis_ha->done_queue))) {
DEBUG3(printk("qla2xxx_eh_abort: calling done for
vis_ha.\n");)
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(host->host_lock);
qla2x00_done(vis_ha);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
}
DEBUG(printk("qla2xxx_eh_abort: Exiting. return_status=0x%x.\n",
@@ -2975,22 +2954,22 @@
ha->cfg_active || ha->loop_state != LOOP_READY)) {
clear_bit(DEVICE_RESET_NEEDED, &ha->dpc_flags);
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
if (qla2x00_device_reset(ha, t) != 0) {
return_status = FAILED;
}
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
} else {
/*
* Wait a while for the loop to come back. Return SUCCESS
* for the kernel to try again.
*/
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(5 * HZ);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
return_status = SUCCESS;
}
@@ -3010,9 +2989,9 @@
DEBUG3(printk("qla2xxx_eh_device_reset: calling "
"done for ha.\n");)
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
qla2x00_done(ha);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
}
DRIVER_UNLOCK
@@ -3114,22 +3093,22 @@
ha->cfg_active || ha->loop_state != LOOP_READY)) {
clear_bit(LOOP_RESET_NEEDED, &ha->dpc_flags);
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
if (qla2x00_loop_reset(ha) != 0) {
return_status = FAILED;
}
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
} else {
/*
* Wait a while for the loop to come back. Return SUCCESS
* for the kernel to try again.
*/
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(5 * HZ);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
return_status = SUCCESS;
}
@@ -3147,9 +3126,9 @@
if (!list_empty(&ha->done_queue)) {
DEBUG3(printk("qla2xxx_eh_bus_reset: calling done for
ha.\n");)
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
qla2x00_done(ha);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
}
DEBUG2_3(printk("qla2xxx_eh_bus_reset: exiting. status=0x%x.\n",
@@ -3272,7 +3251,7 @@
if (!(test_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags))) {
set_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags);
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
if (qla2x00_abort_isp(ha, 1)) {
/* failed. try later */
@@ -3292,27 +3271,27 @@
return_status = SUCCESS;
}
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
clear_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags);
} else {
/*
* Already active. Sleep a while then return SUCCESS for
* kernel to retry the IO.
*/
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(5 * HZ);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
return_status = SUCCESS;
}
if (!list_empty(&ha->done_queue)) {
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
qla2x00_done(ha);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
}
DRIVER_UNLOCK
@@ -3595,9 +3574,9 @@
tasklet_schedule(&ha->run_qla_task);
if (found) {
- spin_unlock(&io_request_lock);
+ spin_unlock(ha->host->host_lock);
qla2x00_restart_queues(vis_ha, TRUE);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
} else {
printk(KERN_INFO
"qla2x00_abort: Couldn't Abort command = %p\n", cmd);
@@ -3851,12 +3830,12 @@
* mid-level code can expect completions
momentitarily.
*/
#if NEW_EH_CODE
- spin_unlock(&io_request_lock);
+ spin_unlock(ha->host->host_lock);
if (qla2x00_abort_isp(ha, 0)) {
/* failed. try later */
set_bit(ISP_ABORT_NEEDED, &ha->dpc_flags);
}
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
#else
set_bit(ISP_ABORT_NEEDED, &ha->dpc_flags);
@@ -3874,9 +3853,9 @@
DEBUG3(printk("qla2x00_reset: going to call restart_queues. "
"jiffies=%lx.\n", jiffies);)
- spin_unlock(&io_request_lock);
+ spin_unlock(ha->host->host_lock);
qla2x00_restart_queues(ha,TRUE);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
DRIVER_UNLOCK
COMTRACE('r')
@@ -3946,7 +3925,7 @@
qla2x00_stats.irqhba = ha;
/* Prevent concurrent access to adapters register */
- /* spin_lock_irqsave(&io_request_lock, cpu_flags);*/
+ /* spin_lock_irqsave(host->host_lock, cpu_flags);*/
reg = ha->iobase;
@@ -3998,7 +3977,7 @@
if (!list_empty(&ha->done_queue))
tasklet_schedule(&ha->run_qla_task);
- /* spin_unlock_irqrestore(&io_request_lock, cpu_flags);*/
+ /* spin_unlock_irqrestore(host->host_lock, cpu_flags);*/
/* Wakeup the DPC routine */
if ((!ha->flags.mbox_busy &&
@@ -4179,7 +4158,7 @@
QLA2100_DPC_LOCK(ha);
- /* spin_lock_irqsave(&io_request_lock, ha->cpu_flags);*/
+ /* spin_lock_irqsave(host->host_lock, ha->cpu_flags);*/
ha->dpc_active = 1;
/* Determine what action is necessary */
@@ -4477,7 +4456,7 @@
if (!list_empty(&ha->done_queue))
tasklet_schedule(&ha->run_qla_task);
- /* spin_unlock_irqrestore(&io_request_lock, ha->cpu_flags);*/
+ /* spin_unlock_irqrestore(host->host_lock, ha->cpu_flags);*/
ha->dpc_active = 0;
@@ -4778,9 +4757,9 @@
/* Call the mid-level driver interrupt handler */
#if 0
- spin_lock_irqsave(&io_request_lock, flags);
+ spin_lock_irqsave(host->host_lock, flags);
qla2x00_callback(ha,cmd);
- spin_unlock_irqrestore(&io_request_lock, flags);
+ spin_unlock_irqrestore(host->host_lock, flags);
#else
sp_put(ha, sp);
@@ -15846,7 +15825,7 @@
printk(KERN_INFO
"qla2x00_apidev: open MAJOR number = %d, "
"MINOR number = %d\n",
- MAJOR(inode->i_rdev), MINOR(inode->i_rdev));
+ major(inode->i_rdev), minor(inode->i_rdev));
return 0;
}
@@ -15902,7 +15881,8 @@
APIDEV_NODE, apidev_major);)
proc_mknod(APIDEV_NODE, 0777+S_IFCHR, host->hostt->proc_dir,
- (kdev_t)MKDEV(apidev_major,0));
+ (kdev_t)mk_kdev(apidev_major,0));
+
return 0;
}
diff -urN base/qla2x00.h 2.5.25/qla2x00.h
--- base/qla2x00.h Tue Apr 16 05:15:40 2002
+++ 2.5.25/qla2x00.h Fri Jul 12 09:29:51 2002
@@ -2682,10 +2682,8 @@
present: 0, /* number of 7xxx's present */\
unchecked_isa_dma: 0, /* no memory DMA restrictions */\
use_clustering: ENABLE_CLUSTERING, \
- use_new_eh_code: 1, \
max_sectors: 512, \
- highmem_io: 1, \
- emulated: 0 \
+ highmem_io: 1 \
}
#else /* KERNEL_VERSION < 2.5.7 */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,8)
diff -urN base/qla2x00_ioctl.c 2.5.25/qla2x00_ioctl.c
--- base/qla2x00_ioctl.c Tue Apr 16 05:15:40 2002
+++ 2.5.25/qla2x00_ioctl.c Fri Jul 12 09:29:51 2002
@@ -2509,14 +2509,14 @@
ha->host_no);)
/* get spin lock for this operation */
- spin_lock_irqsave(&io_request_lock, ha->cpu_flags);
+ spin_lock_irqsave(ha->host->host_lock, ha->cpu_flags);
qla2x00_queuecommand(pscsi_cmd, (void *) qla2x00_scsi_pt_done);
ha->ioctl->cmpl_timer.expires = jiffies + ha->ioctl->ioctl_tov * HZ;
add_timer(&ha->ioctl->cmpl_timer);
- spin_unlock_irqrestore(&io_request_lock, ha->cpu_flags);
+ spin_unlock_irqrestore(ha->host->host_lock, ha->cpu_flags);
down(&ha->ioctl->cmpl_sem);
del_timer(&ha->ioctl->cmpl_timer);
===
cheers,
lincoln.
^ permalink raw reply [flat|nested] 17+ messages in thread
* ext2 performance in 2.5.25 versus 2.4.19pre8aa2
[not found] ` <3D2CFF48.9EFF9C59@zip.com.au>
@ 2002-07-14 12:22 ` Lincoln Dale
2002-07-15 5:30 ` Andrew Morton
2002-07-15 16:30 ` Benjamin LaHaise
0 siblings, 2 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-14 12:22 UTC (permalink / raw)
To: Andrew Morton, Benjamin LaHaise, Andrea Arcangeli,
Stephen C. Tweedie, Linus Torvalds, Steve Lord
Cc: linux-kernel
Andrew Morton wanted me to do some benchmarking of large files on ext2
filesystems rather than the usual block-device testing
i've had some time to do this, here are the results.
one-line summary is that some results are better, some are worse; CPU usage
is better in 2.5.25, but thoughput is sometimes
worse.
Summary:
========
Test #1: create a single large (12GB) file on each disk.
use 12288 blocks of 1048576 bytes each on each of 8 disks.
2.5.25 Wrote 98304MB across 8 files using 96k blocks of 1M in
579.935968s (169.51 MB/sec), 44717usec, ~90% cpu
2.4.19pre8aa2 Wrote 98304MB across 8 files using 96k blocks of 1M in
607.542648s (161.81 MB/sec), 46684usec, ~88% cpu
Test #2: read back from large (12GB) files on each disk.
use 4k reads across 3 million blocks on each of 8 disks:
2.5.25 Read 98304MB across 8 files using 24m blocks of 4k in 508.925829s
(193.16 MB/sec), 158usec mean, ~61% cpu
2.4.19pre8aa2 Read 98304MB across 8 files using 24m blocks of 4k in
526.866882s (186.58 MB/sec), 157usec mean, ~88% cpu
Test #3: same test as #2, but using "nocopy" hack to see if copy_to_user
(memory bandwidth) is the bottleneck.
2.5.25 Read 98304MB across 8 files using 24m blocks of 4k in 507.792229s
(193.59 MB/sec), 160usec mean, ~25% cpu
2.4.19pre8aa2 Read 98304MB across 8 files using 24m blocks of 4k in
511.353691s (192.24 MB/sec), 148usec mean, ~50% cpu
test #4: measure read performance when reads are entirely out of the
page-cache.
test first primes page-cache with data and then issues random
reads from that.
working size is 8 x 200mbyte (1.6GB), test is iterated 10 times.
no I/O is recorded on FC switch, so data is served entirely out
of page cache.
2.5.25 Read 16GB across 8 files using 4096M blocks of 4k in 75.304798s
(212.47 MB/sec), 145usec mean, ~81% cpu
2.4.19pre8aa2 Read 16GB across 8 files using 4096M blocks of 4k in
70.526170s (226.87 MB/sec), 134usec mean, 100% cpu
Test #5: same test as #4, but using "nocopy" hack to see if copy_to_user
(memory bandwidth) is the bottleneck.
2.5.25 Read 16GB across 8 files using 4096M blocks of 4k in 61.694199s
(259.34 MB/sec), 119usec mean, ~65% cpu
** since performance wasn't much higher, i rebooted the machine and loaded
** it with "profile=2" and lockmeter. results of that are at the very end
** of thie email; looks to me like the scheduler was the culplit.
2.4.19pre8aa2 Read 16GB across 8 files using 4096M blocks of 4k in
55.924164s (286.10MB/sec), 108usec mean, ~80% cpu
Details:
========
machine is Dual P3-Xeon, 733MHz processors. 2GB of PC133 SDRAM.
disks are 8 x 15K RPM 18G FC disks, connected to FC switches via 1 x QLogic
FC 2300 HBA @ 2gbit/s.
FC HBA is in a 64/66 PCI slot.
all tests conducted using current in-house test-tool. it uses a
thread-per-device.
8 x empty ext2 filesystems created and mounted
kernels:
- stock 2.3.25 kernel + PAGE_OFFSET modified to 0x80000000 (no highmem),
QLogic FC 2300 HBA +
Andrew Morton's direct-bio patch [not exercised in these benchmarks]
Test #1
-------
create a single large (12GB) file on each disk:
writing to 8 filesystems, using a write block-size of 1 megabyte in
sequential writes.
12288 blocks (12G) per disk, 8 disks is 96GB total.
./test_disk_performance bs=1m blocks=12288 mode=basic operation=write
/mnt/scrap-sd*/bigfile
Linux 2.5.25
Completed writing 98304 mbytes across 8 devices using 98304 blocks of 1048576
in 579.935968 seconds (169.51 Mbytes/sec), 44717usec mean
#0 (/mnt/scrap-sde/bigfile) 12GB in 571.327206s using 12k writes of 1M
(21.51MB/sec), 40876usec
#1 (/mnt/scrap-sdf/bigfile) 12GB in 574.606073s using 12k writes of 1M
(21.39MB/sec), 36949usec
#2 (/mnt/scrap-sdg/bigfile) 12GB in 569.347650s using 12k writes of 1M
(21.58MB/sec), 49047usec
#3 (/mnt/scrap-sdh/bigfile) 12GB in 569.929641s using 12k writes of 1M
(21.56MB/sec), 48534usec
#4 (/mnt/scrap-sdi/bigfile) 12GB in 579.561403s using 12k writes of 1M
(21.20MB/sec), 28629usec
#5 (/mnt/scrap-sdj/bigfile) 12GB in 579.925156s using 12k writes of 1M
(21.19MB/sec), 27282usec
#6 (/mnt/scrap-sdk/bigfile) 12GB in 579.160854s using 12k writes of 1M
(21.22MB/sec), 31282usec
#7 (/mnt/scrap-sdl/bigfile) 12GB in 578.200229s using 12k writes of 1M
(21.25MB/sec), 30292usec
during test, machine had ~10% idle cpu.
Linux 2.4.19pre8aa2
Completed writing 98304 mbytes across 8 devices using 98304 blocks of 1048576
in 607.542648 seconds (161.81 Mbytes/sec), 46684usec mean
#0 (/mnt/scrap-sde/bigfile) 12GB in 603.074257s using 12k writes of 1M
(20.38MB/sec), 37131usec
#1 (/mnt/scrap-sdf/bigfile) 12GB in 606.433219s using 12k writes of 1M
(20.26MB/sec), 25851usec
#2 (/mnt/scrap-sdg/bigfile) 12GB in 603.926881s using 12k writes of 1M
(20.35MB/sec), 43734usec
#3 (/mnt/scrap-sdh/bigfile) 12GB in 603.114330s using 12k writes of 1M
(20.37MB/sec), 39455usec
#4 (/mnt/scrap-sdi/bigfile) 12GB in 604.618177s using 12k writes of 1M
(20.32MB/sec), 43179usec
#5 (/mnt/scrap-sdj/bigfile) 12GB in 597.328666s using 12k writes of 1M
(20.57MB/sec), 40354usec
#6 (/mnt/scrap-sdk/bigfile) 12GB in 590.982972s using 12k writes of 1M
(20.79MB/sec), 44772usec
#7 (/mnt/scrap-sdl/bigfile) 12GB in 607.630086s using 12k writes of 1M
(20.22MB/sec), 22741usec
during test, machine was ~12% idle cpu.
Test #2
-------
read back from large (12GB) files on each disk sequentially using 4k reads
across 3 million blocks:
./test_disk_performance bs=4k blocks=3m mode=basic operation=read
/mnt/scrap-sd*/bigfile
Linux 2.5.25
Completed reading 98304 mbytes across 8 devices using 25165824 blocks of 4096
in 508.925829 seconds (193.16 Mbytes/sec), 158usec mean
#0 (/mnt/scrap-sde/bigfile) 12GB in 505.979550s using 3145728 reads of 4k
(24.29 MB/sec), 160usec
#1 (/mnt/scrap-sdf/bigfile) 12GB in 506.537340s using 3145728 reads of 4k
(24.26 MB/sec), 160usec
#2 (/mnt/scrap-sdg/bigfile) 12GB in 506.582859s using 3145728 reads of 4k
(24.26 MB/sec), 159usec
#3 (/mnt/scrap-sdh/bigfile) 12GB in 507.796716s using 3145728 reads of 4k
(24.20 MB/sec), 152usec
#4 (/mnt/scrap-sdi/bigfile) 12GB in 505.965224s using 3145728 reads of 4k
(24.29 MB/sec), 160usec
#5 (/mnt/scrap-sdj/bigfile) 12GB in 508.235475s using 3145728 reads of 4k
(24.18 MB/sec), 138usec
#6 (/mnt/scrap-sdk/bigfile) 12GB in 508.378988s using 3145728 reads of 4k
(24.17 MB/sec), 137usec
#7 (/mnt/scrap-sdl/bigfile) 12GB in 508.925429s using 3145728 reads of 4k
(24.14 MB/sec), 137usec
during test, machine had approximately 39% idle cpu.
performance is fairly close to FC line-rate -- for interests-sake, test #3
(below)
repeats the test, but using the no-copy hack to see if performance
increases as a
result of reducing the number of memory-copies.
Linux 2.4.19pre8aa2
Completed reading 98304 mbytes across 8 devices using 25165824 blocks of 4096
in 526.866882 seconds (186.58 Mbytes/sec), 157usec mean
#0 (/mnt/scrap-sde/bigfile) 12GB in 496.930413s using 3145728 reads of 4k
(24.73MB/sec), 139usec
#1 (/mnt/scrap-sdf/bigfile) 12GB in 497.684209s using 3145728 reads of 4k
(24.69MB/sec), 134usec
#2 (/mnt/scrap-sdg/bigfile) 12GB in 500.584528s using 3145728 reads of 4k
(24.55MB/sec), 112usec
#3 (/mnt/scrap-sdh/bigfile) 12GB in 526.866829s using 3145728 reads of 4k
(23.32MB/sec), 575usec
#4 (/mnt/scrap-sdi/bigfile) 12GB in 497.065359s using 3145728 reads of 4k
(24.72MB/sec), 137usec
#5 (/mnt/scrap-sdj/bigfile) 12GB in 499.433604s using 3145728 reads of 4k
(24.60MB/sec), 121usec
#6 (/mnt/scrap-sdk/bigfile) 12GB in 506.116496s using 3145728 reads of 4k
(24.28MB/sec), 81usec
#7 (/mnt/scrap-sdl/bigfile) 12GB in 514.755508s using 3145728 reads of 4k
(23.87MB/sec), 80usec
during test, machine had approximately 12% idle cpu.
Test #3
-------
read back from large (12GB) files on each disk sequentially using 4k reads
across 3 million blocks
in order to determine if memory-bandwidth / front-side-bus was the
bottleneck, the kernel was patched with the
bogus "nocopy" read_file_actor hack.
the benchmark as Test #2 was used
./test_disk_performance bs=4k blocks=3m mode=nocopy operation=read
/mnt/scrap-sd*/bigfile
Linux 2.5.25
Completed reading 98304 mbytes across 8 devices using 25165824 blocks of 4096
in 507.792229 seconds (193.59 Mbytes/sec), 160usec mean
#0 (/mnt/scrap-sde/bigfile) 12GB in 507.622831s using 3145728
reads of 4k (24.21 MB/sec), 160usec
#1 (/mnt/scrap-sdf/bigfile) 12GB in 507.543491s using 3145728
reads of 4k (24.21 MB/sec), 159usec
#2 (/mnt/scrap-sdg/bigfile) 12GB in 507.219204s using 3145728
reads of 4k (24.23 MB/sec), 160usec
#3 (/mnt/scrap-sdh/bigfile) 12GB in 507.346622s using 3145728
reads of 4k (24.22 MB/sec), 160usec
#4 (/mnt/scrap-sdi/bigfile) 12GB in 507.739317s using 3145728
reads of 4k (24.20 MB/sec), 160usec
#5 (/mnt/scrap-sdj/bigfile) 12GB in 507.706553s using 3145728
reads of 4k (24.20 MB/sec), 160usec
#6 (/mnt/scrap-sdk/bigfile) 12GB in 507.791357s using 3145728
reads of 4k (24.20 MB/sec), 161usec
#7 (/mnt/scrap-sdl/bigfile) 12GB in 507.791288s using 3145728
reads of 4k (24.20 MB/sec), 160usec
during this test, the machine had ~75% idle cpu and was saturating 2gbit/s FC.
memory-bandwidth / front-side-bus (copy_to_user()) weren't the bottleneck.
the bottleneck in this test was certainly the 2gbit/s FC HBA.
Linux 2.4.19pre8aa2
Completed reading 98304 mbytes across 8 devices using 25165824 blocks of 4096
in 511.353691 seconds (192.24 Mbytes/sec), 148usec mean
#0 (/mnt/scrap-sde/bigfile) 12GB in 501.421399s using 3145728 reads of 4k
(24.51MB/sec), 122usec
#1 (/mnt/scrap-sdf/bigfile) 12GB in 500.688465s using 3145728 reads of 4k
(24.54MB/sec), 128usec
#2 (/mnt/scrap-sdg/bigfile) 12GB in 499.800663s using 3145728 reads of 4k
(24.59MB/sec), 133usec
#3 (/mnt/scrap-sdh/bigfile) 12GB in 505.030670s using 3145728 reads of 4k
(24.33MB/sec), 95usec
#4 (/mnt/scrap-sdi/bigfile) 12GB in 492.732146s using 3145728 reads of 4k
(24.94MB/sec), 156usec
#5 (/mnt/scrap-sdj/bigfile) 12GB in 495.600828s using 3145728 reads of 4k
(24.79MB/sec), 151usec
#6 (/mnt/scrap-sdk/bigfile) 12GB in 504.890322s using 3145728 reads of 4k
(24.34MB/sec), 101usec
#7 (/mnt/scrap-sdl/bigfile) 12GB in 511.353661s using 3145728 reads of 4k
(24.03MB/sec), 80usec
during test, machine cpu was ~50% idle.
Test #4
-------
this test was constructed to show read performance when reads are entirely
out of the page-cache.
randomly read back from a relatively small (200 mbyte) portion of each 12GB
file on each disk spindle. total working size is
8 x 200mbyte (1.6GB), which fits into the page-cache.
firstly, "prime" the page-cache:
./test_disk_performance bs=4k blocks=50k mode=basic operation=read
/mnt/scrap-sd*/bigfile
secondly, randomly seek-once-per-block for 50k blocks of 4k into the file
(i.e. working-set is 200mbyte on each file).
iterate the test 10 times.
./test_disk_performance bs=4k blocks=50k mode=basic operation=read
seek=random iterations=10 /mnt/scrap-sd*/bigfile
Linux 2.5.25
Completed reading 16000 mbytes across 8 devices using 4096000 blocks of 4096
in 75.304798 seconds (212.47 Mbytes/sec), 145usec mean
#0 (/mnt/scrap-sde/bigfile) 2GB in 75.304670s using 512K reads of
4k (26.56 MB/sec), 144usec
#1 (/mnt/scrap-sdf/bigfile) 2GB in 75.201499s using 512K reads of
4k (26.60 MB/sec), 144usec
#2 (/mnt/scrap-sdg/bigfile) 2GB in 75.260114s using 512K reads of
4k (26.57 MB/sec), 144usec
#3 (/mnt/scrap-sdh/bigfile) 2GB in 75.287700s using 512K reads of
4k (26.56 MB/sec), 146usec
#4 (/mnt/scrap-sdi/bigfile) 2GB in 75.298464s using 512K reads of
4k (26.56 MB/sec), 144usec
#5 (/mnt/scrap-sdj/bigfile) 2GB in 75.203990s using 512K reads of
4k (26.59 MB/sec), 144usec
#6 (/mnt/scrap-sdk/bigfile) 2GB in 75.204889s using 512K reads of
4k (26.59 MB/sec), 145usec
#7 (/mnt/scrap-sdl/bigfile) 2GB in 74.981378s using 512K reads of
4k (26.67 MB/sec), 144usec
during this test, there was zero activity on the FC switching infrastructure,
so all i/o was served from the page-cache.
cpu during the test was ~19% idle.
since there was so much idle time, my guess is that the system had hit its
peak
memory-bandwidth. test #5 (below) proves that this is the case.
Linux 2.4.19pre8aa2
Completed reading 16000 mbytes across 8 devices using 4096000 blocks of 4096
in 70.526170 seconds (226.87 Mbytes/sec), 134usec mean
#0 (/mnt/scrap-sde/bigfile) 2GB in 70.321070s using 512K reads of 4k
(28.44 MB/sec), 135usec
#1 (/mnt/scrap-sdf/bigfile) 2GB in 69.913954s using 512K reads of 4k
(28.61 MB/sec), 135usec
#2 (/mnt/scrap-sdg/bigfile) 2GB in 70.449511s using 512K reads of 4k
(28.39 MB/sec), 133usec
#3 (/mnt/scrap-sdh/bigfile) 2GB in 70.467109s using 512K reads of 4k
(28.38 MB/sec), 134usec
#4 (/mnt/scrap-sdi/bigfile) 2GB in 70.491946s using 512K reads of 4k
(28.37 MB/sec), 133usec
#5 (/mnt/scrap-sdj/bigfile) 2GB in 70.350087s using 512K reads of 4k
(28.43 MB/sec), 133usec
#6 (/mnt/scrap-sdk/bigfile) 2GB in 70.496071s using 512K reads of 4k
(28.37 MB/sec), 134usec
#7 (/mnt/scrap-sdl/bigfile) 2GB in 70.526164s using 512K reads of 4k
(28.36 MB/sec), 134usec
cpu was 0% idle during test.
Test #5
-------
randomly read back from a relatively small (200 mbyte) portion of each 12GB
file on each disk spindle. total working size is
8 x 200mbyte (1.6GB), which fits into the page-cache.
in order to determine if memory-bandwidth / front-side-bus was the
bottleneck, the 2.5.25 kernel was patched with the
bogus "nocopy" read_file_actor hack.
the same methodology in test #4 was used:
prime the page-cache with
./test_disk_performance bs=4k blocks=50k mode=basic operation=read
/mnt/scrap-sd*/bigfile
secondly, randomly seek-once-per-block for 50k blocks of 4k into the file
(i.e. working-set is 200mbyte on each file).
iterate the test 10 times.
./test_disk_performance bs=4k blocks=50k mode=nocopy operation=read
seek=random iterations=10 /mnt/scrap-sd*/bigfile
Linux 2.5.25
Completed reading 16000 mbytes across 8 devices using 4096000 blocks of 4096
in 61.694199 seconds (259.34 Mbytes/sec), 119usec mean
#0 (/mnt/scrap-sde/bigfile) 2GB in 61.639063s using 512K reads of
4k (32.45 MB/sec), 119usec
#1 (/mnt/scrap-sdf/bigfile) 2GB in 61.684894s using 512K reads of
4k (32.42 MB/sec), 119usec
#2 (/mnt/scrap-sdg/bigfile) 2GB in 61.693891s using 512K reads of
4k (32.42 MB/sec), 119usec
#3 (/mnt/scrap-sdh/bigfile) 2GB in 61.647535s using 512K reads of
4k (32.44 MB/sec), 119usec
#4 (/mnt/scrap-sdi/bigfile) 2GB in 61.653856s using 512K reads of
4k (32.44 MB/sec), 119usec
#5 (/mnt/scrap-sdj/bigfile) 2GB in 61.642686s using 512K reads of
4k (32.45 MB/sec), 119usec
#6 (/mnt/scrap-sdk/bigfile) 2GB in 61.638954s using 512K reads of
4k (32.45 MB/sec), 119usec
#7 (/mnt/scrap-sdl/bigfile) 2GB in 61.639063s using 512K reads of
4k (32.45 MB/sec), 119usec
this time, cpu was ~35% idle, yet performance didn't improve significantly.
because of this, i compiled a kernel with lockmeter forward-ported to 2.5.25
and rebooted with "profile=2".
Linux 2.4.19pre8aa2
Completed reading 16000 mbytes across 8 devices using 4096000 blocks of 4096
in 55.924164 seconds (286.10 Mbytes/sec), 108usec mean
#0 (/mnt/scrap-sde/bigfile) 2GB in 55.919840s using 512K reads of 4k
(35.77 MB/sec), 107usec
#1 (/mnt/scrap-sdf/bigfile) 2GB in 55.921253s using 512K reads of 4k
(35.76 MB/sec), 108usec
#2 (/mnt/scrap-sdg/bigfile) 2GB in 55.923975s using 512K reads of 4k
(35.76 MB/sec), 108usec
#3 (/mnt/scrap-sdh/bigfile) 2GB in 55.900788s using 512K reads of 4k
(35.78 MB/sec), 108usec
#4 (/mnt/scrap-sdi/bigfile) 2GB in 55.910989s using 512K reads of 4k
(35.77 MB/sec), 108usec
#5 (/mnt/scrap-sdj/bigfile) 2GB in 55.903394s using 512K reads of 4k
(35.78 MB/sec), 107usec
#6 (/mnt/scrap-sdk/bigfile) 2GB in 55.868038s using 512K reads of 4k
(35.80 MB/sec), 108usec
#7 (/mnt/scrap-sdl/bigfile) 2GB in 55.895071s using 512K reads of 4k
(35.78 MB/sec), 108usec
during test, cpu was ~20% idle.
Test #5 of 2.5.25 kernel with lockmeter & profile=2
---------------------------------------------------
using profile=2 and lockmeter on test #5:
given performance wasn't significantly higher (it should have been), i
rebooted the machine, and loaded 2.5.25 with
"profile=2" and patched in lockmeter.
the same benchmark in test #5 was executed as follows:
readprofile -r; ./lockstat/lockstat on; ./lockstat/lockstat reset;
./test_disk_performance3 bs=4k blocks=50k
mode=nocopy operation=read seek=random iterations=10
/mnt/scrap-sd*/bigfile; readprofile -v | sort -n -k4 | tail -20;
./lockstat/lockstat print
the results were very odd -- i've removed all locks with 0% contention.
the contention seems to be in the scheduler:
Completed reading 16000 mbytes across 8 devices using 4096000 blocks of
4096 in 91.014923 seconds (175.80 Mbytes/sec),
175usec mean
#0 (/mnt/scrap-sde/bigfile) 2000MB in 91.014743s using 512000
reads of 4096 bytes (21.97 Mbytes/sec), 139usec
#1 (/mnt/scrap-sdf/bigfile) 2000MB in 90.945358s using 512000
reads of 4096 bytes (21.99 Mbytes/sec), 178usec
#2 (/mnt/scrap-sdg/bigfile) 2000MB in 90.430662s using 512000
reads of 4096 bytes (22.12 Mbytes/sec), 174usec
#3 (/mnt/scrap-sdh/bigfile) 2000MB in 90.474412s using 512000
reads of 4096 bytes (22.11 Mbytes/sec), 174usec
#4 (/mnt/scrap-sdi/bigfile) 2000MB in 90.443735s using 512000
reads of 4096 bytes (22.11 Mbytes/sec), 174usec
#5 (/mnt/scrap-sdj/bigfile) 2000MB in 90.494653s using 512000
reads of 4096 bytes (22.10 Mbytes/sec), 174usec
#6 (/mnt/scrap-sdk/bigfile) 2000MB in 90.488251s using 512000
reads of 4096 bytes (22.10 Mbytes/sec), 174usec
#7 (/mnt/scrap-sdl/bigfile) 2000MB in 90.620444s using 512000
reads of 4096 bytes (22.07 Mbytes/sec), 174usec
80108938 resume_userspace 59 3.6875
8011b540 sys_gettimeofday 729 4.5563
80108650 handle_signal 1963 5.5767
801087b0 do_signal 1046 5.9432
80107a20 sys_rt_sigsuspend 2058 6.1250
80107dd0 sys_sigreturn 1984 6.5263
80108974 syscall_call 73 6.6364
8010a3f0 math_state_restore 450 7.0312
8010f140 restore_i387 3040 7.0370
8011bc40 do_softirq 1473 7.0817
8010ef10 save_i387 4281 7.6446
80120ab0 get_signal_to_deliver 5523 10.7871
80113ba0 schedule 10230 10.8369
8010ed80 restore_fpu 475 14.8438
80109418 device_not_available 787 16.3958
8010897f syscall_exit 203 18.4545
8010de50 do_gettimeofday 7115 49.4097
80120440 send_sig_info 9599 54.5398
80108948 system_call 5549 126.1136
80106d30 default_idle 52095 813.9844
lockstat results -- all contended locks shown:
___________________________________________________________________________________________
System: Linux mel-stglab-host1 2.5.25 #12 SMP Sun Jul 14 19:25:43 EST 2002 i686
Total counts
All (32) CPUs
Start time: Sun Jul 14 20:06:07 2002
End time: Sun Jul 14 20:11:46 2002
Delta Time: 338.84 sec.
Hash table slots in use: 432.
Global read lock slots in use: 999.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - -
SPINLOCKS HOLD WAIT
UTIL CON MEAN( MAX ) MEAN( MAX )(% CPU) TOTAL NOWAIT SPIN
RJECT NAME
1.2% 0.7us(3768us) 1.7us( 286us)(0.02%) 88382474 98.8% 1.1%
0.02% *TOTAL*
0.03% 0.20% 1.6us( 7.5us) 2.6us( 12us)(0.00%) 61102 99.8%
0.20% 0% [0xf7b1c008]
0.03% 0.11% 3.2us( 7.5us) 2.1us( 4.9us)(0.00%) 30551 99.9%
0.11% 0% qla2x00_done+0x1e8
0.00% 0.30% 0.1us( 0.6us) 2.7us( 12us)(0.00%) 30551 99.7%
0.30% 0% qla2x00_queuecommand+0x484
0.01% 0.03% 0.2us( 1.1us) 0.9us( 2.2us)(0.00%) 244002 100%
0.03% 0% [0xf7b1e35c]
0.00% 0.02% 0.1us( 0.6us) 0.9us( 1.2us)(0.00%) 30551 100%
0.02% 0% qla2x00_callback+0x54
0.00% 0.02% 0.2us( 0.7us) 1.1us( 1.9us)(0.00%) 30145 100%
0.02% 0% qla2x00_done+0x34
0.00% 0.03% 0.2us( 1.1us) 0.7us( 0.9us)(0.00%) 30551 100%
0.03% 0% qla2x00_get_new_sp+0x18
0.00% 0.02% 0.2us( 1.1us) 1.2us( 1.9us)(0.00%) 61102 100%
0.02% 0% qla2x00_next+0x20
0.00% 0.06% 0.1us( 0.9us) 0.8us( 1.6us)(0.00%) 30551 100%
0.06% 0% qla2x00_next+0x10c
0.00% 0.05% 0.1us( 0.5us) 0.9us( 1.7us)(0.00%) 30551 100%
0.05% 0% qla2x00_queuecommand+0x424
0.00% 0.03% 0.2us( 0.9us) 1.1us( 2.2us)(0.00%) 30551 100%
0.03% 0% qla2x00_status_entry+0x830
0.06% 0.06% 3.4us( 11us) 2.8us( 5.3us)(0.00%) 60704 100%
0.06% 0% [0xf7b1e388]
0.02% 0.10% 2.2us( 8.0us) 3.0us( 5.3us)(0.00%) 30551 100%
0.10% 0% qla2x00_64bit_start_scsi+0x15c
0.00% 0% 0.3us(
0.4us) 0us 8 100% 0% 0%
qla2x00_cmd_timeout+0x190
0.04% 0.02% 4.7us( 11us) 2.2us( 3.0us)(0.00%) 30145 100%
0.02% 0% qla2x00_intr_handler+0x40
0.26% 0.04% 0.1us( 1.0us) 0.6us( 1.5us)(0.00%) 8927899 100%
0.04% 0% blk_plug_lock
0.00% 0.01% 0.1us( 0.8us) 0.3us( 0.4us)(0.00%) 28901 100%
0.01% 0% blk_plug_device+0x1c
0.26% 0.04% 0.1us( 1.0us) 0.6us( 1.5us)(0.00%) 8870097 100%
0.04% 0% blk_run_queues+0x18
0.00% 1.0% 0.1us( 0.5us) 0.6us( 1.1us)(0.00%) 28901
99.0% 1.0% 0% generic_unplug_device+0x2c
0.03% 0.05% 0.5us( 9.2us) 1.2us( 3.0us)(0.00%) 158705 100%
0.05% 0% contig_page_data+0xb4
0.00% 0.12% 0.2us( 3.0us) 1.3us( 2.0us)(0.00%) 27794 99.9%
0.12% 0% __free_pages_ok+0x174
0.02% 0.03% 0.6us( 9.2us) 1.1us( 3.0us)(0.00%) 130911 100%
0.03% 0% rmqueue+0x28
0.01% 0.03% 1.5us( 44us) 5.6us( 12us)(0.00%) 23127 100%
0.03% 0% dcache_lock
0.00% 0.04% 0.3us( 6.4us) 4.0us( 6.5us)(0.00%) 7053 100%
0.04% 0% dput+0x30
0.01% 0.04% 3.8us( 44us) 7.1us( 12us)(0.00%) 7114 100%
0.04% 0% path_lookup+0xd8
0.00% 0.01% 0.3us( 28us) 0.3us( 0.3us)(0.00%) 13515 100%
0.01% 0% files_lock
0.00% 0.02% 0.2us( 28us) 0.3us( 0.3us)(0.00%) 4489 100%
0.02% 0% __fput+0x70
0.18% 0.02% 1.7us( 4.8us) 1.7us( 3.3us)(0.00%) 367354 100%
0.02% 0% ioapic_lock
0.18% 0.02% 1.7us( 4.8us) 1.7us( 3.3us)(0.00%) 367354 100%
0.02% 0% set_ioapic_affinity+0x20
0.02% 2.7% 9.3us( 306us) 14us( 286us)(0.00%) 7358
97.3% 2.7% 0% kernel_flag
0.00% 0.28% 0.4us( 4.1us) 15us( 15us)(0.00%) 351 99.7%
0.28% 0% de_put+0x2c
0.00% 0.88% 21us( 97us) 15us( 21us)(0.00%) 342 99.1%
0.88% 0% ext3_dirty_inode+0x2c
0.00% 0.50% 2.0us( 70us) 24us( 24us)(0.00%) 202 99.5%
0.50% 0% ext3_get_block_handle+0xb8
0.00% 2.9% 4.9us( 7.0us) 146us( 146us)(0.00%) 35
97.1% 2.9% 0% ext3_write_super+0x24
0.00% 27.0% 52us( 306us) 29us( 286us)(0.00%) 226 73.0%
27.0% 0% schedule+0x394
0.00% 0.79% 0.7us( 1.2us) 9.2us( 14us)(0.00%) 379 99.2%
0.79% 0% sem_exit+0x24
0.00% 0.83% 2.4us( 21us) 5.3us( 8.5us)(0.00%) 845 99.2%
0.83% 0% sys_ioctl+0x44
0.00% 12.0% 6.7us( 36us) 4.2us( 12us)(0.00%) 502 88.0%
12.0% 0% tty_read+0xc4
0.01% 3.2% 8.8us( 53us) 6.2us( 25us)(0.00%) 2004
96.8% 3.2% 0% tty_write+0x1f4
0.02% 0.01% 0.2us( 37us) 3.3us( 16us)(0.00%) 247810 100%
0.01% 0% pagemap_lru_lock
0.01% 0.00% 0.4us( 36us) 14us( 16us)(0.00%) 102714 100%
0.00% 0% activate_page+0xc
0.01% 0.01% 0.1us( 31us) 0.6us( 1.0us)(0.00%) 124041 100%
0.01% 0% lru_cache_add+0x1c
0.00% 0.01% 0.2us( 30us) 0.9us( 0.9us)(0.00%) 21044 100%
0.01% 0% lru_cache_del+0xc
0.00% 83.2% 0.6us( 2.2us) 1.0us( 2.4us)(0.00%) 7652 16.8%
83.2% 0% runqueues
0.00% 83.2% 0.6us( 2.2us) 1.0us( 2.4us)(0.00%) 7652 16.8%
83.2% 0% load_balance+0x13c
0.00% 42.6% 0.8us( 4.2us) 1.0us( 3.0us)(0.00%) 15467 57.4%
42.6% 0% runqueues+0x9a0
0.00% 84.3% 0.4us( 1.9us) 1.0us( 3.0us)(0.00%) 7815 15.7%
84.3% 0% load_balance+0x188
0.05% 0.02% 0.3us( 4.0us) 0.8us( 2.5us)(0.00%) 560160 100%
0.02% 0% timerlist_lock
0.01% 0.04% 0.2us( 1.6us) 0.8us( 1.4us)(0.00%) 93886 100%
0.04% 0% add_timer+0x10
0.00% 0.05% 0.1us( 0.7us) 0.8us( 2.5us)(0.00%) 61444 100%
0.05% 0% del_timer+0x14
0.00% 0.02% 0.2us( 0.9us) 0.8us( 1.3us)(0.00%) 32154 100%
0.02% 0% del_timer_sync+0x1c
0.00% 0.46% 0.5us( 1.6us) 0.6us( 0.7us)(0.00%) 647 99.5%
0.46% 0% mod_timer+0x18
0.03% 0.01% 0.3us( 4.0us) 0.8us( 1.2us)(0.00%) 339146 100%
0.01% 0% timer_bh+0xd4
0.00% 0.01% 0.1us( 0.8us) 0.9us( 1.0us)(0.00%) 32883 100%
0.01% 0% timer_bh+0x274
0.04% 0.36% 3.8us( 12us) 3.2us( 14us)(0.00%) 31764 99.6%
0.36% 0% __make_request+0x70
0.07% 0.10% 7.5us( 44us) 2.5us( 4.5us)(0.00%) 30805 100%
0.10% 0% __scsi_end_request+0x20
0.01% 0.00% 0.6us( 6.9us) 0.5us( 0.8us)(0.00%) 81580 100%
0.00% 0% __wake_up+0x20
0.00% 0.29% 7.7us( 68us) 4.8us( 4.8us)(0.00%) 348 99.7%
0.29% 0% ahc_linux_isr+0x28
0.03% 1.3% 3.2us( 12us) 1.5us( 9.2us)(0.00%) 28915
98.7% 1.3% 0% generic_unplug_device+0x14
0.31% 1.2% 0.8us( 3.7us) 0us 1321702
98.8% 0% 1.2% load_balance+0x120
0.00% 0.59% 0.2us( 0.8us) 1.1us( 1.8us)(0.00%) 1525 99.4%
0.59% 0% n_tty_chars_in_buffer+0x18
0.00% 0.40% 0.1us( 0.5us) 0.6us( 0.6us)(0.00%) 501 99.6%
0.40% 0% read_chan+0x538
0.00% 0.60% 0.1us( 0.2us) 0.9us( 1.2us)(0.00%) 501 99.4%
0.60% 0% read_chan+0x59c
0.00% 0.02% 0.1us( 0.7us) 0.8us( 1.2us)(0.00%) 28530 100%
0.02% 0% remove_wait_queue+0x10
5.2% 3.0% 1.5us( 13us) 1.1us( 3.2us)(0.00%) 11473908
97.0% 3.0% 0% schedule+0x8c
0.02% 1.6% 0.2us( 5.3us) 1.0us( 2.6us)(0.00%) 308882
98.4% 1.6% 0% scheduler_tick+0x10c
0.08% 0.88% 0.8us( 4.7us) 1.0us( 2.2us)(0.00%) 369477 99.1%
0.88% 0% scheduler_tick+0x80
0.00% 0.45% 0.2us( 13us) 2.7us( 7.7us)(0.00%) 30819 99.5%
0.45% 0% scsi_dispatch_cmd+0x138
0.00% 0.21% 0.2us( 1.0us) 2.4us( 5.6us)(0.00%) 30819 99.8%
0.21% 0% scsi_finish_command+0x18
0.00% 0.22% 0.5us( 3.2us) 2.7us( 6.4us)(0.00%) 30819 99.8%
0.22% 0% scsi_queue_next_request+0x18
0.00% 0.08% 0.3us( 4.0us) 5.1us( 18us)(0.00%) 30819 100%
0.08% 0% scsi_request_fn+0x3bc
5.2% 0.00% 2.0us( 12us) 0.7us( 1.0us)(0.00%) 8750674 100%
0.00% 0% send_sig_info+0x4c
0.00% 0.93% 0.8us( 1.9us) 0.2us( 0.2us)(0.00%) 108 99.1%
0.93% 0% sys_sched_yield+0x38
1.6% 7.3% 0.6us( 4.8us) 2.0us( 6.7us)(0.01%) 8758669
92.7% 7.3% 0% try_to_wake_up+0x40
0.00% 0.26% 0.7us( 2.0us) 0.3us( 0.3us)(0.00%) 379 99.7%
0.26% 0% wake_up_forked_process+0x30
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - -
RWLOCK READS HOLD MAX RDR BUSY PERIOD WAIT
UTIL CON MEAN RDRS MEAN( MAX ) MEAN( MAX )( %CPU) TOTAL
NOWAIT SPIN NAME
0.14% 4.7us( 13us)(0.00%) 60230957
99.9% 0.14% *TOTAL*
7.3% 0.00% 2.8us 2 2.8us( 245us) 3.1us(
9.0us)(0.00%) 8751481 100% 0.00% tasklist_lock
0.26% 2.6us( 2.6us)(0.00%) 379
99.7% 0.26% exit_notify+0x18
11.1% 2.6us( 2.6us)(0.00%) 9
88.9% 11.1% sig_exit+0x90
1.4% 0.44% 0.2us 2 0.3us( 3.3us) 4.7us( 13us)(0.00%) 19266661
99.6% 0.44% xtime_lock
0.44% 4.7us( 13us)(0.00%) 19266661
99.6% 0.44% do_gettimeofday+0x14
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - -
RWLOCK WRITES HOLD WAIT (ALL) WAIT (WW)
UTIL CON MEAN( MAX ) MEAN( MAX )( %CPU) MEAN( MAX ) TOTAL
NOWAIT SPIN( WW ) NAME
0.13% 0.7us( 230us) 1.0us( 9.0us)(0.00%) 0.3us( 2.9us) 10490234
99.9% 0.09%(0.05%) *TOTAL*
0.00% 7.8% 0.5us( 3.0us) 1.8us( 9.0us)(0.00%) 0.7us( 2.9us) 1224
92.2% 6.7%( 1.1%) tasklist_lock
0.00% 7.7% 0.7us( 1.3us) 1.6us( 7.4us)(0.00%) 0.3us( 0.7us) 379
92.3% 6.3%( 1.3%) do_fork+0x588
0% 7.1% 2.0us( 3.1us)(0.00%) 1.5us( 2.9us) 379
92.9% 5.8%( 1.3%) exit_notify+0x1c4
0.00% 4.6% 0.1us( 0.1us) 1.4us( 1.7us)(0.00%) 0.6us( 0.6us) 87
95.4% 3.4%( 1.1%) pid_base_iput+0x18
0.00% 9.5% 0.8us( 3.0us) 1.8us( 9.0us)(0.00%) 0.2us( 0.6us) 379
90.5% 8.7%(0.79%) unhash_process+0x14
1.0% 2.1% 5.2us( 18us) 1.0us( 2.3us)(0.00%) 0.3us( 1.2us) 678292
97.9% 1.3%(0.74%) xtime_lock
0.04% 2.1% 0.4us( 4.8us) 1.0us( 2.3us)(0.00%) 0.3us( 1.2us) 339146
97.9% 1.3%(0.81%) timer_bh+0xc
1.0% 2.0% 10us( 18us) 0.9us( 2.2us)(0.00%) 0.3us( 0.9us) 339146
98.0% 1.3%(0.68%) timer_interrupt+0x10
1.1% 0% 0.4us(
230us) 0us 0us 9604614 100% 0%( 0%)
do_generic_file_read+0x8c
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
2002-07-14 12:22 ` ext2 performance in 2.5.25 versus 2.4.19pre8aa2 Lincoln Dale
@ 2002-07-15 5:30 ` Andrew Morton
2002-07-15 6:06 ` Lincoln Dale
2002-07-17 19:22 ` Daniel Phillips
2002-07-15 16:30 ` Benjamin LaHaise
1 sibling, 2 replies; 17+ messages in thread
From: Andrew Morton @ 2002-07-15 5:30 UTC (permalink / raw)
To: Lincoln Dale
Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
Linus Torvalds, Steve Lord, linux-kernel
Lincoln Dale wrote:
>
> Andrew Morton wanted me to do some benchmarking of large files on ext2
> filesystems rather than the usual block-device testing
> i've had some time to do this, here are the results.
>
> one-line summary is that some results are better, some are worse; CPU usage
> is better in 2.5.25, but thoughput is sometimes
> worse.
Well thanks for doing this. All rather strange though.
- You should definitely be seeing reduced CPU on writes through the
pagecache. A whole pile of gunk has disappeared from there.
Here's what I get with 4x4gig files on 4xIDE disks:
for i in hde5 hdg5 hdi5 hdk5
do
/usr/src/ext3/tools/write-and-fsync -m 4000 -f /mnt/$i/foo &
done
2.4.19-rc1+block_highmem 0.06s user 106.75s system 53% cpu 3:20.94 total
2.5.25 0.03s user 78.37s system 40% cpu 3:14.82 total
2.5.25+some stuff 0.05s user 77.91s system 41% cpu 3:07.70 total
2.5.25+O_DIRECT 0.00s user 6.84s system 3% cpu 2:53.21 total
That's a 25% drop in CPU load for writes in 2.5. Actually more, because
Andre's current 2.5 IDE drivers are using teeny requests and are measurably
slow.
That's how it should be, and it is strange that you're not showing
decreased CPU and increased throughput on writes.
- For reads through the pagecache you're showing good reduction in CPU
and some increase in bandwidth.
When reading the above 4 files in parallel on the IDE setup I show:
for i in hde5 hdg5 hdi5 hdk5
do
time /usr/src/ext3/tools/time-read -b 8192 -h 8192 /mnt/$i/foo &
done
2.5.25: 0.43s user 42.74s system 31% cpu 2:17.87 total
2.4.19-rc1+block-highmem: 0.37s user 54.48s system 40% cpu 2:16.17 total
2.4.19-rc1: 0.63s user 129.21s system 76% cpu 2:49.66 total
A 25% drop in CPU load on buffered reads in 2.5.
Funny thing about your results is the presence of sched_yield(),
especially in the copy-from-pagecache-only load. That test should
peg the CPU at 100% and definitely shouldn't be spending time in
default_idle. So who is calling sched_yield()? I think it has to be
your test app?
Be aware that the sched_yield() behaviour in 2.5 has changed a lot
wrt 2.4. It has made StarOffice 5.2 completely unusable on a non-idle
system, for a start. (This is a SO problem and not a kernel problem,
but it's a lesson).
-
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
2002-07-15 5:30 ` Andrew Morton
@ 2002-07-15 6:06 ` Lincoln Dale
2002-07-15 6:52 ` Andrew Morton
2002-07-15 9:49 ` Andrea Arcangeli
2002-07-17 19:22 ` Daniel Phillips
1 sibling, 2 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-15 6:06 UTC (permalink / raw)
To: Andrew Morton
Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
Linus Torvalds, Steve Lord, linux-kernel
At 10:30 PM 14/07/2002 -0700, Andrew Morton wrote:
>Funny thing about your results is the presence of sched_yield(),
>especially in the copy-from-pagecache-only load. That test should
>peg the CPU at 100% and definitely shouldn't be spending time in
>default_idle. So who is calling sched_yield()? I think it has to be
>your test app?
>
>Be aware that the sched_yield() behaviour in 2.5 has changed a lot
>wrt 2.4. It has made StarOffice 5.2 completely unusable on a non-idle
>system, for a start. (This is a SO problem and not a kernel problem,
>but it's a lesson).
my test app uses pthreads (one thread per disk-worker) and
pthread_cond_wait in the master task to wait for all workers to finish.
i'll switch the app to use clone() and sys_futex instead.
i guess in that case, its debatable whether its a kernel problem or not --
pthreads is out there, and if its default behavior is bad, any threaded app
which uses it will also be bad.
cheers,
lincoln.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
2002-07-15 6:06 ` Lincoln Dale
@ 2002-07-15 6:52 ` Andrew Morton
2002-07-15 9:49 ` Andrea Arcangeli
1 sibling, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2002-07-15 6:52 UTC (permalink / raw)
To: Lincoln Dale
Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
Linus Torvalds, Steve Lord, linux-kernel
Lincoln Dale wrote:
>
> At 10:30 PM 14/07/2002 -0700, Andrew Morton wrote:
> >Funny thing about your results is the presence of sched_yield(),
> >especially in the copy-from-pagecache-only load. That test should
> >peg the CPU at 100% and definitely shouldn't be spending time in
> >default_idle. So who is calling sched_yield()? I think it has to be
> >your test app?
> >
> >Be aware that the sched_yield() behaviour in 2.5 has changed a lot
> >wrt 2.4. It has made StarOffice 5.2 completely unusable on a non-idle
> >system, for a start. (This is a SO problem and not a kernel problem,
> >but it's a lesson).
>
> my test app uses pthreads (one thread per disk-worker) and
> pthread_cond_wait in the master task to wait for all workers to finish.
> i'll switch the app to use clone() and sys_futex instead.
OK.
> i guess in that case, its debatable whether its a kernel problem or not --
> pthreads is out there, and if its default behavior is bad, any threaded app
> which uses it will also be bad.
Well if your machine is executing a single cycle in default_idle
with that load then there's a bug somewhere.
I took a quick look at glibc-linuxthreads but as usual, my brain
turned to mush and it took seven years off my life.
If you can send me a copy of your test app I'll take a look
at what's going on.
Thanks.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
2002-07-15 6:06 ` Lincoln Dale
2002-07-15 6:52 ` Andrew Morton
@ 2002-07-15 9:49 ` Andrea Arcangeli
2002-07-15 10:16 ` Lincoln Dale
2002-07-15 18:08 ` Andrew Morton
1 sibling, 2 replies; 17+ messages in thread
From: Andrea Arcangeli @ 2002-07-15 9:49 UTC (permalink / raw)
To: Lincoln Dale
Cc: Andrew Morton, Benjamin LaHaise, Stephen C. Tweedie,
Linus Torvalds, Steve Lord, linux-kernel
On Mon, Jul 15, 2002 at 04:06:21PM +1000, Lincoln Dale wrote:
> At 10:30 PM 14/07/2002 -0700, Andrew Morton wrote:
> >Funny thing about your results is the presence of sched_yield(),
> >especially in the copy-from-pagecache-only load. That test should
> >peg the CPU at 100% and definitely shouldn't be spending time in
> >default_idle. So who is calling sched_yield()? I think it has to be
> >your test app?
> >
> >Be aware that the sched_yield() behaviour in 2.5 has changed a lot
> >wrt 2.4. It has made StarOffice 5.2 completely unusable on a non-idle
> >system, for a start. (This is a SO problem and not a kernel problem,
> >but it's a lesson).
>
> my test app uses pthreads (one thread per disk-worker) and
> pthread_cond_wait in the master task to wait for all workers to finish.
> i'll switch the app to use clone() and sys_futex instead.
unless you call pthread routines during the workload, pthreads cannot be
the reason for a slowdown.
Also I would suggest Andrew to benchmark 2.4.19rc1aa2 against 2.5
instead of plain rc1 just to be sure to compare apples to apples.
(rc1aa2 should also be faster than pre8aa2)
BTW, Lincol, I still have a pending answer for you, about the mmap
slowdown, that's because of reduced readahead mostly, you can tune it
with page-cluster sysctl, it's not only because of the expensive page
faults that mmap I/O implies. I've some revolutionary idea about
replacing readahead, not that it matters for your workload that is
reading physically contigous though.
Andrea
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
2002-07-15 9:49 ` Andrea Arcangeli
@ 2002-07-15 10:16 ` Lincoln Dale
2002-07-15 18:08 ` Andrew Morton
1 sibling, 0 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-15 10:16 UTC (permalink / raw)
To: linux-kernel
At 11:49 AM 15/07/2002 +0200, Andrea Arcangeli wrote:
> > my test app uses pthreads (one thread per disk-worker) and
> > pthread_cond_wait in the master task to wait for all workers to finish.
> > i'll switch the app to use clone() and sys_futex instead.
>
>unless you call pthread routines during the workload, pthreads cannot be
>the reason for a slowdown.
the test app does:
(parent)
for (i=0; i < num_devices; i++) {
err = pthread_create(&(device[i]->thread), NULL, (void
*)run_tests, (void *) i);
..
}
/* wait for all threads to exit */
while (active_threads != 0) {
pthread_mutex_lock(&sync_thread_mutex);
gettimeofday(&now, NULL);
timeout.tv_sec = now.tv_sec + 5;
timeout.tv_nsec = now.tv_usec * 1000;
retcode = 0;
while ((active_threads != 0) && (retcode != ETIMEDOUT)) {
retcode =
pthread_cond_timedwait(&sync_thread_cond, &sync_thread_mutex, &timeout);
}
if (retcode == ETIMEDOUT) {
print_status_update();
}
pthread_mutex_unlock(&sync_thread_mutex);
}
(each worker, when it finishes)
pthread_mutex_lock(&sync_thread_mutex);
active_threads--;
pthread_cond_broadcast(&sync_thread_cond);
pthread_mutex_unlock(&sync_thread_mutex);
pthread_exit(0);
no idea what the pthread_cond_timedwait does under the covers, but i bet
its bad..
>BTW, Lincol, I still have a pending answer for you, about the mmap
>slowdown, that's because of reduced readahead mostly, you can tune it
>with page-cluster sysctl, it's not only because of the expensive page
>faults that mmap I/O implies. I've some revolutionary idea about
>replacing readahead, not that it matters for your workload that is
>reading physically contigous though.
i only added the mmap() for interests-sake; the intent of my benchmarking
was less-so to stress linux, more-so to stress some storage-networking
plumbing (iSCSI & FC stuff), but its been an interesting series of
experiments nontheless.
cheers,
lincoln.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
2002-07-14 12:22 ` ext2 performance in 2.5.25 versus 2.4.19pre8aa2 Lincoln Dale
2002-07-15 5:30 ` Andrew Morton
@ 2002-07-15 16:30 ` Benjamin LaHaise
1 sibling, 0 replies; 17+ messages in thread
From: Benjamin LaHaise @ 2002-07-15 16:30 UTC (permalink / raw)
To: Lincoln Dale
Cc: Andrew Morton, Andrea Arcangeli, Stephen C. Tweedie,
Linus Torvalds, Steve Lord, linux-kernel
On Sun, Jul 14, 2002 at 10:22:56PM +1000, Lincoln Dale wrote:
> one-line summary is that some results are better, some are worse; CPU usage
> is better in 2.5.25, but thoughput is sometimes
> worse.
You might want to rerun your tests on 2.5.25 after redefining HZ to be 100,
or setting HZ to 1000 in the 2.4 kernel.
-ben
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
2002-07-15 9:49 ` Andrea Arcangeli
2002-07-15 10:16 ` Lincoln Dale
@ 2002-07-15 18:08 ` Andrew Morton
1 sibling, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2002-07-15 18:08 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Lincoln Dale, Benjamin LaHaise, Stephen C. Tweedie,
Linus Torvalds, Steve Lord, linux-kernel
Andrea Arcangeli wrote:
>
> On Mon, Jul 15, 2002 at 04:06:21PM +1000, Lincoln Dale wrote:
> > At 10:30 PM 14/07/2002 -0700, Andrew Morton wrote:
> > >Funny thing about your results is the presence of sched_yield(),
> > >especially in the copy-from-pagecache-only load. That test should
> > >peg the CPU at 100% and definitely shouldn't be spending time in
> > >default_idle. So who is calling sched_yield()? I think it has to be
> > >your test app?
> > >
> > >Be aware that the sched_yield() behaviour in 2.5 has changed a lot
> > >wrt 2.4. It has made StarOffice 5.2 completely unusable on a non-idle
> > >system, for a start. (This is a SO problem and not a kernel problem,
> > >but it's a lesson).
> >
> > my test app uses pthreads (one thread per disk-worker) and
> > pthread_cond_wait in the master task to wait for all workers to finish.
> > i'll switch the app to use clone() and sys_futex instead.
>
> unless you call pthread routines during the workload, pthreads cannot be
> the reason for a slowdown.
I didn't see the machine spending any time idle when I ran Lincoln's
test so I'm not sure what's going on there. But the pthread thing
is surely the reason why the profiles are showing time in sched_yield().
What I *did* see was 2.5 spending too much time doing pointless work
in readahead (it's in cache already, stop doing that!). And also
generic_file_llseek() bouncing i_sem around like a ping-pong ball.
Fixing those things up bought 10%.
> Also I would suggest Andrew to benchmark 2.4.19rc1aa2 against 2.5
> instead of plain rc1 just to be sure to compare apples to apples.
> (rc1aa2 should also be faster than pre8aa2)
Yes sorry, but I find testing -aa is a bit of a pain. It's such a
big patch, I'd really need to start a new branch for it.
-
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
2002-07-15 5:30 ` Andrew Morton
2002-07-15 6:06 ` Lincoln Dale
@ 2002-07-17 19:22 ` Daniel Phillips
1 sibling, 0 replies; 17+ messages in thread
From: Daniel Phillips @ 2002-07-17 19:22 UTC (permalink / raw)
To: Andrew Morton, Lincoln Dale
Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
Linus Torvalds, Steve Lord, linux-kernel
On Monday 15 July 2002 07:30, Andrew Morton wrote:
> Lincoln Dale wrote:
> >
> > Andrew Morton wanted me to do some benchmarking of large files on ext2
> > filesystems rather than the usual block-device testing
> > i've had some time to do this, here are the results.
> >
> > one-line summary is that some results are better, some are worse; CPU
> > usage is better in 2.5.25, but thoughput is sometimes worse.
>
> Well thanks for doing this. All rather strange though.
One result that seems pretty consistent in these tests is that avoiding the
page cache is good for about 20% overall throughput improvement. Which is
significant, but less than I would have thought if bus bandwidth is the only
major bottleneck. Something in the vfs/filesystem/blockio path is still
eating too much cpu.
Another observation: though only one of the tests hit 100% CPU, total
throughput is still shows consistent improvement as a result of reducing CPU.
This should not be, it means there is excessive latency between submission of
requests, that is, the IO pipes are not being kept full.
--
Daniel
^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2002-07-17 19:25 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-07-08 3:19 direct-to-BIO for O_DIRECT Andrew Morton
2002-07-08 3:30 ` Lincoln Dale
2002-07-08 7:44 ` Ingo Oeser
2002-07-11 2:25 ` Lincoln Dale
2002-07-11 3:24 ` Andrew Morton
2002-07-11 3:25 ` Lincoln Dale
[not found] ` <3D2CFF48.9EFF9C59@zip.com.au>
2002-07-14 12:22 ` ext2 performance in 2.5.25 versus 2.4.19pre8aa2 Lincoln Dale
2002-07-15 5:30 ` Andrew Morton
2002-07-15 6:06 ` Lincoln Dale
2002-07-15 6:52 ` Andrew Morton
2002-07-15 9:49 ` Andrea Arcangeli
2002-07-15 10:16 ` Lincoln Dale
2002-07-15 18:08 ` Andrew Morton
2002-07-17 19:22 ` Daniel Phillips
2002-07-15 16:30 ` Benjamin LaHaise
2002-07-11 19:52 ` direct-to-BIO for O_DIRECT Jesse Barnes
2002-07-11 23:40 ` Lincoln Dale
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox