direct-to-BIO for O

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* direct-to-BIO for O_DIRECT
@ 2002-07-08  3:19 Andrew Morton
  2002-07-08  3:30 ` Lincoln Dale
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Andrew Morton @ 2002-07-08  3:19 UTC (permalink / raw)
  To: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
	Linus Torvalds, lkml, Steve Lord


Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
the kiovec layer.  It's followed by a patch which converts the raw
driver to use the O_DIRECT engine.

CPU utilisation is about the same as the kiovec-based implementation.
Read and write bandwidth are the same too, for 128k chunks.   But with
one megabyte chunks, this implementation is 20% faster at writing.

I assume this is because the kiobuf-based implementation has to stop
and wait for each 128k chunk, whereas this code streams the entire
request, regardless of its size.

This is with a single (oldish) scsi disk on aic7xxx.  I'd expect the
margin to widen on higher-end hardware which likes to have more 
requests in flight.

Question is: what do we want to do with this sucker?  These are the
remaining users of kiovecs:

	drivers/md/lvm-snap.c
	drivers/media/video/video-buf.c
	drivers/mtd/devices/blkmtd.c
	drivers/scsi/sg.c

the video and mtd drivers seems to be fairly easy to de-kiobufize.
I'm aware of one proprietary driver which uses kiobufs.  XFS uses
kiobufs a little bit - just to map the pages.

So with a bit of effort and maintainer-irritation, we can extract
the kiobuf layer from the kernel.

Do we want to do that?



 fs/Makefile                 |    2 
 fs/block_dev.c              |    7 
 fs/buffer.c                 |    2 
 fs/direct-io.c              |  491 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c             |    7 
 include/linux/buffer_head.h |    2 
 include/linux/fs.h          |   11 
 mm/filemap.c                |   64 ++---
 8 files changed, 543 insertions(+), 43 deletions(-)

--- /dev/null	Thu Aug 30 13:30:55 2001
+++ 2.5.25-akpm/fs/direct-io.c	Sun Jul  7 19:40:20 2002
@@ -0,0 +1,491 @@
+/*
+ * mm/direct-io.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * O_DIRECT
+ *
+ * 04Jul2002	akpm@zip.com.au
+ *		Initial version
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/buffer_head.h>
+#include <linux/rwsem.h>
+#include <asm/atomic.h>
+
+/*
+ * The largest-sized BIO which this code will assemble, in bytes.  Set this
+ * to PAGE_SIZE if your drivers are broken.
+ */
+#define DIO_BIO_MAX_SIZE BIO_MAX_SIZE
+
+/*
+ * How many user pages to map in one call to get_user_pages().  This determines
+ * the size of a structure on the stack.
+ */
+#define DIO_PAGES	64
+
+struct dio {
+	/* BIO submission state */
+	struct bio *bio;		/* bio under assembly */
+	struct bio_vec *bvec;		/* current bvec in that bio */
+	struct inode *inode;
+	int rw;
+	sector_t block_in_file;		/* changes */
+	sector_t final_block_in_request;/* doesn't change */
+	unsigned first_block_in_page;	/* doesn't change */
+	int boundary;			/* prev block is at a boundary */
+	int reap_counter;		/* rate limit reaping */
+	get_block_t *get_block;
+	sector_t last_block_in_bio;
+
+	/* Page fetching state */
+	int curr_page;			/* changes */
+	int total_pages;		/* doesn't change */
+	unsigned long curr_user_address;/* changes */
+
+	/* Page queue */
+	struct page *pages[DIO_PAGES];
+	unsigned head;
+	unsigned tail;
+
+	/* BIO completion state */
+	atomic_t bio_count;
+	spinlock_t bio_list_lock;
+	struct bio *bio_list;		/* singly linked via bi_private */
+	wait_queue_head_t wait_q;
+};
+
+/*
+ * How many pages are in the queue?
+ */
+static inline unsigned dio_pages_present(struct dio *dio)
+{
+	return dio->head - dio->tail;
+}
+
+/*
+ * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
+ */
+static int dio_refill_pages(struct dio *dio)
+{
+	int ret;
+	int nr_pages;
+
+	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+	ret = get_user_pages(
+		current,			/* Task for fault acounting */
+		current->mm,			/* whose pages? */
+		dio->curr_user_address,		/* Where from? */
+		nr_pages,			/* How many pages? */
+		dio->rw == READ,		/* Write to memory? */
+		0,				/* force (?) */
+		&dio->pages[0],
+		NULL);				/* vmas */
+
+	if (ret >= 0) {
+		dio->curr_user_address += ret * PAGE_SIZE;
+		dio->curr_page += ret;
+		dio->head = 0;
+		dio->tail = ret;
+		ret = 0;
+	}
+	return ret;	
+}
+
+/*
+ * Get another userspace page.  Returns an ERR_PTR on error.  Pages are
+ * buffered inside the dio so that we can call get_user_pages() against a
+ * decent number of pages, less frequently.  To provide nicer use of the
+ * L1 cache.
+ */
+static struct page *dio_get_page(struct dio *dio)
+{
+	if (dio_pages_present(dio) == 0) {
+		int ret;
+
+		ret = dio_refill_pages(dio);
+		if (ret) {
+			printk("%s: dio_refill_pages returns %d\n",
+				__FUNCTION__, ret);
+			return ERR_PTR(ret);
+		}
+		BUG_ON(dio_pages_present(dio) == 0);
+	}
+	return dio->pages[dio->head++];
+}
+
+/*
+ * The BIO completion handler simply queues the BIO up for the process-context
+ * handler.
+ *
+ * During I/O bi_private points at the dio.  After I/O, bi_private is used to
+ * implement a singly-linked list of completed BIOs, at dio->bio_list.
+ */
+static void dio_bio_end_io(struct bio *bio)
+{
+	struct dio *dio = bio->bi_private;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dio->bio_list_lock, flags);
+	bio->bi_private = dio->bio_list;
+	dio->bio_list = bio;
+	spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+	wake_up(&dio->wait_q);
+}
+
+static int
+dio_bio_alloc(struct dio *dio, struct block_device *bdev,
+		sector_t first_sector, int nr_vecs)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_KERNEL, nr_vecs);
+	if (bio == NULL)
+		return -ENOMEM;
+
+	bio->bi_bdev = bdev;
+	bio->bi_vcnt = nr_vecs;
+	bio->bi_idx = 0;
+	bio->bi_size = 0;
+	bio->bi_sector = first_sector;
+	bio->bi_io_vec[0].bv_page = NULL;
+	bio->bi_end_io = dio_bio_end_io;
+
+	dio->bio = bio;
+	dio->bvec = NULL;		/* debug */
+	return 0;
+}
+
+static void dio_bio_submit(struct dio *dio)
+{
+	struct bio *bio = dio->bio;
+
+	bio->bi_vcnt = bio->bi_idx;
+	bio->bi_idx = 0;
+	bio->bi_private = dio;
+	atomic_inc(&dio->bio_count);
+	submit_bio(dio->rw, bio);
+
+	dio->bio = NULL;
+	dio->bvec = NULL;
+}
+
+/*
+ * Release any resources in case of a failure
+ */
+static void dio_cleanup(struct dio *dio)
+{
+	while (dio_pages_present(dio))
+		page_cache_release(dio_get_page(dio));
+}
+
+/*
+ * Wait for the next BIO to complete.  Remove it and return it.
+ */
+static struct bio *dio_await_one(struct dio *dio)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	unsigned long flags;
+	struct bio *bio;
+
+	spin_lock_irqsave(&dio->bio_list_lock, flags);
+	while (dio->bio_list == NULL) {
+		add_wait_queue(&dio->wait_q, &wait);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (dio->bio_list == NULL) {
+			spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+			blk_run_queues();
+			schedule();
+			spin_lock_irqsave(&dio->bio_list_lock, flags);
+		}
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&dio->wait_q, &wait);
+	}
+	bio = dio->bio_list;
+	dio->bio_list = bio->bi_private;
+	spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+	return bio;
+}
+
+/*
+ * Process one completed BIO.  No locks are held.
+ */
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int page_no;
+	int ret = 0;
+
+	for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+		struct page *page = bvec[page_no].bv_page;
+
+		if (!uptodate) {
+			if (ret == 0)
+				ret = -EIO;
+		}
+
+		if (dio->rw == READ)
+			set_page_dirty(page);
+		page_cache_release(page);
+	}
+	atomic_dec(&dio->bio_count);
+	bio_put(bio);
+	return ret;
+}
+
+/*
+ * Wait on and process all in-flight BIOs.
+ */
+static int dio_await_completion(struct dio *dio)
+{
+	int ret = 0;
+	while (atomic_read(&dio->bio_count)) {
+		struct bio *bio = dio_await_one(dio);
+		int ret2;
+
+		ret2 = dio_bio_complete(dio, bio);
+		if (ret == 0)
+			ret = ret2;
+	}
+	return ret;
+}
+
+/*
+ * A really large O_DIRECT read or write can generate a lot of BIOs.  So
+ * to keep the memory consumption sane we periodically reap any completed BIOs
+ * during the BIO generation phase.
+ *
+ * This also helps to limis the peak amount of pinned userspace memory.
+ */
+static int dio_bio_reap(struct dio *dio)
+{
+	int ret = 0;
+
+	if (dio->reap_counter++ >= 64) {
+		while (dio->bio_list) {
+			unsigned long flags;
+			struct bio *bio;
+			int ret2;
+
+			spin_lock_irqsave(&dio->bio_list_lock, flags);
+			bio = dio->bio_list;
+			dio->bio_list = bio->bi_private;
+			spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+			ret2 = dio_bio_complete(dio, bio);
+			if (ret == 0)
+				ret = ret2;
+		}
+		dio->reap_counter = 0;
+	}
+	return ret;
+}
+
+/*
+ * Walk the user pages, and the file, mapping blocks to disk and emitting BIOs.
+ */
+int do_direct_IO(struct dio *dio)
+{
+	struct inode * const inode = dio->inode;
+	const unsigned blkbits = inode->i_blkbits;
+	const unsigned blocksize = 1 << blkbits;
+	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+	struct page *page;
+	unsigned block_in_page;
+	int ret;
+
+	/* The I/O can start at any block offset within the first page */
+	block_in_page = dio->first_block_in_page;
+
+	while (dio->block_in_file < dio->final_block_in_request) {
+		int new_page;	/* Need to insert this page into the BIO? */
+
+		page = dio_get_page(dio);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto out;
+		}
+
+		new_page = 1;
+		for ( ; block_in_page < blocks_per_page; block_in_page++) {
+			struct buffer_head map_bh;
+			struct bio *bio;
+
+			map_bh.b_state = 0;
+			ret = (*dio->get_block)(inode, dio->block_in_file,
+						&map_bh, dio->rw == WRITE);
+			if (ret) {
+				printk("%s: get_block returns %d\n",
+					__FUNCTION__, ret);
+				goto fail_release;
+			}
+			/* blockdevs do not set buffer_new */
+			if (buffer_new(&map_bh))
+				unmap_underlying_metadata(map_bh.b_bdev,
+							map_bh.b_blocknr);
+			if (!buffer_mapped(&map_bh)) {
+				ret = -EINVAL;		/* A hole */
+				goto fail_release;
+			}
+			if (dio->bio) {
+				if (dio->bio->bi_idx == dio->bio->bi_vcnt ||
+						dio->boundary ||
+						dio->last_block_in_bio !=
+							map_bh.b_blocknr - 1) {
+					dio_bio_submit(dio);
+					dio->boundary = 0;
+				}
+			}
+			if (dio->bio == NULL) {
+				ret = dio_bio_reap(dio);
+				if (ret)
+					goto fail_release;
+				ret = dio_bio_alloc(dio, map_bh.b_bdev,
+					map_bh.b_blocknr << (blkbits - 9),
+					DIO_BIO_MAX_SIZE / PAGE_SIZE);
+				if (ret)
+					goto fail_release;
+				new_page = 1;
+				dio->boundary = 0;
+			}
+
+			bio = dio->bio;
+			if (new_page) {
+				dio->bvec = &bio->bi_io_vec[bio->bi_idx];
+				page_cache_get(page);
+				dio->bvec->bv_page = page;
+				dio->bvec->bv_len = 0;
+				dio->bvec->bv_offset = block_in_page*blocksize;
+				bio->bi_idx++;
+			}
+			new_page = 0;
+			dio->bvec->bv_len += blocksize;
+			bio->bi_size += blocksize;
+			dio->last_block_in_bio = map_bh.b_blocknr;
+			dio->boundary = buffer_boundary(&map_bh);
+
+			dio->block_in_file++;
+			if (dio->block_in_file >= dio->final_block_in_request)
+				break;
+		}
+		block_in_page = 0;
+		page_cache_release(page);
+	}
+	ret = 0;
+	goto out;
+fail_release:
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+struct dio *g_dio;
+
+int
+generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
+			size_t count, get_block_t get_block)
+{
+	const unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
+	const unsigned long user_addr = (unsigned long)buf;
+	int ret = 0;
+	int ret2;
+	struct dio dio;
+	size_t bytes;
+
+	/* Check the memory alignment.  Blocks cannot straddle pages */
+	if ((user_addr & blocksize_mask) || (count & blocksize_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	g_dio = &dio;
+
+	/* BIO submission state */
+	dio.bio = NULL;
+	dio.bvec = NULL;
+	dio.inode = inode;
+	dio.rw = rw;
+	dio.block_in_file = offset >> inode->i_blkbits;
+	dio.final_block_in_request = (offset + count) >> inode->i_blkbits;
+
+	/* Index into the first page of the first block */
+	dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1))
+						>> inode->i_blkbits;
+	dio.boundary = 0;
+	dio.reap_counter = 0;
+	dio.get_block = get_block;
+	dio.last_block_in_bio = -1;
+
+	/* Page fetching state */
+	dio.curr_page = 0;
+	bytes = count;
+	dio.total_pages = 0;
+	if (offset & PAGE_SIZE) {
+		dio.total_pages++;
+		bytes -= PAGE_SIZE - (offset & ~(PAGE_SIZE - 1));
+	}
+	dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+	dio.curr_user_address = user_addr;
+
+	/* Page queue */
+	dio.head = 0;
+	dio.tail = 0;
+
+	/* BIO completion state */
+	atomic_set(&dio.bio_count, 0);
+	spin_lock_init(&dio.bio_list_lock);
+	dio.bio_list = NULL;
+	init_waitqueue_head(&dio.wait_q);
+
+	down_read(&current->mm->mmap_sem);
+	ret = do_direct_IO(&dio);
+	up_read(&current->mm->mmap_sem);
+
+	if (dio.bio)
+		dio_bio_submit(&dio);
+	if (ret)
+		dio_cleanup(&dio);
+	ret2 = dio_await_completion(&dio);
+	if (ret == 0)
+		ret = ret2;
+	if (ret == 0)
+		ret = count - ((dio.final_block_in_request -
+				dio.block_in_file) << inode->i_blkbits);
+out:
+	return ret;
+}
+
+ssize_t
+generic_file_direct_IO(int rw, struct inode *inode, char *buf,
+			loff_t offset, size_t count)
+{
+	struct address_space *mapping = inode->i_mapping;
+	unsigned blocksize_mask;
+	ssize_t retval;
+
+	blocksize_mask = (1 << inode->i_blkbits) - 1;
+	if ((offset & blocksize_mask) || (count & blocksize_mask)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (mapping->nrpages) {
+		retval = filemap_fdatawrite(mapping);
+		if (retval == 0)
+			retval = filemap_fdatawait(mapping);
+		if (retval)
+			goto out;
+	}
+	retval = mapping->a_ops->direct_IO(rw, inode, buf, offset, count);
+out:
+	return retval;
+}
--- 2.5.25/include/linux/fs.h~odirect-redux	Sun Jul  7 19:35:39 2002
+++ 2.5.25-akpm/include/linux/fs.h	Sun Jul  7 19:35:39 2002
@@ -303,8 +303,8 @@ struct address_space_operations {
 	int (*bmap)(struct address_space *, long);
 	int (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, int);
-#define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
-	int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
+	int (*direct_IO)(int, struct inode *, char *buf,
+				loff_t offset, size_t count);
 };
 
 struct backing_dev_info;
@@ -1128,7 +1128,7 @@ extern int check_disk_change(kdev_t);
 extern int invalidate_inodes(struct super_block *);
 extern int invalidate_device(kdev_t, int);
 extern void invalidate_inode_pages(struct inode *);
-extern void invalidate_inode_pages2(struct address_space *);
+extern void invalidate_inode_pages2(struct address_space *mapping);
 extern void write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
@@ -1233,6 +1233,11 @@ extern int file_read_actor(read_descript
 extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
 extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
 extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
+ssize_t generic_file_direct_IO(int rw, struct inode *inode, char *buf,
+				loff_t offset, size_t count);
+int generic_direct_IO(int rw, struct inode *inode, char *buf,
+			loff_t offset, size_t count, get_block_t *get_block);
+
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
--- 2.5.25/include/linux/buffer_head.h~odirect-redux	Sun Jul  7 19:35:39 2002
+++ 2.5.25-akpm/include/linux/buffer_head.h	Sun Jul  7 19:35:39 2002
@@ -182,8 +182,6 @@ int block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
-int generic_direct_IO(int, struct inode *, struct kiobuf *,
-			unsigned long, int, get_block_t *);
 int file_fsync(struct file *, struct dentry *, int);
 
 #define OSYNC_METADATA	(1<<0)
--- 2.5.25/fs/buffer.c~odirect-redux	Sun Jul  7 19:35:39 2002
+++ 2.5.25-akpm/fs/buffer.c	Sun Jul  7 19:35:39 2002
@@ -2298,6 +2298,7 @@ sector_t generic_block_bmap(struct addre
 	return tmp.b_blocknr;
 }
 
+#if 0
 int generic_direct_IO(int rw, struct inode *inode,
 			struct kiobuf *iobuf, unsigned long blocknr,
 			int blocksize, get_block_t *get_block)
@@ -2344,6 +2345,7 @@ int generic_direct_IO(int rw, struct ino
  out:
 	return retval;
 }
+#endif
 
 /*
  * Start I/O on a physical range of kernel memory, defined by a vector
--- 2.5.25/mm/filemap.c~odirect-redux	Sun Jul  7 19:35:39 2002
+++ 2.5.25-akpm/mm/filemap.c	Sun Jul  7 19:35:39 2002
@@ -413,7 +413,7 @@ static int invalidate_list_pages2(struct
  * free the pages because they're mapped.
  * @mapping: the address_space which pages we want to invalidate
  */
-void invalidate_inode_pages2(struct address_space * mapping)
+void invalidate_inode_pages2(struct address_space *mapping)
 {
 	int unlocked;
 
@@ -1101,6 +1101,7 @@ no_cached_page:
 	UPDATE_ATIME(inode);
 }
 
+#if 0
 static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
 {
 	ssize_t retval;
@@ -1181,6 +1182,7 @@ static ssize_t generic_file_direct_IO(in
  out:	
 	return retval;
 }
+#endif
 
 int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
 {
@@ -1208,15 +1210,36 @@ int file_read_actor(read_descriptor_t * 
  * This is the "read()" routine for all filesystems
  * that can use the page cache directly.
  */
-ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+ssize_t
+generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
 {
 	ssize_t retval;
 
 	if ((ssize_t) count < 0)
 		return -EINVAL;
 
-	if (filp->f_flags & O_DIRECT)
-		goto o_direct;
+	if (filp->f_flags & O_DIRECT) {
+		loff_t pos = *ppos, size;
+		struct address_space *mapping;
+		struct inode *inode;
+
+		mapping = filp->f_dentry->d_inode->i_mapping;
+		inode = mapping->host;
+		retval = 0;
+		if (!count)
+			goto out; /* skip atime */
+		size = inode->i_size;
+		if (pos < size) {
+			if (pos + count > size)
+				count = size - pos;
+			retval = generic_file_direct_IO(READ, inode,
+							buf, pos, count);
+			if (retval > 0)
+				*ppos = pos + retval;
+		}
+		UPDATE_ATIME(filp->f_dentry->d_inode);
+		goto out;
+	}
 
 	retval = -EFAULT;
 	if (access_ok(VERIFY_WRITE, buf, count)) {
@@ -1229,36 +1252,14 @@ ssize_t generic_file_read(struct file * 
 			desc.count = count;
 			desc.buf = buf;
 			desc.error = 0;
-			do_generic_file_read(filp, ppos, &desc, file_read_actor);
-
+			do_generic_file_read(filp,ppos,&desc,file_read_actor);
 			retval = desc.written;
 			if (!retval)
 				retval = desc.error;
 		}
 	}
- out:
+out:
 	return retval;
-
- o_direct:
-	{
-		loff_t pos = *ppos, size;
-		struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
-		struct inode *inode = mapping->host;
-
-		retval = 0;
-		if (!count)
-			goto out; /* skip atime */
-		size = inode->i_size;
-		if (pos < size) {
-			if (pos + count > size)
-				count = size - pos;
-			retval = generic_file_direct_IO(READ, filp, buf, count, pos);
-			if (retval > 0)
-				*ppos = pos + retval;
-		}
-		UPDATE_ATIME(filp->f_dentry->d_inode);
-		goto out;
-	}
 }
 
 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
@@ -2199,8 +2200,8 @@ generic_file_write(struct file *file, co
 	}
 
 	if (unlikely(file->f_flags & O_DIRECT)) {
-		written = generic_file_direct_IO(WRITE, file,
-						(char *) buf, count, pos);
+		written = generic_file_direct_IO(WRITE, inode,
+						(char *)buf, pos, count);
 		if (written > 0) {
 			loff_t end = pos + written;
 			if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
@@ -2208,7 +2209,8 @@ generic_file_write(struct file *file, co
 				mark_inode_dirty(inode);
 			}
 			*ppos = end;
-			invalidate_inode_pages2(mapping);
+			if (mapping->nrpages)
+				invalidate_inode_pages2(mapping);
 		}
 		/*
 		 * Sync the fs metadata but not the minor inode changes and
--- 2.5.25/fs/ext2/inode.c~odirect-redux	Sun Jul  7 19:35:39 2002
+++ 2.5.25-akpm/fs/ext2/inode.c	Sun Jul  7 19:35:39 2002
@@ -607,11 +607,10 @@ static int ext2_bmap(struct address_spac
 }
 
 static int
-ext2_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
-			unsigned long blocknr, int blocksize)
+ext2_direct_IO(int rw, struct inode *inode, char *buf,
+			loff_t offset, size_t count)
 {
-	return generic_direct_IO(rw, inode, iobuf, blocknr,
-				blocksize, ext2_get_block);
+	return generic_direct_IO(rw, inode, buf, offset, count, ext2_get_block);
 }
 
 static int
--- 2.5.25/fs/Makefile~odirect-redux	Sun Jul  7 19:35:39 2002
+++ 2.5.25-akpm/fs/Makefile	Sun Jul  7 19:35:39 2002
@@ -15,7 +15,7 @@ obj-y :=	open.o read_write.o devices.o f
 		namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
 		dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
 		filesystems.o namespace.o seq_file.o xattr.o libfs.o \
-		fs-writeback.o mpage.o
+		fs-writeback.o mpage.o direct-io.o
 
 ifneq ($(CONFIG_NFSD),n)
 ifneq ($(CONFIG_NFSD),)
--- 2.5.25/fs/block_dev.c~odirect-redux	Sun Jul  7 19:35:39 2002
+++ 2.5.25-akpm/fs/block_dev.c	Sun Jul  7 19:35:39 2002
@@ -105,9 +105,12 @@ static int blkdev_get_block(struct inode
 	return 0;
 }
 
-static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
+static int
+blkdev_direct_IO(int rw, struct inode *inode, char *buf,
+			loff_t offset, size_t count)
 {
-	return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, blkdev_get_block);
+	return generic_direct_IO(rw, inode, buf, offset,
+				count, blkdev_get_block);
 }
 
 static int blkdev_writepage(struct page * page)

-


 raw.c |  136 ++++++++++++------------------------------------------------------
 1 files changed, 26 insertions(+), 110 deletions(-)

--- 2.5.25/drivers/char/raw.c~raw-use-generic	Sun Jul  7 19:35:44 2002
+++ 2.5.25-akpm/drivers/char/raw.c	Sun Jul  7 19:58:33 2002
@@ -8,8 +8,8 @@
  * device are used to bind the other minor numbers to block devices.
  */
 
+#include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/iobuf.h>
 #include <linux/major.h>
 #include <linux/blkdev.h>
 #include <linux/raw.h>
@@ -86,12 +86,6 @@ int raw_open(struct inode *inode, struct
 		return 0;
 	}
 	
-	if (!filp->f_iobuf) {
-		err = alloc_kiovec(1, &filp->f_iobuf);
-		if (err)
-			return err;
-	}
-
 	down(&raw_devices[minor].mutex);
 	/*
 	 * No, it is a normal raw device.  All we need to do on open is
@@ -256,124 +250,46 @@ int raw_ctl_ioctl(struct inode *inode, 
 	return err;
 }
 
-
-
-ssize_t	raw_read(struct file *filp, char * buf, 
-		 size_t size, loff_t *offp)
+ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp)
 {
 	return rw_raw_dev(READ, filp, buf, size, offp);
 }
 
-ssize_t	raw_write(struct file *filp, const char *buf, 
-		  size_t size, loff_t *offp)
+ssize_t	raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp)
 {
 	return rw_raw_dev(WRITE, filp, (char *) buf, size, offp);
 }
 
-#define SECTOR_BITS 9
-#define SECTOR_SIZE (1U << SECTOR_BITS)
-#define SECTOR_MASK (SECTOR_SIZE - 1)
-
-ssize_t	rw_raw_dev(int rw, struct file *filp, char *buf, 
-		   size_t size, loff_t *offp)
+ssize_t
+rw_raw_dev(int rw, struct file *filp, char *buf, size_t size, loff_t *offp)
 {
-	struct kiobuf * iobuf;
-	int		new_iobuf;
-	int		err = 0;
-	unsigned long	blocks;
-	size_t		transferred;
-	int		iosize;
-	int		minor;
-	kdev_t		dev;
-	unsigned long	limit;
-	int		sector_size, sector_bits, sector_mask;
-	sector_t	blocknr;
 	struct block_device *bdev;
-	
-	/*
-	 * First, a few checks on device size limits 
-	 */
+	struct inode *inode;
+	int minor;
+	ssize_t ret = 0;
 
 	minor = minor(filp->f_dentry->d_inode->i_rdev);
-
-	new_iobuf = 0;
-	iobuf = filp->f_iobuf;
-	if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
-		/*
-		 * A parallel read/write is using the preallocated iobuf
-		 * so just run slow and allocate a new one.
-		 */
-		err = alloc_kiovec(1, &iobuf);
-		if (err)
-			goto out;
-		new_iobuf = 1;
-	}
-
 	bdev = raw_devices[minor].binding;
-	dev = to_kdev_t(bdev->bd_dev);
-	sector_size = raw_devices[minor].sector_size;
-	sector_bits = raw_devices[minor].sector_bits;
-	sector_mask = sector_size - 1;
-
-	limit = bdev->bd_inode->i_size >> sector_bits;
-	if (!limit)
-		limit = INT_MAX;
-	dprintk ("rw_raw_dev: dev %d:%d (+%d)\n",
-		 major(dev), minor(dev), limit);
-	
-	err = -EINVAL;
-	if ((*offp & sector_mask) || (size & sector_mask))
-		goto out_free;
-	err = 0;
-	if (size)
-		err = -ENXIO;
-	if ((*offp >> sector_bits) >= limit)
-		goto out_free;
-
-	transferred = 0;
-	blocknr = *offp >> sector_bits;
-	while (size > 0) {
-		blocks = size >> sector_bits;
-		if (blocks > limit - blocknr)
-			blocks = limit - blocknr;
-		if (!blocks)
-			break;
-
-		iosize = blocks << sector_bits;
+	inode = bdev->bd_inode;
 
-		err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
-		if (err)
-			break;
-
-		err = brw_kiovec(rw, 1, &iobuf, raw_devices[minor].binding, &blocknr, sector_size);
-
-		if (rw == READ && err > 0)
-			mark_dirty_kiobuf(iobuf, err);
-		
-		if (err >= 0) {
-			transferred += err;
-			size -= err;
-			buf += err;
-		}
-
-		blocknr += blocks;
-
-		unmap_kiobuf(iobuf);
-
-		if (err != iosize)
-			break;
+	if (size == 0)
+		goto out;
+	if (size < 0) {
+		ret = -EINVAL;
+		goto out;
 	}
-	
-	if (transferred) {
-		*offp += transferred;
-		err = transferred;
+	if (*offp >= inode->i_size) {
+		ret = -ENXIO;
+		goto out;
 	}
+	if (size + *offp > inode->i_size)
+		size = inode->i_size - *offp;
 
- out_free:
-	if (!new_iobuf)
-		clear_bit(0, &filp->f_iobuf_lock);
-	else
-		free_kiovec(1, &iobuf);
- out:	
-	return err;
+	ret = generic_file_direct_IO(rw, inode, buf, *offp, size);
+	if (ret > 0)
+		*offp += ret;
+	if (inode->i_mapping->nrpages)
+		invalidate_inode_pages2(inode->i_mapping);
+out:
+	return ret;
 }

-

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: direct-to-BIO for O_DIRECT
  2002-07-08  3:19 direct-to-BIO for O_DIRECT Andrew Morton
@ 2002-07-08  3:30 ` Lincoln Dale
  2002-07-08  7:44 ` Ingo Oeser
  2002-07-11  2:25 ` Lincoln Dale
  2 siblings, 0 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-08  3:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
	Linus Torvalds, lkml, Steve Lord

At 08:19 PM 7/07/2002 -0700, Andrew Morton wrote:
>Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
>the kiovec layer.  It's followed by a patch which converts the raw
>driver to use the O_DIRECT engine.
>
>CPU utilisation is about the same as the kiovec-based implementation.
>Read and write bandwidth are the same too, for 128k chunks.   But with
>one megabyte chunks, this implementation is 20% faster at writing.
>
>I assume this is because the kiobuf-based implementation has to stop
>and wait for each 128k chunk, whereas this code streams the entire
>request, regardless of its size.
>
>This is with a single (oldish) scsi disk on aic7xxx.  I'd expect the
>margin to widen on higher-end hardware which likes to have more
>requests in flight.

i'll have a go at benchmark-testing these.

now have even bigger hardware than before: 2 x 2gbit/s FC HBAs in multiple 
dual-processor (Dual P3 Xeon 550MHz 2M L2 cache and Dual P3 Xeon 833MHz 
256K L2 cache) boxen, 8 x 15K RPM FC, 28 x 10K RPM SCSI.


cheers,

lincoln.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: direct-to-BIO for O_DIRECT
  2002-07-08  3:19 direct-to-BIO for O_DIRECT Andrew Morton
  2002-07-08  3:30 ` Lincoln Dale
@ 2002-07-08  7:44 ` Ingo Oeser
  2002-07-11  2:25 ` Lincoln Dale
  2 siblings, 0 replies; 17+ messages in thread
From: Ingo Oeser @ 2002-07-08  7:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

On Sun, Jul 07, 2002 at 08:19:33PM -0700, Andrew Morton wrote:
> Question is: what do we want to do with this sucker?  These are the
> remaining users of kiovecs:
> 
> 	drivers/md/lvm-snap.c
> 	drivers/media/video/video-buf.c
> 	drivers/mtd/devices/blkmtd.c
> 	drivers/scsi/sg.c
> 
> the video and mtd drivers seems to be fairly easy to de-kiobufize.
> I'm aware of one proprietary driver which uses kiobufs.  XFS uses
> kiobufs a little bit - just to map the pages.

It would be nice if we could just map a set of user pages to a scatterlist.

Developers of mass transfer devices (video grabbers, dsp devices, sg and
many others) would just LOVE you for this ;-)

Block devices are the common case worth optimizing for, but character
devices just need to reimplement most of this, if they want the same 
optimizations. Some devices need mass transfers and are NOT blockdevices.

Linux supports only one class of them properly: NICs.

Please consider supporting them better for 2.5 in stuff similiar to BIOs
and DMA to/from user pages.

Thanks & Regards

Ingo Oeser

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: direct-to-BIO for O_DIRECT
  2002-07-08  3:19 direct-to-BIO for O_DIRECT Andrew Morton
  2002-07-08  3:30 ` Lincoln Dale
  2002-07-08  7:44 ` Ingo Oeser
@ 2002-07-11  2:25 ` Lincoln Dale
  2002-07-11  3:24   ` Andrew Morton
  2002-07-11 19:52   ` direct-to-BIO for O_DIRECT Jesse Barnes
  2 siblings, 2 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-11  2:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
	Linus Torvalds, lkml, Steve Lord

At 08:19 PM 7/07/2002 -0700, Andrew Morton wrote:
>Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
>the kiovec layer.  It's followed by a patch which converts the raw
>driver to use the O_DIRECT engine.
>
>CPU utilisation is about the same as the kiovec-based implementation.
>Read and write bandwidth are the same too, for 128k chunks.   But with
>one megabyte chunks, this implementation is 20% faster at writing.
..
>This is with a single (oldish) scsi disk on aic7xxx.  I'd expect the
>margin to widen on higher-end hardware which likes to have more
>requests in flight.

sorry for the delay.
upgrading from 2.4.19 to 2.5.25 took longer than expected, since the QLogic 
FC 2300 HBA
driver isn't part of the standard kernel, and i had to update it to reflect the
io_request_lock -> host->host_lock, kdev_t and kbuild changes.  urgh, pain 
pain pain.
in the process, i discovered some races in their driver, so fixed them also.

the 2.5 block i/o layer is FAR superior to the 2.4 block i/o layer. kudos 
to Jens, Andrew & co for the changeover.

the results:
   2.4.19pre8aa2 (with lockmeter and profile=2)
      normal     167772160 blocks of 512 bytes in 778 seconds (105.27 
mbyte/sec), CPUs 0% idle
      O_DIRECT   20480 blocks of 4194304 bytes in 430 seconds (190.47 
mbyte/sec), CPUs ~55% idle
      /dev/rawN  20480 blocks of 4194304 bytes in 463 seconds (176.86 
mbyte/sec), CPUs ~62% idle

   2.5.25 ('virgin' 2.5.25 with the exception of changing PAGE_OFFSET to 
0x80000000 and
          your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report 
below)
      normal     167772160 blocks of 512 bytes in 607 seconds (134.81 
mbyte/sec), CPUs 0% idle
      O_DIRECT   20480 blocks of 4194304 bytes in 420 seconds (194.61 
mbyte/sec), CPUs ~93% idle
      /dev/rawN  20480 blocks of 4194304 bytes in 422 seconds (193.84 
mbyte/sec), CPUs ~92% idle

   2.5.25 with direct-to-BIO (and PAGE_OFFSET at 0x80000000)
      normal     167772160 blocks of 512 bytes in 615 seconds (133.06 
mbyte/sec), CPUs 0% idle
      O_DIRECT   20480 blocks of 4194304 bytes in 421 seconds (194.37 
mbyte/sec), CPUs ~92% idle
      /dev/rawN  20480 blocks of 4194304 bytes in 421 seconds (194.35 
mbyte/sec), CPUs ~92% idle


its a little hard to tell CPU load difference between direct-to-BIO versus 
non-direct-to-BIO,
but clearly performance was at 100% of 2gbit/s Fibre Channel with 
direct-to-bio; i've never
seen it sustain exactly 100% throughout a test before.

it was interesting to watch the test of 2.4.19pre8aa2 versus both 2.5.25 
tests; whether it is a
change in the linux scheduler or some other artifact, all "worker" threads 
(1 thread per disk)
completed at almost exactly the same time on 2.5.25 kernels.
in contrast, the benchmark on 2.4.19pre8aa2 had some disks complete their 
work up to half
a minute prior to the last thread finishing -- clearly there was some 
degree of "unfairness"
between threads that has since been addressed.

i'll see about getting dual 2gbit/s FC HBAs working now; my FC MultiPathing 
configuration
is having a bad hair day today and i'm not physically near the test host in 
question to
replace a physical fibre cable reporting errors.


details of how the test was conducted --

test host:
  - dual P3 Xeon (733MHz), 2GB PC133 SDRAM (no HIGHMEM defined)
  - single QLogic FC 2300 HBA operating at 2gbit/s in a 64/66 PCI slot

test:
  - benchmark consisted of sequential read requests in parallel across
    8 x 18G 15K RPM FC disks across the first 10GB of each disk
    (why use "sequential reads" you ask?  because its generally consistent --
    i'm not measuring any i/o re-ordering/elevator behaviour, nor am
    i measuring the speed of any disk-shelf controller cache or
    disk-spindle seek speed.  i'm purely measuring how fast data can
    move from the storage subsystem to userspace).
  - benchmark-test considered complete when all disks have gone idle.
  - benchmark program is multithreaded, one thread per device
  - each test run twice with machine rebooted in-between to ensure
    repeatability

block sizes:
  - for normal, test used 20971520 blocks of 512 bytes (10GB) on each disk
  - for O_DIRECT, test used 2560 blocks of 4194304 bytes (10GB) on each disk
  - for /dev/rawN, test used 2560 blocks of 4194304 bytes (10GB) on each disk


oops report #1: (virgin 2.5.25)
         oops occurs on attempting to issue a read() on a O_DIRECT device.
         this was corrected with Andrew's patch of:

         Oops: 0000
         CPU:    0
         EIP:    0010:[<801c4e11>]    Not tainted
         Using defaults from ksymoops -t elf32-i386 -a i386
         EFLAGS: 00010296
         eax: 00000080   ebx: 00000000   ecx: f6e83b20   edx: f3e79c00
         esi: f3e79cc0   edi: 00010000   ebp: f6e83b20   esp: f393bdcc
         ds: 0018   es: 0018   ss: 0018
         Stack: 8013e856 820fcde0 00000010 000000c0 2aca6000 00000000 
f3e79cc0 00070000
                00000070 801c4fac f6e83b20 f6e83b20 8013edbd 00000000 
f6e83b20 00000010
                00000010 00000000 00000000 00000010 00000001 80127acb 
f56e9ae0 f54691e0
         Call Trace: [<8013e856>] [<801c4fac>] [<8013edbd>] [<80127acb>] 
[<8013e118>]
            [<8013e05f>] [<801269de>] [<80126af8>] [<80140113>] 
[<801400a0>] [<8012a9c7>]
            [<8012abad>] [<8011404b>] [<8013a738>] [<8013a8ea>] [<80108a0b>]
         Code: 8b 43 0c c1 ef 09 8b 50 38 8b 40 34 0f ac d0 09 89 c6 85 f6

         >>EIP; 801c4e11 <generic_make_request+11/130>   <=====
         Trace; 8013e856 <bio_alloc+e6/1a0>
         Trace; 801c4fac <submit_bio+5c/70>
         Trace; 8013edbd <ll_rw_kio+1ad/210>
         Trace; 80127acb <handle_mm_fault+6b/e0>
         Trace; 8013e118 <brw_kiovec+a8/100>
         Trace; 8013e05f <generic_direct_IO+ef/100>
         Trace; 801269de <get_user_pages+ee/150>
         Trace; 80126af8 <map_user_kiobuf+b8/100>
         Trace; 80140113 <blkdev_direct_IO+23/30>
         Trace; 801400a0 <blkdev_get_block+0/50>
         Trace; 8012a9c7 <generic_file_direct_IO+167/1e0>
         Trace; 8012abad <generic_file_read+ed/130>
         Trace; 8011404b <schedule+33b/3a0>
         Trace; 8013a738 <vfs_read+98/110>
         Trace; 8013a8ea <sys_read+2a/40>
         Trace; 80108a0b <syscall_call+7/b>
         Code;  801c4e11 <generic_make_request+11/130>
         00000000 <_EIP>:
         Code;  801c4e11 <generic_make_request+11/130>   <=====
            0:   8b 43 0c                  mov    0xc(%ebx),%eax   <=====
         Code;  801c4e14 <generic_make_request+14/130>
            3:   c1 ef 09                  shr    $0x9,%edi
         Code;  801c4e17 <generic_make_request+17/130>
            6:   8b 50 38                  mov    0x38(%eax),%edx
         Code;  801c4e1a <generic_make_request+1a/130>
            9:   8b 40 34                  mov    0x34(%eax),%eax
         Code;  801c4e1d <generic_make_request+1d/130>
            c:   0f ac d0 09               shrd   $0x9,%edx,%eax
         Code;  801c4e21 <generic_make_request+21/130>
           10:   89 c6                     mov    %eax,%esi
         Code;  801c4e23 <generic_make_request+23/130>
           12:   85 f6                     test   %esi,%esi


cheers,

lincoln.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: direct-to-BIO for O_DIRECT
  2002-07-11  2:25 ` Lincoln Dale
@ 2002-07-11  3:24   ` Andrew Morton
  2002-07-11  3:25     ` Lincoln Dale
  2002-07-11 19:52   ` direct-to-BIO for O_DIRECT Jesse Barnes
  1 sibling, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2002-07-11  3:24 UTC (permalink / raw)
  To: Lincoln Dale
  Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
	Linus Torvalds, lkml, Steve Lord

Lincoln Dale wrote:
> 
> ...
> sorry for the delay.

Is cool.   Thanks for doing this.

> upgrading from 2.4.19 to 2.5.25 took longer than expected, since the QLogic
> FC 2300 HBA
> driver isn't part of the standard kernel, and i had to update it to reflect the
> io_request_lock -> host->host_lock, kdev_t and kbuild changes.  urgh, pain
> pain pain.
> in the process, i discovered some races in their driver, so fixed them also.
>
> the 2.5 block i/o layer is FAR superior to the 2.4 block i/o layer. kudos
> to Jens, Andrew & co for the changeover.
> 
> the results:
>    2.4.19pre8aa2 (with lockmeter and profile=2)
>       normal     167772160 blocks of 512 bytes in 778 seconds (105.27
> mbyte/sec), CPUs 0% idle
>       O_DIRECT   20480 blocks of 4194304 bytes in 430 seconds (190.47
> mbyte/sec), CPUs ~55% idle
>       /dev/rawN  20480 blocks of 4194304 bytes in 463 seconds (176.86
> mbyte/sec), CPUs ~62% idle
> 
>    2.5.25 ('virgin' 2.5.25 with the exception of changing PAGE_OFFSET to
> 0x80000000 and
>           your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
> below)
>       normal     167772160 blocks of 512 bytes in 607 seconds (134.81
> mbyte/sec), CPUs 0% idle
>       O_DIRECT   20480 blocks of 4194304 bytes in 420 seconds (194.61
> mbyte/sec), CPUs ~93% idle
>       /dev/rawN  20480 blocks of 4194304 bytes in 422 seconds (193.84
> mbyte/sec), CPUs ~92% idle

The 30% improvement in pagecache-buffered reads is somewhat unexpected.
The blockdevs are not using multipage BIOs - they're still using
buffer_head-based I/O for both reads and writes.  Are you sure that
the 2.4 QLogic driver is using block-highmem?
 
>    2.5.25 with direct-to-BIO (and PAGE_OFFSET at 0x80000000)
>       normal     167772160 blocks of 512 bytes in 615 seconds (133.06
> mbyte/sec), CPUs 0% idle
>       O_DIRECT   20480 blocks of 4194304 bytes in 421 seconds (194.37
> mbyte/sec), CPUs ~92% idle
>       /dev/rawN  20480 blocks of 4194304 bytes in 421 seconds (194.35
> mbyte/sec), CPUs ~92% idle

OK, so there's nothing there at all really (or there may be.  Hard
to tell when the interface has saturated).

But on my lowly scsi disks I was seeing no change in read bandwidth
either.  Only writes benefitted for some reason.   Can you do
some write testing as well?   If you test writes through the pagecache,
use ext2 and not direct-to-blockdev please - that'll take the multipage
BIOs, buffer_head-bypass route.  Plain old read and write of /dev/XdYY
isn't very optimised at all.

Thanks.

-

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: direct-to-BIO for O_DIRECT
  2002-07-11  3:24   ` Andrew Morton
@ 2002-07-11  3:25     ` Lincoln Dale
       [not found]       ` <3D2CFF48.9EFF9C59@zip.com.au>
  0 siblings, 1 reply; 17+ messages in thread
From: Lincoln Dale @ 2002-07-11  3:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
	Linus Torvalds, lkml, Steve Lord

At 08:24 PM 10/07/2002 -0700, Andrew Morton wrote:
> >    2.5.25 ('virgin' 2.5.25 with the exception of changing PAGE_OFFSET to
> > 0x80000000 and
> >           your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
> > below)
> >       normal     167772160 blocks of 512 bytes in 607 seconds (134.81
> > mbyte/sec), CPUs 0% idle
> >       O_DIRECT   20480 blocks of 4194304 bytes in 420 seconds (194.61
> > mbyte/sec), CPUs ~93% idle
> >       /dev/rawN  20480 blocks of 4194304 bytes in 422 seconds (193.84
> > mbyte/sec), CPUs ~92% idle
>
>The 30% improvement in pagecache-buffered reads is somewhat unexpected.
>The blockdevs are not using multipage BIOs - they're still using
>buffer_head-based I/O for both reads and writes.  Are you sure that
>the 2.4 QLogic driver is using block-highmem?

pretty sure -- there's no highmem in the system: :-)
(i.e. i changed PAGE_OFFSET in order to prevent there being any highmem).

         [root@mel-stglab-host1 root]# cat /proc/meminfo
         MemTotal:      1945680 kB
         MemFree:       1853812 kB
         MemShared:           0 kB
         Cached:          29536 kB
         SwapCached:       2520 kB
         Active:          32336 kB
         Inactive:         8336 kB
         HighTotal:           0 kB
         HighFree:            0 kB
         LowTotal:      1945680 kB
         LowFree:       1853812 kB
         SwapTotal:     2047992 kB
         SwapFree:      2037268 kB
         Dirty:            1396 kB
         Writeback:           0 kB

>OK, so there's nothing there at all really (or there may be.  Hard
>to tell when the interface has saturated).
>
>But on my lowly scsi disks I was seeing no change in read bandwidth
>either.  Only writes benefitted for some reason.   Can you do
>some write testing as well?   If you test writes through the pagecache,
>use ext2 and not direct-to-blockdev please - that'll take the multipage
>BIOs, buffer_head-bypass route.  Plain old read and write of /dev/XdYY
>isn't very optimised at all.

will do.

do you have any other preferences --
  - ext2 or ext3?
  - if ext3, change the journalling mode?
  - i/o to a single large file or multiple files per spindle?

i can also add combinations of read/write & seeking also.
what kind of file-size should i be using?


cheers,

lincoln.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: direct-to-BIO for O_DIRECT
  2002-07-11  2:25 ` Lincoln Dale
  2002-07-11  3:24   ` Andrew Morton
@ 2002-07-11 19:52   ` Jesse Barnes
  2002-07-11 23:40     ` Lincoln Dale
  1 sibling, 1 reply; 17+ messages in thread
From: Jesse Barnes @ 2002-07-11 19:52 UTC (permalink / raw)
  To: Lincoln Dale; +Cc: lkml

On Thu, Jul 11, 2002 at 12:25:03PM +1000, Lincoln Dale wrote:
> sorry for the delay.
> upgrading from 2.4.19 to 2.5.25 took longer than expected, since the
> QLogic FC 2300 HBA driver isn't part of the standard kernel, and i
> had to update it to reflect the io_request_lock -> host->host_lock,
> kdev_t and kbuild changes.  urgh, pain pain pain.  in the process, i
> discovered some races in their driver, so fixed them also.

So you ported the qla2x00 driver forward to 2.5?  Would it be possible
to post that driver?  Not having it has held up some testing I'd like
to do...

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: direct-to-BIO for O_DIRECT
  2002-07-11 19:52   ` direct-to-BIO for O_DIRECT Jesse Barnes
@ 2002-07-11 23:40     ` Lincoln Dale
  0 siblings, 0 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-11 23:40 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: lkml

At 12:52 PM 11/07/2002 -0700, Jesse Barnes wrote:
>On Thu, Jul 11, 2002 at 12:25:03PM +1000, Lincoln Dale wrote:
> > sorry for the delay.
> > upgrading from 2.4.19 to 2.5.25 took longer than expected, since the
> > QLogic FC 2300 HBA driver isn't part of the standard kernel, and i
> > had to update it to reflect the io_request_lock -> host->host_lock,
> > kdev_t and kbuild changes.  urgh, pain pain pain.  in the process, i
> > discovered some races in their driver, so fixed them also.
>
>So you ported the qla2x00 driver forward to 2.5?  Would it be possible
>to post that driver?  Not having it has held up some testing I'd like
>to do...

these are the changes to the qla2x00 6.1 beta 2 driver, as downloadable 
from the QLogic web-site.

there were also some changes required to the makefiles to get this working 
with linux-2.5 kbuild infrastructure.
the hacks i did there are awful and i'm not prepared to put my name against 
those bad hacks just yet. :-)

===
diff -urN base/listops.h 2.5.25/listops.h
--- base/listops.h      Tue Apr 16 05:15:40 2002
+++ 2.5.25/listops.h    Fri Jul 12 09:29:45 2002
@@ -324,9 +324,9 @@
                  return;
          }

-        spin_lock_irqsave(&io_request_lock, flags);
+        spin_lock_irqsave(ha->host->host_lock, flags);
          qla2x00_callback(ha, sp->cmd);
-        spin_unlock_irqrestore(&io_request_lock, flags);
+        spin_unlock_irqrestore(ha->host->host_lock, flags);
  }

  /**************************************************************************
diff -urN base/qla2x00.c 2.5.25/qla2x00.c
--- base/qla2x00.c      Wed Jul 10 18:32:25 2002
+++ 2.5.25/qla2x00.c    Fri Jul 12 09:29:51 2002
@@ -532,10 +532,11 @@
  static int recoveryTime = MAX_RECOVERYTIME;
  static int failbackTime = MAX_FAILBACKTIME;
  #endif /* end of MPIO_SUPPORT */
-#ifdef MODULE
+
  static char *ql2xopts = NULL;
  static int ql2xmaxqdepth = 0;

+#ifdef MODULE
  /* insmod qla2100 ql2xopts=verbose" */
  MODULE_PARM(ql2xopts, "s");
  MODULE_PARM(ql2xmaxqdepth, "i");
@@ -552,7 +553,6 @@
          MODULE_LICENSE("GPL");
  #endif

-#include "listops.h"
  #include "qla_fo.cfg"


@@ -564,6 +564,7 @@
  static char dummy_buffer[60] = "Please don't add commas in your insmod 
command!!\n";

  #endif
+#include "listops.h"

  #if QLA2100_LIPTEST
  static int qla2x00_lip = 0;
@@ -1459,10 +1460,6 @@

         ENTER("qla2x00_detect");

-#if NEW_EH_CODE
-       spin_unlock_irq(&io_request_lock);
-#endif
-
  #ifdef MODULE
         DEBUG2(printk("DEBUG: qla2x00_set_info starts at address = %p\n",
                         qla2x00_set_info);)
@@ -1497,9 +1494,6 @@

         if (!pci_present()) {
                 printk("scsi: PCI not present\n");
-#if NEW_EH_CODE
-               spin_lock_irq(&io_request_lock);
-#endif
                 return 0;
         } /* end of !pci_present() */

@@ -1542,9 +1536,6 @@
                            continue;
                            }
                          */
-#if NEW_EH_CODE
-                       spin_lock_irq(&io_request_lock);
-#endif

                         if ((host =
                                 scsi_register(
@@ -1609,9 +1600,6 @@
                                         "scsi%d: [ERROR] Failed to allocate "
                                         "memory for adapter\n",host->host_no);
                                 qla2x00_mem_free(ha);
-#if NEW_EH_CODE
-                               spin_unlock_irq(&io_request_lock);
-#endif
                                 continue;
                         }

@@ -1654,10 +1642,6 @@

                         ha->list_lock = SPIN_LOCK_UNLOCKED;

-#if NEW_EH_CODE
-                       spin_unlock_irq(&io_request_lock);
-#endif
-
                         if (qla2x00_initialize_adapter(ha) &&
                                 !(ha->device_flags & DFLG_NO_CABLE)) {

@@ -1706,8 +1690,7 @@
                         ha->fabricid[SIMPLE_NAME_SERVER].in_use = TRUE;

  #if NEW_EH_CODE
-
-                       spin_lock_irq(&io_request_lock);
+                       spin_lock_irq(host->host_lock);
  #endif

                         /* Register our resources with Linux */
@@ -1719,7 +1702,7 @@
                                 qla2x00_mem_free(ha);
                                 scsi_unregister(host);
  #if NEW_EH_CODE
-                               spin_unlock_irq(&io_request_lock);
+                               spin_unlock_irq(host->host_lock);
  #endif
                                 continue;
                         }
@@ -1741,7 +1724,7 @@
                         spin_unlock_irqrestore(&ha->hardware_lock, flags);

  #if NEW_EH_CODE
-                       spin_unlock_irq(&io_request_lock);
+                       spin_unlock_irq(host->host_lock);
  #endif

  #if MPIO_SUPPORT
@@ -1805,10 +1788,6 @@
                 }
         } /* end of FOR */

-#if NEW_EH_CODE
-       spin_lock_irq(&io_request_lock);
-#endif
-
         LEAVE("qla2x00_detect");

         return num_hosts;
@@ -2217,7 +2196,7 @@
         ha = (scsi_qla_host_t *) host->hostdata;

         cmd->scsi_done = fn;
-       spin_unlock(&io_request_lock);
+       spin_unlock(host->host_lock);

         /* Allocate a command packet from the "sp" pool.
          * If we cant get back one then let scsi layer
@@ -2227,7 +2206,7 @@
                 printk(KERN_WARNING
                         "queuecommand: Couldn't allocate memory "
                         "for sp - retried.\n");
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(host->host_lock);

                 LEAVE("qla2x00_queuecommand");
                 return(1);
@@ -2309,14 +2288,14 @@
                                 (int)ha->host_no,t,l);)

                 CMD_RESULT(cmd) = DID_NO_CONNECT << 16;
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(host->host_lock);
                 __sp_put(ha, sp);
                 return(0);
         }

         if (l >= ha->max_luns) {
                 CMD_RESULT(cmd) = DID_NO_CONNECT << 16;
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(host->host_lock);
                 __sp_put(ha, sp);
                 LEAVE("qla2x00_queuecommand");
                 return(0);
@@ -2379,7 +2358,7 @@
                 tasklet_schedule(&ha->run_qla_task);

                 LEAVE("qla2x00_queuecommand");
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(host->host_lock);
                 return (0);
         }

@@ -2427,7 +2406,7 @@
                 qla2x00_extend_timeout(sp->cmd ,60);

                 LEAVE("qla2x00_queuecommand");
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(host->host_lock);
                 return (0);
         } else {
                 sp->flags &= ~SRB_BUSY;   /* v5.21b16 */
@@ -2449,7 +2428,7 @@
                 add_to_scsi_retry_queue(ha,sp);

                 LEAVE("qla2x00_queuecommand");
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(host->host_lock);
                 return (0);
         }

@@ -2462,7 +2441,7 @@

         COMTRACE('c')
         LEAVE("qla2x00_queuecommand");
-       spin_lock_irq(&io_request_lock);
+       spin_lock_irq(host->host_lock);
         return (0);
  }

@@ -2526,10 +2505,10 @@
                         break;


-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(ha->host->host_lock);
                 set_current_state(TASK_INTERRUPTIBLE);
                 schedule_timeout(2*HZ);
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);

         } while (time_before_eq(jiffies, max_wait_time));

@@ -2811,7 +2790,7 @@
                         sp_get(ha,sp);

                         spin_unlock_irqrestore(&ha->hardware_lock, flags);
-                       spin_unlock(&io_request_lock);
+                       spin_unlock(host->host_lock);

                         if (qla2x00_abort_command(ha, sp)) {
                                 DEBUG2(printk("qla2xxx_eh_abort: 
abort_command "
@@ -2825,7 +2804,7 @@
                         }

                         sp_put(ha,sp);
-                       spin_lock_irq(&io_request_lock);
+                       spin_lock_irq(host->host_lock);
                         spin_lock_irqsave(&ha->hardware_lock, flags);

                         /*
@@ -2862,15 +2841,15 @@
          */
         if ((which_ha & BIT_0) && (!list_empty(&ha->done_queue))) {
                 DEBUG3(printk("qla2xxx_eh_abort: calling done for ha.\n");)
-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(host->host_lock);
                 qla2x00_done(ha);
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(host->host_lock);
         }
         if ((which_ha & BIT_1) && (!list_empty(&vis_ha->done_queue))) {
                 DEBUG3(printk("qla2xxx_eh_abort: calling done for 
vis_ha.\n");)
-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(host->host_lock);
                 qla2x00_done(vis_ha);
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(host->host_lock);
         }

         DEBUG(printk("qla2xxx_eh_abort: Exiting. return_status=0x%x.\n",
@@ -2975,22 +2954,22 @@
                 ha->cfg_active || ha->loop_state != LOOP_READY)) {

                 clear_bit(DEVICE_RESET_NEEDED, &ha->dpc_flags);
-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(ha->host->host_lock);
                 if (qla2x00_device_reset(ha, t) != 0) {
                         return_status = FAILED;
                 }
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);
         } else {
                 /*
                  * Wait a while for the loop to come back. Return SUCCESS
                  * for the kernel to try again.
                  */
-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(ha->host->host_lock);

                 set_current_state(TASK_INTERRUPTIBLE);
                 schedule_timeout(5 * HZ);

-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);

                 return_status = SUCCESS;
         }
@@ -3010,9 +2989,9 @@
                 DEBUG3(printk("qla2xxx_eh_device_reset: calling "
                                 "done for ha.\n");)

-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(ha->host->host_lock);
                 qla2x00_done(ha);
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);
         }

         DRIVER_UNLOCK
@@ -3114,22 +3093,22 @@
                 ha->cfg_active || ha->loop_state != LOOP_READY)) {

                 clear_bit(LOOP_RESET_NEEDED, &ha->dpc_flags);
-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(ha->host->host_lock);
                 if (qla2x00_loop_reset(ha) != 0) {
                         return_status = FAILED;
                 }
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);
         } else {
                 /*
                  * Wait a while for the loop to come back. Return SUCCESS
                  * for the kernel to try again.
                  */
-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(ha->host->host_lock);

                 set_current_state(TASK_INTERRUPTIBLE);
                 schedule_timeout(5 * HZ);

-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);

                 return_status = SUCCESS;
         }
@@ -3147,9 +3126,9 @@
         if (!list_empty(&ha->done_queue)) {
                 DEBUG3(printk("qla2xxx_eh_bus_reset: calling done for 
ha.\n");)

-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(ha->host->host_lock);
                 qla2x00_done(ha);
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);
         }

         DEBUG2_3(printk("qla2xxx_eh_bus_reset: exiting. status=0x%x.\n",
@@ -3272,7 +3251,7 @@

         if (!(test_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags))) {
                 set_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags);
-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(ha->host->host_lock);

                 if (qla2x00_abort_isp(ha, 1)) {
                         /* failed. try later */
@@ -3292,27 +3271,27 @@
                         return_status = SUCCESS;
                 }

-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);
                 clear_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags);
         } else {
                 /*
                  * Already active. Sleep a while then return SUCCESS for
                  * kernel to retry the IO.
                  */
-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(ha->host->host_lock);

                 set_current_state(TASK_INTERRUPTIBLE);
                 schedule_timeout(5 * HZ);

-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);

                 return_status = SUCCESS;
         }

         if (!list_empty(&ha->done_queue)) {
-               spin_unlock_irq(&io_request_lock);
+               spin_unlock_irq(ha->host->host_lock);
                 qla2x00_done(ha);
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);
         }

         DRIVER_UNLOCK
@@ -3595,9 +3574,9 @@
                 tasklet_schedule(&ha->run_qla_task);

         if (found) {
-               spin_unlock(&io_request_lock);
+               spin_unlock(ha->host->host_lock);
                 qla2x00_restart_queues(vis_ha, TRUE);
-               spin_lock_irq(&io_request_lock);
+               spin_lock_irq(ha->host->host_lock);
         } else {
                 printk(KERN_INFO
                         "qla2x00_abort: Couldn't Abort command = %p\n", cmd);
@@ -3851,12 +3830,12 @@
                          * mid-level code can expect completions 
momentitarily.
                          */
  #if NEW_EH_CODE
-                       spin_unlock(&io_request_lock);
+                       spin_unlock(ha->host->host_lock);
                         if (qla2x00_abort_isp(ha, 0)) {
                                 /* failed. try later */
                                 set_bit(ISP_ABORT_NEEDED, &ha->dpc_flags);
                         }
-                       spin_lock_irq(&io_request_lock);
+                       spin_lock_irq(ha->host->host_lock);
  #else
                         set_bit(ISP_ABORT_NEEDED, &ha->dpc_flags);

@@ -3874,9 +3853,9 @@
         DEBUG3(printk("qla2x00_reset: going to call restart_queues. "
                         "jiffies=%lx.\n", jiffies);)

-       spin_unlock(&io_request_lock);
+       spin_unlock(ha->host->host_lock);
         qla2x00_restart_queues(ha,TRUE);
-       spin_lock_irq(&io_request_lock);
+       spin_lock_irq(ha->host->host_lock);
         DRIVER_UNLOCK

         COMTRACE('r')
@@ -3946,7 +3925,7 @@
         qla2x00_stats.irqhba = ha;

         /* Prevent concurrent access to adapters register */
-       /* spin_lock_irqsave(&io_request_lock, cpu_flags);*/
+       /* spin_lock_irqsave(host->host_lock, cpu_flags);*/

         reg = ha->iobase;

@@ -3998,7 +3977,7 @@
         if (!list_empty(&ha->done_queue))
                 tasklet_schedule(&ha->run_qla_task);

-       /* spin_unlock_irqrestore(&io_request_lock, cpu_flags);*/
+       /* spin_unlock_irqrestore(host->host_lock, cpu_flags);*/

         /* Wakeup the DPC routine */
         if ((!ha->flags.mbox_busy &&
@@ -4179,7 +4158,7 @@

                 QLA2100_DPC_LOCK(ha);

-               /* spin_lock_irqsave(&io_request_lock, ha->cpu_flags);*/
+               /* spin_lock_irqsave(host->host_lock, ha->cpu_flags);*/
                 ha->dpc_active = 1;

                 /* Determine what action is necessary */
@@ -4477,7 +4456,7 @@
                 if (!list_empty(&ha->done_queue))
                         tasklet_schedule(&ha->run_qla_task);

-               /* spin_unlock_irqrestore(&io_request_lock, ha->cpu_flags);*/
+               /* spin_unlock_irqrestore(host->host_lock, ha->cpu_flags);*/

                 ha->dpc_active = 0;

@@ -4778,9 +4757,9 @@

                 /* Call the mid-level driver interrupt handler */
  #if 0
-               spin_lock_irqsave(&io_request_lock, flags);
+               spin_lock_irqsave(host->host_lock, flags);
                 qla2x00_callback(ha,cmd);
-               spin_unlock_irqrestore(&io_request_lock, flags);
+               spin_unlock_irqrestore(host->host_lock, flags);
  #else

                 sp_put(ha, sp);
@@ -15846,7 +15825,7 @@
         printk(KERN_INFO
                 "qla2x00_apidev: open MAJOR number = %d, "
                 "MINOR number = %d\n",
-               MAJOR(inode->i_rdev), MINOR(inode->i_rdev));
+               major(inode->i_rdev), minor(inode->i_rdev));

         return 0;
  }
@@ -15902,7 +15881,8 @@
                         APIDEV_NODE, apidev_major);)

         proc_mknod(APIDEV_NODE, 0777+S_IFCHR, host->hostt->proc_dir,
-                       (kdev_t)MKDEV(apidev_major,0));
+                       (kdev_t)mk_kdev(apidev_major,0));
+

         return 0;
  }
diff -urN base/qla2x00.h 2.5.25/qla2x00.h
--- base/qla2x00.h      Tue Apr 16 05:15:40 2002
+++ 2.5.25/qla2x00.h    Fri Jul 12 09:29:51 2002
@@ -2682,10 +2682,8 @@
         present: 0,             /* number of 7xxx's present   */\
         unchecked_isa_dma: 0,   /* no memory DMA restrictions */\
         use_clustering: ENABLE_CLUSTERING,                      \
-       use_new_eh_code: 1,                                     \
         max_sectors: 512,                                       \
-       highmem_io: 1,                                          \
-       emulated: 0                                             \
+       highmem_io: 1                                           \
  }
  #else /* KERNEL_VERSION < 2.5.7 */
  #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,8)
diff -urN base/qla2x00_ioctl.c 2.5.25/qla2x00_ioctl.c
--- base/qla2x00_ioctl.c        Tue Apr 16 05:15:40 2002
+++ 2.5.25/qla2x00_ioctl.c      Fri Jul 12 09:29:51 2002
@@ -2509,14 +2509,14 @@
                                 ha->host_no);)

                 /* get spin lock for this operation */
-       spin_lock_irqsave(&io_request_lock, ha->cpu_flags);
+       spin_lock_irqsave(ha->host->host_lock, ha->cpu_flags);

         qla2x00_queuecommand(pscsi_cmd, (void *) qla2x00_scsi_pt_done);

         ha->ioctl->cmpl_timer.expires = jiffies + ha->ioctl->ioctl_tov * HZ;
         add_timer(&ha->ioctl->cmpl_timer);

-       spin_unlock_irqrestore(&io_request_lock, ha->cpu_flags);
+       spin_unlock_irqrestore(ha->host->host_lock, ha->cpu_flags);
         down(&ha->ioctl->cmpl_sem);

         del_timer(&ha->ioctl->cmpl_timer);
===


cheers,

lincoln.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* ext2 performance in 2.5.25 versus 2.4.19pre8aa2
       [not found]       ` <3D2CFF48.9EFF9C59@zip.com.au>
@ 2002-07-14 12:22         ` Lincoln Dale
  2002-07-15  5:30           ` Andrew Morton
  2002-07-15 16:30           ` Benjamin LaHaise
  0 siblings, 2 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-14 12:22 UTC (permalink / raw)
  To: Andrew Morton, Benjamin LaHaise, Andrea Arcangeli,
	Stephen C. Tweedie, Linus Torvalds, Steve Lord
  Cc: linux-kernel

Andrew Morton wanted me to do some benchmarking of large files on ext2 
filesystems rather than the usual block-device testing
i've had some time to do this, here are the results.

one-line summary is that some results are better, some are worse; CPU usage 
is better in 2.5.25, but thoughput is sometimes
worse.

Summary:
========

Test #1: create a single large (12GB) file on each disk.
          use 12288 blocks of 1048576 bytes each on each of 8 disks.

   2.5.25	Wrote 98304MB across 8 files using 96k blocks of 1M in 
579.935968s (169.51 MB/sec), 44717usec, ~90% cpu
   2.4.19pre8aa2	Wrote 98304MB across 8 files using 96k blocks of 1M in 
607.542648s (161.81 MB/sec), 46684usec, ~88% cpu


Test #2: read back from large (12GB) files on each disk.
          use 4k reads across 3 million blocks on each of 8 disks:

   2.5.25	Read 98304MB across 8 files using 24m blocks of 4k in 508.925829s 
(193.16 MB/sec), 158usec mean, ~61% cpu
   2.4.19pre8aa2	Read 98304MB across 8 files using 24m blocks of 4k in 
526.866882s (186.58 MB/sec), 157usec mean, ~88% cpu


Test #3: same test as #2, but using "nocopy" hack to see if copy_to_user 
(memory bandwidth) is the bottleneck.

   2.5.25	Read 98304MB across 8 files using 24m blocks of 4k in 507.792229s 
(193.59 MB/sec), 160usec mean, ~25% cpu
   2.4.19pre8aa2	Read 98304MB across 8 files using 24m blocks of 4k in 
511.353691s (192.24 MB/sec), 148usec mean, ~50% cpu


test #4: measure read performance when reads are entirely out of the 
page-cache.
          test first primes page-cache with data and then issues random 
reads from that.
          working size is 8 x 200mbyte (1.6GB), test is iterated 10 times.
          no I/O is recorded on FC switch, so data is served entirely out 
of page cache.

   2.5.25	Read 16GB across 8 files using 4096M blocks of 4k in 75.304798s 
(212.47 MB/sec), 145usec mean, ~81% cpu
   2.4.19pre8aa2	Read 16GB across 8 files using 4096M blocks of 4k in 
70.526170s (226.87 MB/sec), 134usec mean, 100% cpu


Test #5: same test as #4, but using "nocopy" hack to see if copy_to_user 
(memory bandwidth) is the bottleneck.

   2.5.25	Read 16GB across 8 files using 4096M blocks of 4k in 61.694199s 
(259.34 MB/sec), 119usec mean, ~65% cpu

		** since performance wasn't much higher, i rebooted the machine and loaded
		** it with "profile=2" and lockmeter.  results of that are at the very end
		** of thie email; looks to me like the scheduler was the culplit.

   2.4.19pre8aa2	Read 16GB across 8 files using 4096M blocks of 4k in 
55.924164s (286.10MB/sec), 108usec mean, ~80% cpu



Details:
========

machine is Dual P3-Xeon, 733MHz processors.  2GB of PC133 SDRAM.
disks are 8 x 15K RPM 18G FC disks, connected to FC switches via 1 x QLogic 
FC 2300 HBA @ 2gbit/s.
FC HBA is in a 64/66 PCI slot.

all tests conducted using current in-house test-tool.  it uses a 
thread-per-device.

8 x empty ext2 filesystems created and mounted

kernels:
  - stock 2.3.25 kernel + PAGE_OFFSET modified to 0x80000000 (no highmem), 
QLogic FC 2300 HBA +
    Andrew Morton's direct-bio patch [not exercised in these benchmarks]



Test #1
-------

create a single large (12GB) file on each disk:
writing to 8 filesystems, using a write block-size of 1 megabyte in 
sequential writes.
12288 blocks (12G) per disk, 8 disks is 96GB total.
./test_disk_performance bs=1m blocks=12288 mode=basic operation=write 
/mnt/scrap-sd*/bigfile

   Linux 2.5.25
	Completed writing 98304 mbytes across 8 devices using 98304 blocks of 1048576
	in 579.935968 seconds (169.51 Mbytes/sec), 44717usec mean
	 #0 (/mnt/scrap-sde/bigfile) 12GB in 571.327206s using 12k writes of 1M 
(21.51MB/sec), 40876usec
	 #1 (/mnt/scrap-sdf/bigfile) 12GB in 574.606073s using 12k writes of 1M 
(21.39MB/sec), 36949usec
	 #2 (/mnt/scrap-sdg/bigfile) 12GB in 569.347650s using 12k writes of 1M 
(21.58MB/sec), 49047usec
	 #3 (/mnt/scrap-sdh/bigfile) 12GB in 569.929641s using 12k writes of 1M 
(21.56MB/sec), 48534usec
	 #4 (/mnt/scrap-sdi/bigfile) 12GB in 579.561403s using 12k writes of 1M 
(21.20MB/sec), 28629usec
	 #5 (/mnt/scrap-sdj/bigfile) 12GB in 579.925156s using 12k writes of 1M 
(21.19MB/sec), 27282usec
	 #6 (/mnt/scrap-sdk/bigfile) 12GB in 579.160854s using 12k writes of 1M 
(21.22MB/sec), 31282usec
	 #7 (/mnt/scrap-sdl/bigfile) 12GB in 578.200229s using 12k writes of 1M 
(21.25MB/sec), 30292usec

	during test, machine had ~10% idle cpu.

   Linux 2.4.19pre8aa2
	Completed writing 98304 mbytes across 8 devices using 98304 blocks of 1048576
	in 607.542648 seconds (161.81 Mbytes/sec), 46684usec mean
	 #0 (/mnt/scrap-sde/bigfile) 12GB in 603.074257s using 12k writes of 1M 
(20.38MB/sec), 37131usec
	 #1 (/mnt/scrap-sdf/bigfile) 12GB in 606.433219s using 12k writes of 1M 
(20.26MB/sec), 25851usec
	 #2 (/mnt/scrap-sdg/bigfile) 12GB in 603.926881s using 12k writes of 1M 
(20.35MB/sec), 43734usec
	 #3 (/mnt/scrap-sdh/bigfile) 12GB in 603.114330s using 12k writes of 1M 
(20.37MB/sec), 39455usec
	 #4 (/mnt/scrap-sdi/bigfile) 12GB in 604.618177s using 12k writes of 1M 
(20.32MB/sec), 43179usec
	 #5 (/mnt/scrap-sdj/bigfile) 12GB in 597.328666s using 12k writes of 1M 
(20.57MB/sec), 40354usec
	 #6 (/mnt/scrap-sdk/bigfile) 12GB in 590.982972s using 12k writes of 1M 
(20.79MB/sec), 44772usec
	 #7 (/mnt/scrap-sdl/bigfile) 12GB in 607.630086s using 12k writes of 1M 
(20.22MB/sec), 22741usec

	during test, machine was ~12% idle cpu.


Test #2
-------

read back from large (12GB) files on each disk sequentially using 4k reads 
across 3 million blocks:
./test_disk_performance bs=4k blocks=3m mode=basic operation=read 
/mnt/scrap-sd*/bigfile


   Linux 2.5.25
	Completed reading 98304 mbytes across 8 devices using 25165824 blocks of 4096
	in 508.925829 seconds (193.16 Mbytes/sec), 158usec mean
	 #0 (/mnt/scrap-sde/bigfile) 12GB in 505.979550s using 3145728 reads of 4k 
(24.29 MB/sec), 160usec
	 #1 (/mnt/scrap-sdf/bigfile) 12GB in 506.537340s using 3145728 reads of 4k 
(24.26 MB/sec), 160usec
	 #2 (/mnt/scrap-sdg/bigfile) 12GB in 506.582859s using 3145728 reads of 4k 
(24.26 MB/sec), 159usec
	 #3 (/mnt/scrap-sdh/bigfile) 12GB in 507.796716s using 3145728 reads of 4k 
(24.20 MB/sec), 152usec
	 #4 (/mnt/scrap-sdi/bigfile) 12GB in 505.965224s using 3145728 reads of 4k 
(24.29 MB/sec), 160usec
	 #5 (/mnt/scrap-sdj/bigfile) 12GB in 508.235475s using 3145728 reads of 4k 
(24.18 MB/sec), 138usec
	 #6 (/mnt/scrap-sdk/bigfile) 12GB in 508.378988s using 3145728 reads of 4k 
(24.17 MB/sec), 137usec
	 #7 (/mnt/scrap-sdl/bigfile) 12GB in 508.925429s using 3145728 reads of 4k 
(24.14 MB/sec), 137usec

	during test, machine had approximately 39% idle cpu.
	performance is fairly close to FC line-rate -- for interests-sake, test #3 
(below)
	repeats the test, but using the no-copy hack to see if performance 
increases as a
	result of reducing the number of memory-copies.

   Linux 2.4.19pre8aa2
	Completed reading 98304 mbytes across 8 devices using 25165824 blocks of 4096
	in 526.866882 seconds (186.58 Mbytes/sec), 157usec mean
	 #0 (/mnt/scrap-sde/bigfile) 12GB in 496.930413s using 3145728 reads of 4k 
(24.73MB/sec), 139usec
	 #1 (/mnt/scrap-sdf/bigfile) 12GB in 497.684209s using 3145728 reads of 4k 
(24.69MB/sec), 134usec
	 #2 (/mnt/scrap-sdg/bigfile) 12GB in 500.584528s using 3145728 reads of 4k 
(24.55MB/sec), 112usec
	 #3 (/mnt/scrap-sdh/bigfile) 12GB in 526.866829s using 3145728 reads of 4k 
(23.32MB/sec), 575usec
	 #4 (/mnt/scrap-sdi/bigfile) 12GB in 497.065359s using 3145728 reads of 4k 
(24.72MB/sec), 137usec
	 #5 (/mnt/scrap-sdj/bigfile) 12GB in 499.433604s using 3145728 reads of 4k 
(24.60MB/sec), 121usec
	 #6 (/mnt/scrap-sdk/bigfile) 12GB in 506.116496s using 3145728 reads of 4k 
(24.28MB/sec), 81usec
	 #7 (/mnt/scrap-sdl/bigfile) 12GB in 514.755508s using 3145728 reads of 4k 
(23.87MB/sec), 80usec

	during test, machine had approximately 12% idle cpu.


Test #3
-------

read back from large (12GB) files on each disk sequentially using 4k reads 
across 3 million blocks
in order to determine if memory-bandwidth / front-side-bus was the 
bottleneck, the kernel was patched with the
bogus "nocopy" read_file_actor hack.
the benchmark as Test #2 was used
./test_disk_performance bs=4k blocks=3m mode=nocopy operation=read 
/mnt/scrap-sd*/bigfile

   Linux 2.5.25
	Completed reading 98304 mbytes across 8 devices using 25165824 blocks of 4096
	in 507.792229 seconds (193.59 Mbytes/sec), 160usec mean
          #0 (/mnt/scrap-sde/bigfile) 12GB in 507.622831s using 3145728 
reads of 4k (24.21 MB/sec), 160usec
          #1 (/mnt/scrap-sdf/bigfile) 12GB in 507.543491s using 3145728 
reads of 4k (24.21 MB/sec), 159usec
          #2 (/mnt/scrap-sdg/bigfile) 12GB in 507.219204s using 3145728 
reads of 4k (24.23 MB/sec), 160usec
          #3 (/mnt/scrap-sdh/bigfile) 12GB in 507.346622s using 3145728 
reads of 4k (24.22 MB/sec), 160usec
          #4 (/mnt/scrap-sdi/bigfile) 12GB in 507.739317s using 3145728 
reads of 4k (24.20 MB/sec), 160usec
          #5 (/mnt/scrap-sdj/bigfile) 12GB in 507.706553s using 3145728 
reads of 4k (24.20 MB/sec), 160usec
          #6 (/mnt/scrap-sdk/bigfile) 12GB in 507.791357s using 3145728 
reads of 4k (24.20 MB/sec), 161usec
          #7 (/mnt/scrap-sdl/bigfile) 12GB in 507.791288s using 3145728 
reads of 4k (24.20 MB/sec), 160usec

	during this test, the machine had ~75% idle cpu and was saturating 2gbit/s FC.
	memory-bandwidth / front-side-bus (copy_to_user()) weren't the bottleneck.
	the bottleneck in this test was certainly the 2gbit/s FC HBA.

   Linux 2.4.19pre8aa2
	Completed reading 98304 mbytes across 8 devices using 25165824 blocks of 4096
	in 511.353691 seconds (192.24 Mbytes/sec), 148usec mean
	 #0 (/mnt/scrap-sde/bigfile) 12GB in 501.421399s using 3145728 reads of 4k 
(24.51MB/sec), 122usec
	 #1 (/mnt/scrap-sdf/bigfile) 12GB in 500.688465s using 3145728 reads of 4k 
(24.54MB/sec), 128usec
	 #2 (/mnt/scrap-sdg/bigfile) 12GB in 499.800663s using 3145728 reads of 4k 
(24.59MB/sec), 133usec
	 #3 (/mnt/scrap-sdh/bigfile) 12GB in 505.030670s using 3145728 reads of 4k 
(24.33MB/sec), 95usec
	 #4 (/mnt/scrap-sdi/bigfile) 12GB in 492.732146s using 3145728 reads of 4k 
(24.94MB/sec), 156usec
	 #5 (/mnt/scrap-sdj/bigfile) 12GB in 495.600828s using 3145728 reads of 4k 
(24.79MB/sec), 151usec
	 #6 (/mnt/scrap-sdk/bigfile) 12GB in 504.890322s using 3145728 reads of 4k 
(24.34MB/sec), 101usec
	 #7 (/mnt/scrap-sdl/bigfile) 12GB in 511.353661s using 3145728 reads of 4k 
(24.03MB/sec), 80usec

	during test, machine cpu was ~50% idle.


Test #4
-------
this test was constructed to show read performance when reads are entirely 
out of the page-cache.
randomly read back from a relatively small (200 mbyte) portion of each 12GB 
file on each disk spindle.  total working size is

8 x 200mbyte (1.6GB), which fits into the page-cache.

firstly, "prime" the page-cache:
   ./test_disk_performance bs=4k blocks=50k mode=basic operation=read 
/mnt/scrap-sd*/bigfile

secondly, randomly seek-once-per-block for 50k blocks of 4k into the file 
(i.e. working-set is 200mbyte on each file).
iterate the test 10 times.
   ./test_disk_performance bs=4k blocks=50k mode=basic operation=read 
seek=random iterations=10 /mnt/scrap-sd*/bigfile

   Linux 2.5.25
	Completed reading 16000 mbytes across 8 devices using 4096000 blocks of 4096
	in 75.304798 seconds (212.47 Mbytes/sec), 145usec mean
          #0 (/mnt/scrap-sde/bigfile) 2GB in 75.304670s using 512K reads of 
4k (26.56 MB/sec), 144usec
          #1 (/mnt/scrap-sdf/bigfile) 2GB in 75.201499s using 512K reads of 
4k (26.60 MB/sec), 144usec
          #2 (/mnt/scrap-sdg/bigfile) 2GB in 75.260114s using 512K reads of 
4k (26.57 MB/sec), 144usec
          #3 (/mnt/scrap-sdh/bigfile) 2GB in 75.287700s using 512K reads of 
4k (26.56 MB/sec), 146usec
          #4 (/mnt/scrap-sdi/bigfile) 2GB in 75.298464s using 512K reads of 
4k (26.56 MB/sec), 144usec
          #5 (/mnt/scrap-sdj/bigfile) 2GB in 75.203990s using 512K reads of 
4k (26.59 MB/sec), 144usec
          #6 (/mnt/scrap-sdk/bigfile) 2GB in 75.204889s using 512K reads of 
4k (26.59 MB/sec), 145usec
          #7 (/mnt/scrap-sdl/bigfile) 2GB in 74.981378s using 512K reads of 
4k (26.67 MB/sec), 144usec

	during this test, there was zero activity on the FC switching infrastructure,
	so all i/o was served from the page-cache.
	cpu during the test was ~19% idle.

	since there was so much idle time, my guess is that the system had hit its 
peak
	memory-bandwidth.  test #5 (below) proves that this is the case.

   Linux 2.4.19pre8aa2
	Completed reading 16000 mbytes across 8 devices using 4096000 blocks of 4096
	in 70.526170 seconds (226.87 Mbytes/sec), 134usec mean
	 #0 (/mnt/scrap-sde/bigfile) 2GB in 70.321070s using 512K reads of 4k 
(28.44 MB/sec), 135usec
	 #1 (/mnt/scrap-sdf/bigfile) 2GB in 69.913954s using 512K reads of 4k 
(28.61 MB/sec), 135usec
	 #2 (/mnt/scrap-sdg/bigfile) 2GB in 70.449511s using 512K reads of 4k 
(28.39 MB/sec), 133usec
	 #3 (/mnt/scrap-sdh/bigfile) 2GB in 70.467109s using 512K reads of 4k 
(28.38 MB/sec), 134usec
	 #4 (/mnt/scrap-sdi/bigfile) 2GB in 70.491946s using 512K reads of 4k 
(28.37 MB/sec), 133usec
	 #5 (/mnt/scrap-sdj/bigfile) 2GB in 70.350087s using 512K reads of 4k 
(28.43 MB/sec), 133usec
	 #6 (/mnt/scrap-sdk/bigfile) 2GB in 70.496071s using 512K reads of 4k 
(28.37 MB/sec), 134usec
	 #7 (/mnt/scrap-sdl/bigfile) 2GB in 70.526164s using 512K reads of 4k 
(28.36 MB/sec), 134usec

	cpu was 0% idle during test.


Test #5
-------
randomly read back from a relatively small (200 mbyte) portion of each 12GB 
file on each disk spindle.  total working size is

8 x 200mbyte (1.6GB), which fits into the page-cache.

in order to determine if memory-bandwidth / front-side-bus was the 
bottleneck, the 2.5.25 kernel was patched with the
bogus "nocopy" read_file_actor hack.
the same methodology in test #4 was used:
prime the page-cache with
   ./test_disk_performance bs=4k blocks=50k mode=basic operation=read 
/mnt/scrap-sd*/bigfile
secondly, randomly seek-once-per-block for 50k blocks of 4k into the file 
(i.e. working-set is 200mbyte on each file).
iterate the test 10 times.
	./test_disk_performance bs=4k blocks=50k mode=nocopy operation=read 
seek=random iterations=10 /mnt/scrap-sd*/bigfile


   Linux 2.5.25
	Completed reading 16000 mbytes across 8 devices using 4096000 blocks of 4096
	in 61.694199 seconds (259.34 Mbytes/sec), 119usec mean
          #0 (/mnt/scrap-sde/bigfile) 2GB in 61.639063s using 512K reads of 
4k (32.45 MB/sec), 119usec
          #1 (/mnt/scrap-sdf/bigfile) 2GB in 61.684894s using 512K reads of 
4k (32.42 MB/sec), 119usec
          #2 (/mnt/scrap-sdg/bigfile) 2GB in 61.693891s using 512K reads of 
4k (32.42 MB/sec), 119usec
          #3 (/mnt/scrap-sdh/bigfile) 2GB in 61.647535s using 512K reads of 
4k (32.44 MB/sec), 119usec
          #4 (/mnt/scrap-sdi/bigfile) 2GB in 61.653856s using 512K reads of 
4k (32.44 MB/sec), 119usec
          #5 (/mnt/scrap-sdj/bigfile) 2GB in 61.642686s using 512K reads of 
4k (32.45 MB/sec), 119usec
          #6 (/mnt/scrap-sdk/bigfile) 2GB in 61.638954s using 512K reads of 
4k (32.45 MB/sec), 119usec
          #7 (/mnt/scrap-sdl/bigfile) 2GB in 61.639063s using 512K reads of 
4k (32.45 MB/sec), 119usec

	this time, cpu was ~35% idle, yet performance didn't improve significantly.
	because of this, i compiled a kernel with lockmeter forward-ported to 2.5.25
	and rebooted with "profile=2".

   Linux 2.4.19pre8aa2
	Completed reading 16000 mbytes across 8 devices using 4096000 blocks of 4096
	in 55.924164 seconds (286.10 Mbytes/sec), 108usec mean
	 #0 (/mnt/scrap-sde/bigfile) 2GB in 55.919840s using 512K reads of 4k 
(35.77 MB/sec), 107usec
	 #1 (/mnt/scrap-sdf/bigfile) 2GB in 55.921253s using 512K reads of 4k 
(35.76 MB/sec), 108usec
	 #2 (/mnt/scrap-sdg/bigfile) 2GB in 55.923975s using 512K reads of 4k 
(35.76 MB/sec), 108usec
	 #3 (/mnt/scrap-sdh/bigfile) 2GB in 55.900788s using 512K reads of 4k 
(35.78 MB/sec), 108usec
	 #4 (/mnt/scrap-sdi/bigfile) 2GB in 55.910989s using 512K reads of 4k 
(35.77 MB/sec), 108usec
	 #5 (/mnt/scrap-sdj/bigfile) 2GB in 55.903394s using 512K reads of 4k 
(35.78 MB/sec), 107usec
	 #6 (/mnt/scrap-sdk/bigfile) 2GB in 55.868038s using 512K reads of 4k 
(35.80 MB/sec), 108usec
	 #7 (/mnt/scrap-sdl/bigfile) 2GB in 55.895071s using 512K reads of 4k 
(35.78 MB/sec), 108usec

	during test, cpu was ~20% idle.

Test #5 of 2.5.25 kernel with lockmeter & profile=2
---------------------------------------------------

using profile=2 and lockmeter on test #5:

given performance wasn't significantly higher (it should have been), i 
rebooted the machine, and loaded 2.5.25 with
"profile=2" and patched in lockmeter.

the same benchmark in test #5 was executed as follows:
	readprofile -r; ./lockstat/lockstat on; ./lockstat/lockstat reset; 
./test_disk_performance3 bs=4k blocks=50k

mode=nocopy operation=read seek=random iterations=10 
/mnt/scrap-sd*/bigfile; readprofile -v | sort -n -k4 | tail -20;

./lockstat/lockstat print

the results were very odd -- i've removed all locks with 0% contention.
the contention seems to be in the scheduler:

Completed reading 16000 mbytes across 8 devices using 4096000 blocks of 
4096 in 91.014923 seconds (175.80 Mbytes/sec),

175usec mean
          #0 (/mnt/scrap-sde/bigfile) 2000MB in 91.014743s using 512000 
reads of 4096 bytes (21.97 Mbytes/sec), 139usec
          #1 (/mnt/scrap-sdf/bigfile) 2000MB in 90.945358s using 512000 
reads of 4096 bytes (21.99 Mbytes/sec), 178usec
          #2 (/mnt/scrap-sdg/bigfile) 2000MB in 90.430662s using 512000 
reads of 4096 bytes (22.12 Mbytes/sec), 174usec
          #3 (/mnt/scrap-sdh/bigfile) 2000MB in 90.474412s using 512000 
reads of 4096 bytes (22.11 Mbytes/sec), 174usec
          #4 (/mnt/scrap-sdi/bigfile) 2000MB in 90.443735s using 512000 
reads of 4096 bytes (22.11 Mbytes/sec), 174usec
          #5 (/mnt/scrap-sdj/bigfile) 2000MB in 90.494653s using 512000 
reads of 4096 bytes (22.10 Mbytes/sec), 174usec
          #6 (/mnt/scrap-sdk/bigfile) 2000MB in 90.488251s using 512000 
reads of 4096 bytes (22.10 Mbytes/sec), 174usec
          #7 (/mnt/scrap-sdl/bigfile) 2000MB in 90.620444s using 512000 
reads of 4096 bytes (22.07 Mbytes/sec), 174usec
80108938 resume_userspace                             59   3.6875
8011b540 sys_gettimeofday                            729   4.5563
80108650 handle_signal                              1963   5.5767
801087b0 do_signal                                  1046   5.9432
80107a20 sys_rt_sigsuspend                          2058   6.1250
80107dd0 sys_sigreturn                              1984   6.5263
80108974 syscall_call                                 73   6.6364
8010a3f0 math_state_restore                          450   7.0312
8010f140 restore_i387                               3040   7.0370
8011bc40 do_softirq                                 1473   7.0817
8010ef10 save_i387                                  4281   7.6446
80120ab0 get_signal_to_deliver                      5523  10.7871
80113ba0 schedule                                  10230  10.8369
8010ed80 restore_fpu                                 475  14.8438
80109418 device_not_available                        787  16.3958
8010897f syscall_exit                                203  18.4545
8010de50 do_gettimeofday                            7115  49.4097
80120440 send_sig_info                              9599  54.5398
80108948 system_call                                5549 126.1136
80106d30 default_idle                              52095 813.9844


lockstat results -- all contended locks shown:

___________________________________________________________________________________________
System: Linux mel-stglab-host1 2.5.25 #12 SMP Sun Jul 14 19:25:43 EST 2002 i686
Total counts

All (32) CPUs

Start time: Sun Jul 14 20:06:07 2002
End   time: Sun Jul 14 20:11:46 2002
Delta Time: 338.84 sec.
Hash table slots in use:      432.
Global read lock slots in use: 999.


- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - - - - - - - - - - - - -
SPINLOCKS         HOLD            WAIT
   UTIL  CON    MEAN(  MAX )   MEAN(  MAX )(% CPU)     TOTAL NOWAIT SPIN 
RJECT  NAME

         1.2%  0.7us(3768us)  1.7us( 286us)(0.02%)  88382474 98.8%  1.1% 
0.02%  *TOTAL*


  0.03% 0.20%  1.6us( 7.5us)  2.6us(  12us)(0.00%)     61102 99.8% 
0.20%    0%  [0xf7b1c008]
  0.03% 0.11%  3.2us( 7.5us)  2.1us( 4.9us)(0.00%)     30551 99.9% 
0.11%    0%    qla2x00_done+0x1e8
  0.00% 0.30%  0.1us( 0.6us)  2.7us(  12us)(0.00%)     30551 99.7% 
0.30%    0%    qla2x00_queuecommand+0x484

  0.01% 0.03%  0.2us( 1.1us)  0.9us( 2.2us)(0.00%)    244002  100% 
0.03%    0%  [0xf7b1e35c]
  0.00% 0.02%  0.1us( 0.6us)  0.9us( 1.2us)(0.00%)     30551  100% 
0.02%    0%    qla2x00_callback+0x54
  0.00% 0.02%  0.2us( 0.7us)  1.1us( 1.9us)(0.00%)     30145  100% 
0.02%    0%    qla2x00_done+0x34
  0.00% 0.03%  0.2us( 1.1us)  0.7us( 0.9us)(0.00%)     30551  100% 
0.03%    0%    qla2x00_get_new_sp+0x18
  0.00% 0.02%  0.2us( 1.1us)  1.2us( 1.9us)(0.00%)     61102  100% 
0.02%    0%    qla2x00_next+0x20
  0.00% 0.06%  0.1us( 0.9us)  0.8us( 1.6us)(0.00%)     30551  100% 
0.06%    0%    qla2x00_next+0x10c
  0.00% 0.05%  0.1us( 0.5us)  0.9us( 1.7us)(0.00%)     30551  100% 
0.05%    0%    qla2x00_queuecommand+0x424
  0.00% 0.03%  0.2us( 0.9us)  1.1us( 2.2us)(0.00%)     30551  100% 
0.03%    0%    qla2x00_status_entry+0x830

  0.06% 0.06%  3.4us(  11us)  2.8us( 5.3us)(0.00%)     60704  100% 
0.06%    0%  [0xf7b1e388]
  0.02% 0.10%  2.2us( 8.0us)  3.0us( 5.3us)(0.00%)     30551  100% 
0.10%    0%    qla2x00_64bit_start_scsi+0x15c
  0.00%    0%  0.3us( 
0.4us)    0us                        8  100%    0%    0% 
qla2x00_cmd_timeout+0x190
  0.04% 0.02%  4.7us(  11us)  2.2us( 3.0us)(0.00%)     30145  100% 
0.02%    0%    qla2x00_intr_handler+0x40

  0.26% 0.04%  0.1us( 1.0us)  0.6us( 1.5us)(0.00%)   8927899  100% 
0.04%    0%  blk_plug_lock
  0.00% 0.01%  0.1us( 0.8us)  0.3us( 0.4us)(0.00%)     28901  100% 
0.01%    0%    blk_plug_device+0x1c
  0.26% 0.04%  0.1us( 1.0us)  0.6us( 1.5us)(0.00%)   8870097  100% 
0.04%    0%    blk_run_queues+0x18
  0.00%  1.0%  0.1us( 0.5us)  0.6us( 1.1us)(0.00%)     28901 
99.0%  1.0%    0%    generic_unplug_device+0x2c

  0.03% 0.05%  0.5us( 9.2us)  1.2us( 3.0us)(0.00%)    158705  100% 
0.05%    0%  contig_page_data+0xb4
  0.00% 0.12%  0.2us( 3.0us)  1.3us( 2.0us)(0.00%)     27794 99.9% 
0.12%    0%    __free_pages_ok+0x174
  0.02% 0.03%  0.6us( 9.2us)  1.1us( 3.0us)(0.00%)    130911  100% 
0.03%    0%    rmqueue+0x28

  0.01% 0.03%  1.5us(  44us)  5.6us(  12us)(0.00%)     23127  100% 
0.03%    0%  dcache_lock
  0.00% 0.04%  0.3us( 6.4us)  4.0us( 6.5us)(0.00%)      7053  100% 
0.04%    0%    dput+0x30
  0.01% 0.04%  3.8us(  44us)  7.1us(  12us)(0.00%)      7114  100% 
0.04%    0%    path_lookup+0xd8

  0.00% 0.01%  0.3us(  28us)  0.3us( 0.3us)(0.00%)     13515  100% 
0.01%    0%  files_lock
  0.00% 0.02%  0.2us(  28us)  0.3us( 0.3us)(0.00%)      4489  100% 
0.02%    0%    __fput+0x70

  0.18% 0.02%  1.7us( 4.8us)  1.7us( 3.3us)(0.00%)    367354  100% 
0.02%    0%  ioapic_lock
  0.18% 0.02%  1.7us( 4.8us)  1.7us( 3.3us)(0.00%)    367354  100% 
0.02%    0%    set_ioapic_affinity+0x20

  0.02%  2.7%  9.3us( 306us)   14us( 286us)(0.00%)      7358 
97.3%  2.7%    0%  kernel_flag
  0.00% 0.28%  0.4us( 4.1us)   15us(  15us)(0.00%)       351 99.7% 
0.28%    0%    de_put+0x2c
  0.00% 0.88%   21us(  97us)   15us(  21us)(0.00%)       342 99.1% 
0.88%    0%    ext3_dirty_inode+0x2c
  0.00% 0.50%  2.0us(  70us)   24us(  24us)(0.00%)       202 99.5% 
0.50%    0%    ext3_get_block_handle+0xb8
  0.00%  2.9%  4.9us( 7.0us)  146us( 146us)(0.00%)        35 
97.1%  2.9%    0%    ext3_write_super+0x24
  0.00% 27.0%   52us( 306us)   29us( 286us)(0.00%)       226 73.0% 
27.0%    0%    schedule+0x394
  0.00% 0.79%  0.7us( 1.2us)  9.2us(  14us)(0.00%)       379 99.2% 
0.79%    0%    sem_exit+0x24
  0.00% 0.83%  2.4us(  21us)  5.3us( 8.5us)(0.00%)       845 99.2% 
0.83%    0%    sys_ioctl+0x44
  0.00% 12.0%  6.7us(  36us)  4.2us(  12us)(0.00%)       502 88.0% 
12.0%    0%    tty_read+0xc4
  0.01%  3.2%  8.8us(  53us)  6.2us(  25us)(0.00%)      2004 
96.8%  3.2%    0%    tty_write+0x1f4

  0.02% 0.01%  0.2us(  37us)  3.3us(  16us)(0.00%)    247810  100% 
0.01%    0%  pagemap_lru_lock
  0.01% 0.00%  0.4us(  36us)   14us(  16us)(0.00%)    102714  100% 
0.00%    0%    activate_page+0xc
  0.01% 0.01%  0.1us(  31us)  0.6us( 1.0us)(0.00%)    124041  100% 
0.01%    0%    lru_cache_add+0x1c
  0.00% 0.01%  0.2us(  30us)  0.9us( 0.9us)(0.00%)     21044  100% 
0.01%    0%    lru_cache_del+0xc

  0.00% 83.2%  0.6us( 2.2us)  1.0us( 2.4us)(0.00%)      7652 16.8% 
83.2%    0%  runqueues
  0.00% 83.2%  0.6us( 2.2us)  1.0us( 2.4us)(0.00%)      7652 16.8% 
83.2%    0%    load_balance+0x13c

  0.00% 42.6%  0.8us( 4.2us)  1.0us( 3.0us)(0.00%)     15467 57.4% 
42.6%    0%  runqueues+0x9a0
  0.00% 84.3%  0.4us( 1.9us)  1.0us( 3.0us)(0.00%)      7815 15.7% 
84.3%    0%    load_balance+0x188

  0.05% 0.02%  0.3us( 4.0us)  0.8us( 2.5us)(0.00%)    560160  100% 
0.02%    0%  timerlist_lock
  0.01% 0.04%  0.2us( 1.6us)  0.8us( 1.4us)(0.00%)     93886  100% 
0.04%    0%    add_timer+0x10
  0.00% 0.05%  0.1us( 0.7us)  0.8us( 2.5us)(0.00%)     61444  100% 
0.05%    0%    del_timer+0x14
  0.00% 0.02%  0.2us( 0.9us)  0.8us( 1.3us)(0.00%)     32154  100% 
0.02%    0%    del_timer_sync+0x1c
  0.00% 0.46%  0.5us( 1.6us)  0.6us( 0.7us)(0.00%)       647 99.5% 
0.46%    0%    mod_timer+0x18
  0.03% 0.01%  0.3us( 4.0us)  0.8us( 1.2us)(0.00%)    339146  100% 
0.01%    0%    timer_bh+0xd4
  0.00% 0.01%  0.1us( 0.8us)  0.9us( 1.0us)(0.00%)     32883  100% 
0.01%    0%    timer_bh+0x274

  0.04% 0.36%  3.8us(  12us)  3.2us(  14us)(0.00%)     31764 99.6% 
0.36%    0%  __make_request+0x70
  0.07% 0.10%  7.5us(  44us)  2.5us( 4.5us)(0.00%)     30805  100% 
0.10%    0%  __scsi_end_request+0x20
  0.01% 0.00%  0.6us( 6.9us)  0.5us( 0.8us)(0.00%)     81580  100% 
0.00%    0%  __wake_up+0x20
  0.00% 0.29%  7.7us(  68us)  4.8us( 4.8us)(0.00%)       348 99.7% 
0.29%    0%  ahc_linux_isr+0x28
  0.03%  1.3%  3.2us(  12us)  1.5us( 9.2us)(0.00%)     28915 
98.7%  1.3%    0%  generic_unplug_device+0x14
  0.31%  1.2%  0.8us( 3.7us)    0us                  1321702 
98.8%    0%  1.2%  load_balance+0x120
  0.00% 0.59%  0.2us( 0.8us)  1.1us( 1.8us)(0.00%)      1525 99.4% 
0.59%    0%  n_tty_chars_in_buffer+0x18
  0.00% 0.40%  0.1us( 0.5us)  0.6us( 0.6us)(0.00%)       501 99.6% 
0.40%    0%  read_chan+0x538
  0.00% 0.60%  0.1us( 0.2us)  0.9us( 1.2us)(0.00%)       501 99.4% 
0.60%    0%  read_chan+0x59c
  0.00% 0.02%  0.1us( 0.7us)  0.8us( 1.2us)(0.00%)     28530  100% 
0.02%    0%  remove_wait_queue+0x10
   5.2%  3.0%  1.5us(  13us)  1.1us( 3.2us)(0.00%)  11473908 
97.0%  3.0%    0%  schedule+0x8c
  0.02%  1.6%  0.2us( 5.3us)  1.0us( 2.6us)(0.00%)    308882 
98.4%  1.6%    0%  scheduler_tick+0x10c
  0.08% 0.88%  0.8us( 4.7us)  1.0us( 2.2us)(0.00%)    369477 99.1% 
0.88%    0%  scheduler_tick+0x80
  0.00% 0.45%  0.2us(  13us)  2.7us( 7.7us)(0.00%)     30819 99.5% 
0.45%    0%  scsi_dispatch_cmd+0x138
  0.00% 0.21%  0.2us( 1.0us)  2.4us( 5.6us)(0.00%)     30819 99.8% 
0.21%    0%  scsi_finish_command+0x18
  0.00% 0.22%  0.5us( 3.2us)  2.7us( 6.4us)(0.00%)     30819 99.8% 
0.22%    0%  scsi_queue_next_request+0x18
  0.00% 0.08%  0.3us( 4.0us)  5.1us(  18us)(0.00%)     30819  100% 
0.08%    0%  scsi_request_fn+0x3bc
   5.2% 0.00%  2.0us(  12us)  0.7us( 1.0us)(0.00%)   8750674  100% 
0.00%    0%  send_sig_info+0x4c
  0.00% 0.93%  0.8us( 1.9us)  0.2us( 0.2us)(0.00%)       108 99.1% 
0.93%    0%  sys_sched_yield+0x38
   1.6%  7.3%  0.6us( 4.8us)  2.0us( 6.7us)(0.01%)   8758669 
92.7%  7.3%    0%  try_to_wake_up+0x40
  0.00% 0.26%  0.7us( 2.0us)  0.3us( 0.3us)(0.00%)       379 99.7% 
0.26%    0%  wake_up_forked_process+0x30

- - - - - - - - - - - -  - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - - - - - - - - - - - - - - - - - - -
RWLOCK READS   HOLD    MAX  RDR BUSY PERIOD      WAIT
   UTIL  CON    MEAN   RDRS   MEAN(  MAX )   MEAN(  MAX )( %CPU)     TOTAL 
NOWAIT SPIN  NAME

        0.14%                               4.7us(  13us)(0.00%)  60230957 
99.9% 0.14%  *TOTAL*

   7.3% 0.00%   2.8us     2  2.8us( 245us)  3.1us( 
9.0us)(0.00%)   8751481  100% 0.00%  tasklist_lock
        0.26%                               2.6us( 2.6us)(0.00%)       379 
99.7% 0.26%    exit_notify+0x18
        11.1%                               2.6us( 2.6us)(0.00%)         9 
88.9% 11.1%    sig_exit+0x90

   1.4% 0.44%   0.2us     2  0.3us( 3.3us)  4.7us(  13us)(0.00%)  19266661 
99.6% 0.44%  xtime_lock
        0.44%                               4.7us(  13us)(0.00%)  19266661 
99.6% 0.44%    do_gettimeofday+0x14

- - - - - - - - - - -  - - - - - - - - - - - - - - - - - - - - - - - - - - 
- - - - - - - - - - - - - - - - - - - - - - - - -
RWLOCK WRITES     HOLD           WAIT (ALL)           WAIT (WW)
   UTIL  CON    MEAN(  MAX )   MEAN(  MAX )( %CPU)   MEAN(  MAX )     TOTAL 
NOWAIT SPIN(  WW )  NAME

        0.13%  0.7us( 230us)  1.0us( 9.0us)(0.00%)  0.3us( 2.9us)  10490234 
99.9% 0.09%(0.05%)  *TOTAL*

  0.00%  7.8%  0.5us( 3.0us)  1.8us( 9.0us)(0.00%)  0.7us( 2.9us)      1224 
92.2%  6.7%( 1.1%)  tasklist_lock
  0.00%  7.7%  0.7us( 1.3us)  1.6us( 7.4us)(0.00%)  0.3us( 0.7us)       379 
92.3%  6.3%( 1.3%)    do_fork+0x588
     0%  7.1%                 2.0us( 3.1us)(0.00%)  1.5us( 2.9us)       379 
92.9%  5.8%( 1.3%)    exit_notify+0x1c4
  0.00%  4.6%  0.1us( 0.1us)  1.4us( 1.7us)(0.00%)  0.6us( 0.6us)        87 
95.4%  3.4%( 1.1%)    pid_base_iput+0x18
  0.00%  9.5%  0.8us( 3.0us)  1.8us( 9.0us)(0.00%)  0.2us( 0.6us)       379 
90.5%  8.7%(0.79%)    unhash_process+0x14

   1.0%  2.1%  5.2us(  18us)  1.0us( 2.3us)(0.00%)  0.3us( 1.2us)    678292 
97.9%  1.3%(0.74%)  xtime_lock
  0.04%  2.1%  0.4us( 4.8us)  1.0us( 2.3us)(0.00%)  0.3us( 1.2us)    339146 
97.9%  1.3%(0.81%)    timer_bh+0xc
   1.0%  2.0%   10us(  18us)  0.9us( 2.2us)(0.00%)  0.3us( 0.9us)    339146 
98.0%  1.3%(0.68%)    timer_interrupt+0x10

   1.1%    0%  0.4us( 
230us)    0us                   0us           9604614  100%    0%(   0%) 
do_generic_file_read+0x8c


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
  2002-07-14 12:22         ` ext2 performance in 2.5.25 versus 2.4.19pre8aa2 Lincoln Dale
@ 2002-07-15  5:30           ` Andrew Morton
  2002-07-15  6:06             ` Lincoln Dale
  2002-07-17 19:22             ` Daniel Phillips
  2002-07-15 16:30           ` Benjamin LaHaise
  1 sibling, 2 replies; 17+ messages in thread
From: Andrew Morton @ 2002-07-15  5:30 UTC (permalink / raw)
  To: Lincoln Dale
  Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
	Linus Torvalds, Steve Lord, linux-kernel

Lincoln Dale wrote:
> 
> Andrew Morton wanted me to do some benchmarking of large files on ext2
> filesystems rather than the usual block-device testing
> i've had some time to do this, here are the results.
> 
> one-line summary is that some results are better, some are worse; CPU usage
> is better in 2.5.25, but thoughput is sometimes
> worse.

Well thanks for doing this.  All rather strange though.

- You should definitely be seeing reduced CPU on writes through the
  pagecache.  A whole pile of gunk has disappeared from there.
  Here's what I get with 4x4gig files on 4xIDE disks:

for i in hde5 hdg5 hdi5 hdk5
do
	/usr/src/ext3/tools/write-and-fsync -m 4000 -f /mnt/$i/foo &
done

2.4.19-rc1+block_highmem 0.06s user 106.75s system 53% cpu 3:20.94 total
2.5.25                   0.03s user 78.37s system 40% cpu 3:14.82 total
2.5.25+some stuff        0.05s user 77.91s system 41% cpu 3:07.70 total
2.5.25+O_DIRECT          0.00s user 6.84s system 3% cpu 2:53.21 total

That's a 25% drop in CPU load for writes in 2.5.  Actually more, because
Andre's current 2.5 IDE drivers are using teeny requests and are measurably
slow.

That's how it should be, and it is strange that you're not showing
decreased CPU and increased throughput on writes.

- For reads through the pagecache you're showing good reduction in CPU
  and some increase in bandwidth.

When reading the above 4 files in parallel on the IDE setup I show:

for i in hde5 hdg5 hdi5 hdk5
do
	time /usr/src/ext3/tools/time-read -b 8192 -h 8192 /mnt/$i/foo &
done                

2.5.25:                      0.43s user 42.74s system 31% cpu 2:17.87 total
2.4.19-rc1+block-highmem:    0.37s user 54.48s system 40% cpu 2:16.17 total
2.4.19-rc1:                  0.63s user 129.21s system 76% cpu 2:49.66 total

A 25% drop in CPU load on buffered reads in 2.5.

Funny thing about your results is the presence of sched_yield(),
especially in the copy-from-pagecache-only load.  That test should
peg the CPU at 100% and definitely shouldn't be spending time in
default_idle.  So who is calling sched_yield()?  I think it has to be
your test app?

Be aware that the sched_yield() behaviour in 2.5 has changed a lot
wrt 2.4.  It has made StarOffice 5.2 completely unusable on a non-idle
system, for a start.  (This is a SO problem and not a kernel problem,
but it's a lesson).

-

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
  2002-07-15  5:30           ` Andrew Morton
@ 2002-07-15  6:06             ` Lincoln Dale
  2002-07-15  6:52               ` Andrew Morton
  2002-07-15  9:49               ` Andrea Arcangeli
  2002-07-17 19:22             ` Daniel Phillips
  1 sibling, 2 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-15  6:06 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
	Linus Torvalds, Steve Lord, linux-kernel

At 10:30 PM 14/07/2002 -0700, Andrew Morton wrote:
>Funny thing about your results is the presence of sched_yield(),
>especially in the copy-from-pagecache-only load.  That test should
>peg the CPU at 100% and definitely shouldn't be spending time in
>default_idle.  So who is calling sched_yield()?  I think it has to be
>your test app?
>
>Be aware that the sched_yield() behaviour in 2.5 has changed a lot
>wrt 2.4.  It has made StarOffice 5.2 completely unusable on a non-idle
>system, for a start.  (This is a SO problem and not a kernel problem,
>but it's a lesson).

my test app uses pthreads (one thread per disk-worker) and 
pthread_cond_wait in the master task to wait for all workers to finish.
i'll switch the app to use clone() and sys_futex instead.

i guess in that case, its debatable whether its a kernel problem or not -- 
pthreads is out there, and if its default behavior is bad, any threaded app 
which uses it will also be bad.


cheers,

lincoln.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
  2002-07-15  6:06             ` Lincoln Dale
@ 2002-07-15  6:52               ` Andrew Morton
  2002-07-15  9:49               ` Andrea Arcangeli
  1 sibling, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2002-07-15  6:52 UTC (permalink / raw)
  To: Lincoln Dale
  Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
	Linus Torvalds, Steve Lord, linux-kernel

Lincoln Dale wrote:
> 
> At 10:30 PM 14/07/2002 -0700, Andrew Morton wrote:
> >Funny thing about your results is the presence of sched_yield(),
> >especially in the copy-from-pagecache-only load.  That test should
> >peg the CPU at 100% and definitely shouldn't be spending time in
> >default_idle.  So who is calling sched_yield()?  I think it has to be
> >your test app?
> >
> >Be aware that the sched_yield() behaviour in 2.5 has changed a lot
> >wrt 2.4.  It has made StarOffice 5.2 completely unusable on a non-idle
> >system, for a start.  (This is a SO problem and not a kernel problem,
> >but it's a lesson).
> 
> my test app uses pthreads (one thread per disk-worker) and
> pthread_cond_wait in the master task to wait for all workers to finish.
> i'll switch the app to use clone() and sys_futex instead.

OK.

> i guess in that case, its debatable whether its a kernel problem or not --
> pthreads is out there, and if its default behavior is bad, any threaded app
> which uses it will also be bad.

Well if your machine is executing a single cycle in default_idle
with that load then there's a bug somewhere.

I took a quick look at glibc-linuxthreads but as usual, my brain
turned to mush and it took seven years off my life.

If you can send me a copy of your test app I'll take a look
at what's going on.

Thanks.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
  2002-07-15  6:06             ` Lincoln Dale
  2002-07-15  6:52               ` Andrew Morton
@ 2002-07-15  9:49               ` Andrea Arcangeli
  2002-07-15 10:16                 ` Lincoln Dale
  2002-07-15 18:08                 ` Andrew Morton
  1 sibling, 2 replies; 17+ messages in thread
From: Andrea Arcangeli @ 2002-07-15  9:49 UTC (permalink / raw)
  To: Lincoln Dale
  Cc: Andrew Morton, Benjamin LaHaise, Stephen C. Tweedie,
	Linus Torvalds, Steve Lord, linux-kernel

On Mon, Jul 15, 2002 at 04:06:21PM +1000, Lincoln Dale wrote:
> At 10:30 PM 14/07/2002 -0700, Andrew Morton wrote:
> >Funny thing about your results is the presence of sched_yield(),
> >especially in the copy-from-pagecache-only load.  That test should
> >peg the CPU at 100% and definitely shouldn't be spending time in
> >default_idle.  So who is calling sched_yield()?  I think it has to be
> >your test app?
> >
> >Be aware that the sched_yield() behaviour in 2.5 has changed a lot
> >wrt 2.4.  It has made StarOffice 5.2 completely unusable on a non-idle
> >system, for a start.  (This is a SO problem and not a kernel problem,
> >but it's a lesson).
> 
> my test app uses pthreads (one thread per disk-worker) and 
> pthread_cond_wait in the master task to wait for all workers to finish.
> i'll switch the app to use clone() and sys_futex instead.

unless you call pthread routines during the workload, pthreads cannot be
the reason for a slowdown.

Also I would suggest Andrew to benchmark 2.4.19rc1aa2 against 2.5
instead of plain rc1 just to be sure to compare apples to apples.
(rc1aa2 should also be faster than pre8aa2)

BTW, Lincol, I still have a pending answer for you, about the mmap
slowdown, that's because of reduced readahead mostly, you can tune it
with page-cluster sysctl, it's not only because of the expensive page
faults that mmap I/O implies. I've some revolutionary idea about
replacing readahead, not that it matters for your workload that is
reading physically contigous though.

Andrea

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
  2002-07-15  9:49               ` Andrea Arcangeli
@ 2002-07-15 10:16                 ` Lincoln Dale
  2002-07-15 18:08                 ` Andrew Morton
  1 sibling, 0 replies; 17+ messages in thread
From: Lincoln Dale @ 2002-07-15 10:16 UTC (permalink / raw)
  To: linux-kernel

At 11:49 AM 15/07/2002 +0200, Andrea Arcangeli wrote:
> > my test app uses pthreads (one thread per disk-worker) and
> > pthread_cond_wait in the master task to wait for all workers to finish.
> > i'll switch the app to use clone() and sys_futex instead.
>
>unless you call pthread routines during the workload, pthreads cannot be
>the reason for a slowdown.

the test app does:

(parent)
         for (i=0; i < num_devices; i++) {
                 err = pthread_create(&(device[i]->thread), NULL,  (void 
*)run_tests, (void *) i);
                 ..
         }

         /* wait for all threads to exit */
         while (active_threads != 0) {
                 pthread_mutex_lock(&sync_thread_mutex);

                 gettimeofday(&now, NULL);
                 timeout.tv_sec = now.tv_sec + 5;
                 timeout.tv_nsec = now.tv_usec * 1000;
                 retcode = 0;

                 while ((active_threads != 0) && (retcode != ETIMEDOUT)) {
                         retcode = 
pthread_cond_timedwait(&sync_thread_cond, &sync_thread_mutex, &timeout);
                 }

                 if (retcode == ETIMEDOUT) {
                         print_status_update();
                 }
                 pthread_mutex_unlock(&sync_thread_mutex);
         }


(each worker, when it finishes)
         pthread_mutex_lock(&sync_thread_mutex);
         active_threads--;
         pthread_cond_broadcast(&sync_thread_cond);
         pthread_mutex_unlock(&sync_thread_mutex);
         pthread_exit(0);


no idea what the pthread_cond_timedwait does under the covers, but i bet 
its bad..

>BTW, Lincol, I still have a pending answer for you, about the mmap
>slowdown, that's because of reduced readahead mostly, you can tune it
>with page-cluster sysctl, it's not only because of the expensive page
>faults that mmap I/O implies. I've some revolutionary idea about
>replacing readahead, not that it matters for your workload that is
>reading physically contigous though.

i only added the mmap() for interests-sake; the intent of my benchmarking 
was less-so to stress linux, more-so to stress some storage-networking 
plumbing (iSCSI & FC stuff), but its been an interesting series of 
experiments nontheless.


cheers,

lincoln.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
  2002-07-14 12:22         ` ext2 performance in 2.5.25 versus 2.4.19pre8aa2 Lincoln Dale
  2002-07-15  5:30           ` Andrew Morton
@ 2002-07-15 16:30           ` Benjamin LaHaise
  1 sibling, 0 replies; 17+ messages in thread
From: Benjamin LaHaise @ 2002-07-15 16:30 UTC (permalink / raw)
  To: Lincoln Dale
  Cc: Andrew Morton, Andrea Arcangeli, Stephen C. Tweedie,
	Linus Torvalds, Steve Lord, linux-kernel

On Sun, Jul 14, 2002 at 10:22:56PM +1000, Lincoln Dale wrote:
> one-line summary is that some results are better, some are worse; CPU usage 
> is better in 2.5.25, but thoughput is sometimes
> worse.

You might want to rerun your tests on 2.5.25 after redefining HZ to be 100, 
or setting HZ to 1000 in the 2.4 kernel.

		-ben

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
  2002-07-15  9:49               ` Andrea Arcangeli
  2002-07-15 10:16                 ` Lincoln Dale
@ 2002-07-15 18:08                 ` Andrew Morton
  1 sibling, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2002-07-15 18:08 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Lincoln Dale, Benjamin LaHaise, Stephen C. Tweedie,
	Linus Torvalds, Steve Lord, linux-kernel

Andrea Arcangeli wrote:
> 
> On Mon, Jul 15, 2002 at 04:06:21PM +1000, Lincoln Dale wrote:
> > At 10:30 PM 14/07/2002 -0700, Andrew Morton wrote:
> > >Funny thing about your results is the presence of sched_yield(),
> > >especially in the copy-from-pagecache-only load.  That test should
> > >peg the CPU at 100% and definitely shouldn't be spending time in
> > >default_idle.  So who is calling sched_yield()?  I think it has to be
> > >your test app?
> > >
> > >Be aware that the sched_yield() behaviour in 2.5 has changed a lot
> > >wrt 2.4.  It has made StarOffice 5.2 completely unusable on a non-idle
> > >system, for a start.  (This is a SO problem and not a kernel problem,
> > >but it's a lesson).
> >
> > my test app uses pthreads (one thread per disk-worker) and
> > pthread_cond_wait in the master task to wait for all workers to finish.
> > i'll switch the app to use clone() and sys_futex instead.
> 
> unless you call pthread routines during the workload, pthreads cannot be
> the reason for a slowdown.

I didn't see the machine spending any time idle when I ran Lincoln's
test so I'm not sure what's going on there.  But the pthread thing
is surely the reason why the profiles are showing time in sched_yield().

What I *did* see was 2.5 spending too much time doing pointless work
in readahead (it's in cache already, stop doing that!).  And also
generic_file_llseek() bouncing i_sem around like a ping-pong ball.
Fixing those things up bought 10%.

> Also I would suggest Andrew to benchmark 2.4.19rc1aa2 against 2.5
> instead of plain rc1 just to be sure to compare apples to apples.
> (rc1aa2 should also be faster than pre8aa2)

Yes sorry, but I find testing -aa is a bit of a pain.  It's such a
big patch, I'd really need to start a new branch for it.

-

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: ext2 performance in 2.5.25 versus 2.4.19pre8aa2
  2002-07-15  5:30           ` Andrew Morton
  2002-07-15  6:06             ` Lincoln Dale
@ 2002-07-17 19:22             ` Daniel Phillips
  1 sibling, 0 replies; 17+ messages in thread
From: Daniel Phillips @ 2002-07-17 19:22 UTC (permalink / raw)
  To: Andrew Morton, Lincoln Dale
  Cc: Benjamin LaHaise, Andrea Arcangeli, Stephen C. Tweedie,
	Linus Torvalds, Steve Lord, linux-kernel

On Monday 15 July 2002 07:30, Andrew Morton wrote:
> Lincoln Dale wrote:
> > 
> > Andrew Morton wanted me to do some benchmarking of large files on ext2
> > filesystems rather than the usual block-device testing
> > i've had some time to do this, here are the results.
> > 
> > one-line summary is that some results are better, some are worse; CPU
> > usage is better in 2.5.25, but thoughput is sometimes worse.
> 
> Well thanks for doing this.  All rather strange though.

One result that seems pretty consistent in these tests is that avoiding the 
page cache is good for about 20% overall throughput improvement.  Which is 
significant, but less than I would have thought if bus bandwidth is the only 
major bottleneck.  Something in the vfs/filesystem/blockio path is still 
eating too much cpu.

Another observation: though only one of the tests hit 100% CPU, total 
throughput is still shows consistent improvement as a result of reducing CPU. 
This should not be, it means there is excessive latency between submission of 
requests, that is, the IO pipes are not being kept full.

-- 
Daniel

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2002-07-17 19:25 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-07-08  3:19 direct-to-BIO for O_DIRECT Andrew Morton
2002-07-08  3:30 ` Lincoln Dale
2002-07-08  7:44 ` Ingo Oeser
2002-07-11  2:25 ` Lincoln Dale
2002-07-11  3:24   ` Andrew Morton
2002-07-11  3:25     ` Lincoln Dale
     [not found]       ` <3D2CFF48.9EFF9C59@zip.com.au>
2002-07-14 12:22         ` ext2 performance in 2.5.25 versus 2.4.19pre8aa2 Lincoln Dale
2002-07-15  5:30           ` Andrew Morton
2002-07-15  6:06             ` Lincoln Dale
2002-07-15  6:52               ` Andrew Morton
2002-07-15  9:49               ` Andrea Arcangeli
2002-07-15 10:16                 ` Lincoln Dale
2002-07-15 18:08                 ` Andrew Morton
2002-07-17 19:22             ` Daniel Phillips
2002-07-15 16:30           ` Benjamin LaHaise
2002-07-11 19:52   ` direct-to-BIO for O_DIRECT Jesse Barnes
2002-07-11 23:40     ` Lincoln Dale

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox