[PATCH] Squashfs: add asynchronous read support

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] Squashfs: add asynchronous read support
@ 2013-12-16  5:30 Chanho Min
  2013-12-17  7:27 ` Minchan Kim
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Chanho Min @ 2013-12-16  5:30 UTC (permalink / raw)
  To: Phillip Lougher
  Cc: linux-kernel, HyoJun Im, gunho.lee, Minchan Kim, Chanho Min

This patch removes synchronous wait for the up-to-date of buffer in the
file system level. Instead all operations after submit_bh are moved into
the End-of-IO handler and its associated workeque. It decompresses/copies
data into pages and unlock them asynchronously.

This patch enhances the performance of Squashfs in most cases.
Especially, large file reading is improved significantly.

dd read test:

 - ARM cortex-a9 1GHz, 2 cores, eMMC 4.5 HS200 mode.
 - dd if=file1 of=/dev/null bs=64k

Before
 58707718 bytes (56.0MB) copied, 1.393653 seconds, 40.2MB/s

After
 58707718 bytes (56.0MB) copied, 0.942413 seconds, 59.4MB/s

Signed-off-by: Chanho Min <chanho.min@lge.com>
---
 fs/squashfs/Kconfig       |    9 ++
 fs/squashfs/block.c       |  262 +++++++++++++++++++++++++++++++++++++++++++++
 fs/squashfs/file_direct.c |    8 +-
 fs/squashfs/page_actor.c  |    3 +-
 fs/squashfs/page_actor.h  |    3 +-
 fs/squashfs/squashfs.h    |    2 +
 6 files changed, 284 insertions(+), 3 deletions(-)

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index b6fa865..284aa5a 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -51,6 +51,15 @@ config SQUASHFS_FILE_DIRECT
 	  it eliminates a memcpy and it also removes the lock contention
 	  on the single buffer.
 
+config SQUASHFS_READ_DATA_ASYNC
+	bool "Read and decompress data asynchronously"
+	depends on  SQUASHFS_FILE_DIRECT
+	help
+	  By default Squashfs read data synchronously by block (default 128k).
+	  This option removes such a synchronous wait in the file system level.
+	  All works after submit IO do at the End-of-IO handler asynchronously.
+	  This enhances the performance of Squashfs in most cases, especially,
+	  large file reading.
 endchoice
 
 choice
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 0cea9b9..1517ca3 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -212,3 +212,265 @@ read_failure:
 	kfree(bh);
 	return -EIO;
 }
+
+#ifdef CONFIG_SQUASHFS_READ_DATA_ASYNC
+
+struct squashfs_end_io_assoc {
+	int offset;
+	int b_count;
+	int compressed;
+	int length;
+	struct squashfs_page_actor *p_actor;
+	struct buffer_head **__bh;
+	struct squashfs_sb_info *msblk;
+	struct work_struct read_work;
+};
+
+static int squashfs_copy_page(struct squashfs_sb_info *msblk,
+	struct buffer_head **bh, int b, int offset, int length,
+	struct squashfs_page_actor *output)
+{
+	/*
+	 * Block is uncompressed.
+	 */
+	int in, pg_offset = 0, avail = 0, bytes, k = 0;
+	void *data = squashfs_first_page(output);
+	for (bytes = length; k < b; k++) {
+		in = min(bytes, msblk->devblksize - offset);
+		bytes -= in;
+		while (in) {
+			if (pg_offset == PAGE_CACHE_SIZE) {
+				data = squashfs_next_page(output);
+				pg_offset = 0;
+			}
+			avail = min_t(int, in, PAGE_CACHE_SIZE -
+					pg_offset);
+			memcpy(data + pg_offset, bh[k]->b_data + offset,
+					avail);
+			in -= avail;
+			pg_offset += avail;
+			offset += avail;
+		}
+		offset = 0;
+		put_bh(bh[k]);
+	}
+	squashfs_finish_page(output);
+	return length;
+}
+
+/*
+ * This is executed in workqueue for squashfs_read_data_async().
+ * - pages come decompressed/copied and unlocked asynchronously.
+ */
+static void squashfs_buffer_read_async(struct squashfs_end_io_assoc *io_assoc)
+{
+	struct squashfs_sb_info *msblk = io_assoc->msblk;
+	struct squashfs_page_actor *actor = io_assoc->p_actor;
+	struct page **page = actor->page;
+	int pages = actor->pages;
+	struct page *target_page = actor->target_page;
+	int i, length, bytes = 0;
+	void *pageaddr;
+
+	if (io_assoc->compressed) {
+		length = squashfs_decompress(msblk, io_assoc->__bh,
+				io_assoc->b_count, io_assoc->offset,
+				io_assoc->length, actor);
+		if (length < 0) {
+			ERROR("squashfs_read_data failed to read block\n");
+			goto read_failure;
+		}
+	} else
+		length = squashfs_copy_page(msblk, io_assoc->__bh,
+				io_assoc->b_count, io_assoc->offset,
+				io_assoc->length, actor);
+
+	/* Last page may have trailing bytes not filled */
+	bytes = length % PAGE_CACHE_SIZE;
+	if (bytes) {
+		pageaddr = kmap_atomic(page[pages - 1]);
+		memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
+		kunmap_atomic(pageaddr);
+	}
+
+	/* Mark pages as uptodate, unlock and release */
+	for (i = 0; i < pages; i++) {
+		flush_dcache_page(page[i]);
+		SetPageUptodate(page[i]);
+		unlock_page(page[i]);
+		if (page[i] != target_page)
+			page_cache_release(page[i]);
+	}
+
+	kfree(io_assoc->__bh);
+	kfree(actor);
+	kfree(page);
+	kfree(io_assoc);
+	return;
+
+read_failure:
+	/* Decompression failed, mark pages as errored.  Target_page is
+	 * dealt with by the caller
+	 */
+	for (i = 0; i < pages; i++) {
+		if (page[i] == NULL || page[i] == target_page)
+			continue;
+		flush_dcache_page(page[i]);
+		SetPageError(page[i]);
+		unlock_page(page[i]);
+		page_cache_release(page[i]);
+	}
+
+	kfree(io_assoc->__bh);
+	kfree(actor);
+	kfree(page);
+	kfree(io_assoc);
+	return;
+}
+
+static void squashfs_async_work(struct work_struct *work)
+{
+	struct squashfs_end_io_assoc *io_assoc =
+		container_of(work, struct squashfs_end_io_assoc, read_work);
+
+	squashfs_buffer_read_async(io_assoc);
+}
+
+/*
+ * squashfs_buffer_end_io: update buffer and check if all buffers of array
+ * are updated then invoke the wq for async read.
+ */
+static void squashfs_buffer_end_io(struct buffer_head *bh, int uptodate)
+{
+	int i;
+	struct squashfs_end_io_assoc *io_assoc = bh->b_private;
+
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		/* This happens, due to failed READA attempts. */
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+
+	BUG_ON(!io_assoc);
+
+	/* Check if all buffers are uptodate */
+	for (i = 0; i < io_assoc->b_count; i++)
+		if (!buffer_uptodate(io_assoc->__bh[i]))
+			return;
+
+	schedule_work(&io_assoc->read_work);
+}
+
+/*
+ * squashfs_ll_r_block: low-level access to block devices for squashfs.
+ * @nr: number of &struct buffer_heads in the array
+ * @bhs: array of pointers to &struct buffer_head
+ *
+ * squashfs_ll_r_block sets b_end_io to the squashfs specific completion handler
+ * that marks the buffer up-to-date and invokes workqueue for decompression
+ * and uptodate of pages if needed.
+ */
+static void squashfs_ll_r_block(int nr, struct buffer_head *bhs[])
+{
+	int i, s_nr = 0;
+	struct squashfs_end_io_assoc *io_assoc = NULL;
+
+	for (i = 0; i < nr; i++) {
+		struct buffer_head *bh = bhs[i];
+		io_assoc = bh->b_private;
+		if (!trylock_buffer(bh))
+			continue;
+		if (!buffer_uptodate(bh)) {
+			bh->b_end_io = squashfs_buffer_end_io;
+			get_bh(bh);
+			s_nr++;
+			submit_bh(READ, bh);
+			continue;
+		}
+		unlock_buffer(bh);
+	}
+	/*
+	 * All buffers are uptodate, but no submit_bh is occurred.
+	 * Then try to unlock pages directly.
+	 */
+	if (nr && !s_nr && io_assoc)
+		squashfs_buffer_read_async(io_assoc);
+}
+
+/*
+ * Read and datablock asynchronously. same as squashfs_read_data(),
+ * except it doesn't block until a buffer comes unlocked.
+ * the work after submit IO do at the End-Of-Handle and the associated wq.
+ */
+int squashfs_read_data_async(struct super_block *sb, u64 index, int length,
+		struct squashfs_page_actor *output)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	struct buffer_head **bh;
+	int offset = index & ((1 << msblk->devblksize_log2) - 1);
+	u64 cur_index = index >> msblk->devblksize_log2;
+	int bytes, compressed, b = 0, k = 0;
+	struct squashfs_end_io_assoc *io_assoc;
+
+	bh = kcalloc(((output->length + msblk->devblksize - 1)
+		>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
+	if (bh == NULL)
+		return -ENOMEM;
+
+	if (!length)
+		return -EINVAL;
+
+	bytes = -offset;
+
+	compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+	length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+
+	TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
+		index, compressed ? "" : "un", length, output->length);
+
+	if (length < 0 || length > output->length ||
+			(index + length) > msblk->bytes_used)
+		goto read_failure;
+
+	io_assoc = kmalloc(sizeof(struct squashfs_end_io_assoc), GFP_KERNEL);
+	if (io_assoc == NULL)
+		return -ENOMEM;
+
+	io_assoc->offset = offset;
+	io_assoc->p_actor = output;
+	io_assoc->compressed = compressed;
+	io_assoc->__bh = bh;
+	io_assoc->length = length;
+	io_assoc->msblk = msblk;
+
+	INIT_WORK(&io_assoc->read_work, squashfs_async_work);
+
+	for (b = 0; bytes < length; b++, cur_index++) {
+		bh[b] = sb_getblk(sb, cur_index);
+		if (bh[b] == NULL)
+			goto block_release;
+		bytes += msblk->devblksize;
+		if (!buffer_locked(bh[b]))
+			bh[b]->b_private = io_assoc;
+	}
+	io_assoc->b_count = b;
+
+	/* make sure io_assoc is updated before submit IO */
+	mb();
+	squashfs_ll_r_block(b, bh);
+	return length;
+
+block_release:
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+read_failure:
+	ERROR("squashfs_read_data failed to read block 0x%llx\n",
+					(unsigned long long) index);
+	kfree(bh);
+	return -EIO;
+}
+#endif
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 62a0de6..610ca17 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -52,7 +52,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 	 * Create a "page actor" which will kmap and kunmap the
 	 * page cache pages appropriately within the decompressor
 	 */
-	actor = squashfs_page_actor_init_special(page, pages, 0);
+	actor = squashfs_page_actor_init_special(page, pages, target_page, 0);
 	if (actor == NULL)
 		goto out;
 
@@ -91,6 +91,11 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 	}
 
 	/* Decompress directly into the page cache buffers */
+#ifdef CONFIG_SQUASHFS_READ_DATA_ASYNC
+	squashfs_read_data_async(inode->i_sb, block, bsize, actor);
+
+	return 0;
+#else
 	res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
 	if (res < 0)
 		goto mark_errored;
@@ -116,6 +121,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 	kfree(page);
 
 	return 0;
+#endif
 
 mark_errored:
 	/* Decompression failed, mark pages as errored.  Target_page is
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 5a1c11f..2d2d7ac 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -81,7 +81,7 @@ static void direct_finish_page(struct squashfs_page_actor *actor)
 }
 
 struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
-	int pages, int length)
+	int pages, struct page *target_page, int length)
 {
 	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
 
@@ -91,6 +91,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
 	actor->length = length ? : pages * PAGE_CACHE_SIZE;
 	actor->page = page;
 	actor->pages = pages;
+	actor->target_page = target_page;
 	actor->next_page = 0;
 	actor->pageaddr = NULL;
 	actor->squashfs_first_page = direct_first_page;
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 26dd820..b50d982 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -58,13 +58,14 @@ struct squashfs_page_actor {
 	void    *(*squashfs_next_page)(struct squashfs_page_actor *);
 	void    (*squashfs_finish_page)(struct squashfs_page_actor *);
 	int	pages;
+	struct page *target_page;
 	int	length;
 	int	next_page;
 };
 
 extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
 extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
-							 **, int, int);
+						 **, int, struct page *, int);
 static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
 {
 	return actor->squashfs_first_page(actor);
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 9e1bb79..39e95af 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -30,6 +30,8 @@
 /* block.c */
 extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
 				struct squashfs_page_actor *);
+extern int squashfs_read_data_async(struct super_block *, u64, int,
+				struct squashfs_page_actor *);
 
 /* cache.c */
 extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] Squashfs: add asynchronous read support
  2013-12-16  5:30 [PATCH] Squashfs: add asynchronous read support Chanho Min
@ 2013-12-17  7:27 ` Minchan Kim
  2013-12-18  4:29 ` Re : " Chanho Min
  2013-12-23  5:03 ` Phillip Lougher
  2 siblings, 0 replies; 9+ messages in thread
From: Minchan Kim @ 2013-12-17  7:27 UTC (permalink / raw)
  To: Chanho Min; +Cc: Phillip Lougher, linux-kernel, HyoJun Im, gunho.lee

Hello Chanho,

On Mon, Dec 16, 2013 at 02:30:26PM +0900, Chanho Min wrote:
> This patch removes synchronous wait for the up-to-date of buffer in the
> file system level. Instead all operations after submit_bh are moved into
> the End-of-IO handler and its associated workeque. It decompresses/copies
> data into pages and unlock them asynchronously.
> 
> This patch enhances the performance of Squashfs in most cases.
> Especially, large file reading is improved significantly.
> 
> dd read test:
> 
>  - ARM cortex-a9 1GHz, 2 cores, eMMC 4.5 HS200 mode.
>  - dd if=file1 of=/dev/null bs=64k
> 
> Before
>  58707718 bytes (56.0MB) copied, 1.393653 seconds, 40.2MB/s
> 
> After
>  58707718 bytes (56.0MB) copied, 0.942413 seconds, 59.4MB/s

It's really nice!

I did test it on x86 with USB stick and ARM with eMMC on my Nexus 4.
In experiment, I couldn't see much gain like you both system and
even it was regressed at bs=32k test, maybe workqueue
allocation/schedule of work per I/O.
Your test is rather special or what I am missing?

Before that, I'd like to know fundamental reason why your implementation
for asynchronous read enhance. At a first glance, I thought it's caused
by readahead from MM layer but when I read code, I found I was wrong.
MM's readahead logic works based on PageReadahead marker but squashfs
invalidates by grab_cache_page_nowait so it wouldn't work as we expected.

Another possibility is block I/O merging in block layder by plugging logic,
which was what I tried a few month ago although implementation was really
bad. But it wouldn't work with your patch because do_generic_file_read
will unplug block layer by lock_page without merging enough I/O.

So, what do you think real actuator for enhance your experiment?
Then, I could investigate why I can't get a benefit.

Thanks for looking this.

> 
> Signed-off-by: Chanho Min <chanho.min@lge.com>
> ---
>  fs/squashfs/Kconfig       |    9 ++
>  fs/squashfs/block.c       |  262 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/squashfs/file_direct.c |    8 +-
>  fs/squashfs/page_actor.c  |    3 +-
>  fs/squashfs/page_actor.h  |    3 +-
>  fs/squashfs/squashfs.h    |    2 +
>  6 files changed, 284 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
> index b6fa865..284aa5a 100644
> --- a/fs/squashfs/Kconfig
> +++ b/fs/squashfs/Kconfig
> @@ -51,6 +51,15 @@ config SQUASHFS_FILE_DIRECT
>  	  it eliminates a memcpy and it also removes the lock contention
>  	  on the single buffer.
>  
> +config SQUASHFS_READ_DATA_ASYNC
> +	bool "Read and decompress data asynchronously"
> +	depends on  SQUASHFS_FILE_DIRECT
> +	help
> +	  By default Squashfs read data synchronously by block (default 128k).
> +	  This option removes such a synchronous wait in the file system level.
> +	  All works after submit IO do at the End-of-IO handler asynchronously.
> +	  This enhances the performance of Squashfs in most cases, especially,
> +	  large file reading.
>  endchoice
>  
>  choice
> diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
> index 0cea9b9..1517ca3 100644
> --- a/fs/squashfs/block.c
> +++ b/fs/squashfs/block.c
> @@ -212,3 +212,265 @@ read_failure:
>  	kfree(bh);
>  	return -EIO;
>  }
> +
> +#ifdef CONFIG_SQUASHFS_READ_DATA_ASYNC
> +
> +struct squashfs_end_io_assoc {
> +	int offset;
> +	int b_count;
> +	int compressed;
> +	int length;
> +	struct squashfs_page_actor *p_actor;
> +	struct buffer_head **__bh;
> +	struct squashfs_sb_info *msblk;
> +	struct work_struct read_work;
> +};
> +
> +static int squashfs_copy_page(struct squashfs_sb_info *msblk,
> +	struct buffer_head **bh, int b, int offset, int length,
> +	struct squashfs_page_actor *output)
> +{
> +	/*
> +	 * Block is uncompressed.
> +	 */
> +	int in, pg_offset = 0, avail = 0, bytes, k = 0;
> +	void *data = squashfs_first_page(output);
> +	for (bytes = length; k < b; k++) {
> +		in = min(bytes, msblk->devblksize - offset);
> +		bytes -= in;
> +		while (in) {
> +			if (pg_offset == PAGE_CACHE_SIZE) {
> +				data = squashfs_next_page(output);
> +				pg_offset = 0;
> +			}
> +			avail = min_t(int, in, PAGE_CACHE_SIZE -
> +					pg_offset);
> +			memcpy(data + pg_offset, bh[k]->b_data + offset,
> +					avail);
> +			in -= avail;
> +			pg_offset += avail;
> +			offset += avail;
> +		}
> +		offset = 0;
> +		put_bh(bh[k]);
> +	}
> +	squashfs_finish_page(output);
> +	return length;
> +}
> +
> +/*
> + * This is executed in workqueue for squashfs_read_data_async().
> + * - pages come decompressed/copied and unlocked asynchronously.
> + */
> +static void squashfs_buffer_read_async(struct squashfs_end_io_assoc *io_assoc)
> +{
> +	struct squashfs_sb_info *msblk = io_assoc->msblk;
> +	struct squashfs_page_actor *actor = io_assoc->p_actor;
> +	struct page **page = actor->page;
> +	int pages = actor->pages;
> +	struct page *target_page = actor->target_page;
> +	int i, length, bytes = 0;
> +	void *pageaddr;
> +
> +	if (io_assoc->compressed) {
> +		length = squashfs_decompress(msblk, io_assoc->__bh,
> +				io_assoc->b_count, io_assoc->offset,
> +				io_assoc->length, actor);
> +		if (length < 0) {
> +			ERROR("squashfs_read_data failed to read block\n");
> +			goto read_failure;
> +		}
> +	} else
> +		length = squashfs_copy_page(msblk, io_assoc->__bh,
> +				io_assoc->b_count, io_assoc->offset,
> +				io_assoc->length, actor);
> +
> +	/* Last page may have trailing bytes not filled */
> +	bytes = length % PAGE_CACHE_SIZE;
> +	if (bytes) {
> +		pageaddr = kmap_atomic(page[pages - 1]);
> +		memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
> +		kunmap_atomic(pageaddr);
> +	}
> +
> +	/* Mark pages as uptodate, unlock and release */
> +	for (i = 0; i < pages; i++) {
> +		flush_dcache_page(page[i]);
> +		SetPageUptodate(page[i]);
> +		unlock_page(page[i]);
> +		if (page[i] != target_page)
> +			page_cache_release(page[i]);
> +	}
> +
> +	kfree(io_assoc->__bh);
> +	kfree(actor);
> +	kfree(page);
> +	kfree(io_assoc);
> +	return;
> +
> +read_failure:
> +	/* Decompression failed, mark pages as errored.  Target_page is
> +	 * dealt with by the caller
> +	 */
> +	for (i = 0; i < pages; i++) {
> +		if (page[i] == NULL || page[i] == target_page)
> +			continue;
> +		flush_dcache_page(page[i]);
> +		SetPageError(page[i]);
> +		unlock_page(page[i]);
> +		page_cache_release(page[i]);
> +	}
> +
> +	kfree(io_assoc->__bh);
> +	kfree(actor);
> +	kfree(page);
> +	kfree(io_assoc);
> +	return;
> +}
> +
> +static void squashfs_async_work(struct work_struct *work)
> +{
> +	struct squashfs_end_io_assoc *io_assoc =
> +		container_of(work, struct squashfs_end_io_assoc, read_work);
> +
> +	squashfs_buffer_read_async(io_assoc);
> +}
> +
> +/*
> + * squashfs_buffer_end_io: update buffer and check if all buffers of array
> + * are updated then invoke the wq for async read.
> + */
> +static void squashfs_buffer_end_io(struct buffer_head *bh, int uptodate)
> +{
> +	int i;
> +	struct squashfs_end_io_assoc *io_assoc = bh->b_private;
> +
> +	if (uptodate) {
> +		set_buffer_uptodate(bh);
> +	} else {
> +		/* This happens, due to failed READA attempts. */
> +		clear_buffer_uptodate(bh);
> +	}
> +	unlock_buffer(bh);
> +	put_bh(bh);
> +
> +	BUG_ON(!io_assoc);
> +
> +	/* Check if all buffers are uptodate */
> +	for (i = 0; i < io_assoc->b_count; i++)
> +		if (!buffer_uptodate(io_assoc->__bh[i]))
> +			return;
> +
> +	schedule_work(&io_assoc->read_work);
> +}
> +
> +/*
> + * squashfs_ll_r_block: low-level access to block devices for squashfs.
> + * @nr: number of &struct buffer_heads in the array
> + * @bhs: array of pointers to &struct buffer_head
> + *
> + * squashfs_ll_r_block sets b_end_io to the squashfs specific completion handler
> + * that marks the buffer up-to-date and invokes workqueue for decompression
> + * and uptodate of pages if needed.
> + */
> +static void squashfs_ll_r_block(int nr, struct buffer_head *bhs[])
> +{
> +	int i, s_nr = 0;
> +	struct squashfs_end_io_assoc *io_assoc = NULL;
> +
> +	for (i = 0; i < nr; i++) {
> +		struct buffer_head *bh = bhs[i];
> +		io_assoc = bh->b_private;
> +		if (!trylock_buffer(bh))
> +			continue;
> +		if (!buffer_uptodate(bh)) {
> +			bh->b_end_io = squashfs_buffer_end_io;
> +			get_bh(bh);
> +			s_nr++;
> +			submit_bh(READ, bh);
> +			continue;
> +		}
> +		unlock_buffer(bh);
> +	}
> +	/*
> +	 * All buffers are uptodate, but no submit_bh is occurred.
> +	 * Then try to unlock pages directly.
> +	 */
> +	if (nr && !s_nr && io_assoc)
> +		squashfs_buffer_read_async(io_assoc);
> +}
> +
> +/*
> + * Read and datablock asynchronously. same as squashfs_read_data(),
> + * except it doesn't block until a buffer comes unlocked.
> + * the work after submit IO do at the End-Of-Handle and the associated wq.
> + */
> +int squashfs_read_data_async(struct super_block *sb, u64 index, int length,
> +		struct squashfs_page_actor *output)
> +{
> +	struct squashfs_sb_info *msblk = sb->s_fs_info;
> +	struct buffer_head **bh;
> +	int offset = index & ((1 << msblk->devblksize_log2) - 1);
> +	u64 cur_index = index >> msblk->devblksize_log2;
> +	int bytes, compressed, b = 0, k = 0;
> +	struct squashfs_end_io_assoc *io_assoc;
> +
> +	bh = kcalloc(((output->length + msblk->devblksize - 1)
> +		>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
> +	if (bh == NULL)
> +		return -ENOMEM;
> +
> +	if (!length)
> +		return -EINVAL;
> +
> +	bytes = -offset;
> +
> +	compressed = SQUASHFS_COMPRESSED_BLOCK(length);
> +	length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
> +
> +	TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
> +		index, compressed ? "" : "un", length, output->length);
> +
> +	if (length < 0 || length > output->length ||
> +			(index + length) > msblk->bytes_used)
> +		goto read_failure;
> +
> +	io_assoc = kmalloc(sizeof(struct squashfs_end_io_assoc), GFP_KERNEL);
> +	if (io_assoc == NULL)
> +		return -ENOMEM;
> +
> +	io_assoc->offset = offset;
> +	io_assoc->p_actor = output;
> +	io_assoc->compressed = compressed;
> +	io_assoc->__bh = bh;
> +	io_assoc->length = length;
> +	io_assoc->msblk = msblk;
> +
> +	INIT_WORK(&io_assoc->read_work, squashfs_async_work);
> +
> +	for (b = 0; bytes < length; b++, cur_index++) {
> +		bh[b] = sb_getblk(sb, cur_index);
> +		if (bh[b] == NULL)
> +			goto block_release;
> +		bytes += msblk->devblksize;
> +		if (!buffer_locked(bh[b]))
> +			bh[b]->b_private = io_assoc;
> +	}
> +	io_assoc->b_count = b;
> +
> +	/* make sure io_assoc is updated before submit IO */
> +	mb();
> +	squashfs_ll_r_block(b, bh);
> +	return length;
> +
> +block_release:
> +	for (; k < b; k++)
> +		put_bh(bh[k]);
> +
> +read_failure:
> +	ERROR("squashfs_read_data failed to read block 0x%llx\n",
> +					(unsigned long long) index);
> +	kfree(bh);
> +	return -EIO;
> +}
> +#endif
> diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
> index 62a0de6..610ca17 100644
> --- a/fs/squashfs/file_direct.c
> +++ b/fs/squashfs/file_direct.c
> @@ -52,7 +52,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
>  	 * Create a "page actor" which will kmap and kunmap the
>  	 * page cache pages appropriately within the decompressor
>  	 */
> -	actor = squashfs_page_actor_init_special(page, pages, 0);
> +	actor = squashfs_page_actor_init_special(page, pages, target_page, 0);
>  	if (actor == NULL)
>  		goto out;
>  
> @@ -91,6 +91,11 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
>  	}
>  
>  	/* Decompress directly into the page cache buffers */
> +#ifdef CONFIG_SQUASHFS_READ_DATA_ASYNC
> +	squashfs_read_data_async(inode->i_sb, block, bsize, actor);
> +
> +	return 0;
> +#else
>  	res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
>  	if (res < 0)
>  		goto mark_errored;
> @@ -116,6 +121,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
>  	kfree(page);
>  
>  	return 0;
> +#endif
>  
>  mark_errored:
>  	/* Decompression failed, mark pages as errored.  Target_page is
> diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
> index 5a1c11f..2d2d7ac 100644
> --- a/fs/squashfs/page_actor.c
> +++ b/fs/squashfs/page_actor.c
> @@ -81,7 +81,7 @@ static void direct_finish_page(struct squashfs_page_actor *actor)
>  }
>  
>  struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
> -	int pages, int length)
> +	int pages, struct page *target_page, int length)
>  {
>  	struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
>  
> @@ -91,6 +91,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
>  	actor->length = length ? : pages * PAGE_CACHE_SIZE;
>  	actor->page = page;
>  	actor->pages = pages;
> +	actor->target_page = target_page;
>  	actor->next_page = 0;
>  	actor->pageaddr = NULL;
>  	actor->squashfs_first_page = direct_first_page;
> diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
> index 26dd820..b50d982 100644
> --- a/fs/squashfs/page_actor.h
> +++ b/fs/squashfs/page_actor.h
> @@ -58,13 +58,14 @@ struct squashfs_page_actor {
>  	void    *(*squashfs_next_page)(struct squashfs_page_actor *);
>  	void    (*squashfs_finish_page)(struct squashfs_page_actor *);
>  	int	pages;
> +	struct page *target_page;
>  	int	length;
>  	int	next_page;
>  };
>  
>  extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
>  extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
> -							 **, int, int);
> +						 **, int, struct page *, int);
>  static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
>  {
>  	return actor->squashfs_first_page(actor);
> diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
> index 9e1bb79..39e95af 100644
> --- a/fs/squashfs/squashfs.h
> +++ b/fs/squashfs/squashfs.h
> @@ -30,6 +30,8 @@
>  /* block.c */
>  extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
>  				struct squashfs_page_actor *);
> +extern int squashfs_read_data_async(struct super_block *, u64, int,
> +				struct squashfs_page_actor *);
>  
>  /* cache.c */
>  extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re : Re: [PATCH] Squashfs: add asynchronous read support
  2013-12-16  5:30 [PATCH] Squashfs: add asynchronous read support Chanho Min
  2013-12-17  7:27 ` Minchan Kim
@ 2013-12-18  4:29 ` Chanho Min
  2013-12-18  5:24   ` Minchan Kim
  2013-12-21  2:05   ` Chanho Min
  2013-12-23  5:03 ` Phillip Lougher
  2 siblings, 2 replies; 9+ messages in thread
From: Chanho Min @ 2013-12-18  4:29 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Phillip Lougher, linux-kernel, 임효준,
	이건호


> I did test it on x86 with USB stick and ARM with eMMC on my Nexus 4.
> In experiment, I couldn't see much gain like you both system and even it
> was regressed at bs=32k test, maybe workqueue allocation/schedule of work
> per I/O.
> Your test is rather special or what I am missing?
Can you specify your test result on ARM with eMMC.

> Before that, I'd like to know fundamental reason why your implementation
> for asynchronous read enhance. At a first glance, I thought it's caused by
> readahead from MM layer but when I read code, I found I was wrong.
> MM's readahead logic works based on PageReadahead marker but squashfs
> invalidates by grab_cache_page_nowait so it wouldn't work as we expected.
>
> Another possibility is block I/O merging in block layder by plugging logic,
> which was what I tried a few month ago although implementation was really
> bad. But it wouldn't work with your patch because do_generic_file_read
> will unplug block layer by lock_page without merging enough I/O.
>
> So, what do you think real actuator for enhance your experiment?
> Then, I could investigate why I can't get a benefit.
Currently, squashfs adds request to the block device queue synchronously with
wait for competion. mmc takes this request one by one and push them to host driver,
But it allows mmc to be idle frequently. This patch allows to add block requset
asynchrously without wait for competion, mmcqd can fetch a lot of request from block
at a time. As a result, mmcqd get busy and use a more bandwidth of mmc.
For test, I added two count variables in mmc_queue_thread as bellows
and tested same dd transfer.

static int mmc_queue_thread(void *d)
{
..
	do {
		if (req || mq->mqrq_prev->req) {
			fetch++;
		} else {
			idle++;
		}
	} while (1);
..
}

without patch:
 fetch: 920, idle: 460

with patch
 fetch: 918, idle: 40

Thanks
Chanho.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Re : Re: [PATCH] Squashfs: add asynchronous read support
  2013-12-18  4:29 ` Re : " Chanho Min
@ 2013-12-18  5:24   ` Minchan Kim
  2013-12-21  2:05   ` Chanho Min
  1 sibling, 0 replies; 9+ messages in thread
From: Minchan Kim @ 2013-12-18  5:24 UTC (permalink / raw)
  To: Chanho Min
  Cc: Phillip Lougher, linux-kernel, 임효준,
	이건호

Hello,

Please don't break thread.
You should reply to my mail instead of your original post.

On Wed, Dec 18, 2013 at 01:29:37PM +0900, Chanho Min wrote:
> 
> > I did test it on x86 with USB stick and ARM with eMMC on my Nexus 4.
> > In experiment, I couldn't see much gain like you both system and even it
> > was regressed at bs=32k test, maybe workqueue allocation/schedule of work
> > per I/O.
> > Your test is rather special or what I am missing?
> Can you specify your test result on ARM with eMMC.

Sure.
        before  after
32K     3.6M     3.4M
64K     6.3M     8.2M
128K    11.4M    11.7M
160K    13.6M    13.8M
256K    19.8M    19M
288K    21.3M    20.8M

> 
> > Before that, I'd like to know fundamental reason why your implementation
> > for asynchronous read enhance. At a first glance, I thought it's caused by
> > readahead from MM layer but when I read code, I found I was wrong.
> > MM's readahead logic works based on PageReadahead marker but squashfs
> > invalidates by grab_cache_page_nowait so it wouldn't work as we expected.
> >
> > Another possibility is block I/O merging in block layder by plugging logic,
> > which was what I tried a few month ago although implementation was really
> > bad. But it wouldn't work with your patch because do_generic_file_read
> > will unplug block layer by lock_page without merging enough I/O.
> >
> > So, what do you think real actuator for enhance your experiment?
> > Then, I could investigate why I can't get a benefit.
> Currently, squashfs adds request to the block device queue synchronously with
> wait for competion. mmc takes this request one by one and push them to host driver,
> But it allows mmc to be idle frequently. This patch allows to add block requset
> asynchrously without wait for competion, mmcqd can fetch a lot of request from block
> at a time. As a result, mmcqd get busy and use a more bandwidth of mmc.
> For test, I added two count variables in mmc_queue_thread as bellows
> and tested same dd transfer.
> 
> static int mmc_queue_thread(void *d)
> {
> ..
> 	do {
> 		if (req || mq->mqrq_prev->req) {
> 			fetch++;
> 		} else {
> 			idle++;
> 		}
> 	} while (1);
> ..
> }
> 
> without patch:
>  fetch: 920, idle: 460
> 
> with patch
>  fetch: 918, idle: 40

It's a result which isn't what I want to know.
What I wnat to know is why upper layer issues more I/O per second.

For example, you read 32K so MM layer will prepare 8 pages to read in but
at issuing at a first page, squashfs make 32 pages and fill the page cache
if we assume you use 128K compression so MM layer's already prepared 7 page
would be freed without further I/O and do_generic_file_read will wait for
completion by lock_page without further I/O queueing. It's not suprising.
One of page freed is a READA marked page so readahead couldn't work.
If readahead works, it would be just by luck. Actually, by simulation
64K dd, I found readahead logic would be triggered but it's just by luck
and it's not intended, I think.

If first issued I/O complete, squashfs decompress the I/O with 128K pages
so all 4 iteration(128K/32K) would be hit in page cache.
If all 128K hit in page cache, mm layer start to issue next I/O and
repeat above logic until you ends up reading all file size.
So my opition is that upper layer wouldn't issue more I/O logically.
If it worked, it's not what we expect but side-effect.

That's why I'd like to know what's your thought for increasing IOPS.
Please, could you say your thought why IOPS increased, not a result
on low level driver?

Anyway, in my opinion, we should take care of MM layer's readahead for
enhance sequential I/O. For it, we should use buffer pages passed by MM
instead of freeing them and allocating new pages in squashfs.
IMHO, it would be better to implement squashfs_readpages but my insight
is very weak so I guess Phillip will give more good idea/insight about
the issue.

Thanks!

> 
> Thanks
> Chanho.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Re: Re : Re: [PATCH] Squashfs: add asynchronous read support
  2013-12-18  4:29 ` Re : " Chanho Min
  2013-12-18  5:24   ` Minchan Kim
@ 2013-12-21  2:05   ` Chanho Min
  2013-12-23  0:38     ` Minchan Kim
  2013-12-23  3:03     ` Re : " Chanho Min
  1 sibling, 2 replies; 9+ messages in thread
From: Chanho Min @ 2013-12-21  2:05 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Phillip Lougher, linux-kernel, 임효준,
	이건호


> Please don't break thread.
> You should reply to my mail instead of your original post.
Sorry, It seems to be my mailer issue. I'm trying to fix it.

> It's a result which isn't what I want to know.
> What I wnat to know is why upper layer issues more I/O per second.
> For example, you read 32K so MM layer will prepare 8 pages to read in but
> at issuing at a first page, squashfs make 32 pages and fill the page cache
> if we assume you use 128K compression so MM layer's already prepared 7
> page
> would be freed without further I/O and do_generic_file_read will wait for
> completion by lock_page without further I/O queueing. It's not suprising.
> One of page freed is a READA marked page so readahead couldn't work.
> If readahead works, it would be just by luck. Actually, by simulation
> 64K dd, I found readahead logic would be triggered but it's just by luck
> and it's not intended, I think.
MM layer's readahead pages would not be freed immediately.
Squashfs can use them by grab_cache_page_nowait and READA marked page is available.
Intentional or not, readahead works pretty well. I checked in experiment.

> If first issued I/O complete, squashfs decompress the I/O with 128K pages
> so all 4 iteration(128K/32K) would be hit in page cache.
> If all 128K hit in page cache, mm layer start to issue next I/O and
> repeat above logic until you ends up reading all file size.
> So my opition is that upper layer wouldn't issue more I/O logically.
> If it worked, it's not what we expect but side-effect.
>
> That's why I'd like to know what's your thought for increasing IOPS.
> Please, could you say your thought why IOPS increased, not a result
> on low level driver?
It is because readahead can works asynchronously in background.
Suppose that you read a large file by 128k partially and contiguously
like "dd bs=128k". Two IOs can be issued per 128k reading,
First IO is for intended pages, second IO is for readahead.
If first IO hit in cache thank to previous readahead, no wait for IO completion
is needed, because intended page is up-to-date already.
But, current squashfs waits for second IO's completion unnecessarily.
That is one of reason that we should move page's up-to-date
to the asynchronous area like my patch.

> Anyway, in my opinion, we should take care of MM layer's readahead for
> enhance sequential I/O. For it, we should use buffer pages passed by MM
> instead of freeing them and allocating new pages in squashfs.
> IMHO, it would be better to implement squashfs_readpages but my insight
> is very weak so I guess Phillip will give more good idea/insight about
> the issue.
That's a good point. Also, I think my patch is another way which can be implemented
without significant impact on current implementation and I wait for Phillip's comment.

Thanks
Chanho


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Re: Re : Re: [PATCH] Squashfs: add asynchronous read support
  2013-12-21  2:05   ` Chanho Min
@ 2013-12-23  0:38     ` Minchan Kim
  2013-12-23  3:03     ` Re : " Chanho Min
  1 sibling, 0 replies; 9+ messages in thread
From: Minchan Kim @ 2013-12-23  0:38 UTC (permalink / raw)
  To: Chanho Min
  Cc: Phillip Lougher, linux-kernel, 임효준,
	이건호

On Sat, Dec 21, 2013 at 11:05:51AM +0900, Chanho Min wrote:
> 
> > Please don't break thread.
> > You should reply to my mail instead of your original post.
> Sorry, It seems to be my mailer issue. I'm trying to fix it.
> 
> > It's a result which isn't what I want to know.
> > What I wnat to know is why upper layer issues more I/O per second.
> > For example, you read 32K so MM layer will prepare 8 pages to read in but
> > at issuing at a first page, squashfs make 32 pages and fill the page cache
> > if we assume you use 128K compression so MM layer's already prepared 7
> > page
> > would be freed without further I/O and do_generic_file_read will wait for
> > completion by lock_page without further I/O queueing. It's not suprising.
> > One of page freed is a READA marked page so readahead couldn't work.
> > If readahead works, it would be just by luck. Actually, by simulation
> > 64K dd, I found readahead logic would be triggered but it's just by luck
> > and it's not intended, I think.
> MM layer's readahead pages would not be freed immediately.
> Squashfs can use them by grab_cache_page_nowait and READA marked page is available.
> Intentional or not, readahead works pretty well. I checked in experiment.


read_pages
  for(page_idx ...) {
    if (!add_to_page_cache_lru)) { <-- 1)
      mapping->a_ops->readpage(filp, page)
        squashfs_readpage
          for (i ...) {   2)  Here, 31 pages are inserted into page cache
            grab_cahe_page_nowait <------/
              add_to_page_cache_lru 
          }
    }
    /*
     * 1) will be failed with EEXIST by 2) so every pages other than first page
     * in list would be freed
     */
    page_cache_release(page) 
  }
   
If you see ReadAhead works, it is just by luck as I told you.
Please simulate it with 64K dd.

> 
> > If first issued I/O complete, squashfs decompress the I/O with 128K pages
> > so all 4 iteration(128K/32K) would be hit in page cache.
> > If all 128K hit in page cache, mm layer start to issue next I/O and
> > repeat above logic until you ends up reading all file size.
> > So my opition is that upper layer wouldn't issue more I/O logically.
> > If it worked, it's not what we expect but side-effect.
> >
> > That's why I'd like to know what's your thought for increasing IOPS.
> > Please, could you say your thought why IOPS increased, not a result
> > on low level driver?
> It is because readahead can works asynchronously in background.
> Suppose that you read a large file by 128k partially and contiguously
> like "dd bs=128k". Two IOs can be issued per 128k reading,
> First IO is for intended pages, second IO is for readahead.
> If first IO hit in cache thank to previous readahead, no wait for IO completion
> is needed, because intended page is up-to-date already.
> But, current squashfs waits for second IO's completion unnecessarily.
> That is one of reason that we should move page's up-to-date
> to the asynchronous area like my patch.

I understand it but your patch doesn't make it.

> 
> > Anyway, in my opinion, we should take care of MM layer's readahead for
> > enhance sequential I/O. For it, we should use buffer pages passed by MM
> > instead of freeing them and allocating new pages in squashfs.
> > IMHO, it would be better to implement squashfs_readpages but my insight
> > is very weak so I guess Phillip will give more good idea/insight about
> > the issue.
> That's a good point. Also, I think my patch is another way which can be implemented
> without significant impact on current implementation and I wait for Phillip's comment.
> 
> Thanks
> Chanho
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re : Re: Re: Re : Re: [PATCH] Squashfs: add asynchronous read support
  2013-12-21  2:05   ` Chanho Min
  2013-12-23  0:38     ` Minchan Kim
@ 2013-12-23  3:03     ` Chanho Min
  2013-12-23  5:04       ` Minchan Kim
  1 sibling, 1 reply; 9+ messages in thread
From: Chanho Min @ 2013-12-23  3:03 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Phillip Lougher, linux-kernel, 임효준,
	이건호



> read_pages
>   for(page_idx ...) {
>     if (!add_to_page_cache_lru)) { <-- 1)
>       mapping->a_ops->readpage(filp, page)
>         squashfs_readpage
>           for (i ...) {   2)  Here, 31 pages are inserted into page cache
>             grab_cahe_page_nowait <------/
>               add_to_page_cache_lru
>           }
>     }
>     /*
>      * 1) will be failed with EEXIST by 2) so every pages other than first page
>      * in list would be freed
>      */
>     page_cache_release(page)
>   }
>
> If you see ReadAhead works, it is just by luck as I told you.
> Please simulate it with 64K dd.
You right, This luck happened frequently with 128k dd or my test.

> I understand it but your patch doesn't make it.
>
I think my patch can make it if readahead works normally or luckily.

Thanks a lot!
Chanho,


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Squashfs: add asynchronous read support
  2013-12-16  5:30 [PATCH] Squashfs: add asynchronous read support Chanho Min
  2013-12-17  7:27 ` Minchan Kim
  2013-12-18  4:29 ` Re : " Chanho Min
@ 2013-12-23  5:03 ` Phillip Lougher
  2 siblings, 0 replies; 9+ messages in thread
From: Phillip Lougher @ 2013-12-23  5:03 UTC (permalink / raw)
  To: Chanho Min
  Cc: Phillip Lougher, linux-kernel, HyoJun Im, gunho.lee, Minchan Kim

On 16/12/13 05:30, Chanho Min wrote:
> This patch removes synchronous wait for the up-to-date of buffer in the
> file system level. Instead all operations after submit_bh are moved into
> the End-of-IO handler and its associated workeque. It decompresses/copies
> data into pages and unlock them asynchronously.
>
> This patch enhances the performance of Squashfs in most cases.
> Especially, large file reading is improved significantly.

Hi,

The following is the summarised results of a set of
comprehensive tests of the asynchronous patch against the current
synchronous Squashfs readpage implementation.

The following tables should be fairly self-explanatory, but,
the testing methodology was:

Generate a series of Squashfs filesystems, with block size
1024K, 512K, 256K, 128K and 64K.

Then for each filesystem

Run "dd if=/mnt/file of=/dev/null bs=X"

Where X is 4K, 8K, 16K, 32K, 64K, 128K, 256K, 512K, and 1024K

For each dd, run it against six different Squashfs modules,
configured with the following different options:

1. SQUASHFS_READ_DATA_ASYNC selected, SQUASHFS_DECOMP_SINGLE selected
    i.e. Async patch and single threaded decompression
    == Asyn Single in following tables

2. SQUASHFS_READ_DATA_ASYNC *not* selected, SQUASHFS_DECOMP_SINGLE selected
    i.e. No Async patch and single threaded decompression
    == No Asyn Single in following tables

3. SQUASHFS_READ_DATA_ASYNC selected, SQUASHFS_DECOMP_MULTI selected
    i.e. Async patch and multi-threaded decompression
    == Asyn Multi in following tables

4. SQUASHFS_READ_DATA_ASYNC *not* selected, SQUASHFS_DECOMP_MULTI selected
    i.e. No Async patch and multi-threaded decompression
    == No Asyn Multi in following tables

5. SQUASHFS_READ_DATA_ASYNC selected, SQUASHFS_DECOMP_MULTI_PERCPU selected
    i.e. Async patch and percpu multi-threaded decompression
    == Asyn Percpu in following tables

6. SQUASHFS_READ_DATA_ASYNC *not* selected, SQUASHFS_DECOMP_MULTI_PERCPU selected
    i.e. No Async patch and percpu multi-threaded decompression
    == No Asyn Percpu in following tables

The figures in the following tables are the MB/s reported by dd.

The tests were performed on a KVM guest with 4 cores and 4Gb of
memory, running on a core i5 based host.

The Squashfs filesystem was on "/dev/hdb".

/mnt/file is a 3Gb file, average compression 22% (635 Mb)

Squashfs: gzip filesystem 1024K blocks

	Asyn	No Asyn	Asyn	No Asyn	Asyn	No Asyn
	Single	Single	Multi	Multi	Percpu	Percpu
-------------------------------------------------------
4K:	89.4	97.5	89.9	98.1	90.6	99.1
8K:	89.9	99.0	89.7	99.4	90.3	99.4
16K:	90.6	99.8	90.8	100	90.2	97.0
32K:	90.3	98.7	90.3	98.0	89.9	101
64K:	90.3	97.6	90.2	97.1	90.1	99.7
128K:	90.4	98.6	90.2	97.6	90.7	98.5
256K:	89.7	96.9	89.8	99.2	90.2	101
512K:	89.7	98.9	90.8	98.1	89.4	97.8
1024K:	89.3	98.0	89.6	98.6	88.7	96.4

Squashfs: gzip filesystem 512K blocks	

	Asyn	No Asyn	Asyn	No Asyn	Asyn	No Asyn
	Single	Single	Multi	Multi	Percpu	Percpu
-------------------------------------------------------
4K:	68.5	94.9	67.6	99.0	68.9	97.0
8K:	69.3	101	68.9	94.3	69.0	97.2
16K:	68.9	98.6	69.4	98.9	68.8	98.0
32K:	68.6	96.5	69.4	98.9	69.4	108
64K:	68.7	92.9	69.7	101	68.8	98.2
128K:	67.4	102	68.7	90.3	69.4	100
256K:	68.7	95.1	68.2	99.7	68.5	97.7
512K:	69.9	114	82.0	104	74.2	94.4
1024K:	71.6	105	79.2	105	69.1	98.0

Squashfs: gzip filesystem 256K blocks

	Asyn	No Asyn	Asyn	No Asyn	Asyn	No Asyn
	Single	Single	Multi	Multi	Percpu	Percpu
-------------------------------------------------------
4K:	53.6	92.2	54.6	87.5	53.7	82.1
8K:	53.5	87.3	53.5	85.0	53.5	85.7
16K:	53.1	89.0	53.8	95.7	53.5	91.1
32K:	54.0	95.9	53.8	98.7	53.9	85.3
64K:	53.7	86.9	53.4	103	53.4	86.3
128K:	53.2	94.4	53.6	100	53.7	97.9
256K:	55.5	101	53.0	94.1	53.3	87.0
512K:	53.1	93.0	53.4	87.7	53.2	89.8
1024K:	53.2	91.4	52.7	91.3	53.0	95.4

A couple of points about the above can be noticed:

1. With a Squashfs block size of 256K and greater, Squashfs
    readpage() does its own readahead.  This means the asynchronous
    readpage is never called multiply (to run in parallel), because
    there is never any more work to do after the first readpage().

    The above results therefore reflect the basic performance of
    the asynchronous readpage implementation versus the
    synchronous readpage implementation.

2. It can be seen in all cases the asynchronous readpage implementation
    performs worse than the synchronous readpage implementation.

    This varies from 10% worse (1024K blocks) to a huge 44% worse
    (256K blocks).

    In otherwords, the smaller the filesystem block size the more
    readpage calls, and the greater performance hit.

Squashfs: gzip filesystem 128K blocks

	Asyn	No Asyn	Asyn	No Asyn	Asyn	No Asyn
	Single	Single	Multi	Multi	Percpu	Percpu
-------------------------------------------------------
4K:	44.7	49.2	43.0	47.0	42.7	48.9
8K:	43.0	48.6	43.0	48.2	43.0	49.8
16K:	42.9	47.9	43.0	47.5	43.1	47.4
32K:	42.9	47.2	43.0	46.1	42.9	47.1
64K:	50.7	47.0	55.1	53.1	55.2	49.6
128K:	51.1	47.6	54.9	49.2	55.7	48.7
256K:	51.0	49.5	54.7	48.2	57.3	49.3
512K:	53.6	48.2	55.2	47.6	54.9	48.3
1024K:	51.1	47.9	54.7	47.5	54.6	46.5

A couple of points about the above can be noticed:

1. A filesystem block size of 128K means a single readpage call
    no longer performs all the readahead.  Thus allowing the
    possibility of simultaneous asynchronous readpage calls.

2. dd block sizes of 32K and less show worse performance
    for the asynchronous readpage implementation, similar to
    the previous results.

    An ftrace of the kernel shows no parallel async readpage calls
    are taking place.

3. Block sizes of 64K and above show readahead taking place, with
    parallel asynchronous readpages.

    This is confirmed by an ftrace of the kernel.

    The performance improvement ranges from 7.9% to 16%, or 1.079x and
    1.16x the performance of the synchronous readpage() implementation.

Overall of the above 108 datapoint sets, 15 show an improvement
of between 7.9%-16%, and 93 show a performance decrease of
between 10%-44%.  86% of the performance tests are worse.

This is the not terribly good "good" results.

Worse is the fact I have got constant kernel oopses testing the
asynchronous patch (note I received no kernel oopses without the
asynchronous patch).  I have included two of the kernel oopses
here.  All the kernel oopses are different, but all point to the
fact the asynchronous patch introduces races with the page state
within the page cache/VFS layer.

1. Ooops 1.

BUG: Bad page state in process kworker/0:3  pfn:11f618
page:ffffea00047d8600 count:0 mapcount:0 mapping:ffff88003c1081b8 index:0x2ea8
page flags: 0x800000000000000a(error|uptodate)
BUG: Bad page state in process kworker/0:3  pfn:5020b
page:ffffea00014082c0 count:0 mapcount:0 mapping:ffff88003c1081b8 index:0x2ea9
page flags: 0x400000000000000a(error|uptodate)
BUG: Bad page state in process kworker/0:3  pfn:5020a
page:ffffea0001408280 count:0 mapcount:0 mapping:ffff88003c1081b8 index:0x2eaa
page flags: 0x400000000000000a(error|uptodate)
BUG: Bad page state in process kworker/0:3  pfn:50209
page:ffffea0001408240 count:0 mapcount:0 mapping:ffff88003c1081b8 index:0x2eab
page flags: 0x400000000000000a(error|uptodate)
BUG: Bad page state in process kworker/0:3  pfn:50208
page:ffffea0001408200 count:0 mapcount:0 mapping:ffff88003c1081b8 index:0x2eac
page flags: 0x400000000000000a(error|uptodate)
BUG: Bad page state in process kworker/0:3  pfn:50207
page:ffffea00014081c0 count:0 mapcount:0 mapping:ffff88003c1081b8 index:0x2ead
page flags: 0x400000000000000a(error|uptodate)
BUG: Bad page state in process kworker/0:3  pfn:50205
page:ffffea0001408140 count:0 mapcount:0 mapping:ffff88003c1081b8 index:0x2eae
page flags: 0x400000000000000a(error|uptodate)
BUG: Bad page state in process kworker/0:3  pfn:50204
page:ffffea0001408100 count:0 mapcount:0 mapping:ffff88003c1081b8 index:0x2eaf
page flags: 0x400000000000000a(error|uptodate)
stack segment: 0000 [#1] SMP
Modules linked in: squashfs [last unloaded: squashfs]
CPU: 0 PID: 2698 Comm: klogd Tainted: G    B        3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffff8800da85cec0 ti: ffff88011a6b6000 task.ti: ffff88011a6b6000
RIP: 0010:[<ffffffff81106026>]  [<ffffffff81106026>] __kmalloc_track_caller+0x56/0x130
RSP: 0018:ffff88011a6b7c20  EFLAGS: 00010206
RAX: 0000000000000000 RBX: ffff8800dbbd9700 RCX: 000000000009689f
RDX: 000000000009689e RSI: 0000000000000000 RDI: 00000000000001bf
RBP: 32206365443e313c R08: 0000000000015480 R09: 0000000000000003
R10: ffff8800da85cec0 R11: 0000000000000246 R12: 00000000000106d0
R13: 00000000000001c0 R14: ffffffff81658bb0 R15: ffff88011b001840
FS:  00007fe94d6c36f0(0000) GS:ffff88011fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffff600400 CR3: 000000011a60c000 CR4: 00000000000006f0
Stack:
  ffff8800dbbd9700 ffff88011a6b7c8f 00000000000004d0 00000000000001c0
  0000000000000000 7fffffffffffffff ffffffff81658ae9 ffff8800dbbd9700
  00000000000004d0 0000000000000000 ffff88011b001900 000000000000004d
Call Trace:
  [<ffffffff81658ae9>] ? __kmalloc_reserve.isra.40+0x29/0x80
  [<ffffffff81658bb0>] ? __alloc_skb+0x70/0x280
  [<ffffffff81652b77>] ? sock_alloc_send_pskb+0x197/0x3d0
  [<ffffffff816f0ce5>] ? unix_dgram_sendmsg+0x165/0x640
  [<ffffffff81113aaf>] ? path_lookupat+0x6f/0x790
  [<ffffffff8164f8b5>] ? sock_aio_write+0xf5/0x130
  [<ffffffff810342ff>] ? __do_page_fault+0x2af/0x460
  [<ffffffff81107e8c>] ? do_sync_write+0x5c/0x90
  [<ffffffff81108575>] ? vfs_write+0x155/0x1c0
  [<ffffffff81108b48>] ? SyS_write+0x48/0xa0
  [<ffffffff81725352>] ? system_call_fastpath+0x16/0x1b
Code: 89 c7 76 61 49 8b 0f 65 48 03 0c 25 c8 cc 00 00 48 8b 51 08 48 8b 29 48 85 ed 0f 84 a8 00 00 00 49 63 47 20 48 8d 4a 01 4d 8b 07 <48> 8b 5c 05 00 48 89 e8 65 49 0f c7 08 0f 94 c0 84 c0 74 c5 49
RIP  [<ffffffff81106026>] __kmalloc_track_caller+0x56/0x130
  RSP <ffff88011a6b7c20>
---[ end trace 5e8d5e5a2d98a915 ]---
note: klogd[2698] exited with preempt_count 1
------------[ cut here ]------------
WARNING: CPU: 0 PID: 1495 at fs/buffer.c:1204 __find_get_block+0x1c5/0x220()
VFS: brelse: Trying to free free buffer
Modules linked in: squashfs [last unloaded: squashfs]
CPU: 0 PID: 1495 Comm: jbd2/hda1-8 Tainted: G    B D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  0000000000000009 ffffffff8171e362 ffff8800dbb43b38 ffffffff8103ca27
  ffff880113bbb270 ffff8800dbb43b88 0000000000000008 ffff880113bbb000
  0000000000001000 ffffffff8103ca8c ffffffff819818c8 0000000000000018
Call Trace:
  [<ffffffff8171e362>] ? dump_stack+0x41/0x51
  [<ffffffff8103ca27>] ? warn_slowpath_common+0x77/0x90
  [<ffffffff8103ca8c>] ? warn_slowpath_fmt+0x4c/0x50
  [<ffffffff81136b85>] ? __find_get_block+0x1c5/0x220
  [<ffffffff811370ae>] ? __getblk+0x9e/0x2e0
  [<ffffffff811f2980>] ? jbd2_journal_get_descriptor_buffer+0x30/0x90
  [<ffffffff811eaa71>] ? journal_submit_commit_record.isra.10+0x61/0x1c0
  [<ffffffff811ebcae>] ? jbd2_journal_commit_transaction+0x10de/0x17d0
  [<ffffffff81065d13>] ? try_to_wake_up+0xd3/0x290
  [<ffffffff811f0ee5>] ? kjournald2+0xb5/0x250
  [<ffffffff8106f980>] ? __wake_up_sync+0x10/0x10
  [<ffffffff811f0e30>] ? commit_timeout+0x10/0x10
  [<ffffffff8105a226>] ? kthread+0xc6/0xe0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
  [<ffffffff817252ac>] ? ret_from_fork+0x7c/0xb0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
---[ end trace 5e8d5e5a2d98a916 ]---
------------[ cut here ]------------
WARNING: CPU: 0 PID: 1495 at fs/buffer.c:1204 __find_get_block+0x1c5/0x220()
VFS: brelse: Trying to free free buffer
Modules linked in: squashfs [last unloaded: squashfs]
CPU: 0 PID: 1495 Comm: jbd2/hda1-8 Tainted: G    B D W    3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  0000000000000009 ffffffff8171e362 ffff8800dbb43b98 ffffffff8103ca27
  ffff880113bbb2d8 ffff8800dbb43be8 0000000000000008 ffff880060e47f70
  0000000000001000 ffffffff8103ca8c ffffffff819818c8 0000000000000018
Call Trace:
  [<ffffffff8171e362>] ? dump_stack+0x41/0x51
  [<ffffffff8103ca27>] ? warn_slowpath_common+0x77/0x90
  [<ffffffff8103ca8c>] ? warn_slowpath_fmt+0x4c/0x50
  [<ffffffff81136b85>] ? __find_get_block+0x1c5/0x220
  [<ffffffff811370ae>] ? __getblk+0x9e/0x2e0
  [<ffffffff811f2980>] ? jbd2_journal_get_descriptor_buffer+0x30/0x90
  [<ffffffff811eb6aa>] ? jbd2_journal_commit_transaction+0xada/0x17d0
  [<ffffffff817209ea>] ? __schedule+0x2aa/0x810
  [<ffffffff81065d13>] ? try_to_wake_up+0xd3/0x290
  [<ffffffff811f0ee5>] ? kjournald2+0xb5/0x250
  [<ffffffff8106f980>] ? __wake_up_sync+0x10/0x10
  [<ffffffff811f0e30>] ? commit_timeout+0x10/0x10
  [<ffffffff8105a226>] ? kthread+0xc6/0xe0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
  [<ffffffff817252ac>] ? ret_from_fork+0x7c/0xb0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
---[ end trace 5e8d5e5a2d98a917 ]---

2. Oops 2

BUG: unable to handle kernel NULL pointer dereference at 0000000000000028
IP: [<ffffffffa05c8ad6>] zlib_uncompress+0xc6/0x190 [squashfs]
PGD 69239067 PUD 89e0e067 PMD 0
Oops: 0000 [#1] SMP
Modules linked in: squashfs [last unloaded: squashfs]
CPU: 1 PID: 4877 Comm: kworker/1:3 Not tainted 3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
Workqueue: events squashfs_async_work [squashfs]
task: ffff8800da9f3b10 ti: ffff880074e46000 task.ti: ffff880074e46000
RIP: 0010:[<ffffffffa05c8ad6>]  [<ffffffffa05c8ad6>] zlib_uncompress+0xc6/0x190 [squashfs]
RSP: 0018:ffff880074e47d18  EFLAGS: 00010283
RAX: 00000000000002f4 RBX: 0000000000000003 RCX: 0000000000000000
RDX: ffff880000000000 RSI: 0000000000000000 RDI: ffff8800db3bc640
RBP: ffff88011a62e600 R08: 000000000000010c R09: 0000000000000881
R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000881
R13: 0000000000000000 R14: 000000000000010c R15: ffff88011a5e0c60
FS:  0000000000000000(0000) GS:ffff88011fc80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000028 CR3: 0000000057bd8000 CR4: 00000000000006e0
Stack:
  ffff88011a5e08a8 ffff8800db270800 ffff8800db270800 ffff88011a5e08a8
  ffff88011a5e08a0 ffff88011a62e600 0000000000000003 000000000000010c
  ffffffffa05c83f4 ffff8800db3bc640 ffff8800db3bc640 00000881000127c0
Call Trace:
  [<ffffffffa05c83f4>] ? squashfs_decompress+0x64/0xa0 [squashfs]
  [<ffffffffa05c4109>] ? squashfs_buffer_read_async+0x69/0x2e0 [squashfs]
  [<ffffffff810536d5>] ? process_one_work+0x175/0x3f0
  [<ffffffff81054224>] ? worker_thread+0x114/0x3a0
  [<ffffffff81054110>] ? manage_workers.isra.26+0x290/0x290
  [<ffffffff8105a226>] ? kthread+0xc6/0xe0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
  [<ffffffff817252ac>] ? ret_from_fork+0x7c/0xb0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
Code: 8b 47 08 b9 01 00 00 00 85 c0 75 30 41 39 dd 7d 2b 48 8b 44 24 08 49 63 f5 48 8b 74 f5 00 8b 40 08 44 29 f0 44 39 e0 41 0f 4f c4 <4c> 03 76 28 41 29 c4 41 89 47 08 4d 89 37 45 31 f6 41 8b 77 20
RIP  [<ffffffffa05c8ad6>] zlib_uncompress+0xc6/0x190 [squashfs]
  RSP <ffff880074e47d18>
CR2: 0000000000000028
---[ end trace e66a56dcb454c870 ]---
note: kworker/1:3[4877] exited with preempt_count 1
BUG: unable to handle kernel paging request at ffffffffffffffc8
IP: [<ffffffff8105a85c>] kthread_data+0xc/0x20
PGD 1c0c067 PUD 1c0e067 PMD 0
Oops: 0000 [#2] SMP
Modules linked in: squashfs [last unloaded: squashfs]
CPU: 1 PID: 4877 Comm: kworker/1:3 Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffff8800da9f3b10 ti: ffff880074e46000 task.ti: ffff880074e46000
RIP: 0010:[<ffffffff8105a85c>]  [<ffffffff8105a85c>] kthread_data+0xc/0x20
RSP: 0018:ffff880074e47988  EFLAGS: 00010002
RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffff88011fc927c0
RDX: 000000000000b9c7 RSI: 0000000000000001 RDI: ffff8800da9f3b10
RBP: ffff880074e47a98 R08: 8000000000000000 R09: 0000000000000001
R10: 000000000000b885 R11: 0000000000000000 R12: ffff8800da9f3df0
R13: 0000000000000001 R14: ffff8800da9f3b00 R15: ffff8800da9f3b10
FS:  0000000000000000(0000) GS:ffff88011fc80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000028 CR3: 0000000057bd8000 CR4: 00000000000006e0
Stack:
  ffffffff81054d8d ffff88011fc927c0 ffffffff81720b3d ffff8800da9f3b10
  ffff880074e47fd8 00000000000127c0 00000000000127c0 ffff8800da9f3b10
  0000000000000001 0000000000000000 0000000000000206 ffff880074e47a68
Call Trace:
  [<ffffffff81054d8d>] ? wq_worker_sleeping+0xd/0x80
  [<ffffffff81720b3d>] ? __schedule+0x3fd/0x810
  [<ffffffff812b22eb>] ? debug_object_active_state+0x12b/0x170
  [<ffffffff8103d330>] ? will_become_orphaned_pgrp+0xb0/0xb0
  [<ffffffff8103e072>] ? do_exit+0x6a2/0x9f0
  [<ffffffff81005a8a>] ? oops_end+0x6a/0x90
  [<ffffffff8171b805>] ? no_context+0x25e/0x26a
  [<ffffffff810342ff>] ? __do_page_fault+0x2af/0x460
  [<ffffffff8106d263>] ? idle_balance+0x193/0x1a0
  [<ffffffff817243fa>] ? _raw_spin_unlock_irq+0xa/0x10
  [<ffffffff81062597>] ? finish_task_switch+0x57/0xa0
  [<ffffffff817209ea>] ? __schedule+0x2aa/0x810
  [<ffffffff81724be2>] ? page_fault+0x22/0x30
  [<ffffffffa05c8ad6>] ? zlib_uncompress+0xc6/0x190 [squashfs]
  [<ffffffffa05c8a48>] ? zlib_uncompress+0x38/0x190 [squashfs]
  [<ffffffffa05c83f4>] ? squashfs_decompress+0x64/0xa0 [squashfs]
  [<ffffffffa05c4109>] ? squashfs_buffer_read_async+0x69/0x2e0 [squashfs]
  [<ffffffff810536d5>] ? process_one_work+0x175/0x3f0
  [<ffffffff81054224>] ? worker_thread+0x114/0x3a0
  [<ffffffff81054110>] ? manage_workers.isra.26+0x290/0x290
  [<ffffffff8105a226>] ? kthread+0xc6/0xe0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
  [<ffffffff817252ac>] ? ret_from_fork+0x7c/0xb0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
Code: 88 02 00 00 48 8b 40 b8 48 c1 e8 02 83 e0 01 c3 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 8b 87 88 02 00 00 <48> 8b 40 c8 c3 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f
RIP  [<ffffffff8105a85c>] kthread_data+0xc/0x20
  RSP <ffff880074e47988>
CR2: ffffffffffffffc8
---[ end trace e66a56dcb454c871 ]---
Fixing recursive fault but reboot is needed!
hda: lost interrupt
hda: ide_dma_sff_timer_expiry: DMA status (0x64)
hdc: lost interrupt

ðBUG: spinlock lockup suspected on CPU#1, kworker/1:3/4877
  lock: 0xffff88011fc927c0, .magic: dead4ead, .owner: kworker/1:3/4877, .owner_cpu: 1
CPU: 1 PID: 4877 Comm: kworker/1:3 Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  0000000088c1cfb8 ffffffff8171e362 ffff88011fc927c0 ffffffff81073169
  ffff88011fc927c0 ffff880074e47708 0000000000000000 ffffffff817207de
  ffff8800da9f3b10 ffff880074e47fd8 00000000000127c0 00000000000127c0
Call Trace:
  [<ffffffff8171e362>] ? dump_stack+0x41/0x51
  [<ffffffff81073169>] ? do_raw_spin_lock+0x69/0x140
  [<ffffffff817207de>] ? __schedule+0x9e/0x810
  [<ffffffff8107d450>] ? wake_up_klogd+0x30/0x40
  [<ffffffff8171c3a5>] ? printk+0x54/0x56
  [<ffffffff8103e2d9>] ? do_exit+0x909/0x9f0
  [<ffffffff81005a8a>] ? oops_end+0x6a/0x90
  [<ffffffff8171b805>] ? no_context+0x25e/0x26a
  [<ffffffff810342ff>] ? __do_page_fault+0x2af/0x460
  [<ffffffff81099deb>] ? __sprint_symbol+0x8b/0xe0
  [<ffffffff812a4c06>] ? string.isra.5+0x36/0xe0
  [<ffffffff812a504c>] ? symbol_string.isra.10+0x5c/0xa0
  [<ffffffff81724be2>] ? page_fault+0x22/0x30
  [<ffffffff8105a85c>] ? kthread_data+0xc/0x20
  [<ffffffff81054d8d>] ? wq_worker_sleeping+0xd/0x80
  [<ffffffff81720b3d>] ? __schedule+0x3fd/0x810
  [<ffffffff812b22eb>] ? debug_object_active_state+0x12b/0x170
  [<ffffffff8103d330>] ? will_become_orphaned_pgrp+0xb0/0xb0
  [<ffffffff8103e072>] ? do_exit+0x6a2/0x9f0
  [<ffffffff81005a8a>] ? oops_end+0x6a/0x90
  [<ffffffff8171b805>] ? no_context+0x25e/0x26a
  [<ffffffff810342ff>] ? __do_page_fault+0x2af/0x460
  [<ffffffff8106d263>] ? idle_balance+0x193/0x1a0
  [<ffffffff817243fa>] ? _raw_spin_unlock_irq+0xa/0x10
  [<ffffffff81062597>] ? finish_task_switch+0x57/0xa0
  [<ffffffff817209ea>] ? __schedule+0x2aa/0x810
  [<ffffffff81724be2>] ? page_fault+0x22/0x30
  [<ffffffffa05c8ad6>] ? zlib_uncompress+0xc6/0x190 [squashfs]
  [<ffffffffa05c8a48>] ? zlib_uncompress+0x38/0x190 [squashfs]
  [<ffffffffa05c83f4>] ? squashfs_decompress+0x64/0xa0 [squashfs]
  [<ffffffffa05c4109>] ? squashfs_buffer_read_async+0x69/0x2e0 [squashfs]
  [<ffffffff810536d5>] ? process_one_work+0x175/0x3f0
  [<ffffffff81054224>] ? worker_thread+0x114/0x3a0
  [<ffffffff81054110>] ? manage_workers.isra.26+0x290/0x290
  [<ffffffff8105a226>] ? kthread+0xc6/0xe0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
  [<ffffffff817252ac>] ? ret_from_fork+0x7c/0xb0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
sending NMI to all CPUs:
NMI backtrace for cpu 1
CPU: 1 PID: 4877 Comm: kworker/1:3 Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffff8800da9f3b10 ti: ffff880074e46000 task.ti: ffff880074e46000
RIP: 0010:[<ffffffff8102f8e7>]  [<ffffffff8102f8e7>] flat_send_IPI_mask+0x57/0x80
RSP: 0018:ffff880074e475b8  EFLAGS: 00010046
RAX: 0000000000000c00 RBX: 0000000000000c00 RCX: 0000000000000006
RDX: ffffffff81c1e140 RSI: 0000000000000002 RDI: 0000000000000300
RBP: 0000000000000096 R08: 0000000000000400 R09: 0000000000000267
R10: 0000000000000266 R11: 0000000000000006 R12: 000000000000000f
R13: 0000000000000001 R14: ffffffffffffffc8 R15: ffff8800da9f3b10
FS:  0000000000000000(0000) GS:ffff88011fc80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000028 CR3: 0000000057bd8000 CR4: 00000000000006e0
Stack:
  0000000200000001 0000000000002710 0000000088c1cfb8 0000000088c1cfb8
  ffffffff8102c9b7 ffff88011fc927c0 ffffffff8107316e ffff88011fc927c0
  ffff880074e47708 0000000000000000 ffffffff817207de ffff8800da9f3b10
Call Trace:
  [<ffffffff8102c9b7>] ? arch_trigger_all_cpu_backtrace+0x47/0x80
  [<ffffffff8107316e>] ? do_raw_spin_lock+0x6e/0x140
  [<ffffffff817207de>] ? __schedule+0x9e/0x810
  [<ffffffff8107d450>] ? wake_up_klogd+0x30/0x40
  [<ffffffff8171c3a5>] ? printk+0x54/0x56
  [<ffffffff8103e2d9>] ? do_exit+0x909/0x9f0
  [<ffffffff81005a8a>] ? oops_end+0x6a/0x90
  [<ffffffff8171b805>] ? no_context+0x25e/0x26a
  [<ffffffff810342ff>] ? __do_page_fault+0x2af/0x460
  [<ffffffff81099deb>] ? __sprint_symbol+0x8b/0xe0
  [<ffffffff812a4c06>] ? string.isra.5+0x36/0xe0
  [<ffffffff812a504c>] ? symbol_string.isra.10+0x5c/0xa0
  [<ffffffff81724be2>] ? page_fault+0x22/0x30
  [<ffffffff8105a85c>] ? kthread_data+0xc/0x20
  [<ffffffff81054d8d>] ? wq_worker_sleeping+0xd/0x80
  [<ffffffff81720b3d>] ? __schedule+0x3fd/0x810
  [<ffffffff812b22eb>] ? debug_object_active_state+0x12b/0x170
  [<ffffffff8103d330>] ? will_become_orphaned_pgrp+0xb0/0xb0
  [<ffffffff8103e072>] ? do_exit+0x6a2/0x9f0
  [<ffffffff81005a8a>] ? oops_end+0x6a/0x90
  [<ffffffff8171b805>] ? no_context+0x25e/0x26a
  [<ffffffff810342ff>] ? __do_page_fault+0x2af/0x460
  [<ffffffff8106d263>] ? idle_balance+0x193/0x1a0
  [<ffffffff817243fa>] ? _raw_spin_unlock_irq+0xa/0x10
  [<ffffffff81062597>] ? finish_task_switch+0x57/0xa0
  [<ffffffff817209ea>] ? __schedule+0x2aa/0x810
  [<ffffffff81724be2>] ? page_fault+0x22/0x30
  [<ffffffffa05c8ad6>] ? zlib_uncompress+0xc6/0x190 [squashfs]
  [<ffffffffa05c8a48>] ? zlib_uncompress+0x38/0x190 [squashfs]
  [<ffffffffa05c83f4>] ? squashfs_decompress+0x64/0xa0 [squashfs]
  [<ffffffffa05c4109>] ? squashfs_buffer_read_async+0x69/0x2e0 [squashfs]
  [<ffffffff810536d5>] ? process_one_work+0x175/0x3f0
  [<ffffffff81054224>] ? worker_thread+0x114/0x3a0
  [<ffffffff81054110>] ? manage_workers.isra.26+0x290/0x290
  [<ffffffff8105a226>] ? kthread+0xc6/0xe0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
  [<ffffffff817252ac>] ? ret_from_fork+0x7c/0xb0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
Code: 25 00 b3 5f ff f6 c4 10 75 f2 44 89 e0 c1 e0 18 89 04 25 10 b3 5f ff 89 f0 09 d8 80 cf 04 83 fe 02 0f 44 c3 89 04 25 00 b3 5f ff <55> 9d 48 83 c4 08 5b 5d 41 5c c3 89 74 24 04 ff 92 50 01 00 00
NMI backtrace for cpu 2
INFO: NMI handler (arch_trigger_all_cpu_backtrace_handler) took too long to run: 115.138 msecs
CPU: 2 PID: 0 Comm: swapper/2 Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffff88011b07c1a0 ti: ffff88011b0fa000 task.ti: ffff88011b0fa000
RIP: 0010:[<ffffffff8100ae3b>]  [<ffffffff8100ae3b>] default_idle+0x1b/0xa0
RSP: 0018:ffff88011b0fbf00  EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0100000000000000
RDX: 0100000000000000 RSI: 0000000000000000 RDI: ffff88011fd0d280
RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000001
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011b0fbfd8
R13: ffff88011b0fbfd8 R14: ffffffff81cd8fd0 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff88011fd00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffffffffff600400 CR3: 000000011a7e3000 CR4: 00000000000006e0
Stack:
  0000000000000000 ffff88011b0fbfd8 ffffffff8107f1a1 ffff88011b0fbfd8
  0000000000000000 0000000000000000 0000000000000000 0000000000000000
  0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
  [<ffffffff8107f1a1>] ? cpu_startup_entry+0x141/0x220
Code: 9c 2d 05 00 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 8b 05 dd 6e cc 00 55 65 8b 2c 25 1c b0 00 00 53 85 c0 7f 17 fb f4 <8b> 05 c7 6e cc 00 65 8b 2c 25 1c b0 00 00 85 c0 7f 3b 5b 5d c3
INFO: NMI handler (arch_trigger_all_cpu_backtrace_handler) took too long to run: 141.182 msecs
NMI backtrace for cpu 0
CPU: 0 PID: 0 Comm: swapper/0 Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffffffff81c10480 ti: ffffffff81c00000 task.ti: ffffffff81c00000
RIP: 0010:[<ffffffff8100ae3b>]  [<ffffffff8100ae3b>] default_idle+0x1b/0xa0
RSP: 0018:ffffffff81c01f68  EFLAGS: 00000246
RAX: 0000000000000000 RBX: ffffffffffffffff RCX: 0100000000000000
RDX: 0100000000000000 RSI: 0000000000000000 RDI: ffff88011fc0d280
RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff81c01fd8
R13: ffffffff81c01fd8 R14: ffffffff81cd8fd0 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff88011fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffffffffff600400 CR3: 00000000daa9f000 CR4: 00000000000006f0
Stack:
  ffffffffffffffff ffffffff81c01fd8 ffffffff8107f1a1 ffffffff81c01fd8
  ffffffffffffffff ffffffff81d82920 ffff88011ffcd540 ffffffff81d892c0
  0000000000000000 0000000000000000 ffffffff81cf4d9c ffffffff81cf4849
Call Trace:
  [<ffffffff8107f1a1>] ? cpu_startup_entry+0x141/0x220
  [<ffffffff81cf4d9c>] ? start_kernel+0x376/0x381
  [<ffffffff81cf4849>] ? repair_env_string+0x58/0x58
Code: 9c 2d 05 00 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 8b 05 dd 6e cc 00 55 65 8b 2c 25 1c b0 00 00 53 85 c0 7f 17 fb f4 <8b> 05 c7 6e cc 00 65 8b 2c 25 1c b0 00 00 85 c0 7f 3b 5b 5d c3
INFO: NMI handler (arch_trigger_all_cpu_backtrace_handler) took too long to run: 141.252 msecs
NMI backtrace for cpu 3
CPU: 3 PID: 0 Comm: swapper/3 Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffff88011b07c830 ti: ffff88011b0fc000 task.ti: ffff88011b0fc000
RIP: 0010:[<ffffffff8100ae3b>]  [<ffffffff8100ae3b>] default_idle+0x1b/0xa0
RSP: 0018:ffff88011b0fdf00  EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0100000000000000
RDX: 0100000000000000 RSI: 0000000000000000 RDI: ffff88011fd8d280
RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000001
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011b0fdfd8
R13: ffff88011b0fdfd8 R14: ffffffff81cd8fd0 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff88011fd80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffffffffff600400 CR3: 00000000dbb8f000 CR4: 00000000000006e0
Stack:
  0000000000000000 ffff88011b0fdfd8 ffffffff8107f1a1 ffff88011b0fdfd8
  0000000000000000 0000000000000000 0000000000000000 0000000000000000
  0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
  [<ffffffff8107f1a1>] ? cpu_startup_entry+0x141/0x220
Code: 9c 2d 05 00 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 8b 05 dd 6e cc 00 55 65 8b 2c 25 1c b0 00 00 53 85 c0 7f 17 fb f4 <8b> 05 c7 6e cc 00 65 8b 2c 25 1c b0 00 00 85 c0 7f 3b 5b 5d c3
hda: DMA interrupt recovery
hda: lost interrupt
hda: ide_dma_sff_timer_expiry: DMA status (0x64)
hda: DMA interrupt recovery
hda: lost interrupt
ide-atapi: cmd 0x3 timed out
hdc: lost interrupt
hda: lost interrupt
hda: lost interrupt
BUG: spinlock lockup suspected on CPU#3, syslogd/2719
  lock: 0xffff88011fc927c0, .magic: dead4ead, .owner: kworker/1:3/4877, .owner_cpu: 1
CPU: 3 PID: 2719 Comm: syslogd Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  0000000088c1cfb8 ffffffff8171e362 ffff88011fc927c0 ffffffff81073169
  ffff88011b36c1a0 00000000000127c0 ffff88011fc927c0 ffffffff81065fc6
  0000000000000000 ffff88011b36c1a0 0000000000000000 0000000000000000
Call Trace:
  [<ffffffff8171e362>] ? dump_stack+0x41/0x51
  [<ffffffff81073169>] ? do_raw_spin_lock+0x69/0x140
  [<ffffffff81065fc6>] ? wake_up_new_task+0x96/0x170
  [<ffffffff8103bdaf>] ? do_fork+0x12f/0x2f0
  [<ffffffff81122d50>] ? __fd_install+0x20/0x50
  [<ffffffff817255f9>] ? stub_clone+0x69/0x90
  [<ffffffff81725352>] ? system_call_fastpath+0x16/0x1b
sending NMI to all CPUs:
NMI backtrace for cpu 3
CPU: 3 PID: 2719 Comm: syslogd Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffff8800d9d541a0 ti: ffff8800dbb8a000 task.ti: ffff8800dbb8a000
RIP: 0010:[<ffffffff8102f8e7>]  [<ffffffff8102f8e7>] flat_send_IPI_mask+0x57/0x80
RSP: 0018:ffff8800dbb8be60  EFLAGS: 00010046
RAX: 0000000000000c00 RBX: 0000000000000c00 RCX: 000000000000fefd
RDX: ffffffff81c1e140 RSI: 0000000000000002 RDI: 0000000000000300
RBP: 0000000000000086 R08: 0000000000000400 R09: 00000000000002fe
R10: 00000000000002fd R11: 0000000000000006 R12: 000000000000000f
R13: ffff88011b36c6a8 R14: 0000000000000246 R15: 0000000000001311
FS:  00007f71050c86f0(0000) GS:ffff88011fd80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 000000000060ce00 CR3: 00000000daa9f000 CR4: 00000000000006e0
Stack:
  0000000200000001 0000000000002710 0000000088c1cfb8 0000000088c1cfb8
  ffffffff8102c9b7 ffff88011fc927c0 ffffffff8107316e ffff88011b36c1a0
  00000000000127c0 ffff88011fc927c0 ffffffff81065fc6 0000000000000000
Call Trace:
  [<ffffffff8102c9b7>] ? arch_trigger_all_cpu_backtrace+0x47/0x80
  [<ffffffff8107316e>] ? do_raw_spin_lock+0x6e/0x140
  [<ffffffff81065fc6>] ? wake_up_new_task+0x96/0x170
  [<ffffffff8103bdaf>] ? do_fork+0x12f/0x2f0
  [<ffffffff81122d50>] ? __fd_install+0x20/0x50
  [<ffffffff817255f9>] ? stub_clone+0x69/0x90
  [<ffffffff81725352>] ? system_call_fastpath+0x16/0x1b
Code: 25 00 b3 5f ff f6 c4 10 75 f2 44 89 e0 c1 e0 18 89 04 25 10 b3 5f ff 89 f0 09 d8 80 cf 04 83 fe 02 0f 44 c3 89 04 25 00 b3 5f ff <55> 9d 48 83 c4 08 5b 5d 41 5c c3 89 74 24 04 ff 92 50 01 00 00
NMI backtrace for cpu 1
CPU: 1 PID: 4877 Comm: kworker/1:3 Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffff8800da9f3b10 ti: ffff880074e46000 task.ti: ffff880074e46000
RIP: 0010:[<ffffffff81073222>]  [<ffffffff81073222>] do_raw_spin_lock+0x122/0x140
RSP: 0018:ffff880074e475f0  EFLAGS: 00000002
RAX: 0000000000000027 RBX: ffff88011fc927c0 RCX: 000000007beba1ad
RDX: 0000000000000028 RSI: 0000000000000001 RDI: 0000000000230471
RBP: 0000000088c1cfb8 R08: 0000000000000400 R09: 0000000000000267
R10: 0000000000000266 R11: 0000000000000006 R12: 0000000088c1cfb8
R13: 0000000000000001 R14: ffffffffffffffc8 R15: ffff8800da9f3b10
FS:  0000000000000000(0000) GS:ffff88011fc80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000028 CR3: 0000000057bd8000 CR4: 00000000000006e0
Stack:
  ffff88011fc927c0 ffff880074e47708 0000000000000000 ffffffff817207de
  ffff8800da9f3b10 ffff880074e47fd8 00000000000127c0 00000000000127c0
  ffff8800da9f3b10 0000000000000000 0000000000000001 ffffffff8107d450
Call Trace:
  [<ffffffff817207de>] ? __schedule+0x9e/0x810
  [<ffffffff8107d450>] ? wake_up_klogd+0x30/0x40
  [<ffffffff8171c3a5>] ? printk+0x54/0x56
  [<ffffffff8103e2d9>] ? do_exit+0x909/0x9f0
  [<ffffffff81005a8a>] ? oops_end+0x6a/0x90
  [<ffffffff8171b805>] ? no_context+0x25e/0x26a
  [<ffffffff810342ff>] ? __do_page_fault+0x2af/0x460
  [<ffffffff81099deb>] ? __sprint_symbol+0x8b/0xe0
  [<ffffffff812a4c06>] ? string.isra.5+0x36/0xe0
  [<ffffffff812a504c>] ? symbol_string.isra.10+0x5c/0xa0
  [<ffffffff81724be2>] ? page_fault+0x22/0x30
  [<ffffffff8105a85c>] ? kthread_data+0xc/0x20
  [<ffffffff81054d8d>] ? wq_worker_sleeping+0xd/0x80
  [<ffffffff81720b3d>] ? __schedule+0x3fd/0x810
  [<ffffffff812b22eb>] ? debug_object_active_state+0x12b/0x170
  [<ffffffff8103d330>] ? will_become_orphaned_pgrp+0xb0/0xb0
  [<ffffffff8103e072>] ? do_exit+0x6a2/0x9f0
  [<ffffffff81005a8a>] ? oops_end+0x6a/0x90
  [<ffffffff8171b805>] ? no_context+0x25e/0x26a
  [<ffffffff810342ff>] ? __do_page_fault+0x2af/0x460
  [<ffffffff8106d263>] ? idle_balance+0x193/0x1a0
  [<ffffffff817243fa>] ? _raw_spin_unlock_irq+0xa/0x10
  [<ffffffff81062597>] ? finish_task_switch+0x57/0xa0
  [<ffffffff817209ea>] ? __schedule+0x2aa/0x810
  [<ffffffff81724be2>] ? page_fault+0x22/0x30
  [<ffffffffa05c8ad6>] ? zlib_uncompress+0xc6/0x190 [squashfs]
  [<ffffffffa05c8a48>] ? zlib_uncompress+0x38/0x190 [squashfs]
  [<ffffffffa05c83f4>] ? squashfs_decompress+0x64/0xa0 [squashfs]
  [<ffffffffa05c4109>] ? squashfs_buffer_read_async+0x69/0x2e0 [squashfs]
  [<ffffffff810536d5>] ? process_one_work+0x175/0x3f0
  [<ffffffff81054224>] ? worker_thread+0x114/0x3a0
  [<ffffffff81054110>] ? manage_workers.isra.26+0x290/0x290
  [<ffffffff8105a226>] ? kthread+0xc6/0xe0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
  [<ffffffff817252ac>] ? ret_from_fork+0x7c/0xb0
  [<ffffffff8105a160>] ? kthread_create_on_node+0x1a0/0x1a0
Code: 00 e9 40 ff ff ff 48 c7 c6 ae 72 9a 81 48 89 df e8 08 90 6a 00 e9 1b ff ff ff 48 c7 c6 cd 1c 97 81 e8 f7 8f 6a 00 e9 f7 fe ff ff <0f> b6 03 38 c2 0f 84 56 ff ff ff f3 90 90 eb f0 66 66 66 66 66
NMI backtrace for cpu 2
INFO: NMI handler (arch_trigger_all_cpu_backtrace_handler) took too long to run: 160.716 msecs
CPU: 2 PID: 0 Comm: swapper/2 Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffff88011b07c1a0 ti: ffff88011b0fa000 task.ti: ffff88011b0fa000
RIP: 0010:[<ffffffff8100ae3b>]  [<ffffffff8100ae3b>] default_idle+0x1b/0xa0
RSP: 0018:ffff88011b0fbf00  EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0100000000000000
RDX: 0100000000000000 RSI: 0000000000000000 RDI: ffff88011fd0d280
RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000001
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011b0fbfd8
R13: ffff88011b0fbfd8 R14: ffffffff81cd8fd0 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff88011fd00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffffffffff600400 CR3: 00000000db23b000 CR4: 00000000000006e0
Stack:
  0000000000000000 ffff88011b0fbfd8 ffffffff8107f1a1 ffff88011b0fbfd8
  0000000000000000 0000000000000000 0000000000000000 0000000000000000
  0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call Trace:
  [<ffffffff8107f1a1>] ? cpu_startup_entry+0x141/0x220
Code: 9c 2d 05 00 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 8b 05 dd 6e cc 00 55 65 8b 2c 25 1c b0 00 00 53 85 c0 7f 17 fb f4 <8b> 05 c7 6e cc 00 65 8b 2c 25 1c b0 00 00 85 c0 7f 3b 5b 5d c3
INFO: NMI handler (arch_trigger_all_cpu_backtrace_handler) took too long to run: 202.778 msecs
NMI backtrace for cpu 0
CPU: 0 PID: 0 Comm: swapper/0 Tainted: G      D      3.13.0-rc3+ #32
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
task: ffffffff81c10480 ti: ffffffff81c00000 task.ti: ffffffff81c00000
RIP: 0010:[<ffffffff8100ae3b>]  [<ffffffff8100ae3b>] default_idle+0x1b/0xa0
RSP: 0018:ffffffff81c01f68  EFLAGS: 00000246
RAX: 0000000000000000 RBX: ffffffffffffffff RCX: 0100000000000000
RDX: 0100000000000000 RSI: 0000000000000000 RDI: ffff88011fc0d280
R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff81c01fd8
R13: ffffffff81c01fd8 R14: ffffffff81cd8fd0 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff88011fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000608060 CR3: 0000000031f37000 CR4: 00000000000006f0
Stack:
  ffffffffffffffff ffffffff81c01fd8 ffffffff8107f1a1 ffffffff81c01fd8
  ffffffffffffffff ffffffff81d82920 ffff88011ffcd540 ffffffff81d892c0
  0000000000000000 0000000000000000 ffffffff81cf4d9c ffffffff81cf4849
Call Trace:
  [<ffffffff8107f1a1>] ? cpu_startup_entry+0x141/0x220
  [<ffffffff81cf4d9c>] ? start_kernel+0x376/0x381
  [<ffffffff81cf4849>] ? repair_env_string+0x58/0x58
Code: 9c 2d 05 00 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 8b 05 dd 6e cc 00 55 65 8b 2c 25 1c b0 00 00 53 85 c0 7f 17 fb f4 <8b> 05 c7 6e cc 00 65 8b 2c 25 1c b0 00 00 85 c0 7f 3b 5b 5d c3
qemu: terminating on signal 15 from pid 7695
*****************************************************************

To summarise, I am entirely unconvinced about the merits of
adding the asynchronous patch.  Two last points need to be made here:

1. Squashfs's readpage() is fairly unique and unusual, in that it
    does readahead itself and adds multiple pages to the page cache in
    one call, this is obviously because Squashfs blocks tend to be much
    larger than the page size.  Conversely, most other filesystems
    readpage() routines add the page it has been called to read and
    that is it.  As such making Squashfs readpage() asynchronous
    offers the real risk of introducing page races within code that
    does not exist with the synchronous Squashfs readpage() implementation
    or with other filesystem's asynchronous readpage() implementations.

    The fact that the asynchronous Squashfs readpage() implementation
    shows worse performance than the synchronous readpage() implementation
    when multiple asynchronous readpage() calls do not take place, and
    generates kernel oopses when multiple asynchronous readpage()
    calls do take place, shows that the above fears are borne out.

2. An observation which may not be immediately obvious.

   Let's look again at the performance results, specifically the best
   results for the asynchronous patch where it outperforms
   the synchronous readpage().

Squashfs: gzip filesystem 128K blocks

	Asyn	No Asyn	Asyn	No Asyn	Asyn	No Asyn
	Single	Single	Multi	Multi	Percpu	Percpu
-------------------------------------------------------
32K:	42.9	47.2	43.0	46.1	42.9	47.1
64K:	50.7	47.0	55.1	53.1	55.2	49.6
128K:	51.1	47.6	54.9	49.2	55.7	48.7
256K:	51.0	49.5	54.7	48.2	57.3	49.3
512K:	53.6	48.2	55.2	47.6	54.9	48.3
1024K:	51.1	47.9	54.7	47.5	54.6	46.5

Now compare them against the performance results for 256K block filesystems.

Squashfs: gzip filesystem 256K blocks

	Asyn	No Asyn	Asyn	No Asyn	Asyn	No Asyn
	Single	Single	Multi	Multi	Percpu	Percpu
-------------------------------------------------------
32K:	54.0	95.9	53.8	98.7	53.9	85.3
64K:	53.7	86.9	53.4	103	53.4	86.3
128K:	53.2	94.4	53.6	100	53.7	97.9
256K:	55.5	101	53.0	94.1	53.3	87.0
512K:	53.1	93.0	53.4	87.7	53.2	89.8
1024K:	53.2	91.4	52.7	91.3	53.0	95.4

So you're using a 128K block filesystem and add asynchronous readpage
to gain a 16% improvement.  A somewhat better approach would be to
move to a 256K block filesystem, and gain a 74% performance boost.

Phillip

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Re : Re: Re: Re : Re: [PATCH] Squashfs: add asynchronous read support
  2013-12-23  3:03     ` Re : " Chanho Min
@ 2013-12-23  5:04       ` Minchan Kim
  0 siblings, 0 replies; 9+ messages in thread
From: Minchan Kim @ 2013-12-23  5:04 UTC (permalink / raw)
  To: Chanho Min
  Cc: Phillip Lougher, linux-kernel, 임효준,
	이건호

On Mon, Dec 23, 2013 at 12:03:39PM +0900, Chanho Min wrote:
> 
> 
> > read_pages
> >   for(page_idx ...) {
> >     if (!add_to_page_cache_lru)) { <-- 1)
> >       mapping->a_ops->readpage(filp, page)
> >         squashfs_readpage
> >           for (i ...) {   2)  Here, 31 pages are inserted into page cache
> >             grab_cahe_page_nowait <------/
> >               add_to_page_cache_lru
> >           }
> >     }
> >     /*
> >      * 1) will be failed with EEXIST by 2) so every pages other than first page
> >      * in list would be freed
> >      */
> >     page_cache_release(page)
> >   }
> >
> > If you see ReadAhead works, it is just by luck as I told you.
> > Please simulate it with 64K dd.
> You right, This luck happened frequently with 128k dd or my test.

Yeah, it was not intented by MM's readahead.
If you test it with squashfs 256K compression, you couldn't get a benefit.
If you test it with small block size dd like 32K, you couldn't, either.
It means it's very fragile. One more thing. Your approach doesn't work
page cache has already some sparse page because you are solving only
direct page copy part, which couldn't work if we read some sparse page
in a file and reclaimed many pages.

Please rethink.

I already explained what's the problem in your patch.
You are ignoring VM's logic. (ex, PageReadahead mark)
The squashfs is rather special due to compression FS so if we have no other way,
I'd like to support your approach but I pointed out problem in your patch and
suggest my solution to overcome the problem. It culd be silly but at least,
it's time that you should prove why it's brain-damaged so maintainer will
review this thread and decide or suggest easily. :)

Here goes again.

I suggest it would be better to implment squashfs_readpages and it should
work with cache buffer instead of direct page cache so that it could copy
from cache buffers to pages passed by MM without freeing them so that it
preserves readhead hinted page and would work with VM's readahead well
1. although algorithm in readahead were changed, 2. although you use small
block size dd, 3. although you use other compression size of squashfs.

Thanks.

> 
> > I understand it but your patch doesn't make it.
> >
> I think my patch can make it if readahead works normally or luckily.
> 
> Thanks a lot!
> Chanho,
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2013-12-23  5:04 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-12-16  5:30 [PATCH] Squashfs: add asynchronous read support Chanho Min
2013-12-17  7:27 ` Minchan Kim
2013-12-18  4:29 ` Re : " Chanho Min
2013-12-18  5:24   ` Minchan Kim
2013-12-21  2:05   ` Chanho Min
2013-12-23  0:38     ` Minchan Kim
2013-12-23  3:03     ` Re : " Chanho Min
2013-12-23  5:04       ` Minchan Kim
2013-12-23  5:03 ` Phillip Lougher

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox