[PATCH v1 2/2] zram: remove init_lock in zram_make

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-28  8:15 [PATCH 1/2] zram: free meta table in zram_meta_free Minchan Kim
@ 2015-01-28  8:15 ` Minchan Kim
  2015-01-28 14:56   ` Sergey Senozhatsky
  2015-01-29 13:48   ` Ganesh Mahendran
  0 siblings, 2 replies; 43+ messages in thread
From: Minchan Kim @ 2015-01-28  8:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Nitin Gupta, Jerome Marchand,
	Sergey Senozhatsky, Ganesh Mahendran, Minchan Kim

Admin could reset zram during I/O operation going on so we have
used zram->init_lock as read-side lock in I/O path to prevent
sudden zram meta freeing.

However, the init_lock is really troublesome.
We can't do call zram_meta_alloc under init_lock due to lockdep splat
because zram_rw_page is one of the function under reclaim path and
hold it as read_lock while other places in process context hold it
as write_lock. So, we have used allocation out of the lock to avoid
lockdep warn but it's not good for readability and fainally, I met
another lockdep splat between init_lock and cpu_hotpulug from
kmem_cache_destroy during wokring zsmalloc compaction. :(

Yes, the ideal is to remove horrible init_lock of zram in rw path.
This patch removes it in rw path and instead, put init_done bool
variable to check initialization done with smp_[wmb|rmb] and
srcu_[un]read_lock to prevent sudden zram meta freeing
during I/O operation.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 drivers/block/zram/zram_drv.c | 85 +++++++++++++++++++++++++++++++------------
 drivers/block/zram/zram_drv.h |  5 +++
 2 files changed, 66 insertions(+), 24 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a598ada817f0..b33add453027 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/vmalloc.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 
 #include "zram_drv.h"
 
@@ -53,9 +54,31 @@ static ssize_t name##_show(struct device *d,		\
 }									\
 static DEVICE_ATTR_RO(name);
 
-static inline int init_done(struct zram *zram)
+static inline bool init_done(struct zram *zram)
 {
-	return zram->meta != NULL;
+	/*
+	 * init_done can be used without holding zram->init_lock in
+	 * read/write handler(ie, zram_make_request) but we should make sure
+	 * that zram->init_done should set up after meta initialization is
+	 * done. Look at setup_init_done.
+	 */
+	bool ret = zram->init_done;
+
+	if (ret)
+		smp_rmb(); /* pair with setup_init_done */
+
+	return ret;
+}
+
+static inline void setup_init_done(struct zram *zram, bool flag)
+{
+	/*
+	 * Store operation of struct zram fields should complete
+	 * before zram->init_done set up because zram_bvec_rw
+	 * doesn't hold an zram->init_lock.
+	 */
+	smp_wmb();
+	zram->init_done = flag;
 }
 
 static inline struct zram *dev_to_zram(struct device *dev)
@@ -326,6 +349,10 @@ static void zram_meta_free(struct zram_meta *meta)
 	kfree(meta);
 }
 
+static void rcu_zram_do_nothing(struct rcu_head *unused)
+{
+}
+
 static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
 {
 	char pool_name[8];
@@ -726,11 +753,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 		return;
 	}
 
-	zcomp_destroy(zram->comp);
 	zram->max_comp_streams = 1;
 
-	zram_meta_free(zram->meta);
-	zram->meta = NULL;
 	/* Reset stats */
 	memset(&zram->stats, 0, sizeof(zram->stats));
 
@@ -738,8 +762,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 	if (reset_capacity)
 		set_capacity(zram->disk, 0);
 
+	setup_init_done(zram, false);
+	call_srcu(&zram->srcu, &zram->rcu, rcu_zram_do_nothing);
+	synchronize_srcu(&zram->srcu);
+	zram_meta_free(zram->meta);
+	zcomp_destroy(zram->comp);
 	up_write(&zram->init_lock);
-
 	/*
 	 * Revalidate disk out of the init_lock to avoid lockdep splat.
 	 * It's okay because disk's capacity is protected by init_lock
@@ -762,10 +790,19 @@ static ssize_t disksize_store(struct device *dev,
 	if (!disksize)
 		return -EINVAL;
 
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		pr_info("Cannot change disksize for initialized device\n");
+		up_write(&zram->init_lock);
+		return -EBUSY;
+	}
+
 	disksize = PAGE_ALIGN(disksize);
 	meta = zram_meta_alloc(zram->disk->first_minor, disksize);
-	if (!meta)
+	if (!meta) {
+		up_write(&zram->init_lock);
 		return -ENOMEM;
+	}
 
 	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
 	if (IS_ERR(comp)) {
@@ -775,17 +812,11 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_free_meta;
 	}
 
-	down_write(&zram->init_lock);
-	if (init_done(zram)) {
-		pr_info("Cannot change disksize for initialized device\n");
-		err = -EBUSY;
-		goto out_destroy_comp;
-	}
-
 	zram->meta = meta;
 	zram->comp = comp;
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+	setup_init_done(zram, true);
 	up_write(&zram->init_lock);
 
 	/*
@@ -797,10 +828,8 @@ static ssize_t disksize_store(struct device *dev,
 
 	return len;
 
-out_destroy_comp:
-	up_write(&zram->init_lock);
-	zcomp_destroy(comp);
 out_free_meta:
+	up_write(&zram->init_lock);
 	zram_meta_free(meta);
 	return err;
 }
@@ -905,9 +934,10 @@ out:
  */
 static void zram_make_request(struct request_queue *queue, struct bio *bio)
 {
+	int idx;
 	struct zram *zram = queue->queuedata;
 
-	down_read(&zram->init_lock);
+	idx = srcu_read_lock(&zram->srcu);
 	if (unlikely(!init_done(zram)))
 		goto error;
 
@@ -918,12 +948,12 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
 	}
 
 	__zram_make_request(zram, bio);
-	up_read(&zram->init_lock);
+	srcu_read_unlock(&zram->srcu, idx);
 
 	return;
 
 error:
-	up_read(&zram->init_lock);
+	srcu_read_unlock(&zram->srcu, idx);
 	bio_io_error(bio);
 }
 
@@ -945,18 +975,20 @@ static void zram_slot_free_notify(struct block_device *bdev,
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, int rw)
 {
-	int offset, err;
+	int offset, err, idx;
 	u32 index;
 	struct zram *zram;
 	struct bio_vec bv;
 
 	zram = bdev->bd_disk->private_data;
+	idx = srcu_read_lock(&zram->srcu);
+
 	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
 		atomic64_inc(&zram->stats.invalid_io);
+		srcu_read_unlock(&zram->srcu, idx);
 		return -EINVAL;
 	}
 
-	down_read(&zram->init_lock);
 	if (unlikely(!init_done(zram))) {
 		err = -EIO;
 		goto out_unlock;
@@ -971,7 +1003,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 
 	err = zram_bvec_rw(zram, &bv, index, offset, rw);
 out_unlock:
-	up_read(&zram->init_lock);
+	srcu_read_unlock(&zram->srcu, idx);
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
 	 * calling page_endio.
@@ -1041,6 +1073,11 @@ static int create_device(struct zram *zram, int device_id)
 
 	init_rwsem(&zram->init_lock);
 
+	if (init_srcu_struct(&zram->srcu)) {
+		pr_err("Error initialize srcu for device %d\n", device_id);
+		goto out;
+	}
+
 	zram->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!zram->queue) {
 		pr_err("Error allocating disk queue for device %d\n",
@@ -1125,8 +1162,8 @@ static void destroy_device(struct zram *zram)
 
 	del_gendisk(zram->disk);
 	put_disk(zram->disk);
-
 	blk_cleanup_queue(zram->queue);
+	cleanup_srcu_struct(&zram->srcu);
 }
 
 static int __init zram_init(void)
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e492f6bf11f1..2042c310aea8 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -105,8 +105,13 @@ struct zram {
 	struct gendisk *disk;
 	struct zcomp *comp;
 
+	struct srcu_struct srcu;
+	struct rcu_head rcu;
+
 	/* Prevent concurrent execution of device init, reset and R/W request */
 	struct rw_semaphore init_lock;
+	bool init_done;
+
 	/*
 	 * This is the limit on amount of *uncompressed* worth of data
 	 * we can store in a disk.
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-28  8:15 ` [PATCH v1 2/2] zram: remove init_lock in zram_make_request Minchan Kim
@ 2015-01-28 14:56   ` Sergey Senozhatsky
  2015-01-28 15:04     ` Sergey Senozhatsky
  2015-01-28 23:33     ` Minchan Kim
  2015-01-29 13:48   ` Ganesh Mahendran
  1 sibling, 2 replies; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-28 14:56 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Nitin Gupta,
	Jerome Marchand, Sergey Senozhatsky, Ganesh Mahendran

On (01/28/15 17:15), Minchan Kim wrote:
> Admin could reset zram during I/O operation going on so we have
> used zram->init_lock as read-side lock in I/O path to prevent
> sudden zram meta freeing.
> 
> However, the init_lock is really troublesome.
> We can't do call zram_meta_alloc under init_lock due to lockdep splat
> because zram_rw_page is one of the function under reclaim path and
> hold it as read_lock while other places in process context hold it
> as write_lock. So, we have used allocation out of the lock to avoid
> lockdep warn but it's not good for readability and fainally, I met
> another lockdep splat between init_lock and cpu_hotpulug from
> kmem_cache_destroy during wokring zsmalloc compaction. :(
> 
> Yes, the ideal is to remove horrible init_lock of zram in rw path.
> This patch removes it in rw path and instead, put init_done bool
> variable to check initialization done with smp_[wmb|rmb] and
> srcu_[un]read_lock to prevent sudden zram meta freeing
> during I/O operation.
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  drivers/block/zram/zram_drv.c | 85 +++++++++++++++++++++++++++++++------------
>  drivers/block/zram/zram_drv.h |  5 +++
>  2 files changed, 66 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index a598ada817f0..b33add453027 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -32,6 +32,7 @@
>  #include <linux/string.h>
>  #include <linux/vmalloc.h>
>  #include <linux/err.h>
> +#include <linux/srcu.h>
>  
>  #include "zram_drv.h"
>  
> @@ -53,9 +54,31 @@ static ssize_t name##_show(struct device *d,		\
>  }									\
>  static DEVICE_ATTR_RO(name);
>  
> -static inline int init_done(struct zram *zram)
> +static inline bool init_done(struct zram *zram)
>  {
> -	return zram->meta != NULL;
> +	/*
> +	 * init_done can be used without holding zram->init_lock in
> +	 * read/write handler(ie, zram_make_request) but we should make sure
> +	 * that zram->init_done should set up after meta initialization is
> +	 * done. Look at setup_init_done.
> +	 */
> +	bool ret = zram->init_done;

I don't like re-introduced ->init_done.
another idea... how about using `zram->disksize == 0' instead of
`->init_done' (previously `->meta != NULL')? should do the trick.


and I'm not sure I get this rmb...

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-28 14:56   ` Sergey Senozhatsky
@ 2015-01-28 15:04     ` Sergey Senozhatsky
  2015-01-28 23:33     ` Minchan Kim
  1 sibling, 0 replies; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-28 15:04 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel, linux-mm,
	Nitin Gupta, Jerome Marchand, Ganesh Mahendran,
	sergey.senozhatsky.work

On (01/28/15 23:56), Sergey Senozhatsky wrote:
> > -static inline int init_done(struct zram *zram)
> > +static inline bool init_done(struct zram *zram)
> >  {
> > -	return zram->meta != NULL;
> > +	/*
> > +	 * init_done can be used without holding zram->init_lock in
> > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > +	 * that zram->init_done should set up after meta initialization is
> > +	 * done. Look at setup_init_done.
> > +	 */
> > +	bool ret = zram->init_done;
> 
> I don't like re-introduced ->init_done.
> another idea... how about using `zram->disksize == 0' instead of
> `->init_done' (previously `->meta != NULL')? should do the trick.
> 

a typo, I meant `->disksize != 0'.

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-28 14:56   ` Sergey Senozhatsky
  2015-01-28 15:04     ` Sergey Senozhatsky
@ 2015-01-28 23:33     ` Minchan Kim
  2015-01-29  1:57       ` Sergey Senozhatsky
  1 sibling, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-01-28 23:33 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, linux-kernel, linux-mm, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On Wed, Jan 28, 2015 at 11:56:51PM +0900, Sergey Senozhatsky wrote:
> On (01/28/15 17:15), Minchan Kim wrote:
> > Admin could reset zram during I/O operation going on so we have
> > used zram->init_lock as read-side lock in I/O path to prevent
> > sudden zram meta freeing.
> > 
> > However, the init_lock is really troublesome.
> > We can't do call zram_meta_alloc under init_lock due to lockdep splat
> > because zram_rw_page is one of the function under reclaim path and
> > hold it as read_lock while other places in process context hold it
> > as write_lock. So, we have used allocation out of the lock to avoid
> > lockdep warn but it's not good for readability and fainally, I met
> > another lockdep splat between init_lock and cpu_hotpulug from
> > kmem_cache_destroy during wokring zsmalloc compaction. :(
> > 
> > Yes, the ideal is to remove horrible init_lock of zram in rw path.
> > This patch removes it in rw path and instead, put init_done bool
> > variable to check initialization done with smp_[wmb|rmb] and
> > srcu_[un]read_lock to prevent sudden zram meta freeing
> > during I/O operation.
> > 
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> >  drivers/block/zram/zram_drv.c | 85 +++++++++++++++++++++++++++++++------------
> >  drivers/block/zram/zram_drv.h |  5 +++
> >  2 files changed, 66 insertions(+), 24 deletions(-)
> > 
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index a598ada817f0..b33add453027 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -32,6 +32,7 @@
> >  #include <linux/string.h>
> >  #include <linux/vmalloc.h>
> >  #include <linux/err.h>
> > +#include <linux/srcu.h>
> >  
> >  #include "zram_drv.h"
> >  
> > @@ -53,9 +54,31 @@ static ssize_t name##_show(struct device *d,		\
> >  }									\
> >  static DEVICE_ATTR_RO(name);
> >  
> > -static inline int init_done(struct zram *zram)
> > +static inline bool init_done(struct zram *zram)
> >  {
> > -	return zram->meta != NULL;
> > +	/*
> > +	 * init_done can be used without holding zram->init_lock in
> > +	 * read/write handler(ie, zram_make_request) but we should make sure
> > +	 * that zram->init_done should set up after meta initialization is
> > +	 * done. Look at setup_init_done.
> > +	 */
> > +	bool ret = zram->init_done;
> 
> I don't like re-introduced ->init_done.
> another idea... how about using `zram->disksize == 0' instead of
> `->init_done' (previously `->meta != NULL')? should do the trick.

It could be.

> 
> 
> and I'm not sure I get this rmb...

What makes you not sure?
I think it's clear and common pattern for smp_[wmb|rmb]. :)


> 
> 	-ss

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-28 23:33     ` Minchan Kim
@ 2015-01-29  1:57       ` Sergey Senozhatsky
  2015-01-29  2:01         ` Minchan Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-29  1:57 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel@vger.kernel.org, Linux-MM,
	Nitin Gupta, Jerome Marchand, Ganesh Mahendran,
	sergey.senozhatsky.work

[-- Attachment #1: Type: text/plain, Size: 663 bytes --]

On Thu, Jan 29, 2015 at 8:33 AM, Minchan Kim <minchan@kernel.org> wrote:

> On Wed, Jan 28, 2015 at 11:56:51PM +0900, Sergey Senozhatsky wrote:
> > I don't like re-introduced ->init_done.
> > another idea... how about using `zram->disksize == 0' instead of
> > `->init_done' (previously `->meta != NULL')? should do the trick.
>
> It could be.
>
>
care to change it?



> >
> >
> > and I'm not sure I get this rmb...
>
> What makes you not sure?
> I think it's clear and common pattern for smp_[wmb|rmb]. :)
>


well... what that "if (ret)" gives? it's almost always true, because the
device is initialized during read/write operations (in 99.99% of cases).

-ss

[-- Attachment #2: Type: text/html, Size: 1265 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-29  1:57       ` Sergey Senozhatsky
@ 2015-01-29  2:01         ` Minchan Kim
  2015-01-29  2:22           ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-01-29  2:01 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, linux-kernel@vger.kernel.org, Linux-MM,
	Nitin Gupta, Jerome Marchand, Ganesh Mahendran,
	sergey.senozhatsky.work

On Thu, Jan 29, 2015 at 10:57:38AM +0900, Sergey Senozhatsky wrote:
> On Thu, Jan 29, 2015 at 8:33 AM, Minchan Kim <minchan@kernel.org> wrote:
> 
> > On Wed, Jan 28, 2015 at 11:56:51PM +0900, Sergey Senozhatsky wrote:
> > > I don't like re-introduced ->init_done.
> > > another idea... how about using `zram->disksize == 0' instead of
> > > `->init_done' (previously `->meta != NULL')? should do the trick.
> >
> > It could be.
> >
> >
> care to change it?

Will try!

> 
> 
> 
> > >
> > >
> > > and I'm not sure I get this rmb...
> >
> > What makes you not sure?
> > I think it's clear and common pattern for smp_[wmb|rmb]. :)
> >
> 
> 
> well... what that "if (ret)" gives? it's almost always true, because the
> device is initialized during read/write operations (in 99.99% of cases).

If it was your concern, I'm happy to remove the check.(ie, actually,
I realized that after I push the button to send). Thanks!

> 
> -ss

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-29  2:01         ` Minchan Kim
@ 2015-01-29  2:22           ` Sergey Senozhatsky
  2015-01-29  5:28             ` Minchan Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-29  2:22 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran,
	sergey.senozhatsky.work

On (01/29/15 11:01), Minchan Kim wrote:
> On Thu, Jan 29, 2015 at 10:57:38AM +0900, Sergey Senozhatsky wrote:
> > On Thu, Jan 29, 2015 at 8:33 AM, Minchan Kim <minchan@kernel.org> wrote:
> > > On Wed, Jan 28, 2015 at 11:56:51PM +0900, Sergey Senozhatsky wrote:
> > > > I don't like re-introduced ->init_done.
> > > > another idea... how about using `zram->disksize == 0' instead of
> > > > `->init_done' (previously `->meta != NULL')? should do the trick.
> > >
> > > It could be.
> > >
> > >
> > care to change it?
> 
> Will try!
> 
> If it was your concern, I'm happy to remove the check.(ie, actually,
> I realized that after I push the button to send). Thanks!
> 

Thanks a lot, Minchan.

and, guys, sorry for previous html email (I'm sure I toggled the "plain
text" mode in gmail web-interface, but somehow it has different meaning
in gmail world).

I'm still concerned about performance numbers that I see on my x86_64.
it's not always, but mostly slower. I'll give it another try (disable
lockdep, etc.), but if we lose 10% on average then, sorry, I'm not so
positive about srcu change and will tend to vote for your initial commit
that simply moved meta free() out of init_lock and left locking as is
(lockdep warning would have been helpful there, because otherwise it
just looked like we change code w/o any reason).

what do you thunk?

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-29  2:22           ` Sergey Senozhatsky
@ 2015-01-29  5:28             ` Minchan Kim
  2015-01-29  6:06               ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-01-29  5:28 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

On Thu, Jan 29, 2015 at 11:22:41AM +0900, Sergey Senozhatsky wrote:
> On (01/29/15 11:01), Minchan Kim wrote:
> > On Thu, Jan 29, 2015 at 10:57:38AM +0900, Sergey Senozhatsky wrote:
> > > On Thu, Jan 29, 2015 at 8:33 AM, Minchan Kim <minchan@kernel.org> wrote:
> > > > On Wed, Jan 28, 2015 at 11:56:51PM +0900, Sergey Senozhatsky wrote:
> > > > > I don't like re-introduced ->init_done.
> > > > > another idea... how about using `zram->disksize == 0' instead of
> > > > > `->init_done' (previously `->meta != NULL')? should do the trick.
> > > >
> > > > It could be.
> > > >
> > > >
> > > care to change it?
> > 
> > Will try!
> > 
> > If it was your concern, I'm happy to remove the check.(ie, actually,
> > I realized that after I push the button to send). Thanks!
> > 
> 
> Thanks a lot, Minchan.
> 
> and, guys, sorry for previous html email (I'm sure I toggled the "plain
> text" mode in gmail web-interface, but somehow it has different meaning
> in gmail world).
> 
> 
> I'm still concerned about performance numbers that I see on my x86_64.
> it's not always, but mostly slower. I'll give it another try (disable
> lockdep, etc.), but if we lose 10% on average then, sorry, I'm not so
> positive about srcu change and will tend to vote for your initial commit
> that simply moved meta free() out of init_lock and left locking as is
> (lockdep warning would have been helpful there, because otherwise it
> just looked like we change code w/o any reason).
> 
> what do you thunk?

Surely I agreee with you. If it suffers from 10% performance regression,
it's absolutely no go.

However, I believe it should be no loss because that's one of the reason
from RCU birth which should be really win in read-side lock path compared
to other locking.

Please test it with dd or something for block-based test for removing
noise from FS. I also will test it to confirm that with real machine.

Thanks for the review!



> 
> 	-ss

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-29  5:28             ` Minchan Kim
@ 2015-01-29  6:06               ` Sergey Senozhatsky
  2015-01-29  6:35                 ` Minchan Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-29  6:06 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On (01/29/15 14:28), Minchan Kim wrote:
> > I'm still concerned about performance numbers that I see on my x86_64.
> > it's not always, but mostly slower. I'll give it another try (disable
> > lockdep, etc.), but if we lose 10% on average then, sorry, I'm not so
> > positive about srcu change and will tend to vote for your initial commit
> > that simply moved meta free() out of init_lock and left locking as is
> > (lockdep warning would have been helpful there, because otherwise it
> > just looked like we change code w/o any reason).
> > 
> > what do you thunk?
> 
> Surely I agreee with you. If it suffers from 10% performance regression,
> it's absolutely no go.
> 
> However, I believe it should be no loss because that's one of the reason
> from RCU birth which should be really win in read-side lock path compared
> to other locking.
> 
> Please test it with dd or something for block-based test for removing
> noise from FS. I also will test it to confirm that with real machine.
> 

do you test with a single dd thread/process?  just dd if=foo of=bar -c... or
you start N `dd &' processes?

for a single writer there should be no difference, no doubt. I'm more
interested in multi-writer/multi-reader/mixed use cases.

the options that I use are: iozone -t 3 -R -r 16K -s 60M -I +Z
and -I is:
	-I  Use VxFS VX_DIRECT, O_DIRECT,or O_DIRECTIO for all file operations

with O_DIRECT I don't think there is a lot of noise, but I'll try to use
different benchmarks a bit later.


	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-29  6:06               ` Sergey Senozhatsky
@ 2015-01-29  6:35                 ` Minchan Kim
  2015-01-29  7:08                   ` Sergey Senozhatsky
  2015-01-30  0:20                   ` Sergey Senozhatsky
  0 siblings, 2 replies; 43+ messages in thread
From: Minchan Kim @ 2015-01-29  6:35 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

On Thu, Jan 29, 2015 at 03:06:04PM +0900, Sergey Senozhatsky wrote:
> On (01/29/15 14:28), Minchan Kim wrote:
> > > I'm still concerned about performance numbers that I see on my x86_64.
> > > it's not always, but mostly slower. I'll give it another try (disable
> > > lockdep, etc.), but if we lose 10% on average then, sorry, I'm not so
> > > positive about srcu change and will tend to vote for your initial commit
> > > that simply moved meta free() out of init_lock and left locking as is
> > > (lockdep warning would have been helpful there, because otherwise it
> > > just looked like we change code w/o any reason).
> > > 
> > > what do you thunk?
> > 
> > Surely I agreee with you. If it suffers from 10% performance regression,
> > it's absolutely no go.
> > 
> > However, I believe it should be no loss because that's one of the reason
> > from RCU birth which should be really win in read-side lock path compared
> > to other locking.
> > 
> > Please test it with dd or something for block-based test for removing
> > noise from FS. I also will test it to confirm that with real machine.
> > 
> 
> do you test with a single dd thread/process?  just dd if=foo of=bar -c... or
> you start N `dd &' processes?

I tested it with multiple dd processes.

> 
> for a single writer there should be no difference, no doubt. I'm more
> interested in multi-writer/multi-reader/mixed use cases.
> 
> the options that I use are: iozone -t 3 -R -r 16K -s 60M -I +Z
> and -I is:
> 	-I  Use VxFS VX_DIRECT, O_DIRECT,or O_DIRECTIO for all file operations
> 
> with O_DIRECT I don't think there is a lot of noise, but I'll try to use
> different benchmarks a bit later.
> 

As you told, the data was not stable.
Anyway, when I read down_read implementation, it's one atomic instruction.
Hmm, it seems te be better for srcu_read_lock which does more things.
But I guessed most of overhead are from [de]compression, memcpy, clear_page
That's why I guessed we don't have measurable difference from that.
What's the data pattern if you use iozone? I guess it's really simple
pattern compressor can do fast. I used /dev/sda for dd write so
more realistic data. Anyway, if we has 10% regression even if the data
is simple, I never want to merge it.
I will test it carefully and if it turns out lots regression,
surely, I will not go with this and send the original patch again.

Thanks.

> 
> 	-ss

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-29  6:35                 ` Minchan Kim
@ 2015-01-29  7:08                   ` Sergey Senozhatsky
  2015-01-30 14:41                     ` Minchan Kim
  2015-01-30  0:20                   ` Sergey Senozhatsky
  1 sibling, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-29  7:08 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On (01/29/15 15:35), Minchan Kim wrote:
>
> As you told, the data was not stable.
>
yes. fread test was always slower, and the rest was mostly slower.


> Anyway, when I read down_read implementation, it's one atomic instruction.
> Hmm, it seems te be better for srcu_read_lock which does more things.
>
srcu looks havier, agree.

> But I guessed most of overhead are from [de]compression, memcpy, clear_page
> That's why I guessed we don't have measurable difference from that.
> What's the data pattern if you use iozone?

by "data pattern" you mean usage scenario? well, I usually use zram for
`make -jX', where X=[4..N]. so N concurrent read-write ops scenario.

	-ss

> I guess it's really simple pattern compressor can do fast. I used /dev/sda
> for dd write so more realistic data. Anyway, if we has 10% regression even if
> the data is simple, I never want to merge it.
> I will test it carefully and if it turns out lots regression,
> surely, I will not go with this and send the original patch again.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-28  8:15 ` [PATCH v1 2/2] zram: remove init_lock in zram_make_request Minchan Kim
  2015-01-28 14:56   ` Sergey Senozhatsky
@ 2015-01-29 13:48   ` Ganesh Mahendran
  2015-01-29 15:12     ` Sergey Senozhatsky
  1 sibling, 1 reply; 43+ messages in thread
From: Ganesh Mahendran @ 2015-01-29 13:48 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, Linux-MM, Nitin Gupta,
	Jerome Marchand, Sergey Senozhatsky

Hello, Minchan:

2015-01-28 16:15 GMT+08:00 Minchan Kim <minchan@kernel.org>:
> Admin could reset zram during I/O operation going on so we have
> used zram->init_lock as read-side lock in I/O path to prevent
> sudden zram meta freeing.

When I/O operation is running, that means the /dev/zram0 is
mounted or swaped on. Then the device could not be reset by
below code:

    /* Do not reset an active device! */
    if (bdev->bd_holders) {
        ret = -EBUSY;
        goto out;
    }

So the zram->init_lock in I/O path is to check whether the device
has been initialized(echo xxx > /sys/block/zram/disk_size).

Is that right?

>
> However, the init_lock is really troublesome.
> We can't do call zram_meta_alloc under init_lock due to lockdep splat

I do not know much about the lockdep.

enabled the CONFIG_LOCKDEP, But from the test, there is no warning
printed. Could you please tell me how this happened?

Thanks.

> because zram_rw_page is one of the function under reclaim path and
> hold it as read_lock while other places in process context hold it
> as write_lock. So, we have used allocation out of the lock to avoid
> lockdep warn but it's not good for readability and fainally, I met
> another lockdep splat between init_lock and cpu_hotpulug from
> kmem_cache_destroy during wokring zsmalloc compaction. :(
>
> Yes, the ideal is to remove horrible init_lock of zram in rw path.
> This patch removes it in rw path and instead, put init_done bool
> variable to check initialization done with smp_[wmb|rmb] and
> srcu_[un]read_lock to prevent sudden zram meta freeing
> during I/O operation.
>
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  drivers/block/zram/zram_drv.c | 85 +++++++++++++++++++++++++++++++------------
>  drivers/block/zram/zram_drv.h |  5 +++
>  2 files changed, 66 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index a598ada817f0..b33add453027 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -32,6 +32,7 @@
>  #include <linux/string.h>
>  #include <linux/vmalloc.h>
>  #include <linux/err.h>
> +#include <linux/srcu.h>
>
>  #include "zram_drv.h"
>
> @@ -53,9 +54,31 @@ static ssize_t name##_show(struct device *d,         \
>  }                                                                      \
>  static DEVICE_ATTR_RO(name);
>
> -static inline int init_done(struct zram *zram)
> +static inline bool init_done(struct zram *zram)
>  {
> -       return zram->meta != NULL;
> +       /*
> +        * init_done can be used without holding zram->init_lock in
> +        * read/write handler(ie, zram_make_request) but we should make sure
> +        * that zram->init_done should set up after meta initialization is
> +        * done. Look at setup_init_done.
> +        */
> +       bool ret = zram->init_done;
> +
> +       if (ret)
> +               smp_rmb(); /* pair with setup_init_done */
> +
> +       return ret;
> +}
> +
> +static inline void setup_init_done(struct zram *zram, bool flag)
> +{
> +       /*
> +        * Store operation of struct zram fields should complete
> +        * before zram->init_done set up because zram_bvec_rw
> +        * doesn't hold an zram->init_lock.
> +        */
> +       smp_wmb();
> +       zram->init_done = flag;
>  }
>
>  static inline struct zram *dev_to_zram(struct device *dev)
> @@ -326,6 +349,10 @@ static void zram_meta_free(struct zram_meta *meta)
>         kfree(meta);
>  }
>
> +static void rcu_zram_do_nothing(struct rcu_head *unused)
> +{
> +}
> +
>  static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
>  {
>         char pool_name[8];
> @@ -726,11 +753,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>                 return;
>         }
>
> -       zcomp_destroy(zram->comp);
>         zram->max_comp_streams = 1;
>
> -       zram_meta_free(zram->meta);
> -       zram->meta = NULL;
>         /* Reset stats */
>         memset(&zram->stats, 0, sizeof(zram->stats));
>
> @@ -738,8 +762,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>         if (reset_capacity)
>                 set_capacity(zram->disk, 0);

Should this be after synchronize_srcu()?

>
> +       setup_init_done(zram, false);
> +       call_srcu(&zram->srcu, &zram->rcu, rcu_zram_do_nothing);
> +       synchronize_srcu(&zram->srcu);
> +       zram_meta_free(zram->meta);
> +       zcomp_destroy(zram->comp);
>         up_write(&zram->init_lock);
> -
>         /*
>          * Revalidate disk out of the init_lock to avoid lockdep splat.
>          * It's okay because disk's capacity is protected by init_lock
> @@ -762,10 +790,19 @@ static ssize_t disksize_store(struct device *dev,
>         if (!disksize)
>                 return -EINVAL;
>
> +       down_write(&zram->init_lock);
> +       if (init_done(zram)) {
> +               pr_info("Cannot change disksize for initialized device\n");
> +               up_write(&zram->init_lock);
> +               return -EBUSY;
> +       }
> +
>         disksize = PAGE_ALIGN(disksize);
>         meta = zram_meta_alloc(zram->disk->first_minor, disksize);
> -       if (!meta)
> +       if (!meta) {
> +               up_write(&zram->init_lock);
>                 return -ENOMEM;
> +       }
>
>         comp = zcomp_create(zram->compressor, zram->max_comp_streams);
>         if (IS_ERR(comp)) {
> @@ -775,17 +812,11 @@ static ssize_t disksize_store(struct device *dev,
>                 goto out_free_meta;
>         }
>
> -       down_write(&zram->init_lock);
> -       if (init_done(zram)) {
> -               pr_info("Cannot change disksize for initialized device\n");
> -               err = -EBUSY;
> -               goto out_destroy_comp;
> -       }
> -
>         zram->meta = meta;
>         zram->comp = comp;
>         zram->disksize = disksize;
>         set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
> +       setup_init_done(zram, true);
>         up_write(&zram->init_lock);
>
>         /*
> @@ -797,10 +828,8 @@ static ssize_t disksize_store(struct device *dev,
>
>         return len;
>
> -out_destroy_comp:
> -       up_write(&zram->init_lock);
> -       zcomp_destroy(comp);
>  out_free_meta:
> +       up_write(&zram->init_lock);
>         zram_meta_free(meta);
>         return err;
>  }
> @@ -905,9 +934,10 @@ out:
>   */
>  static void zram_make_request(struct request_queue *queue, struct bio *bio)
>  {
> +       int idx;
>         struct zram *zram = queue->queuedata;
>
> -       down_read(&zram->init_lock);
> +       idx = srcu_read_lock(&zram->srcu);
>         if (unlikely(!init_done(zram)))
>                 goto error;
>
> @@ -918,12 +948,12 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
>         }
>
>         __zram_make_request(zram, bio);
> -       up_read(&zram->init_lock);
> +       srcu_read_unlock(&zram->srcu, idx);
>
>         return;
>
>  error:
> -       up_read(&zram->init_lock);
> +       srcu_read_unlock(&zram->srcu, idx);
>         bio_io_error(bio);
>  }
>
> @@ -945,18 +975,20 @@ static void zram_slot_free_notify(struct block_device *bdev,
>  static int zram_rw_page(struct block_device *bdev, sector_t sector,
>                        struct page *page, int rw)
>  {
> -       int offset, err;
> +       int offset, err, idx;
>         u32 index;
>         struct zram *zram;
>         struct bio_vec bv;
>
>         zram = bdev->bd_disk->private_data;
> +       idx = srcu_read_lock(&zram->srcu);
> +
>         if (!valid_io_request(zram, sector, PAGE_SIZE)) {
>                 atomic64_inc(&zram->stats.invalid_io);
> +               srcu_read_unlock(&zram->srcu, idx);
>                 return -EINVAL;
>         }
>
> -       down_read(&zram->init_lock);
>         if (unlikely(!init_done(zram))) {
>                 err = -EIO;
>                 goto out_unlock;
> @@ -971,7 +1003,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
>
>         err = zram_bvec_rw(zram, &bv, index, offset, rw);
>  out_unlock:
> -       up_read(&zram->init_lock);
> +       srcu_read_unlock(&zram->srcu, idx);
>         /*
>          * If I/O fails, just return error(ie, non-zero) without
>          * calling page_endio.
> @@ -1041,6 +1073,11 @@ static int create_device(struct zram *zram, int device_id)
>
>         init_rwsem(&zram->init_lock);
>
> +       if (init_srcu_struct(&zram->srcu)) {
> +               pr_err("Error initialize srcu for device %d\n", device_id);
> +               goto out;
> +       }
> +
>         zram->queue = blk_alloc_queue(GFP_KERNEL);
>         if (!zram->queue) {
>                 pr_err("Error allocating disk queue for device %d\n",
> @@ -1125,8 +1162,8 @@ static void destroy_device(struct zram *zram)
>
>         del_gendisk(zram->disk);
>         put_disk(zram->disk);
> -
>         blk_cleanup_queue(zram->queue);
> +       cleanup_srcu_struct(&zram->srcu);
>  }
>
>  static int __init zram_init(void)
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index e492f6bf11f1..2042c310aea8 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -105,8 +105,13 @@ struct zram {
>         struct gendisk *disk;
>         struct zcomp *comp;
>
> +       struct srcu_struct srcu;
> +       struct rcu_head rcu;
> +
>         /* Prevent concurrent execution of device init, reset and R/W request */
>         struct rw_semaphore init_lock;
> +       bool init_done;
> +
>         /*
>          * This is the limit on amount of *uncompressed* worth of data
>          * we can store in a disk.
> --
> 1.9.1
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-29 13:48   ` Ganesh Mahendran
@ 2015-01-29 15:12     ` Sergey Senozhatsky
  2015-01-30  7:52       ` Ganesh Mahendran
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-29 15:12 UTC (permalink / raw)
  To: Ganesh Mahendran
  Cc: Minchan Kim, Andrew Morton, linux-kernel, Linux-MM, Nitin Gupta,
	Jerome Marchand, Sergey Senozhatsky, sergey.senozhatsky.work

On (01/29/15 21:48), Ganesh Mahendran wrote:
> > Admin could reset zram during I/O operation going on so we have
> > used zram->init_lock as read-side lock in I/O path to prevent
> > sudden zram meta freeing.
> 
> When I/O operation is running, that means the /dev/zram0 is
> mounted or swaped on. Then the device could not be reset by
> below code:
> 
>     /* Do not reset an active device! */
>     if (bdev->bd_holders) {
>         ret = -EBUSY;
>         goto out;
>     }
> 
> So the zram->init_lock in I/O path is to check whether the device
> has been initialized(echo xxx > /sys/block/zram/disk_size).
> 

for mounted device (w/fs), we see initial (well, it goes up and down
many times while we create device, but this is not interesting here)
->bd_holders increment in:
  vfs_kern_mount -> mount_bdev -> blkdev_get_by_path -> blkdev_get

and it goes to zero in:
  cleanup_mnt -> deactivate_super -> kill_block_super -> blkdev_put


after umount we still have init device. so, *theoretically*, we
can see something like

	CPU0				CPU1
umount
reset_store			
bdev->bd_holders == 0			mount
...					zram_make_request()
zram_reset_device()

w/o zram->init_lock in both zram_reset_device() and zram_make_request()
one of CPUs will be a bit sad.

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-29  6:35                 ` Minchan Kim
  2015-01-29  7:08                   ` Sergey Senozhatsky
@ 2015-01-30  0:20                   ` Sergey Senozhatsky
  1 sibling, 0 replies; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-30  0:20 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

[-- Attachment #1: Type: text/plain, Size: 41304 bytes --]

On (01/29/15 15:35), Minchan Kim wrote:
> 
> I tested it with multiple dd processes.
> 

Hello,

fio test (http://manpages.ubuntu.com/manpages/natty/man1/fio.1.html) configs are attached,
so we can run fio on different h/w (works against zram with fs mounted at /mnt/)

	for i in ./test-fio-*; do fio ./$i; rm bg*; done

in short

randread
BASE   READ: io=1600.0MB, aggrb=3747.8MB/s, minb=959250KB/s, maxb=982.82MB/s, mint=407msec, maxt=427msec
SRCU   READ: io=1600.0MB, aggrb=3782.6MB/s, minb=968321KB/s, maxb=977.11MB/s, mint=409msec, maxt=423msec

random read/write
BASE   READ: io=820304KB, aggrb=1296.3MB/s, minb=331838KB/s, maxb=333456KB/s, mint=615msec, maxt=618msec
SRCU   READ: io=820304KB, aggrb=1261.6MB/s, minb=322954KB/s, maxb=335639KB/s, mint=611msec, maxt=635msec
BASE  WRITE: io=818096KB, aggrb=1292.8MB/s, minb=330944KB/s, maxb=332559KB/s, mint=615msec, maxt=618msec
SRCU  WRITE: io=818096KB, aggrb=1258.2MB/s, minb=322085KB/s, maxb=334736KB/s, mint=611msec, maxt=635msec

random write
BASE  WRITE: io=1600.0MB, aggrb=692184KB/s, minb=173046KB/s, maxb=174669KB/s, mint=2345msec, maxt=2367msec
SRCU  WRITE: io=1600.0MB, aggrb=672577KB/s, minb=168144KB/s, maxb=174149KB/s, mint=2352msec, maxt=2436msec


detailed per-process metrics:

************* BASE *************

$ for i in ./test-fio-*; do fio ./$i; rm bg*; done
bgreader: (g=0): rw=randread, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=32
...
fio-2.2.5
Starting 4 processes
bgreader: Laying out IO file(s) (1 file(s) / 400MB)
bgreader: Laying out IO file(s) (1 file(s) / 400MB)
bgreader: Laying out IO file(s) (1 file(s) / 400MB)
bgreader: Laying out IO file(s) (1 file(s) / 400MB)

bgreader: (groupid=0, jobs=1): err= 0: pid=10792: Thu Jan 29 19:43:30 2015
  read : io=409600KB, bw=959251KB/s, iops=239812, runt=   427msec
    slat (usec): min=2, max=28, avg= 2.43, stdev= 0.62
    clat (usec): min=1, max=151, avg=122.97, stdev= 4.99
     lat (usec): min=4, max=174, avg=125.52, stdev= 5.08
    clat percentiles (usec):
     |  1.00th=[  106],  5.00th=[  116], 10.00th=[  121], 20.00th=[  122],
     | 30.00th=[  122], 40.00th=[  123], 50.00th=[  123], 60.00th=[  124],
     | 70.00th=[  124], 80.00th=[  124], 90.00th=[  125], 95.00th=[  127],
     | 99.00th=[  143], 99.50th=[  143], 99.90th=[  145], 99.95th=[  145],
     | 99.99th=[  147]
    lat (usec) : 2=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.97%
  cpu          : usr=39.11%, sys=55.04%, ctx=82, majf=0, minf=38
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=102400/w=0/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgreader: (groupid=0, jobs=1): err= 0: pid=10793: Thu Jan 29 19:43:30 2015
  read : io=409600KB, bw=982254KB/s, iops=245563, runt=   417msec
    slat (usec): min=2, max=60, avg= 2.42, stdev= 0.56
    clat (usec): min=1, max=187, avg=123.17, stdev= 4.02
     lat (usec): min=3, max=189, avg=125.71, stdev= 4.09
    clat percentiles (usec):
     |  1.00th=[  105],  5.00th=[  121], 10.00th=[  121], 20.00th=[  122],
     | 30.00th=[  122], 40.00th=[  123], 50.00th=[  123], 60.00th=[  124],
     | 70.00th=[  124], 80.00th=[  125], 90.00th=[  126], 95.00th=[  127],
     | 99.00th=[  133], 99.50th=[  135], 99.90th=[  141], 99.95th=[  175],
     | 99.99th=[  187]
    lat (usec) : 2=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.97%
  cpu          : usr=40.53%, sys=55.88%, ctx=43, majf=0, minf=39
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=102400/w=0/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgreader: (groupid=0, jobs=1): err= 0: pid=10794: Thu Jan 29 19:43:30 2015
  read : io=409600KB, bw=982.82MB/s, iops=251597, runt=   407msec
    slat (usec): min=2, max=50, avg= 2.43, stdev= 0.58
    clat (usec): min=2, max=174, avg=123.07, stdev= 3.97
     lat (usec): min=5, max=197, avg=125.62, stdev= 4.02
    clat percentiles (usec):
     |  1.00th=[  106],  5.00th=[  120], 10.00th=[  121], 20.00th=[  122],
     | 30.00th=[  122], 40.00th=[  123], 50.00th=[  123], 60.00th=[  123],
     | 70.00th=[  124], 80.00th=[  125], 90.00th=[  126], 95.00th=[  129],
     | 99.00th=[  135], 99.50th=[  137], 99.90th=[  145], 99.95th=[  153],
     | 99.99th=[  169]
    lat (usec) : 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.98%
  cpu          : usr=41.03%, sys=57.74%, ctx=42, majf=0, minf=37
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=102400/w=0/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgreader: (groupid=0, jobs=1): err= 0: pid=10795: Thu Jan 29 19:43:30 2015
  read : io=409600KB, bw=980.41MB/s, iops=250980, runt=   408msec
    slat (usec): min=2, max=357, avg= 2.45, stdev= 1.32
    clat (usec): min=1, max=498, avg=124.23, stdev=12.09
     lat (usec): min=4, max=501, avg=126.80, stdev=12.33
    clat percentiles (usec):
     |  1.00th=[  108],  5.00th=[  121], 10.00th=[  122], 20.00th=[  122],
     | 30.00th=[  123], 40.00th=[  123], 50.00th=[  124], 60.00th=[  124],
     | 70.00th=[  124], 80.00th=[  125], 90.00th=[  126], 95.00th=[  127],
     | 99.00th=[  139], 99.50th=[  141], 99.90th=[  274], 99.95th=[  350],
     | 99.99th=[  498]
    lat (usec) : 2=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.61%, 500=0.37%
  cpu          : usr=41.18%, sys=58.09%, ctx=86, majf=0, minf=38
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=102400/w=0/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32

Run status group 0 (all jobs):
   READ: io=1600.0MB, aggrb=3747.8MB/s, minb=959250KB/s, maxb=982.82MB/s, mint=407msec, maxt=427msec

Disk stats (read/write):
  zram0: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=0.00%
bgupdater: (g=0): rw=randrw, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=32
...
fio-2.2.5
Starting 4 processes
bgupdater: Laying out IO file(s) (1 file(s) / 400MB)
bgupdater: Laying out IO file(s) (1 file(s) / 400MB)
bgupdater: Laying out IO file(s) (1 file(s) / 400MB)
bgupdater: Laying out IO file(s) (1 file(s) / 400MB)

bgupdater: (groupid=0, jobs=1): err= 0: pid=10803: Thu Jan 29 19:43:33 2015
  read : io=205076KB, bw=332916KB/s, iops=83228, runt=   616msec
    slat (usec): min=2, max=38, avg= 2.53, stdev= 0.71
    clat (usec): min=18, max=912, avg=187.23, stdev=18.71
     lat (usec): min=22, max=915, avg=189.88, stdev=18.74
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  177],
     | 30.00th=[  181], 40.00th=[  183], 50.00th=[  187], 60.00th=[  191],
     | 70.00th=[  193], 80.00th=[  197], 90.00th=[  203], 95.00th=[  209],
     | 99.00th=[  223], 99.50th=[  233], 99.90th=[  258], 99.95th=[  290],
     | 99.99th=[  908]
    bw (KB  /s): min=332504, max=332504, per=25.05%, avg=332504.00, stdev= 0.00
  write: io=204524KB, bw=332019KB/s, iops=83004, runt=   616msec
    slat (usec): min=4, max=731, avg= 6.21, stdev= 3.42
    clat (usec): min=1, max=910, avg=187.36, stdev=17.80
     lat (usec): min=8, max=918, avg=193.71, stdev=18.16
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  177],
     | 30.00th=[  181], 40.00th=[  185], 50.00th=[  187], 60.00th=[  191],
     | 70.00th=[  193], 80.00th=[  197], 90.00th=[  203], 95.00th=[  209],
     | 99.00th=[  223], 99.50th=[  231], 99.90th=[  258], 99.95th=[  282],
     | 99.99th=[  908]
    bw (KB  /s): min=332544, max=332544, per=25.12%, avg=332544.00, stdev= 0.00
    lat (usec) : 2=0.01%, 20=0.01%, 50=0.01%, 100=0.01%, 250=99.80%
    lat (usec) : 500=0.15%, 1000=0.03%
  cpu          : usr=30.36%, sys=68.99%, ctx=185, majf=0, minf=7
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=51269/w=51131/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgupdater: (groupid=0, jobs=1): err= 0: pid=10804: Thu Jan 29 19:43:33 2015
  read : io=205076KB, bw=331838KB/s, iops=82959, runt=   618msec
    slat (usec): min=2, max=99, avg= 2.54, stdev= 0.88
    clat (usec): min=13, max=4676, avg=187.64, stdev=78.01
     lat (usec): min=16, max=4678, avg=190.29, stdev=78.01
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  177],
     | 30.00th=[  181], 40.00th=[  183], 50.00th=[  187], 60.00th=[  189],
     | 70.00th=[  193], 80.00th=[  197], 90.00th=[  201], 95.00th=[  207],
     | 99.00th=[  221], 99.50th=[  239], 99.90th=[  342], 99.95th=[  462],
     | 99.99th=[ 4704]
    bw (KB  /s): min=330248, max=330248, per=24.88%, avg=330248.00, stdev= 0.00
  write: io=204524KB, bw=330945KB/s, iops=82736, runt=   618msec
    slat (usec): min=4, max=4458, avg= 6.24, stdev=19.77
    clat (usec): min=2, max=4673, avg=187.90, stdev=80.44
     lat (usec): min=6, max=4680, avg=194.28, stdev=82.87
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  177],
     | 30.00th=[  181], 40.00th=[  183], 50.00th=[  187], 60.00th=[  189],
     | 70.00th=[  193], 80.00th=[  197], 90.00th=[  201], 95.00th=[  207],
     | 99.00th=[  219], 99.50th=[  233], 99.90th=[  342], 99.95th=[  462],
     | 99.99th=[ 4640]
    bw (KB  /s): min=330328, max=330328, per=24.95%, avg=330328.00, stdev= 0.00
    lat (usec) : 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.67%, 500=0.28%
    lat (msec) : 10=0.03%
  cpu          : usr=30.10%, sys=68.45%, ctx=67, majf=0, minf=10
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=51269/w=51131/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgupdater: (groupid=0, jobs=1): err= 0: pid=10805: Thu Jan 29 19:43:33 2015
  read : io=205076KB, bw=332916KB/s, iops=83228, runt=   616msec
    slat (usec): min=2, max=4810, avg= 2.63, stdev=21.50
    clat (usec): min=14, max=5018, avg=187.75, stdev=99.05
     lat (usec): min=16, max=5020, avg=190.49, stdev=101.38
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  175],
     | 30.00th=[  179], 40.00th=[  183], 50.00th=[  185], 60.00th=[  189],
     | 70.00th=[  191], 80.00th=[  195], 90.00th=[  201], 95.00th=[  207],
     | 99.00th=[  219], 99.50th=[  229], 99.90th=[  262], 99.95th=[  956],
     | 99.99th=[ 5024]
    bw (KB  /s): min=331056, max=331056, per=24.94%, avg=331056.00, stdev= 0.00
  write: io=204524KB, bw=332019KB/s, iops=83004, runt=   616msec
    slat (usec): min=4, max=49, avg= 6.11, stdev= 1.01
    clat (usec): min=2, max=5018, avg=186.97, stdev=70.14
     lat (usec): min=6, max=5024, avg=193.22, stdev=70.15
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  177],
     | 30.00th=[  179], 40.00th=[  183], 50.00th=[  187], 60.00th=[  189],
     | 70.00th=[  193], 80.00th=[  195], 90.00th=[  201], 95.00th=[  207],
     | 99.00th=[  217], 99.50th=[  227], 99.90th=[  255], 99.95th=[  948],
     | 99.99th=[ 5024]
    bw (KB  /s): min=331320, max=331320, per=25.03%, avg=331320.00, stdev= 0.00
    lat (usec) : 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.83%, 500=0.09%, 1000=0.03%
    lat (msec) : 10=0.03%
  cpu          : usr=30.52%, sys=68.34%, ctx=70, majf=0, minf=8
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=51269/w=51131/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgupdater: (groupid=0, jobs=1): err= 0: pid=10806: Thu Jan 29 19:43:33 2015
  read : io=205076KB, bw=333457KB/s, iops=83364, runt=   615msec
    slat (usec): min=2, max=40, avg= 2.53, stdev= 0.69
    clat (usec): min=18, max=287, avg=186.73, stdev=12.94
     lat (usec): min=21, max=289, avg=189.38, stdev=12.98
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  177],
     | 30.00th=[  181], 40.00th=[  183], 50.00th=[  187], 60.00th=[  189],
     | 70.00th=[  193], 80.00th=[  197], 90.00th=[  203], 95.00th=[  207],
     | 99.00th=[  221], 99.50th=[  229], 99.90th=[  258], 99.95th=[  270],
     | 99.99th=[  286]
    bw (KB  /s): min=332680, max=332680, per=25.06%, avg=332680.00, stdev= 0.00
  write: io=204524KB, bw=332559KB/s, iops=83139, runt=   615msec
    slat (usec): min=4, max=28, avg= 6.17, stdev= 1.02
    clat (usec): min=2, max=285, avg=186.95, stdev=12.80
     lat (usec): min=8, max=294, avg=193.26, stdev=12.89
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  177],
     | 30.00th=[  181], 40.00th=[  185], 50.00th=[  187], 60.00th=[  191],
     | 70.00th=[  193], 80.00th=[  197], 90.00th=[  203], 95.00th=[  209],
     | 99.00th=[  221], 99.50th=[  227], 99.90th=[  251], 99.95th=[  262],
     | 99.99th=[  274]
    bw (KB  /s): min=332672, max=332672, per=25.13%, avg=332672.00, stdev= 0.00
    lat (usec) : 4=0.01%, 20=0.01%, 50=0.01%, 100=0.01%, 250=99.85%
    lat (usec) : 500=0.14%
  cpu          : usr=30.73%, sys=69.11%, ctx=63, majf=0, minf=7
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=51269/w=51131/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32

Run status group 0 (all jobs):
   READ: io=820304KB, aggrb=1296.3MB/s, minb=331838KB/s, maxb=333456KB/s, mint=615msec, maxt=618msec
  WRITE: io=818096KB, aggrb=1292.8MB/s, minb=330944KB/s, maxb=332559KB/s, mint=615msec, maxt=618msec

Disk stats (read/write):
  zram0: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=0.00%
bgwriter: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=32
...
fio-2.2.5
Starting 4 processes
bgwriter: Laying out IO file(s) (1 file(s) / 400MB)
bgwriter: Laying out IO file(s) (1 file(s) / 400MB)
bgwriter: Laying out IO file(s) (1 file(s) / 400MB)
bgwriter: Laying out IO file(s) (1 file(s) / 400MB)
Jobs: 4 (f=4)
bgwriter: (groupid=0, jobs=1): err= 0: pid=10814: Thu Jan 29 19:43:35 2015
  write: io=409600KB, bw=174150KB/s, iops=43537, runt=  2352msec
    slat (usec): min=11, max=3866, avg=20.72, stdev=14.58
    clat (usec): min=2, max=5074, avg=712.72, stdev=92.15
     lat (usec): min=20, max=5094, avg=733.63, stdev=93.77
    clat percentiles (usec):
     |  1.00th=[  612],  5.00th=[  676], 10.00th=[  684], 20.00th=[  700],
     | 30.00th=[  700], 40.00th=[  708], 50.00th=[  708], 60.00th=[  716],
     | 70.00th=[  716], 80.00th=[  724], 90.00th=[  732], 95.00th=[  740],
     | 99.00th=[  820], 99.50th=[ 1004], 99.90th=[ 1640], 99.95th=[ 2128],
     | 99.99th=[ 5088]
    bw (KB  /s): min=170904, max=174808, per=24.97%, avg=172816.00, stdev=1847.92
    lat (usec) : 4=0.01%, 50=0.01%, 100=0.01%, 250=0.01%, 500=0.24%
    lat (usec) : 750=96.28%, 1000=2.94%
    lat (msec) : 2=0.46%, 4=0.03%, 10=0.03%
  cpu          : usr=10.67%, sys=88.52%, ctx=267, majf=0, minf=8
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=0/w=102400/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgwriter: (groupid=0, jobs=1): err= 0: pid=10815: Thu Jan 29 19:43:35 2015
  write: io=409600KB, bw=173046KB/s, iops=43261, runt=  2367msec
    slat (usec): min=12, max=1459, avg=20.87, stdev=10.16
    clat (usec): min=2, max=2804, avg=717.25, stdev=69.33
     lat (usec): min=22, max=2825, avg=738.31, stdev=70.84
    clat percentiles (usec):
     |  1.00th=[  628],  5.00th=[  676], 10.00th=[  692], 20.00th=[  700],
     | 30.00th=[  708], 40.00th=[  708], 50.00th=[  716], 60.00th=[  716],
     | 70.00th=[  724], 80.00th=[  732], 90.00th=[  740], 95.00th=[  748],
     | 99.00th=[  812], 99.50th=[ 1080], 99.90th=[ 1800], 99.95th=[ 1992],
     | 99.99th=[ 2640]
    bw (KB  /s): min=171160, max=172768, per=24.82%, avg=171786.00, stdev=761.48
    lat (usec) : 4=0.01%, 50=0.01%, 100=0.01%, 250=0.01%, 500=0.11%
    lat (usec) : 750=95.99%, 1000=3.25%
    lat (msec) : 2=0.60%, 4=0.04%
  cpu          : usr=10.65%, sys=88.64%, ctx=502, majf=0, minf=9
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=0/w=102400/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgwriter: (groupid=0, jobs=1): err= 0: pid=10816: Thu Jan 29 19:43:35 2015
  write: io=409600KB, bw=174224KB/s, iops=43555, runt=  2351msec
    slat (usec): min=12, max=280, avg=20.67, stdev= 2.58
    clat (usec): min=2, max=1116, avg=712.42, stdev=31.10
     lat (usec): min=20, max=1145, avg=733.30, stdev=31.89
    clat percentiles (usec):
     |  1.00th=[  628],  5.00th=[  676], 10.00th=[  684], 20.00th=[  700],
     | 30.00th=[  708], 40.00th=[  708], 50.00th=[  716], 60.00th=[  716],
     | 70.00th=[  724], 80.00th=[  724], 90.00th=[  740], 95.00th=[  748],
     | 99.00th=[  772], 99.50th=[  788], 99.90th=[ 1020], 99.95th=[ 1048],
     | 99.99th=[ 1080]
    bw (KB  /s): min=171632, max=174888, per=25.00%, avg=173026.00, stdev=1359.10
    lat (usec) : 4=0.01%, 50=0.01%, 100=0.01%, 250=0.01%, 500=0.11%
    lat (usec) : 750=94.50%, 1000=5.19%
    lat (msec) : 2=0.19%
  cpu          : usr=10.85%, sys=88.77%, ctx=472, majf=0, minf=8
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=0/w=102400/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgwriter: (groupid=0, jobs=1): err= 0: pid=10817: Thu Jan 29 19:43:35 2015
  write: io=409600KB, bw=174670KB/s, iops=43667, runt=  2345msec
    slat (usec): min=11, max=6296, avg=20.66, stdev=24.99
    clat (usec): min=2, max=5509, avg=708.98, stdev=90.54
     lat (usec): min=22, max=7099, avg=729.83, stdev=94.22
    clat percentiles (usec):
     |  1.00th=[  604],  5.00th=[  676], 10.00th=[  684], 20.00th=[  700],
     | 30.00th=[  700], 40.00th=[  708], 50.00th=[  708], 60.00th=[  716],
     | 70.00th=[  716], 80.00th=[  724], 90.00th=[  732], 95.00th=[  740],
     | 99.00th=[  756], 99.50th=[  772], 99.90th=[  940], 99.95th=[ 1912],
     | 99.99th=[ 5536]
    bw (KB  /s): min=170792, max=176064, per=25.05%, avg=173360.00, stdev=2157.10
    lat (usec) : 4=0.01%, 50=0.01%, 100=0.01%, 250=0.01%, 500=0.32%
    lat (usec) : 750=97.68%, 1000=1.92%
    lat (msec) : 2=0.03%, 10=0.03%
  cpu          : usr=10.79%, sys=88.53%, ctx=252, majf=0, minf=6
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=0/w=102400/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32

Run status group 0 (all jobs):
  WRITE: io=1600.0MB, aggrb=692184KB/s, minb=173046KB/s, maxb=174669KB/s, mint=2345msec, maxt=2367msec

Disk stats (read/write):
  zram0: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=0.00%



==========================================================================================================



************* SRCU *************

$ for i in ./test-fio-*; do fio ./$i; rm bg*; done
bgreader: (g=0): rw=randread, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=32
...
fio-2.2.5
Starting 4 processes
bgreader: Laying out IO file(s) (1 file(s) / 400MB)
bgreader: Laying out IO file(s) (1 file(s) / 400MB)
bgreader: Laying out IO file(s) (1 file(s) / 400MB)
bgreader: Laying out IO file(s) (1 file(s) / 400MB)

bgreader: (groupid=0, jobs=1): err= 0: pid=11578: Thu Jan 29 19:45:01 2015
  read : io=409600KB, bw=986988KB/s, iops=246746, runt=   415msec
    slat (usec): min=2, max=708, avg= 2.48, stdev= 2.29
    clat (usec): min=2, max=839, avg=124.69, stdev=16.01
     lat (usec): min=4, max=842, avg=127.29, stdev=16.29
    clat percentiles (usec):
     |  1.00th=[  121],  5.00th=[  122], 10.00th=[  122], 20.00th=[  122],
     | 30.00th=[  123], 40.00th=[  123], 50.00th=[  123], 60.00th=[  123],
     | 70.00th=[  124], 80.00th=[  124], 90.00th=[  125], 95.00th=[  131],
     | 99.00th=[  139], 99.50th=[  223], 99.90th=[  243], 99.95th=[  282],
     | 99.99th=[  836]
    lat (usec) : 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.90%, 500=0.05%, 1000=0.03%
  cpu          : usr=40.48%, sys=57.11%, ctx=46, majf=0, minf=39
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=102400/w=0/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgreader: (groupid=0, jobs=1): err= 0: pid=11579: Thu Jan 29 19:45:01 2015
  read : io=409600KB, bw=968322KB/s, iops=242080, runt=   423msec
    slat (usec): min=2, max=88, avg= 2.45, stdev= 0.67
    clat (usec): min=1, max=213, avg=123.41, stdev= 4.23
     lat (usec): min=3, max=216, avg=125.97, stdev= 4.30
    clat percentiles (usec):
     |  1.00th=[  108],  5.00th=[  121], 10.00th=[  122], 20.00th=[  122],
     | 30.00th=[  123], 40.00th=[  123], 50.00th=[  123], 60.00th=[  124],
     | 70.00th=[  124], 80.00th=[  124], 90.00th=[  125], 95.00th=[  126],
     | 99.00th=[  139], 99.50th=[  139], 99.90th=[  145], 99.95th=[  195],
     | 99.99th=[  213]
    lat (usec) : 2=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.97%
  cpu          : usr=39.24%, sys=55.79%, ctx=85, majf=0, minf=39
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=102400/w=0/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgreader: (groupid=0, jobs=1): err= 0: pid=11580: Thu Jan 29 19:45:01 2015
  read : io=409600KB, bw=984615KB/s, iops=246153, runt=   416msec
    slat (usec): min=2, max=51, avg= 2.42, stdev= 0.63
    clat (usec): min=2, max=171, avg=123.01, stdev= 3.46
     lat (usec): min=4, max=174, avg=125.55, stdev= 3.50
    clat percentiles (usec):
     |  1.00th=[  120],  5.00th=[  121], 10.00th=[  121], 20.00th=[  122],
     | 30.00th=[  122], 40.00th=[  122], 50.00th=[  122], 60.00th=[  123],
     | 70.00th=[  123], 80.00th=[  123], 90.00th=[  124], 95.00th=[  126],
     | 99.00th=[  141], 99.50th=[  143], 99.90th=[  145], 99.95th=[  155],
     | 99.99th=[  171]
    lat (usec) : 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.97%
  cpu          : usr=40.38%, sys=56.25%, ctx=82, majf=0, minf=37
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=102400/w=0/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgreader: (groupid=0, jobs=1): err= 0: pid=11581: Thu Jan 29 19:45:01 2015
  read : io=409600KB, bw=977.11MB/s, iops=250366, runt=   409msec
    slat (usec): min=2, max=236, avg= 2.46, stdev= 0.98
    clat (usec): min=1, max=370, avg=124.55, stdev=11.16
     lat (usec): min=4, max=372, avg=127.12, stdev=11.37
    clat percentiles (usec):
     |  1.00th=[  107],  5.00th=[  121], 10.00th=[  122], 20.00th=[  122],
     | 30.00th=[  123], 40.00th=[  123], 50.00th=[  124], 60.00th=[  124],
     | 70.00th=[  124], 80.00th=[  125], 90.00th=[  125], 95.00th=[  126],
     | 99.00th=[  175], 99.50th=[  223], 99.90th=[  225], 99.95th=[  278],
     | 99.99th=[  370]
    lat (usec) : 2=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.92%, 500=0.06%
  cpu          : usr=41.56%, sys=57.95%, ctx=48, majf=0, minf=38
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=102400/w=0/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32

Run status group 0 (all jobs):
   READ: io=1600.0MB, aggrb=3782.6MB/s, minb=968321KB/s, maxb=977.11MB/s, mint=409msec, maxt=423msec

Disk stats (read/write):
  zram0: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=0.00%
bgupdater: (g=0): rw=randrw, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=32
...
fio-2.2.5
Starting 4 processes
bgupdater: Laying out IO file(s) (1 file(s) / 400MB)
bgupdater: Laying out IO file(s) (1 file(s) / 400MB)
bgupdater: Laying out IO file(s) (1 file(s) / 400MB)
bgupdater: Laying out IO file(s) (1 file(s) / 400MB)

bgupdater: (groupid=0, jobs=1): err= 0: pid=11592: Thu Jan 29 19:45:04 2015
  read : io=205076KB, bw=334545KB/s, iops=83636, runt=   613msec
    slat (usec): min=2, max=73, avg= 2.56, stdev= 0.73
    clat (usec): min=19, max=310, avg=186.14, stdev=13.49
     lat (usec): min=22, max=312, avg=188.81, stdev=13.55
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  177],
     | 30.00th=[  179], 40.00th=[  183], 50.00th=[  187], 60.00th=[  189],
     | 70.00th=[  193], 80.00th=[  197], 90.00th=[  201], 95.00th=[  207],
     | 99.00th=[  229], 99.50th=[  251], 99.90th=[  266], 99.95th=[  274],
     | 99.99th=[  298]
    bw (KB  /s): min=333496, max=333496, per=25.82%, avg=333496.00, stdev= 0.00
  write: io=204524KB, bw=333644KB/s, iops=83411, runt=   613msec
    slat (usec): min=4, max=30, avg= 6.10, stdev= 1.01
    clat (usec): min=2, max=307, avg=186.32, stdev=13.35
     lat (usec): min=8, max=313, avg=192.56, stdev=13.47
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  177],
     | 30.00th=[  181], 40.00th=[  183], 50.00th=[  187], 60.00th=[  189],
     | 70.00th=[  193], 80.00th=[  197], 90.00th=[  201], 95.00th=[  207],
     | 99.00th=[  227], 99.50th=[  249], 99.90th=[  266], 99.95th=[  274],
     | 99.99th=[  298]
    bw (KB  /s): min=333368, max=333368, per=25.88%, avg=333368.00, stdev= 0.00
    lat (usec) : 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=99.47%, 500=0.51%
  cpu          : usr=30.51%, sys=69.00%, ctx=68, majf=0, minf=7
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=51269/w=51131/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgupdater: (groupid=0, jobs=1): err= 0: pid=11593: Thu Jan 29 19:45:04 2015
  read : io=205076KB, bw=334545KB/s, iops=83636, runt=   613msec
    slat (usec): min=2, max=40, avg= 2.53, stdev= 0.65
    clat (usec): min=53, max=3694, avg=186.31, stdev=65.12
     lat (usec): min=55, max=3697, avg=188.96, stdev=65.13
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  175],
     | 30.00th=[  179], 40.00th=[  181], 50.00th=[  185], 60.00th=[  187],
     | 70.00th=[  191], 80.00th=[  195], 90.00th=[  201], 95.00th=[  207],
     | 99.00th=[  231], 99.50th=[  249], 99.90th=[  266], 99.95th=[  342],
     | 99.99th=[ 3696]
    bw (KB  /s): min=335528, max=335528, per=25.97%, avg=335528.00, stdev= 0.00
  write: io=204524KB, bw=333644KB/s, iops=83411, runt=   613msec
    slat (usec): min=4, max=160, avg= 6.08, stdev= 1.22
    clat (usec): min=2, max=3693, avg=186.31, stdev=59.38
     lat (usec): min=44, max=3699, avg=192.54, stdev=59.41
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  175],
     | 30.00th=[  179], 40.00th=[  183], 50.00th=[  185], 60.00th=[  189],
     | 70.00th=[  191], 80.00th=[  195], 90.00th=[  201], 95.00th=[  205],
     | 99.00th=[  225], 99.50th=[  247], 99.90th=[  262], 99.95th=[  346],
     | 99.99th=[ 3696]
    bw (KB  /s): min=335376, max=335376, per=26.03%, avg=335376.00, stdev= 0.00
    lat (usec) : 4=0.01%, 50=0.01%, 100=0.01%, 250=99.55%, 500=0.41%
    lat (msec) : 4=0.03%
  cpu          : usr=30.34%, sys=68.68%, ctx=64, majf=0, minf=9
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=51269/w=51131/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgupdater: (groupid=0, jobs=1): err= 0: pid=11594: Thu Jan 29 19:45:04 2015
  read : io=205076KB, bw=322954KB/s, iops=80738, runt=   635msec
    slat (usec): min=2, max=9087, avg= 2.75, stdev=41.31
    clat (usec): min=13, max=9287, avg=193.88, stdev=237.09
     lat (usec): min=15, max=9293, avg=196.75, stdev=240.69
    clat percentiles (usec):
     |  1.00th=[  143],  5.00th=[  163], 10.00th=[  169], 20.00th=[  175],
     | 30.00th=[  179], 40.00th=[  181], 50.00th=[  185], 60.00th=[  189],
     | 70.00th=[  191], 80.00th=[  195], 90.00th=[  201], 95.00th=[  209],
     | 99.00th=[  255], 99.50th=[  350], 99.90th=[ 3536], 99.95th=[ 8256],
     | 99.99th=[ 9280]
    bw (KB  /s): min=324248, max=324248, per=25.10%, avg=324248.00, stdev= 0.00
  write: io=204524KB, bw=322085KB/s, iops=80521, runt=   635msec
    slat (usec): min=3, max=3360, avg= 6.20, stdev=15.91
    clat (usec): min=1, max=9286, avg=192.34, stdev=210.71
     lat (usec): min=6, max=9293, avg=198.68, stdev=211.36
    clat percentiles (usec):
     |  1.00th=[  145],  5.00th=[  163], 10.00th=[  169], 20.00th=[  175],
     | 30.00th=[  179], 40.00th=[  183], 50.00th=[  185], 60.00th=[  189],
     | 70.00th=[  191], 80.00th=[  195], 90.00th=[  203], 95.00th=[  209],
     | 99.00th=[  253], 99.50th=[  346], 99.90th=[ 1032], 99.95th=[ 8256],
     | 99.99th=[ 9280]
    bw (KB  /s): min=324656, max=324656, per=25.20%, avg=324656.00, stdev= 0.00
    lat (usec) : 2=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 100=0.01%
    lat (usec) : 250=98.84%, 500=0.93%, 750=0.05%
    lat (msec) : 2=0.03%, 4=0.06%, 10=0.06%
  cpu          : usr=29.29%, sys=66.14%, ctx=91, majf=0, minf=7
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=51269/w=51131/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgupdater: (groupid=0, jobs=1): err= 0: pid=11595: Thu Jan 29 19:45:04 2015
  read : io=205076KB, bw=335640KB/s, iops=83909, runt=   611msec
    slat (usec): min=2, max=58, avg= 2.56, stdev= 0.82
    clat (usec): min=17, max=470, avg=185.51, stdev=15.09
     lat (usec): min=20, max=472, avg=188.18, stdev=15.16
    clat percentiles (usec):
     |  1.00th=[  157],  5.00th=[  167], 10.00th=[  171], 20.00th=[  175],
     | 30.00th=[  179], 40.00th=[  181], 50.00th=[  185], 60.00th=[  187],
     | 70.00th=[  191], 80.00th=[  195], 90.00th=[  201], 95.00th=[  209],
     | 99.00th=[  233], 99.50th=[  251], 99.90th=[  270], 99.95th=[  330],
     | 99.99th=[  462]
    bw (KB  /s): min=334544, max=334544, per=25.90%, avg=334544.00, stdev= 0.00
  write: io=204524KB, bw=334736KB/s, iops=83684, runt=   611msec
    slat (usec): min=4, max=90, avg= 6.07, stdev= 1.19
    clat (usec): min=2, max=468, avg=185.70, stdev=14.68
     lat (usec): min=8, max=474, avg=191.91, stdev=14.84
    clat percentiles (usec):
     |  1.00th=[  159],  5.00th=[  167], 10.00th=[  171], 20.00th=[  175],
     | 30.00th=[  179], 40.00th=[  183], 50.00th=[  185], 60.00th=[  189],
     | 70.00th=[  191], 80.00th=[  195], 90.00th=[  203], 95.00th=[  209],
     | 99.00th=[  231], 99.50th=[  249], 99.90th=[  266], 99.95th=[  274],
     | 99.99th=[  462]
    bw (KB  /s): min=334440, max=334440, per=25.96%, avg=334440.00, stdev= 0.00
    lat (usec) : 4=0.01%, 20=0.01%, 50=0.01%, 100=0.01%, 250=99.48%
    lat (usec) : 500=0.51%
  cpu          : usr=30.44%, sys=68.58%, ctx=188, majf=0, minf=7
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=51269/w=51131/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32

Run status group 0 (all jobs):
   READ: io=820304KB, aggrb=1261.6MB/s, minb=322954KB/s, maxb=335639KB/s, mint=611msec, maxt=635msec
  WRITE: io=818096KB, aggrb=1258.2MB/s, minb=322085KB/s, maxb=334736KB/s, mint=611msec, maxt=635msec

Disk stats (read/write):
  zram0: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=0.00%
bgwriter: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=32
...
fio-2.2.5
Starting 4 processes
bgwriter: Laying out IO file(s) (1 file(s) / 400MB)
bgwriter: Laying out IO file(s) (1 file(s) / 400MB)
bgwriter: Laying out IO file(s) (1 file(s) / 400MB)
bgwriter: Laying out IO file(s) (1 file(s) / 400MB)
Jobs: 4 (f=4)
bgwriter: (groupid=0, jobs=1): err= 0: pid=11605: Thu Jan 29 19:45:06 2015
  write: io=409600KB, bw=168144KB/s, iops=42036, runt=  2436msec
    slat (usec): min=10, max=5615, avg=21.50, stdev=37.53
    clat (usec): min=2, max=9808, avg=738.51, stdev=364.25
     lat (usec): min=40, max=9827, avg=760.22, stdev=371.10
    clat percentiles (usec):
     |  1.00th=[  612],  5.00th=[  668], 10.00th=[  684], 20.00th=[  692],
     | 30.00th=[  700], 40.00th=[  700], 50.00th=[  708], 60.00th=[  708],
     | 70.00th=[  716], 80.00th=[  724], 90.00th=[  732], 95.00th=[  756],
     | 99.00th=[ 1576], 99.50th=[ 3056], 99.90th=[ 6816], 99.95th=[ 9408],
     | 99.99th=[ 9792]
    bw (KB  /s): min=157408, max=176360, per=24.65%, avg=165770.00, stdev=8401.50
    lat (usec) : 4=0.01%, 50=0.01%, 100=0.01%, 250=0.01%, 500=0.20%
    lat (usec) : 750=94.24%, 1000=4.14%
    lat (msec) : 2=0.63%, 4=0.45%, 10=0.33%
  cpu          : usr=10.26%, sys=85.22%, ctx=982, majf=0, minf=7
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=0/w=102400/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgwriter: (groupid=0, jobs=1): err= 0: pid=11606: Thu Jan 29 19:45:06 2015
  write: io=409600KB, bw=173928KB/s, iops=43481, runt=  2355msec
    slat (usec): min=12, max=1173, avg=20.76, stdev= 7.22
    clat (usec): min=2, max=2247, avg=713.64, stdev=49.05
     lat (usec): min=22, max=2270, avg=734.58, stdev=50.26
    clat percentiles (usec):
     |  1.00th=[  620],  5.00th=[  668], 10.00th=[  684], 20.00th=[  692],
     | 30.00th=[  700], 40.00th=[  708], 50.00th=[  708], 60.00th=[  716],
     | 70.00th=[  716], 80.00th=[  724], 90.00th=[  740], 95.00th=[  756],
     | 99.00th=[  868], 99.50th=[  924], 99.90th=[ 1368], 99.95th=[ 1608],
     | 99.99th=[ 2192]
    bw (KB  /s): min=170448, max=175504, per=25.63%, avg=172406.00, stdev=2166.56
    lat (usec) : 4=0.01%, 50=0.01%, 100=0.01%, 250=0.01%, 500=0.13%
    lat (usec) : 750=92.77%, 1000=6.92%
    lat (msec) : 2=0.17%, 4=0.01%
  cpu          : usr=10.66%, sys=88.66%, ctx=528, majf=0, minf=8
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=0/w=102400/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgwriter: (groupid=0, jobs=1): err= 0: pid=11607: Thu Jan 29 19:45:06 2015
  write: io=409600KB, bw=172900KB/s, iops=43224, runt=  2369msec
    slat (usec): min=9, max=3458, avg=20.88, stdev=19.13
    clat (usec): min=2, max=4190, avg=718.03, stdev=114.69
     lat (usec): min=22, max=4210, avg=739.09, stdev=116.47
    clat percentiles (usec):
     |  1.00th=[  620],  5.00th=[  668], 10.00th=[  684], 20.00th=[  692],
     | 30.00th=[  700], 40.00th=[  700], 50.00th=[  708], 60.00th=[  708],
     | 70.00th=[  716], 80.00th=[  724], 90.00th=[  732], 95.00th=[  756],
     | 99.00th=[ 1064], 99.50th=[ 1080], 99.90th=[ 2024], 99.95th=[ 3952],
     | 99.99th=[ 4192]
    bw (KB  /s): min=168600, max=173752, per=25.49%, avg=171458.00, stdev=2129.26
    lat (usec) : 4=0.01%, 50=0.01%, 100=0.01%, 250=0.01%, 500=0.14%
    lat (usec) : 750=94.29%, 1000=2.69%
    lat (msec) : 2=2.74%, 4=0.09%, 10=0.03%
  cpu          : usr=10.55%, sys=87.38%, ctx=373, majf=0, minf=8
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=0/w=102400/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32
bgwriter: (groupid=0, jobs=1): err= 0: pid=11608: Thu Jan 29 19:45:06 2015
  write: io=409600KB, bw=174150KB/s, iops=43537, runt=  2352msec
    slat (usec): min=9, max=5199, avg=20.79, stdev=26.32
    clat (usec): min=2, max=5915, avg=712.94, stdev=149.06
     lat (usec): min=20, max=5936, avg=733.91, stdev=151.50
    clat percentiles (usec):
     |  1.00th=[  628],  5.00th=[  668], 10.00th=[  684], 20.00th=[  692],
     | 30.00th=[  700], 40.00th=[  708], 50.00th=[  708], 60.00th=[  716],
     | 70.00th=[  716], 80.00th=[  724], 90.00th=[  732], 95.00th=[  740],
     | 99.00th=[  780], 99.50th=[  836], 99.90th=[ 2512], 99.95th=[ 5536],
     | 99.99th=[ 5920]
    bw (KB  /s): min=170872, max=174024, per=25.68%, avg=172714.00, stdev=1386.45
    lat (usec) : 4=0.01%, 50=0.01%, 100=0.01%, 250=0.01%, 500=0.10%
    lat (usec) : 750=97.10%, 1000=2.55%
    lat (msec) : 2=0.08%, 4=0.07%, 10=0.09%
  cpu          : usr=10.50%, sys=88.56%, ctx=260, majf=0, minf=6
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued    : total=r=0/w=102400/d=0, short=r=0/w=0/d=0, drop=r=0/w=0/d=0
     latency   : target=0, window=0, percentile=100.00%, depth=32

Run status group 0 (all jobs):
  WRITE: io=1600.0MB, aggrb=672577KB/s, minb=168144KB/s, maxb=174149KB/s, mint=2352msec, maxt=2436msec

Disk stats (read/write):
  zram0: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=0.00%



	-ss

[-- Attachment #2: test-fio-randread --]
[-- Type: text/plain, Size: 126 bytes --]

[global]
size=400m
directory=/mnt/
ioengine=libaio
iodepth=32
direct=1
invalidate=1
numjobs=4
loops=4

[bgreader]
rw=randread

[-- Attachment #3: test-fio-randrw --]
[-- Type: text/plain, Size: 125 bytes --]

[global]
size=400m
directory=/mnt/
ioengine=libaio
iodepth=32
direct=1
invalidate=1
numjobs=4
loops=5

[bgupdater]
rw=randrw

[-- Attachment #4: test-fio-randwrite --]
[-- Type: text/plain, Size: 127 bytes --]

[global]
size=400m
directory=/mnt/
ioengine=libaio
iodepth=32
direct=1
invalidate=1
numjobs=4
loops=5

[bgwriter]
rw=randwrite

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-29 15:12     ` Sergey Senozhatsky
@ 2015-01-30  7:52       ` Ganesh Mahendran
  2015-01-30  8:08         ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Ganesh Mahendran @ 2015-01-30  7:52 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Minchan Kim, Andrew Morton, linux-kernel, Linux-MM, Nitin Gupta,
	Jerome Marchand, sergey.senozhatsky.work

Hello Sergey

2015-01-29 23:12 GMT+08:00 Sergey Senozhatsky <sergey.senozhatsky@gmail.com>:
> On (01/29/15 21:48), Ganesh Mahendran wrote:
>> > Admin could reset zram during I/O operation going on so we have
>> > used zram->init_lock as read-side lock in I/O path to prevent
>> > sudden zram meta freeing.
>>
>> When I/O operation is running, that means the /dev/zram0 is
>> mounted or swaped on. Then the device could not be reset by
>> below code:
>>
>>     /* Do not reset an active device! */
>>     if (bdev->bd_holders) {
>>         ret = -EBUSY;
>>         goto out;
>>     }
>>
>> So the zram->init_lock in I/O path is to check whether the device
>> has been initialized(echo xxx > /sys/block/zram/disk_size).
>>

Thanks for your explanation.

>
> for mounted device (w/fs), we see initial (well, it goes up and down

What does "w/" mean?

> many times while we create device, but this is not interesting here)
> ->bd_holders increment in:
>   vfs_kern_mount -> mount_bdev -> blkdev_get_by_path -> blkdev_get
>
> and it goes to zero in:
>   cleanup_mnt -> deactivate_super -> kill_block_super -> blkdev_put
>
>
> after umount we still have init device. so, *theoretically*, we
> can see something like
>
>         CPU0                            CPU1
> umount
> reset_store
> bdev->bd_holders == 0                   mount
> ...                                     zram_make_request()
> zram_reset_device()

In this example, the data stored in zram will be corrupted.
Since CPU0 will free meta while CPU1 is using.
right?


>
> w/o zram->init_lock in both zram_reset_device() and zram_make_request()
> one of CPUs will be a bit sad.
what does "w/o" mean?

Thanks

>
>         -ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-30  7:52       ` Ganesh Mahendran
@ 2015-01-30  8:08         ` Sergey Senozhatsky
  2015-01-31  8:50           ` Ganesh Mahendran
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-30  8:08 UTC (permalink / raw)
  To: Ganesh Mahendran
  Cc: Sergey Senozhatsky, Minchan Kim, Andrew Morton, linux-kernel,
	Linux-MM, Nitin Gupta, Jerome Marchand, sergey.senozhatsky.work

On (01/30/15 15:52), Ganesh Mahendran wrote:
> >> When I/O operation is running, that means the /dev/zram0 is
> >> mounted or swaped on. Then the device could not be reset by
> >> below code:
> >>
> >>     /* Do not reset an active device! */
> >>     if (bdev->bd_holders) {
> >>         ret = -EBUSY;
> >>         goto out;
> >>     }
> >>
> >> So the zram->init_lock in I/O path is to check whether the device
> >> has been initialized(echo xxx > /sys/block/zram/disk_size).
> >>
> 
> Thanks for your explanation.
> 
> >
> > for mounted device (w/fs), we see initial (well, it goes up and down
> 
> What does "w/" mean?

'with fs'

> > many times while we create device, but this is not interesting here)
> > ->bd_holders increment in:
> >   vfs_kern_mount -> mount_bdev -> blkdev_get_by_path -> blkdev_get
> >
> > and it goes to zero in:
> >   cleanup_mnt -> deactivate_super -> kill_block_super -> blkdev_put
> >
> >
> > after umount we still have init device. so, *theoretically*, we
> > can see something like
> >
> >         CPU0                            CPU1
> > umount
> > reset_store
> > bdev->bd_holders == 0                   mount
> > ...                                     zram_make_request()
> > zram_reset_device()
> 
> In this example, the data stored in zram will be corrupted.
> Since CPU0 will free meta while CPU1 is using.
> right?
> 

with out ->init_lock protection in this case we have 'free' vs. 'use' race.

> 
> >
> > w/o zram->init_lock in both zram_reset_device() and zram_make_request()
> > one of CPUs will be a bit sad.
> what does "w/o" mean?

'with out'


	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-29  7:08                   ` Sergey Senozhatsky
@ 2015-01-30 14:41                     ` Minchan Kim
  2015-01-31 11:31                       ` Sergey Senozhatsky
  2015-02-01 14:50                       ` Sergey Senozhatsky
  0 siblings, 2 replies; 43+ messages in thread
From: Minchan Kim @ 2015-01-30 14:41 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

Hello, Sergey

On Thu, Jan 29, 2015 at 04:08:35PM +0900, Sergey Senozhatsky wrote:
> On (01/29/15 15:35), Minchan Kim wrote:
> >
> > As you told, the data was not stable.
> >
> yes. fread test was always slower, and the rest was mostly slower.
> 
> 
> > Anyway, when I read down_read implementation, it's one atomic instruction.
> > Hmm, it seems te be better for srcu_read_lock which does more things.
> >
> srcu looks havier, agree.

ffffffff8172c350 <down_read>:
ffffffff8172c350:       e8 7b 3f 00 00          callq  ffffffff817302d0 <__fentry__>
ffffffff8172c355:       55                      push   %rbp
ffffffff8172c356:       48 89 e5                mov    %rsp,%rbp
ffffffff8172c359:       53                      push   %rbx
ffffffff8172c35a:       48 89 fb                mov    %rdi,%rbx
ffffffff8172c35d:       48 83 ec 08             sub    $0x8,%rsp
ffffffff8172c361:       e8 9a e0 ff ff          callq  ffffffff8172a400 <_cond_resched>
ffffffff8172c366:       48 89 d8                mov    %rbx,%rax
ffffffff8172c369:       f0 48 ff 00             lock incq (%rax)
ffffffff8172c36d:       79 05                   jns    ffffffff8172c374 <down_read+0x24>
ffffffff8172c36f:       e8 5c e7 c4 ff          callq  ffffffff8137aad0 <call_rwsem_down_read_failed>
ffffffff8172c374:       48 83 c4 08             add    $0x8,%rsp
ffffffff8172c378:       5b                      pop    %rbx
ffffffff8172c379:       5d                      pop    %rbp
ffffffff8172c37a:       c3                      retq   


ffffffff810eeec0 <__srcu_read_lock>:
ffffffff810eeec0:       e8 0b 14 64 00          callq  ffffffff817302d0 <__fentry__>
ffffffff810eeec5:       48 8b 07                mov    (%rdi),%rax
ffffffff810eeec8:       55                      push   %rbp
ffffffff810eeec9:       48 89 e5                mov    %rsp,%rbp
ffffffff810eeecc:       83 e0 01                and    $0x1,%eax
ffffffff810eeecf:       48 63 d0                movslq %eax,%rdx
ffffffff810eeed2:       48 8b 4f 08             mov    0x8(%rdi),%rcx
ffffffff810eeed6:       65 48 ff 04 d1          incq   %gs:(%rcx,%rdx,8)
ffffffff810eeedb:       0f ae f0                mfence 
ffffffff810eeede:       48 83 c2 02             add    $0x2,%rdx
ffffffff810eeee2:       48 8b 4f 08             mov    0x8(%rdi),%rcx
ffffffff810eeee6:       65 48 ff 04 d1          incq   %gs:(%rcx,%rdx,8)
ffffffff810eeeeb:       5d                      pop    %rbp
ffffffff810eeeec:       c3                      retq   

Yes, __srcu_read_lock is a little bit heavier but the number of instruction
are not too much difference to make difference 10%. A culprit is
__cond_resched but I don't think, either because our test was CPU intensive
soS I don't think schedule latency affects total bandwidth.

More cuprit is your data pattern.
It seems you didn't use scramble_buffers=0, zero_buffers in fio so that
fio fills random data pattern so zram bandwidth could be different by
compression/decompression ratio.

I did test your fio script adding above options with my 4 CPU real machine
(NOTE, ubuntu fio is old so that it doesn't work well above two options
so I should update fio recently which solves it perfectly)

Another thing about fio is it seems loops option works with write test
with overwrite=1 options while read test doesn't work so that I should
use perf stat -r options to verify stdev.

In addition, I passed first test to remove noise as creating files
and increased testsize as 1G from 400m

1) randread

= vanilla =

 Performance counter stats for 'fio test-fio-randread.txt' (10 runs):

       4713.879241      task-clock (msec)         #    3.160 CPUs utilized            ( +-  0.62% )
             1,131      context-switches          #    0.240 K/sec                    ( +-  2.83% )
                23      cpu-migrations            #    0.005 K/sec                    ( +-  4.40% )
            15,767      page-faults               #    0.003 M/sec                    ( +-  0.03% )
    15,134,497,088      cycles                    #    3.211 GHz                      ( +-  0.15% ) [83.36%]
    10,763,665,604      stalled-cycles-frontend   #   71.12% frontend cycles idle     ( +-  0.22% ) [83.34%]
     6,896,294,076      stalled-cycles-backend    #   45.57% backend  cycles idle     ( +-  0.29% ) [66.67%]
     9,898,608,791      instructions              #    0.65  insns per cycle        
                                                  #    1.09  stalled cycles per insn  ( +-  0.07% ) [83.33%]
     1,852,167,485      branches                  #  392.918 M/sec                    ( +-  0.07% ) [83.34%]
        14,864,143      branch-misses             #    0.80% of all branches          ( +-  0.16% ) [83.34%]

       1.491813361 seconds time elapsed                                          ( +-  0.62% )

= srcu =

 Performance counter stats for 'fio test-fio-randread.txt' (10 runs):

       4752.790715      task-clock (msec)         #    3.166 CPUs utilized            ( +-  0.48% )
             1,179      context-switches          #    0.248 K/sec                    ( +-  1.56% )
                26      cpu-migrations            #    0.005 K/sec                    ( +-  3.91% )
            15,764      page-faults               #    0.003 M/sec                    ( +-  0.02% )
    15,263,869,915      cycles                    #    3.212 GHz                      ( +-  0.25% ) [83.32%]
    10,935,658,177      stalled-cycles-frontend   #   71.64% frontend cycles idle     ( +-  0.38% ) [83.33%]
     7,067,290,320      stalled-cycles-backend    #   46.30% backend  cycles idle     ( +-  0.46% ) [66.64%]
     9,896,513,423      instructions              #    0.65  insns per cycle        
                                                  #    1.11  stalled cycles per insn  ( +-  0.07% ) [83.33%]
     1,847,612,285      branches                  #  388.743 M/sec                    ( +-  0.07% ) [83.38%]
        14,814,815      branch-misses             #    0.80% of all branches          ( +-  0.24% ) [83.37%]

       1.501284082 seconds time elapsed                                          ( +-  0.50% )

srcu is worse as 0.63% but the difference is really marginal.

2) randwrite

= vanilla =

 Performance counter stats for 'fio test-fio-randwrite.txt' (10 runs):

       6283.823490      task-clock (msec)         #    3.332 CPUs utilized            ( +-  0.44% )
             1,536      context-switches          #    0.245 K/sec                    ( +-  2.10% )
                25      cpu-migrations            #    0.004 K/sec                    ( +-  3.79% )
            15,914      page-faults               #    0.003 M/sec                    ( +-  0.02% )
    20,408,942,915      cycles                    #    3.248 GHz                      ( +-  0.40% ) [83.34%]
    14,398,424,739      stalled-cycles-frontend   #   70.55% frontend cycles idle     ( +-  0.62% ) [83.36%]
     9,513,822,555      stalled-cycles-backend    #   46.62% backend  cycles idle     ( +-  0.62% ) [66.65%]
    13,507,376,783      instructions              #    0.66  insns per cycle        
                                                  #    1.07  stalled cycles per insn  ( +-  0.05% ) [83.36%]
     3,155,423,934      branches                  #  502.150 M/sec                    ( +-  0.05% ) [83.34%]
        18,381,090      branch-misses             #    0.58% of all branches          ( +-  0.16% ) [83.34%]

       1.885926070 seconds time elapsed                                          ( +-  0.61% )

= srcu =

 Performance counter stats for 'fio test-fio-randwrite.txt' (10 runs):

       6152.997119      task-clock (msec)         #    3.304 CPUs utilized            ( +-  0.29% )
             1,422      context-switches          #    0.231 K/sec                    ( +-  3.45% )
                28      cpu-migrations            #    0.004 K/sec                    ( +-  7.47% )
            15,921      page-faults               #    0.003 M/sec                    ( +-  0.02% )
    19,862,315,430      cycles                    #    3.228 GHz                      ( +-  0.09% ) [83.33%]
    13,872,541,761      stalled-cycles-frontend   #   69.84% frontend cycles idle     ( +-  0.12% ) [83.34%]
     9,074,883,552      stalled-cycles-backend    #   45.69% backend  cycles idle     ( +-  0.19% ) [66.71%]
    13,494,854,651      instructions              #    0.68  insns per cycle        
                                                  #    1.03  stalled cycles per insn  ( +-  0.03% ) [83.37%]
     3,148,938,955      branches                  #  511.773 M/sec                    ( +-  0.04% ) [83.33%]
        17,701,249      branch-misses             #    0.56% of all branches          ( +-  0.23% ) [83.34%]

       1.862543230 seconds time elapsed                                          ( +-  0.35% )

srcu is better as 1.24% is better.

3) randrw

= vanilla =

 Performance counter stats for 'fio test-fio-randrw.txt' (10 runs):

       5609.976477      task-clock (msec)         #    3.249 CPUs utilized            ( +-  0.34% )
             1,407      context-switches          #    0.251 K/sec                    ( +-  0.96% )
                25      cpu-migrations            #    0.004 K/sec                    ( +-  5.37% )
            15,906      page-faults               #    0.003 M/sec                    ( +-  0.05% )
    18,090,560,346      cycles                    #    3.225 GHz                      ( +-  0.35% ) [83.36%]
    12,885,393,954      stalled-cycles-frontend   #   71.23% frontend cycles idle     ( +-  0.53% ) [83.33%]
     8,570,185,547      stalled-cycles-backend    #   47.37% backend  cycles idle     ( +-  0.59% ) [66.67%]
    11,771,620,352      instructions              #    0.65  insns per cycle        
                                                  #    1.09  stalled cycles per insn  ( +-  0.05% ) [83.35%]
     2,508,014,871      branches                  #  447.063 M/sec                    ( +-  0.05% ) [83.34%]
        18,585,638      branch-misses             #    0.74% of all branches          ( +-  0.23% ) [83.35%]

       1.726691239 seconds time elapsed                                          ( +-  0.40% )

= srcu =

       5475.312828      task-clock (msec)         #    3.246 CPUs utilized            ( +-  0.59% )
             1,399      context-switches          #    0.255 K/sec                    ( +-  1.46% )
                24      cpu-migrations            #    0.004 K/sec                    ( +-  6.27% )
            15,916      page-faults               #    0.003 M/sec                    ( +-  0.04% )
    17,583,197,041      cycles                    #    3.211 GHz                      ( +-  0.11% ) [83.33%]
    12,352,657,985      stalled-cycles-frontend   #   70.25% frontend cycles idle     ( +-  0.16% ) [83.33%]
     8,173,164,212      stalled-cycles-backend    #   46.48% backend  cycles idle     ( +-  0.19% ) [66.70%]
    11,780,176,340      instructions              #    0.67  insns per cycle        
                                                  #    1.05  stalled cycles per insn  ( +-  0.05% ) [83.36%]
     2,506,722,383      branches                  #  457.823 M/sec                    ( +-  0.06% ) [83.35%]
        18,436,877      branch-misses             #    0.74% of all branches          ( +-  0.18% ) [83.32%]

       1.686877512 seconds time elapsed                                          ( +-  0.43% )

srcu is better as 2.3%

Srcu is better than down_read but I don't believe either because when I
did perf record, [up|down]_read and srcu_read_[lock|unlock] is really
minor (about 0.5%) so that I think it's really marginal.
(for example, if we removes srcu_read_[un]lock totally, we just enhance
about 1%) So, I don't think it's worth.

Okay, if you concerns on the data still, how about this?
Even, it would be smaller instructions than [up|down]_read so I guess
it could remove your performance concern. But I don't believe
it could make significant difference, either.
Hope it addresses your concern.

Thanks.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-30  8:08         ` Sergey Senozhatsky
@ 2015-01-31  8:50           ` Ganesh Mahendran
  2015-01-31 11:07             ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Ganesh Mahendran @ 2015-01-31  8:50 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Minchan Kim, Andrew Morton, linux-kernel,
	Linux-MM, Nitin Gupta, Jerome Marchand

2015-01-30 16:08 GMT+08:00 Sergey Senozhatsky
<sergey.senozhatsky.work@gmail.com>:
> On (01/30/15 15:52), Ganesh Mahendran wrote:
>> >> When I/O operation is running, that means the /dev/zram0 is
>> >> mounted or swaped on. Then the device could not be reset by
>> >> below code:
>> >>
>> >>     /* Do not reset an active device! */
>> >>     if (bdev->bd_holders) {
>> >>         ret = -EBUSY;
>> >>         goto out;
>> >>     }
>> >>
>> >> So the zram->init_lock in I/O path is to check whether the device
>> >> has been initialized(echo xxx > /sys/block/zram/disk_size).
>> >>
>>
>> Thanks for your explanation.
>>
>> >
>> > for mounted device (w/fs), we see initial (well, it goes up and down
>>
>> What does "w/" mean?
>
> 'with fs'
>
>> > many times while we create device, but this is not interesting here)
>> > ->bd_holders increment in:
>> >   vfs_kern_mount -> mount_bdev -> blkdev_get_by_path -> blkdev_get
>> >
>> > and it goes to zero in:
>> >   cleanup_mnt -> deactivate_super -> kill_block_super -> blkdev_put
>> >
>> >
>> > after umount we still have init device. so, *theoretically*, we
>> > can see something like
>> >
>> >         CPU0                            CPU1
>> > umount
>> > reset_store
>> > bdev->bd_holders == 0                   mount
>> > ...                                     zram_make_request()
>> > zram_reset_device()
>>
>> In this example, the data stored in zram will be corrupted.
>> Since CPU0 will free meta while CPU1 is using.
>> right?
>>
>
> with out ->init_lock protection in this case we have 'free' vs. 'use' race.

Maybe I did not explain clearly. I send a patch about this issue:

https://patchwork.kernel.org/patch/5754041/

Thanks

>
>>
>> >
>> > w/o zram->init_lock in both zram_reset_device() and zram_make_request()
>> > one of CPUs will be a bit sad.
>> what does "w/o" mean?
>
> 'with out'
>
>
>         -ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-31  8:50           ` Ganesh Mahendran
@ 2015-01-31 11:07             ` Sergey Senozhatsky
  2015-01-31 12:59               ` Ganesh Mahendran
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-31 11:07 UTC (permalink / raw)
  To: Ganesh Mahendran
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Minchan Kim,
	Andrew Morton, linux-kernel, Linux-MM, Nitin Gupta,
	Jerome Marchand

On (01/31/15 16:50), Ganesh Mahendran wrote:
> >> > after umount we still have init device. so, *theoretically*, we
> >> > can see something like
> >> >
> >> >         CPU0                            CPU1
> >> > umount
> >> > reset_store
> >> > bdev->bd_holders == 0                   mount
> >> > ...                                     zram_make_request()
> >> > zram_reset_device()
[..]


> 
> Maybe I did not explain clearly. I send a patch about this issue:
> 
> https://patchwork.kernel.org/patch/5754041/


excuse me? explain to me clearly what? my finding and my analysis?


this is the second time in a week that you hijack someone's work
and you don't even bother to give any credit to people.


Minchan moved zram_meta_free(meta) out of init_lock here
https://lkml.org/lkml/2015/1/21/29

I proposed to also move zs_free() of meta->handles here
https://lkml.org/lkml/2015/1/21/384


... so what happened then -- you jumped in and sent a patch.
https://lkml.org/lkml/2015/1/24/50


Minchan sent you a hint https://lkml.org/lkml/2015/1/26/471

>   but it seems the patch is based on my recent work "zram: free meta out of init_lock".



 "the patch is based on my work"!



now, for the last few days we were discussing init_lock and I first
expressed my concerns and spoke about 'free' vs. 'use' problem
here (but still didn't have enough spare to submit, besides we are in
the middle of reset/init/write rework)

https://lkml.org/lkml/2015/1/27/1029

>
>bdev->bd_holders protects from resetting device which has read/write
>operation ongoing on the onther CPU.
>
>I need to refresh on how ->bd_holders actually incremented/decremented.
>can the following race condition take a place?
>
>        CPU0                                    CPU1
>reset_store()
>bdev->bd_holders == false
>                                        zram_make_request
>                                                -rm- down_read(&zram->init_lock);
>                                        init_done(zram) == true
>zram_reset_device()                     valid_io_request()
>                                        __zram_make_request
>down_write(&zram->init_lock);           zram_bvec_rw
>[..]
>set_capacity(zram->disk, 0);
>zram->init_done = false;
>kick_all_cpus_sync();                   zram_bvec_write or zram_bvec_read()
>zram_meta_free(zram->meta);
>zcomp_destroy(zram->comp);              zcomp_compress() or zcomp_decompress()
>


and later here https://lkml.org/lkml/2015/1/29/645

>
>after umount we still have init device. so, *theoretically*, we
>can see something like
>
>
>        CPU0                            CPU1
>umount
>reset_store
>bdev->bd_holders == 0                   mount
>...                                     zram_make_request()
>zram_reset_device()
>



so what happened next? your patch happened next.
with quite familiar problem description

>
>      CPU0                    CPU1
> t1:  bdput
> t2:                          mount /dev/zram0 /mnt
> t3:  zram_reset_device
>

and now you say that I don't understant something in "your analysis"?



stop doing this. this is not how it works.


	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-30 14:41                     ` Minchan Kim
@ 2015-01-31 11:31                       ` Sergey Senozhatsky
  2015-02-01 14:50                       ` Sergey Senozhatsky
  1 sibling, 0 replies; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-01-31 11:31 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

Hello Minchan,
excellent analysis!

On (01/30/15 23:41), Minchan Kim wrote:
> Yes, __srcu_read_lock is a little bit heavier but the number of instruction
> are not too much difference to make difference 10%. A culprit is
> __cond_resched but I don't think, either because our test was CPU intensive
> soS I don't think schedule latency affects total bandwidth.
> 
> More cuprit is your data pattern.
> It seems you didn't use scramble_buffers=0, zero_buffers in fio so that
> fio fills random data pattern so zram bandwidth could be different by
> compression/decompression ratio.

Completely agree.
Shame on me. gotten so used to iozone (iozone uses same data pattern 0xA5,
this is +Z option what for), so I didn't even think about data pattern
in fio. sorry.

> 1) randread
> srcu is worse as 0.63% but the difference is really marginal.
> 
> 2) randwrite
> srcu is better as 1.24% is better.
> 
> 3) randrw
> srcu is better as 2.3%

hm, interesting. I'll re-check.

> Okay, if you concerns on the data still, how about this?

I'm not so upset to lose 0.6234187%. my concerns were about iozone's
10% different (which looks a bit worse).


I'll review your patch. Thanks for your effort.


> > 
> > by "data pattern" you mean usage scenario? well, I usually use zram for
> > `make -jX', where X=[4..N]. so N concurrent read-write ops scenario.
> 
> What I meant is what data fills I/O buffer, which is really important
> to evaluate zram because the compression/decompression speeds relys on it.
> 

I see. I never test it with `make' anyway, only iozone +Z.

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-31 11:07             ` Sergey Senozhatsky
@ 2015-01-31 12:59               ` Ganesh Mahendran
  0 siblings, 0 replies; 43+ messages in thread
From: Ganesh Mahendran @ 2015-01-31 12:59 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Minchan Kim, Andrew Morton, linux-kernel,
	Linux-MM, Nitin Gupta, Jerome Marchand

Hello, Sergey

2015-01-31 19:07 GMT+08:00 Sergey Senozhatsky <sergey.senozhatsky@gmail.com>:
> On (01/31/15 16:50), Ganesh Mahendran wrote:
>> >> > after umount we still have init device. so, *theoretically*, we
>> >> > can see something like
>> >> >
>> >> >         CPU0                            CPU1
>> >> > umount
>> >> > reset_store
>> >> > bdev->bd_holders == 0                   mount
>> >> > ...                                     zram_make_request()
>> >> > zram_reset_device()
> [..]
>
>
>>
>> Maybe I did not explain clearly. I send a patch about this issue:
>>
>> https://patchwork.kernel.org/patch/5754041/
>
>
> excuse me? explain to me clearly what? my finding and my analysis?

Sorry, I missed this mail
https://lkml.org/lkml/2015/1/27/1029

That's why I ask questions in this
https://lkml.org/lkml/2015/1/29/580
after Minchan's description.

>
>
> this is the second time in a week that you hijack someone's work
> and you don't even bother to give any credit to people.
>
>
> Minchan moved zram_meta_free(meta) out of init_lock here
> https://lkml.org/lkml/2015/1/21/29
>
> I proposed to also move zs_free() of meta->handles here
> https://lkml.org/lkml/2015/1/21/384

I thought you wanted move the code block after
       up_write(&zram->init_lock);

And I found the code block can be even encapsulated in
zram_meta_free().

That's why I sent:
https://lkml.org/lkml/2015/1/24/50

>
>
> ... so what happened then -- you jumped in and sent a patch.
> https://lkml.org/lkml/2015/1/24/50
>
>
> Minchan sent you a hint https://lkml.org/lkml/2015/1/26/471
>
>>   but it seems the patch is based on my recent work "zram: free meta out of init_lock".
>
>
>
>  "the patch is based on my work"!
>
>
>
> now, for the last few days we were discussing init_lock and I first
> expressed my concerns and spoke about 'free' vs. 'use' problem
> here (but still didn't have enough spare to submit, besides we are in
> the middle of reset/init/write rework)
>
> https://lkml.org/lkml/2015/1/27/1029
>
>>
>>bdev->bd_holders protects from resetting device which has read/write
>>operation ongoing on the onther CPU.
>>
>>I need to refresh on how ->bd_holders actually incremented/decremented.
>>can the following race condition take a place?
>>
>>        CPU0                                    CPU1
>>reset_store()
>>bdev->bd_holders == false
>>                                        zram_make_request
>>                                                -rm- down_read(&zram->init_lock);
>>                                        init_done(zram) == true
>>zram_reset_device()                     valid_io_request()
>>                                        __zram_make_request
>>down_write(&zram->init_lock);           zram_bvec_rw
>>[..]
>>set_capacity(zram->disk, 0);
>>zram->init_done = false;
>>kick_all_cpus_sync();                   zram_bvec_write or zram_bvec_read()
>>zram_meta_free(zram->meta);
>>zcomp_destroy(zram->comp);              zcomp_compress() or zcomp_decompress()

Sorry, I did not check this mail.

>>
>
>
> and later here https://lkml.org/lkml/2015/1/29/645
>
>>
>>after umount we still have init device. so, *theoretically*, we
>>can see something like
>>
>>
>>        CPU0                            CPU1
>>umount
>>reset_store
>>bdev->bd_holders == 0                   mount
>>...                                     zram_make_request()
>>zram_reset_device()
>>
>
>
>
> so what happened next? your patch happened next.
> with quite familiar problem description
>
>>
>>      CPU0                    CPU1
>> t1:  bdput
>> t2:                          mount /dev/zram0 /mnt
>> t3:  zram_reset_device
>>
>
> and now you say that I don't understant something in "your analysis"?
>
>
>
> stop doing this. this is not how it works.
>
>
>         -ss
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-01-30 14:41                     ` Minchan Kim
  2015-01-31 11:31                       ` Sergey Senozhatsky
@ 2015-02-01 14:50                       ` Sergey Senozhatsky
  2015-02-01 15:04                         ` Sergey Senozhatsky
  2015-02-02  1:30                         ` Minchan Kim
  1 sibling, 2 replies; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-01 14:50 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

Hello Minchan,

the idea looks good and this is something I was trying to do, except
that I used kref.

some review nitpicks are below. I also posted modified version of your
patch so that will save some time.


>  static inline int init_done(struct zram *zram)
>  {
> -	return zram->meta != NULL;
> +	return zram->disksize != 0;

we don't set ->disksize to 0 when create device. and I think
it's better to use refcount here, but set it to 0 during device creation.
(see the patch below)

> +static inline bool zram_meta_get(struct zram_meta *meta)
> +{
> +	if (!atomic_inc_not_zero(&meta->refcount))
> +		return false;
> +	return true;
> +}

I've changed it to likely case first: `if recount return true'

>  static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  {
> +	struct zram_meta *meta;
> +	u64 disksize;

not needed. (see the patch below).

> +
>  	down_write(&zram->init_lock);
>  
>  	zram->limit_pages = 0;
> @@ -728,14 +750,20 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  		return;
>  	}
>  
> +	meta = zram->meta;
> +
>  	zcomp_destroy(zram->comp);

we can't destoy zcomp before we see IO completion.

>  	zram->max_comp_streams = 1;

we better keep original comp_streams number before we see IO completion.
we don't know how many RW ops we have, so completion can happen earlier.

> -	zram_meta_free(zram->meta, zram->disksize);
> -	zram->meta = NULL;
> +	disksize = zram->disksize;
> +	zram_meta_put(meta);
> +	/* Read/write handler will not handle further I/O operation. */
> +	zram->disksize = 0;

I keep it on its current position. (see below)

> +	wait_for_completion(&meta->complete);
> +	/* I/O operation under all of CPU are done so let's free */
> +	zram_meta_free(zram->meta, disksize);
>  	/* Reset stats */
>  	memset(&zram->stats, 0, sizeof(zram->stats));
>  
> -	zram->disksize = 0;
>  	if (reset_capacity)
>  		set_capacity(zram->disk, 0);
>  
> @@ -908,23 +936,25 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
>  {
>  	struct zram *zram = queue->queuedata;
>  
> -	down_read(&zram->init_lock);
> -	if (unlikely(!init_done(zram)))
> +	if (unlikely(!zram_meta_get(zram->meta)))
>  		goto error;
>  
> +	if (unlikely(!init_done(zram)))
> +		goto put_meta;
> +

here and later:
we can't take zram_meta_get() first and then check for init_done(zram),
because ->meta can be NULL, so it fill be ->NULL->refcount.

let's keep ->completion and ->refcount in zram and rename zram_meta_[get|put]
to zram_[get|put].




please review a bit modified version of your patch.

/* the patch also reogranizes a bit order of struct zram members, to move
member that we use more often together and to avoid paddings. nothing
critical here. */


next action items are:
-- we actually can now switch from init_lock in some _show() fucntion to
zram_get()/zram_put()
-- address that theoretical and very unlikely in real live race condition
of umount-reset vs. mount-rw.


no concerns about performance of this version -- we probably will not get
any faster than that.


thanks a lot for your effort!

---

 drivers/block/zram/zram_drv.c | 82 ++++++++++++++++++++++++++++++-------------
 drivers/block/zram/zram_drv.h | 17 ++++-----
 2 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index aa5a4c5..6916790 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -44,7 +44,7 @@ static const char *default_compressor = "lzo";
 static unsigned int num_devices = 1;
 
 #define ZRAM_ATTR_RO(name)						\
-static ssize_t name##_show(struct device *d,		\
+static ssize_t name##_show(struct device *d,				\
 				struct device_attribute *attr, char *b)	\
 {									\
 	struct zram *zram = dev_to_zram(d);				\
@@ -55,7 +55,7 @@ static DEVICE_ATTR_RO(name);
 
 static inline int init_done(struct zram *zram)
 {
-	return zram->meta != NULL;
+	return atomic_read(&zram->refcount);
 }
 
 static inline struct zram *dev_to_zram(struct device *dev)
@@ -358,6 +358,23 @@ out_error:
 	return NULL;
 }
 
+static inline bool zram_get(struct zram *zram)
+{
+	if (atomic_inc_not_zero(&zram->refcount))
+		return true;
+	return false;
+}
+
+/*
+ * We want to free zram_meta in process context to avoid
+ * deadlock between reclaim path and any other locks
+ */
+static inline void zram_put(struct zram *zram)
+{
+	if (atomic_dec_and_test(&zram->refcount))
+		complete(&zram->io_done);
+}
+
 static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
 {
 	if (*offset + bvec->bv_len >= PAGE_SIZE)
@@ -719,6 +736,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
 
 static void zram_reset_device(struct zram *zram, bool reset_capacity)
 {
+	struct zram_meta *meta;
+	struct zcomp *comp;
+
 	down_write(&zram->init_lock);
 
 	zram->limit_pages = 0;
@@ -728,14 +748,21 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 		return;
 	}
 
-	zcomp_destroy(zram->comp);
-	zram->max_comp_streams = 1;
-	zram_meta_free(zram->meta, zram->disksize);
-	zram->meta = NULL;
+	meta = zram->meta;
+	comp = zram->comp;
+	/* ->refcount will go down to 0 eventually */
+	zram_put(zram);
+
+	wait_for_completion(&zram->io_done);
+	/* I/O operation under all of CPU are done so let's free */
+	zram_meta_free(meta, disksize);
+	zcomp_destroy(comp);
+
 	/* Reset stats */
 	memset(&zram->stats, 0, sizeof(zram->stats));
-
 	zram->disksize = 0;
+	zram->max_comp_streams = 1;
+
 	if (reset_capacity)
 		set_capacity(zram->disk, 0);
 
@@ -783,6 +810,8 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_destroy_comp;
 	}
 
+	init_completion(&zram->io_done);
+	atomic_set(&zram->refcount, 1);
 	zram->meta = meta;
 	zram->comp = comp;
 	zram->disksize = disksize;
@@ -795,7 +824,6 @@ static ssize_t disksize_store(struct device *dev,
 	 * so that revalidate_disk always sees up-to-date capacity.
 	 */
 	revalidate_disk(zram->disk);
-
 	return len;
 
 out_destroy_comp:
@@ -908,23 +936,24 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct zram *zram = queue->queuedata;
 
-	down_read(&zram->init_lock);
-	if (unlikely(!init_done(zram)))
+	if (unlikely(!zram_get(zram)))
 		goto error;
 
+	if (unlikely(!init_done(zram)))
+		goto put_zram;
+
 	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
 					bio->bi_iter.bi_size)) {
 		atomic64_inc(&zram->stats.invalid_io);
-		goto error;
+		goto put_zram;
 	}
 
 	__zram_make_request(zram, bio);
-	up_read(&zram->init_lock);
-
+	zram_put(zram);
 	return;
-
+put_zram:
+	zram_put(zram);
 error:
-	up_read(&zram->init_lock);
 	bio_io_error(bio);
 }
 
@@ -946,21 +975,22 @@ static void zram_slot_free_notify(struct block_device *bdev,
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, int rw)
 {
-	int offset, err;
+	int offset, err = -EIO;
 	u32 index;
 	struct zram *zram;
 	struct bio_vec bv;
 
 	zram = bdev->bd_disk->private_data;
+	if (unlikely(!zram_get(zram)))
+		goto out;
+
+	if (unlikely(!init_done(zram)))
+		goto put_zram;
+
 	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
 		atomic64_inc(&zram->stats.invalid_io);
-		return -EINVAL;
-	}
-
-	down_read(&zram->init_lock);
-	if (unlikely(!init_done(zram))) {
-		err = -EIO;
-		goto out_unlock;
+		err = -EINVAL;
+		goto put_zram;
 	}
 
 	index = sector >> SECTORS_PER_PAGE_SHIFT;
@@ -971,8 +1001,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_offset = 0;
 
 	err = zram_bvec_rw(zram, &bv, index, offset, rw);
-out_unlock:
-	up_read(&zram->init_lock);
+put_zram:
+	zram_put(zram);
+out:
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
 	 * calling page_endio.
@@ -1041,6 +1072,7 @@ static int create_device(struct zram *zram, int device_id)
 	int ret = -ENOMEM;
 
 	init_rwsem(&zram->init_lock);
+	atomic_set(&zram->refcount, 0);
 
 	zram->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!zram->queue) {
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index b05a816..7138c82 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -100,24 +100,25 @@ struct zram_meta {
 
 struct zram {
 	struct zram_meta *meta;
+	struct zcomp *comp;
 	struct request_queue *queue;
 	struct gendisk *disk;
-	struct zcomp *comp;
-
 	/* Prevent concurrent execution of device init, reset and R/W request */
 	struct rw_semaphore init_lock;
 	/*
-	 * This is the limit on amount of *uncompressed* worth of data
-	 * we can store in a disk.
+	 * the number of pages zram can consume for storing compressed data
 	 */
-	u64 disksize;	/* bytes */
+	unsigned long limit_pages;
+	atomic_t refcount;
 	int max_comp_streams;
+
 	struct zram_stats stats;
+	struct completion io_done; /* notify IO under all of cpu are done */
 	/*
-	 * the number of pages zram can consume for storing compressed data
+	 * This is the limit on amount of *uncompressed* worth of data
+	 * we can store in a disk.
 	 */
-	unsigned long limit_pages;
-
+	u64 disksize;	/* bytes */
 	char compressor[10];
 };
 #endif

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-01 14:50                       ` Sergey Senozhatsky
@ 2015-02-01 15:04                         ` Sergey Senozhatsky
  2015-02-02  1:43                           ` Minchan Kim
  2015-02-02  1:30                         ` Minchan Kim
  1 sibling, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-01 15:04 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

Sorry, guys! my bad. the patch I sent to you is incomplete. pelase review
this one.

On (02/01/15 23:50), Sergey Senozhatsky wrote:
> +	zram_meta_free(meta, disksize);
> 
that was my in-mail last second stupid modification and I didn't compile
tested it. I hit send button and then realized that your patch didn't move
->meta and ->comp free out of ->init_lock.

So this patch does it.

thanks.

---

 drivers/block/zram/zram_drv.c | 84 +++++++++++++++++++++++++++++--------------
 drivers/block/zram/zram_drv.h | 17 ++++-----
 2 files changed, 67 insertions(+), 34 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index aa5a4c5..c0b612d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -44,7 +44,7 @@ static const char *default_compressor = "lzo";
 static unsigned int num_devices = 1;
 
 #define ZRAM_ATTR_RO(name)						\
-static ssize_t name##_show(struct device *d,		\
+static ssize_t name##_show(struct device *d,				\
 				struct device_attribute *attr, char *b)	\
 {									\
 	struct zram *zram = dev_to_zram(d);				\
@@ -55,7 +55,7 @@ static DEVICE_ATTR_RO(name);
 
 static inline int init_done(struct zram *zram)
 {
-	return zram->meta != NULL;
+	return atomic_read(&zram->refcount);
 }
 
 static inline struct zram *dev_to_zram(struct device *dev)
@@ -358,6 +358,23 @@ out_error:
 	return NULL;
 }
 
+static inline bool zram_get(struct zram *zram)
+{
+	if (atomic_inc_not_zero(&zram->refcount))
+		return true;
+	return false;
+}
+
+/*
+ * We want to free zram_meta in process context to avoid
+ * deadlock between reclaim path and any other locks
+ */
+static inline void zram_put(struct zram *zram)
+{
+	if (atomic_dec_and_test(&zram->refcount))
+		complete(&zram->io_done);
+}
+
 static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
 {
 	if (*offset + bvec->bv_len >= PAGE_SIZE)
@@ -719,6 +736,10 @@ static void zram_bio_discard(struct zram *zram, u32 index,
 
 static void zram_reset_device(struct zram *zram, bool reset_capacity)
 {
+	struct zram_meta *meta;
+	struct zcomp *comp;
+	u64 disksize;
+
 	down_write(&zram->init_lock);
 
 	zram->limit_pages = 0;
@@ -728,19 +749,25 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
 		return;
 	}
 
-	zcomp_destroy(zram->comp);
-	zram->max_comp_streams = 1;
-	zram_meta_free(zram->meta, zram->disksize);
-	zram->meta = NULL;
+	meta = zram->meta;
+	comp = zram->comp;
+	disksize = zram->disksize;
+	/* ->refcount will go down to 0 eventually */
+	zram_put(zram);
+	wait_for_completion(&zram->io_done);
+
 	/* Reset stats */
 	memset(&zram->stats, 0, sizeof(zram->stats));
-
 	zram->disksize = 0;
+	zram->max_comp_streams = 1;
+
 	if (reset_capacity)
 		set_capacity(zram->disk, 0);
 
 	up_write(&zram->init_lock);
-
+	/* I/O operation under all of CPU are done so let's free */
+	zram_meta_free(meta, disksize);
+	zcomp_destroy(comp);
 	/*
 	 * Revalidate disk out of the init_lock to avoid lockdep splat.
 	 * It's okay because disk's capacity is protected by init_lock
@@ -783,6 +810,8 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_destroy_comp;
 	}
 
+	init_completion(&zram->io_done);
+	atomic_set(&zram->refcount, 1);
 	zram->meta = meta;
 	zram->comp = comp;
 	zram->disksize = disksize;
@@ -795,7 +824,6 @@ static ssize_t disksize_store(struct device *dev,
 	 * so that revalidate_disk always sees up-to-date capacity.
 	 */
 	revalidate_disk(zram->disk);
-
 	return len;
 
 out_destroy_comp:
@@ -908,23 +936,24 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct zram *zram = queue->queuedata;
 
-	down_read(&zram->init_lock);
-	if (unlikely(!init_done(zram)))
+	if (unlikely(!zram_get(zram)))
 		goto error;
 
+	if (unlikely(!init_done(zram)))
+		goto put_zram;
+
 	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
 					bio->bi_iter.bi_size)) {
 		atomic64_inc(&zram->stats.invalid_io);
-		goto error;
+		goto put_zram;
 	}
 
 	__zram_make_request(zram, bio);
-	up_read(&zram->init_lock);
-
+	zram_put(zram);
 	return;
-
+put_zram:
+	zram_put(zram);
 error:
-	up_read(&zram->init_lock);
 	bio_io_error(bio);
 }
 
@@ -946,21 +975,22 @@ static void zram_slot_free_notify(struct block_device *bdev,
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, int rw)
 {
-	int offset, err;
+	int offset, err = -EIO;
 	u32 index;
 	struct zram *zram;
 	struct bio_vec bv;
 
 	zram = bdev->bd_disk->private_data;
+	if (unlikely(!zram_get(zram)))
+		goto out;
+
+	if (unlikely(!init_done(zram)))
+		goto put_zram;
+
 	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
 		atomic64_inc(&zram->stats.invalid_io);
-		return -EINVAL;
-	}
-
-	down_read(&zram->init_lock);
-	if (unlikely(!init_done(zram))) {
-		err = -EIO;
-		goto out_unlock;
+		err = -EINVAL;
+		goto put_zram;
 	}
 
 	index = sector >> SECTORS_PER_PAGE_SHIFT;
@@ -971,8 +1001,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_offset = 0;
 
 	err = zram_bvec_rw(zram, &bv, index, offset, rw);
-out_unlock:
-	up_read(&zram->init_lock);
+put_zram:
+	zram_put(zram);
+out:
 	/*
 	 * If I/O fails, just return error(ie, non-zero) without
 	 * calling page_endio.
@@ -1041,6 +1072,7 @@ static int create_device(struct zram *zram, int device_id)
 	int ret = -ENOMEM;
 
 	init_rwsem(&zram->init_lock);
+	atomic_set(&zram->refcount, 0);
 
 	zram->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!zram->queue) {
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index b05a816..7138c82 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -100,24 +100,25 @@ struct zram_meta {
 
 struct zram {
 	struct zram_meta *meta;
+	struct zcomp *comp;
 	struct request_queue *queue;
 	struct gendisk *disk;
-	struct zcomp *comp;
-
 	/* Prevent concurrent execution of device init, reset and R/W request */
 	struct rw_semaphore init_lock;
 	/*
-	 * This is the limit on amount of *uncompressed* worth of data
-	 * we can store in a disk.
+	 * the number of pages zram can consume for storing compressed data
 	 */
-	u64 disksize;	/* bytes */
+	unsigned long limit_pages;
+	atomic_t refcount;
 	int max_comp_streams;
+
 	struct zram_stats stats;
+	struct completion io_done; /* notify IO under all of cpu are done */
 	/*
-	 * the number of pages zram can consume for storing compressed data
+	 * This is the limit on amount of *uncompressed* worth of data
+	 * we can store in a disk.
 	 */
-	unsigned long limit_pages;
-
+	u64 disksize;	/* bytes */
 	char compressor[10];
 };
 #endif

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-01 14:50                       ` Sergey Senozhatsky
  2015-02-01 15:04                         ` Sergey Senozhatsky
@ 2015-02-02  1:30                         ` Minchan Kim
  2015-02-02  1:48                           ` Sergey Senozhatsky
  1 sibling, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-02-02  1:30 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

Hey Sergey,

On Sun, Feb 01, 2015 at 11:50:36PM +0900, Sergey Senozhatsky wrote:
> Hello Minchan,
> 
> the idea looks good and this is something I was trying to do, except
> that I used kref.
> 
> some review nitpicks are below. I also posted modified version of your
> patch so that will save some time.

Thanks a lot!

> 
> 
> >  static inline int init_done(struct zram *zram)
> >  {
> > -	return zram->meta != NULL;
> > +	return zram->disksize != 0;
> 
> we don't set ->disksize to 0 when create device. and I think
> it's better to use refcount here, but set it to 0 during device creation.
> (see the patch below)

There was a reason I didn't use refcount there.
I should have written down it.

We need something to prevent further I/O handling on other CPUs.
Otherwise, it's livelock. For example, new 'A' I/O rw path on CPU 1
can see non-zero refcount if another CPU is going on rw.
Then, another new 'B' I/O rw path on CPU 2 can see non-zero refcount
if A I/O is going on. Then, another new 'C' I/O rw path on CPU 3 can
see non-zero refcount if B I/O is going on. Finally, 'A' IO is done
on CPU 1 and next I/O 'D' on CPU 1 can see non-zero refcount because
'C' on CPU 3 is going on. Infinite loop.

> 
> > +static inline bool zram_meta_get(struct zram_meta *meta)
> > +{
> > +	if (!atomic_inc_not_zero(&meta->refcount))
> > +		return false;
> > +	return true;
> > +}
> 
> I've changed it to likely case first: `if recount return true'

Thanks!

> 
> >  static void zram_reset_device(struct zram *zram, bool reset_capacity)
> >  {
> > +	struct zram_meta *meta;
> > +	u64 disksize;
> 
> not needed. (see the patch below).
> 
> > +
> >  	down_write(&zram->init_lock);
> >  
> >  	zram->limit_pages = 0;
> > @@ -728,14 +750,20 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> >  		return;
> >  	}
> >  
> > +	meta = zram->meta;
> > +
> >  	zcomp_destroy(zram->comp);
> 
> we can't destoy zcomp before we see IO completion.

True.

> 
> >  	zram->max_comp_streams = 1;
> 
> we better keep original comp_streams number before we see IO completion.
> we don't know how many RW ops we have, so completion can happen earlier.

Yeb.

> 
> > -	zram_meta_free(zram->meta, zram->disksize);
> > -	zram->meta = NULL;
> > +	disksize = zram->disksize;
> > +	zram_meta_put(meta);
> > +	/* Read/write handler will not handle further I/O operation. */
> > +	zram->disksize = 0;
> 
> I keep it on its current position. (see below)
> 
> > +	wait_for_completion(&meta->complete);
> > +	/* I/O operation under all of CPU are done so let's free */
> > +	zram_meta_free(zram->meta, disksize);
> >  	/* Reset stats */
> >  	memset(&zram->stats, 0, sizeof(zram->stats));
> >  
> > -	zram->disksize = 0;
> >  	if (reset_capacity)
> >  		set_capacity(zram->disk, 0);
> >  
> > @@ -908,23 +936,25 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
> >  {
> >  	struct zram *zram = queue->queuedata;
> >  
> > -	down_read(&zram->init_lock);
> > -	if (unlikely(!init_done(zram)))
> > +	if (unlikely(!zram_meta_get(zram->meta)))
> >  		goto error;
> >  
> > +	if (unlikely(!init_done(zram)))
> > +		goto put_meta;
> > +
> 
> here and later:
> we can't take zram_meta_get() first and then check for init_done(zram),
> because ->meta can be NULL, so it fill be ->NULL->refcount.

True.
Actually, it was totally RFC I forgot adding the tag in the night but I can't
escape from my shame with the escuse. Thanks!

> 
> let's keep ->completion and ->refcount in zram and rename zram_meta_[get|put]
> to zram_[get|put].

Good idea but still want to name it as zram_meta_get/put because zram_get naming
might confuse struct zram's refcount rather than zram_meta. :)

> 
> 
> 
> 
> please review a bit modified version of your patch.
> 
> /* the patch also reogranizes a bit order of struct zram members, to move
> member that we use more often together and to avoid paddings. nothing
> critical here. */
> 
> 
> next action items are:
> -- we actually can now switch from init_lock in some _show() fucntion to
> zram_get()/zram_put()
> -- address that theoretical and very unlikely in real live race condition
> of umount-reset vs. mount-rw.
> 
> 
> no concerns about performance of this version -- we probably will not get
> any faster than that.
> 
> 
> thanks a lot for your effort!
> 
> ---
> 
>  drivers/block/zram/zram_drv.c | 82 ++++++++++++++++++++++++++++++-------------
>  drivers/block/zram/zram_drv.h | 17 ++++-----
>  2 files changed, 66 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index aa5a4c5..6916790 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -44,7 +44,7 @@ static const char *default_compressor = "lzo";
>  static unsigned int num_devices = 1;
>  
>  #define ZRAM_ATTR_RO(name)						\
> -static ssize_t name##_show(struct device *d,		\
> +static ssize_t name##_show(struct device *d,				\
>  				struct device_attribute *attr, char *b)	\
>  {									\
>  	struct zram *zram = dev_to_zram(d);				\
> @@ -55,7 +55,7 @@ static DEVICE_ATTR_RO(name);
>  
>  static inline int init_done(struct zram *zram)
>  {
> -	return zram->meta != NULL;
> +	return atomic_read(&zram->refcount);
>  }
>  
>  static inline struct zram *dev_to_zram(struct device *dev)
> @@ -358,6 +358,23 @@ out_error:
>  	return NULL;
>  }
>  
> +static inline bool zram_get(struct zram *zram)
> +{
> +	if (atomic_inc_not_zero(&zram->refcount))
> +		return true;
> +	return false;
> +}
> +
> +/*
> + * We want to free zram_meta in process context to avoid
> + * deadlock between reclaim path and any other locks
> + */
> +static inline void zram_put(struct zram *zram)
> +{
> +	if (atomic_dec_and_test(&zram->refcount))
> +		complete(&zram->io_done);
> +}
> +
>  static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
>  {
>  	if (*offset + bvec->bv_len >= PAGE_SIZE)
> @@ -719,6 +736,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
>  
>  static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  {
> +	struct zram_meta *meta;
> +	struct zcomp *comp;
> +
>  	down_write(&zram->init_lock);
>  
>  	zram->limit_pages = 0;
> @@ -728,14 +748,21 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  		return;
>  	}
>  
> -	zcomp_destroy(zram->comp);
> -	zram->max_comp_streams = 1;
> -	zram_meta_free(zram->meta, zram->disksize);
> -	zram->meta = NULL;
> +	meta = zram->meta;
> +	comp = zram->comp;
> +	/* ->refcount will go down to 0 eventually */
> +	zram_put(zram);
> +
> +	wait_for_completion(&zram->io_done);
> +	/* I/O operation under all of CPU are done so let's free */
> +	zram_meta_free(meta, disksize);
> +	zcomp_destroy(comp);
> +
>  	/* Reset stats */
>  	memset(&zram->stats, 0, sizeof(zram->stats));
> -
>  	zram->disksize = 0;
> +	zram->max_comp_streams = 1;
> +
>  	if (reset_capacity)
>  		set_capacity(zram->disk, 0);
>  
> @@ -783,6 +810,8 @@ static ssize_t disksize_store(struct device *dev,
>  		goto out_destroy_comp;
>  	}
>  
> +	init_completion(&zram->io_done);
> +	atomic_set(&zram->refcount, 1);
>  	zram->meta = meta;
>  	zram->comp = comp;
>  	zram->disksize = disksize;
> @@ -795,7 +824,6 @@ static ssize_t disksize_store(struct device *dev,
>  	 * so that revalidate_disk always sees up-to-date capacity.
>  	 */
>  	revalidate_disk(zram->disk);
> -
>  	return len;
>  
>  out_destroy_comp:
> @@ -908,23 +936,24 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
>  {
>  	struct zram *zram = queue->queuedata;
>  
> -	down_read(&zram->init_lock);
> -	if (unlikely(!init_done(zram)))
> +	if (unlikely(!zram_get(zram)))
>  		goto error;
>  
> +	if (unlikely(!init_done(zram)))
> +		goto put_zram;
> +
>  	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
>  					bio->bi_iter.bi_size)) {
>  		atomic64_inc(&zram->stats.invalid_io);
> -		goto error;
> +		goto put_zram;
>  	}
>  
>  	__zram_make_request(zram, bio);
> -	up_read(&zram->init_lock);
> -
> +	zram_put(zram);
>  	return;
> -
> +put_zram:
> +	zram_put(zram);
>  error:
> -	up_read(&zram->init_lock);
>  	bio_io_error(bio);
>  }
>  
> @@ -946,21 +975,22 @@ static void zram_slot_free_notify(struct block_device *bdev,
>  static int zram_rw_page(struct block_device *bdev, sector_t sector,
>  		       struct page *page, int rw)
>  {
> -	int offset, err;
> +	int offset, err = -EIO;
>  	u32 index;
>  	struct zram *zram;
>  	struct bio_vec bv;
>  
>  	zram = bdev->bd_disk->private_data;
> +	if (unlikely(!zram_get(zram)))
> +		goto out;
> +
> +	if (unlikely(!init_done(zram)))
> +		goto put_zram;
> +
>  	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
>  		atomic64_inc(&zram->stats.invalid_io);
> -		return -EINVAL;
> -	}
> -
> -	down_read(&zram->init_lock);
> -	if (unlikely(!init_done(zram))) {
> -		err = -EIO;
> -		goto out_unlock;
> +		err = -EINVAL;
> +		goto put_zram;
>  	}
>  
>  	index = sector >> SECTORS_PER_PAGE_SHIFT;
> @@ -971,8 +1001,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
>  	bv.bv_offset = 0;
>  
>  	err = zram_bvec_rw(zram, &bv, index, offset, rw);
> -out_unlock:
> -	up_read(&zram->init_lock);
> +put_zram:
> +	zram_put(zram);
> +out:
>  	/*
>  	 * If I/O fails, just return error(ie, non-zero) without
>  	 * calling page_endio.
> @@ -1041,6 +1072,7 @@ static int create_device(struct zram *zram, int device_id)
>  	int ret = -ENOMEM;
>  
>  	init_rwsem(&zram->init_lock);
> +	atomic_set(&zram->refcount, 0);
>  
>  	zram->queue = blk_alloc_queue(GFP_KERNEL);
>  	if (!zram->queue) {
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index b05a816..7138c82 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -100,24 +100,25 @@ struct zram_meta {
>  
>  struct zram {
>  	struct zram_meta *meta;
> +	struct zcomp *comp;
>  	struct request_queue *queue;
>  	struct gendisk *disk;
> -	struct zcomp *comp;
> -
>  	/* Prevent concurrent execution of device init, reset and R/W request */
>  	struct rw_semaphore init_lock;
>  	/*
> -	 * This is the limit on amount of *uncompressed* worth of data
> -	 * we can store in a disk.
> +	 * the number of pages zram can consume for storing compressed data
>  	 */
> -	u64 disksize;	/* bytes */
> +	unsigned long limit_pages;
> +	atomic_t refcount;
>  	int max_comp_streams;
> +
>  	struct zram_stats stats;
> +	struct completion io_done; /* notify IO under all of cpu are done */
>  	/*
> -	 * the number of pages zram can consume for storing compressed data
> +	 * This is the limit on amount of *uncompressed* worth of data
> +	 * we can store in a disk.
>  	 */
> -	unsigned long limit_pages;
> -
> +	u64 disksize;	/* bytes */
>  	char compressor[10];
>  };
>  #endif
> 

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-01 15:04                         ` Sergey Senozhatsky
@ 2015-02-02  1:43                           ` Minchan Kim
  2015-02-02  1:59                             ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-02-02  1:43 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

On Mon, Feb 02, 2015 at 12:04:16AM +0900, Sergey Senozhatsky wrote:
> Sorry, guys! my bad. the patch I sent to you is incomplete. pelase review
> this one.
> 
> On (02/01/15 23:50), Sergey Senozhatsky wrote:
> > +	zram_meta_free(meta, disksize);
> > 
> that was my in-mail last second stupid modification and I didn't compile
> tested it. I hit send button and then realized that your patch didn't move
> ->meta and ->comp free out of ->init_lock.
> 
> So this patch does it.
> 
> thanks.
> 
> ---
> 
>  drivers/block/zram/zram_drv.c | 84 +++++++++++++++++++++++++++++--------------
>  drivers/block/zram/zram_drv.h | 17 ++++-----
>  2 files changed, 67 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index aa5a4c5..c0b612d 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -44,7 +44,7 @@ static const char *default_compressor = "lzo";
>  static unsigned int num_devices = 1;
>  
>  #define ZRAM_ATTR_RO(name)						\
> -static ssize_t name##_show(struct device *d,		\
> +static ssize_t name##_show(struct device *d,				\
>  				struct device_attribute *attr, char *b)	\
>  {									\
>  	struct zram *zram = dev_to_zram(d);				\
> @@ -55,7 +55,7 @@ static DEVICE_ATTR_RO(name);
>  
>  static inline int init_done(struct zram *zram)
>  {
> -	return zram->meta != NULL;
> +	return atomic_read(&zram->refcount);

As I said previous mail, it could make livelock so I want to use disksize
in here to prevent further I/O handling.

>  }
>  
>  static inline struct zram *dev_to_zram(struct device *dev)
> @@ -358,6 +358,23 @@ out_error:
>  	return NULL;
>  }
>  
> +static inline bool zram_get(struct zram *zram)
> +{
> +	if (atomic_inc_not_zero(&zram->refcount))
> +		return true;
> +	return false;
> +}
> +
> +/*
> + * We want to free zram_meta in process context to avoid
> + * deadlock between reclaim path and any other locks
> + */
> +static inline void zram_put(struct zram *zram)
> +{
> +	if (atomic_dec_and_test(&zram->refcount))
> +		complete(&zram->io_done);
> +}

Although I suggested this complete, it might be rather overkill(pz,
understand me it was work in midnight. :))
Instead, we could use just atomic_dec in here and
use wait_event(event, atomic_read(&zram->refcount) == 0) in reset.

Otherwise, looks good to me. I will cook up based on this version
and test/send if you don't have any strong objection until that.

Thanks for the review, Sergey!

> +
>  static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
>  {
>  	if (*offset + bvec->bv_len >= PAGE_SIZE)
> @@ -719,6 +736,10 @@ static void zram_bio_discard(struct zram *zram, u32 index,
>  
>  static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  {
> +	struct zram_meta *meta;
> +	struct zcomp *comp;
> +	u64 disksize;
> +
>  	down_write(&zram->init_lock);
>  
>  	zram->limit_pages = 0;
> @@ -728,19 +749,25 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  		return;
>  	}
>  
> -	zcomp_destroy(zram->comp);
> -	zram->max_comp_streams = 1;
> -	zram_meta_free(zram->meta, zram->disksize);
> -	zram->meta = NULL;
> +	meta = zram->meta;
> +	comp = zram->comp;
> +	disksize = zram->disksize;
> +	/* ->refcount will go down to 0 eventually */
> +	zram_put(zram);
> +	wait_for_completion(&zram->io_done);
> +
>  	/* Reset stats */
>  	memset(&zram->stats, 0, sizeof(zram->stats));
> -
>  	zram->disksize = 0;
> +	zram->max_comp_streams = 1;
> +
>  	if (reset_capacity)
>  		set_capacity(zram->disk, 0);
>  
>  	up_write(&zram->init_lock);
> -
> +	/* I/O operation under all of CPU are done so let's free */
> +	zram_meta_free(meta, disksize);
> +	zcomp_destroy(comp);
>  	/*
>  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
>  	 * It's okay because disk's capacity is protected by init_lock
> @@ -783,6 +810,8 @@ static ssize_t disksize_store(struct device *dev,
>  		goto out_destroy_comp;
>  	}
>  
> +	init_completion(&zram->io_done);
> +	atomic_set(&zram->refcount, 1);
>  	zram->meta = meta;
>  	zram->comp = comp;
>  	zram->disksize = disksize;
> @@ -795,7 +824,6 @@ static ssize_t disksize_store(struct device *dev,
>  	 * so that revalidate_disk always sees up-to-date capacity.
>  	 */
>  	revalidate_disk(zram->disk);
> -
>  	return len;
>  
>  out_destroy_comp:
> @@ -908,23 +936,24 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
>  {
>  	struct zram *zram = queue->queuedata;
>  
> -	down_read(&zram->init_lock);
> -	if (unlikely(!init_done(zram)))
> +	if (unlikely(!zram_get(zram)))
>  		goto error;
>  
> +	if (unlikely(!init_done(zram)))
> +		goto put_zram;
> +
>  	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
>  					bio->bi_iter.bi_size)) {
>  		atomic64_inc(&zram->stats.invalid_io);
> -		goto error;
> +		goto put_zram;
>  	}
>  
>  	__zram_make_request(zram, bio);
> -	up_read(&zram->init_lock);
> -
> +	zram_put(zram);
>  	return;
> -
> +put_zram:
> +	zram_put(zram);
>  error:
> -	up_read(&zram->init_lock);
>  	bio_io_error(bio);
>  }
>  
> @@ -946,21 +975,22 @@ static void zram_slot_free_notify(struct block_device *bdev,
>  static int zram_rw_page(struct block_device *bdev, sector_t sector,
>  		       struct page *page, int rw)
>  {
> -	int offset, err;
> +	int offset, err = -EIO;
>  	u32 index;
>  	struct zram *zram;
>  	struct bio_vec bv;
>  
>  	zram = bdev->bd_disk->private_data;
> +	if (unlikely(!zram_get(zram)))
> +		goto out;
> +
> +	if (unlikely(!init_done(zram)))
> +		goto put_zram;
> +
>  	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
>  		atomic64_inc(&zram->stats.invalid_io);
> -		return -EINVAL;
> -	}
> -
> -	down_read(&zram->init_lock);
> -	if (unlikely(!init_done(zram))) {
> -		err = -EIO;
> -		goto out_unlock;
> +		err = -EINVAL;
> +		goto put_zram;
>  	}
>  
>  	index = sector >> SECTORS_PER_PAGE_SHIFT;
> @@ -971,8 +1001,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
>  	bv.bv_offset = 0;
>  
>  	err = zram_bvec_rw(zram, &bv, index, offset, rw);
> -out_unlock:
> -	up_read(&zram->init_lock);
> +put_zram:
> +	zram_put(zram);
> +out:
>  	/*
>  	 * If I/O fails, just return error(ie, non-zero) without
>  	 * calling page_endio.
> @@ -1041,6 +1072,7 @@ static int create_device(struct zram *zram, int device_id)
>  	int ret = -ENOMEM;
>  
>  	init_rwsem(&zram->init_lock);
> +	atomic_set(&zram->refcount, 0);
>  
>  	zram->queue = blk_alloc_queue(GFP_KERNEL);
>  	if (!zram->queue) {
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index b05a816..7138c82 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -100,24 +100,25 @@ struct zram_meta {
>  
>  struct zram {
>  	struct zram_meta *meta;
> +	struct zcomp *comp;
>  	struct request_queue *queue;
>  	struct gendisk *disk;
> -	struct zcomp *comp;
> -
>  	/* Prevent concurrent execution of device init, reset and R/W request */
>  	struct rw_semaphore init_lock;
>  	/*
> -	 * This is the limit on amount of *uncompressed* worth of data
> -	 * we can store in a disk.
> +	 * the number of pages zram can consume for storing compressed data
>  	 */
> -	u64 disksize;	/* bytes */
> +	unsigned long limit_pages;
> +	atomic_t refcount;
>  	int max_comp_streams;
> +
>  	struct zram_stats stats;
> +	struct completion io_done; /* notify IO under all of cpu are done */
>  	/*
> -	 * the number of pages zram can consume for storing compressed data
> +	 * This is the limit on amount of *uncompressed* worth of data
> +	 * we can store in a disk.
>  	 */
> -	unsigned long limit_pages;
> -
> +	u64 disksize;	/* bytes */
>  	char compressor[10];
>  };
>  #endif
> 

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  1:30                         ` Minchan Kim
@ 2015-02-02  1:48                           ` Sergey Senozhatsky
  2015-02-02  2:44                             ` Minchan Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-02  1:48 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

Hello Minchan,

On (02/02/15 10:30), Minchan Kim wrote:
> > >  static inline int init_done(struct zram *zram)
> > >  {
> > > -	return zram->meta != NULL;
> > > +	return zram->disksize != 0;
> > 
> > we don't set ->disksize to 0 when create device. and I think
> > it's better to use refcount here, but set it to 0 during device creation.
> > (see the patch below)
> 
> There was a reason I didn't use refcount there.
> I should have written down it.
> 
> We need something to prevent further I/O handling on other CPUs.
> Otherwise, it's livelock. For example, new 'A' I/O rw path on CPU 1
> can see non-zero refcount if another CPU is going on rw.
> Then, another new 'B' I/O rw path on CPU 2 can see non-zero refcount
> if A I/O is going on. Then, another new 'C' I/O rw path on CPU 3 can
> see non-zero refcount if B I/O is going on. Finally, 'A' IO is done
> on CPU 1 and next I/O 'D' on CPU 1 can see non-zero refcount because
> 'C' on CPU 3 is going on. Infinite loop.

sure, I did think about this. and I actually didn't find any reason not
to use ->refcount there. if user wants to reset the device, he first
should umount it to make bdev->bd_holders check happy. and that's where
IOs will be failed. so it makes sense to switch to ->refcount there, IMHO.


> > here and later:
> > we can't take zram_meta_get() first and then check for init_done(zram),
> > because ->meta can be NULL, so it fill be ->NULL->refcount.
> 
> True.
> Actually, it was totally RFC I forgot adding the tag in the night but I can't
> escape from my shame with the escuse. Thanks!

no problem at all. you were throwing solutions all week long.

> 
> > 
> > let's keep ->completion and ->refcount in zram and rename zram_meta_[get|put]
> > to zram_[get|put].
> 
> Good idea but still want to name it as zram_meta_get/put because zram_get naming
> might confuse struct zram's refcount rather than zram_meta. :)

no objections. but I assume we agreed to keep ->io_done completion
and ->refcount in zram.

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  1:43                           ` Minchan Kim
@ 2015-02-02  1:59                             ` Sergey Senozhatsky
  2015-02-02  2:45                               ` Minchan Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-02  1:59 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On (02/02/15 10:43), Minchan Kim wrote:
> >  static inline int init_done(struct zram *zram)
> >  {
> > -	return zram->meta != NULL;
> > +	return atomic_read(&zram->refcount);
> 
> As I said previous mail, it could make livelock so I want to use disksize
> in here to prevent further I/O handling.

just as I said in my previous email -- is this live lock really possible?
we need to umount device to continue with reset. and umount will kill IOs out
of our way.

the other reset caller is  __exit zram_exit(). but once again, I don't
expect this function being executed on mounted device and module being
in use.


> > +static inline void zram_put(struct zram *zram)
> > +{
> > +	if (atomic_dec_and_test(&zram->refcount))
> > +		complete(&zram->io_done);
> > +}
> 
> Although I suggested this complete, it might be rather overkill(pz,
> understand me it was work in midnight. :))
> Instead, we could use just atomic_dec in here and
> use wait_event(event, atomic_read(&zram->refcount) == 0) in reset.
> 

yes, I think it can do the trick.

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  1:48                           ` Sergey Senozhatsky
@ 2015-02-02  2:44                             ` Minchan Kim
  2015-02-02  4:01                               ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-02-02  2:44 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

On Mon, Feb 02, 2015 at 10:48:00AM +0900, Sergey Senozhatsky wrote:
> Hello Minchan,
> 
> On (02/02/15 10:30), Minchan Kim wrote:
> > > >  static inline int init_done(struct zram *zram)
> > > >  {
> > > > -	return zram->meta != NULL;
> > > > +	return zram->disksize != 0;
> > > 
> > > we don't set ->disksize to 0 when create device. and I think
> > > it's better to use refcount here, but set it to 0 during device creation.
> > > (see the patch below)
> > 
> > There was a reason I didn't use refcount there.
> > I should have written down it.
> > 
> > We need something to prevent further I/O handling on other CPUs.
> > Otherwise, it's livelock. For example, new 'A' I/O rw path on CPU 1
> > can see non-zero refcount if another CPU is going on rw.
> > Then, another new 'B' I/O rw path on CPU 2 can see non-zero refcount
> > if A I/O is going on. Then, another new 'C' I/O rw path on CPU 3 can
> > see non-zero refcount if B I/O is going on. Finally, 'A' IO is done
> > on CPU 1 and next I/O 'D' on CPU 1 can see non-zero refcount because
> > 'C' on CPU 3 is going on. Infinite loop.
> 
> sure, I did think about this. and I actually didn't find any reason not
> to use ->refcount there. if user wants to reset the device, he first
> should umount it to make bdev->bd_holders check happy. and that's where
> IOs will be failed. so it makes sense to switch to ->refcount there, IMHO.

If we use zram as block device itself(not a fs or swap) and open the
block device as !FMODE_EXCL, bd_holders will be void.

Another topic: As I didn't see enough fs/block_dev.c bd_holders in zram
would be mess. I guess we need to study hotplug of device and implement
it for zram reset rather than strange own konb. It should go TODO. :(

> 
> 
> > > here and later:
> > > we can't take zram_meta_get() first and then check for init_done(zram),
> > > because ->meta can be NULL, so it fill be ->NULL->refcount.
> > 
> > True.
> > Actually, it was totally RFC I forgot adding the tag in the night but I can't
> > escape from my shame with the escuse. Thanks!
> 
> no problem at all. you were throwing solutions all week long.
> 
> > 
> > > 
> > > let's keep ->completion and ->refcount in zram and rename zram_meta_[get|put]
> > > to zram_[get|put].
> > 
> > Good idea but still want to name it as zram_meta_get/put because zram_get naming
> > might confuse struct zram's refcount rather than zram_meta. :)
> 
> no objections. but I assume we agreed to keep ->io_done completion
> and ->refcount in zram.
> 
> 	-ss

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  1:59                             ` Sergey Senozhatsky
@ 2015-02-02  2:45                               ` Minchan Kim
  2015-02-02  3:47                                 ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-02-02  2:45 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

On Mon, Feb 02, 2015 at 10:59:40AM +0900, Sergey Senozhatsky wrote:
> On (02/02/15 10:43), Minchan Kim wrote:
> > >  static inline int init_done(struct zram *zram)
> > >  {
> > > -	return zram->meta != NULL;
> > > +	return atomic_read(&zram->refcount);
> > 
> > As I said previous mail, it could make livelock so I want to use disksize
> > in here to prevent further I/O handling.
> 
> just as I said in my previous email -- is this live lock really possible?
> we need to umount device to continue with reset. and umount will kill IOs out
> of our way.
> 
> the other reset caller is  __exit zram_exit(). but once again, I don't
> expect this function being executed on mounted device and module being
> in use.
> 
> 
> > > +static inline void zram_put(struct zram *zram)
> > > +{
> > > +	if (atomic_dec_and_test(&zram->refcount))
> > > +		complete(&zram->io_done);
> > > +}
> > 
> > Although I suggested this complete, it might be rather overkill(pz,
> > understand me it was work in midnight. :))
> > Instead, we could use just atomic_dec in here and
> > use wait_event(event, atomic_read(&zram->refcount) == 0) in reset.
> > 
> 
> yes, I think it can do the trick.

Hey, it's not a trick. It suits for the our goal well. Completion
was too much, I think.

> 
> 	-ss

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
@ 2015-02-02  3:41 Minchan Kim
  2015-02-02  5:59 ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-02-02  3:41 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

Separate another issue from my patch.

On Mon, Feb 02, 2015 at 11:44:06AM +0900, Minchan Kim wrote:
> On Mon, Feb 02, 2015 at 10:48:00AM +0900, Sergey Senozhatsky wrote:
> > Hello Minchan,
> > 
> > On (02/02/15 10:30), Minchan Kim wrote:
> > > > >  static inline int init_done(struct zram *zram)
> > > > >  {
> > > > > -	return zram->meta != NULL;
> > > > > +	return zram->disksize != 0;
> > > > 
> > > > we don't set ->disksize to 0 when create device. and I think
> > > > it's better to use refcount here, but set it to 0 during device creation.
> > > > (see the patch below)
> > > 
> > > There was a reason I didn't use refcount there.
> > > I should have written down it.
> > > 
> > > We need something to prevent further I/O handling on other CPUs.
> > > Otherwise, it's livelock. For example, new 'A' I/O rw path on CPU 1
> > > can see non-zero refcount if another CPU is going on rw.
> > > Then, another new 'B' I/O rw path on CPU 2 can see non-zero refcount
> > > if A I/O is going on. Then, another new 'C' I/O rw path on CPU 3 can
> > > see non-zero refcount if B I/O is going on. Finally, 'A' IO is done
> > > on CPU 1 and next I/O 'D' on CPU 1 can see non-zero refcount because
> > > 'C' on CPU 3 is going on. Infinite loop.
> > 
> > sure, I did think about this. and I actually didn't find any reason not
> > to use ->refcount there. if user wants to reset the device, he first
> > should umount it to make bdev->bd_holders check happy. and that's where
> > IOs will be failed. so it makes sense to switch to ->refcount there, IMHO.
> 
> If we use zram as block device itself(not a fs or swap) and open the
> block device as !FMODE_EXCL, bd_holders will be void.
> 
> Another topic: As I didn't see enough fs/block_dev.c bd_holders in zram
> would be mess. I guess we need to study hotplug of device and implement
> it for zram reset rather than strange own konb. It should go TODO. :(

Actually, I thought bd_mutex use from custom driver was terrible idea
so we should walk around with device hotplug but as I look through
another drivers, they have used the lock for a long time.
Maybe it's okay to use it in zram?
If so, Ganesh's patch is no problem to me although I didn't' review it in detail.
One thing I want to point out is that it would be better to change bd_holders
with bd_openers to filter out because dd test opens block device as !EXCL
so bd_holders will be void.

What do you think about it?

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  2:45                               ` Minchan Kim
@ 2015-02-02  3:47                                 ` Sergey Senozhatsky
  0 siblings, 0 replies; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-02  3:47 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On (02/02/15 11:45), Minchan Kim wrote:
> On Mon, Feb 02, 2015 at 10:59:40AM +0900, Sergey Senozhatsky wrote:
> > On (02/02/15 10:43), Minchan Kim wrote:
> > > >  static inline int init_done(struct zram *zram)
> > > >  {
> > > > -	return zram->meta != NULL;
> > > > +	return atomic_read(&zram->refcount);
> > > 
> > > As I said previous mail, it could make livelock so I want to use disksize
> > > in here to prevent further I/O handling.
> > 
> > just as I said in my previous email -- is this live lock really possible?
> > we need to umount device to continue with reset. and umount will kill IOs out
> > of our way.
> > 
> > the other reset caller is  __exit zram_exit(). but once again, I don't
> > expect this function being executed on mounted device and module being
> > in use.
> > 
> > 
> > > > +static inline void zram_put(struct zram *zram)
> > > > +{
> > > > +	if (atomic_dec_and_test(&zram->refcount))
> > > > +		complete(&zram->io_done);
> > > > +}
> > > 
> > > Although I suggested this complete, it might be rather overkill(pz,
> > > understand me it was work in midnight. :))
> > > Instead, we could use just atomic_dec in here and
> > > use wait_event(event, atomic_read(&zram->refcount) == 0) in reset.
> > > 
> > 
> > yes, I think it can do the trick.
> 
> Hey, it's not a trick. It suits for the our goal well. Completion
> was too much, I think.
> 

sure :)

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  2:44                             ` Minchan Kim
@ 2015-02-02  4:01                               ` Sergey Senozhatsky
  2015-02-02  4:28                                 ` Minchan Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-02  4:01 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On (02/02/15 11:44), Minchan Kim wrote:
> > sure, I did think about this. and I actually didn't find any reason not
> > to use ->refcount there. if user wants to reset the device, he first
> > should umount it to make bdev->bd_holders check happy. and that's where
> > IOs will be failed. so it makes sense to switch to ->refcount there, IMHO.
> 
> If we use zram as block device itself(not a fs or swap) and open the
> block device as !FMODE_EXCL, bd_holders will be void.
> 

hm.
I don't mind to use ->disksize there, but personally I'd maybe prefer
to use ->refcount, which just looks less hacky. zram's most common use
cases are coming from ram swap device or ram device with fs. so it looks
a bit like we care about some corner case here.

just my opinion, no objections against ->disksize != 0.

I need to check fs/block_dev. can we switch away from ->bd_holders?

> Another topic: As I didn't see enough fs/block_dev.c bd_holders in zram
> would be mess. I guess we need to study hotplug of device and implement
> it for zram reset rather than strange own konb. It should go TODO. :(

ok, need to investigate this later.
let's land current activities first.

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  4:01                               ` Sergey Senozhatsky
@ 2015-02-02  4:28                                 ` Minchan Kim
  2015-02-02  5:09                                   ` Sergey Senozhatsky
  2015-02-02  5:10                                   ` Minchan Kim
  0 siblings, 2 replies; 43+ messages in thread
From: Minchan Kim @ 2015-02-02  4:28 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

On Mon, Feb 02, 2015 at 01:01:24PM +0900, Sergey Senozhatsky wrote:
> On (02/02/15 11:44), Minchan Kim wrote:
> > > sure, I did think about this. and I actually didn't find any reason not
> > > to use ->refcount there. if user wants to reset the device, he first
> > > should umount it to make bdev->bd_holders check happy. and that's where
> > > IOs will be failed. so it makes sense to switch to ->refcount there, IMHO.
> > 
> > If we use zram as block device itself(not a fs or swap) and open the
> > block device as !FMODE_EXCL, bd_holders will be void.
> > 
> 
> hm.
> I don't mind to use ->disksize there, but personally I'd maybe prefer
> to use ->refcount, which just looks less hacky. zram's most common use
> cases are coming from ram swap device or ram device with fs. so it looks
> a bit like we care about some corner case here.

Maybe, but I always test zram with dd so it's not a corner case for me. :)

> 
> just my opinion, no objections against ->disksize != 0.

Thanks. It's a draft for v2. Please review.

BTW, you pointed out race between bdev_open/close and reset and
it's cleary bug although it's rare in real practice.
So, I want to fix it earlier than this patch and mark it as -stable
if we can fix it easily like Ganesh's work.
If it gets landing, we could make this patch rebased on it.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  4:28                                 ` Minchan Kim
@ 2015-02-02  5:09                                   ` Sergey Senozhatsky
  2015-02-02  5:18                                     ` Minchan Kim
  2015-02-02  5:10                                   ` Minchan Kim
  1 sibling, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-02  5:09 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran


the patch mosly looks good, except for one place:

On (02/02/15 13:28), Minchan Kim wrote:
> @@ -783,6 +812,8 @@ static ssize_t disksize_store(struct device *dev,
>  		goto out_destroy_comp;
>  	}
>  
> +	init_waitqueue_head(&zram->io_done);
> +	zram_meta_get(zram);

it was
+       init_completion(&zram->io_done);
+       atomic_set(&zram->refcount, 1);

I think we need to replace zram_meta_get(zram) with atomic_set(&zram->refcount, 1).

->refcount is 0 by default and atomic_inc_not_zero(&zram->refcount) will not
increment it here, nor anywhere else.


>  	zram->meta = meta;
>  	zram->comp = comp;
>  	zram->disksize = disksize;
> @@ -838,8 +869,8 @@ static ssize_t reset_store(struct device *dev,
>  	/* Make sure all pending I/O is finished */
>  	fsync_bdev(bdev);
>  	bdput(bdev);
> -

[..]

> @@ -1041,6 +1075,7 @@ static int create_device(struct zram *zram, int device_id)
>  	int ret = -ENOMEM;
>  
>  	init_rwsem(&zram->init_lock);
> +	atomic_set(&zram->refcount, 0);

sorry, I forgot that zram is kzalloc()-ated. so we can drop

	atomic_set(&zram->refcount, 0)

>  	zram->queue = blk_alloc_queue(GFP_KERNEL);
>  	if (!zram->queue) {


	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  4:28                                 ` Minchan Kim
  2015-02-02  5:09                                   ` Sergey Senozhatsky
@ 2015-02-02  5:10                                   ` Minchan Kim
  1 sibling, 0 replies; 43+ messages in thread
From: Minchan Kim @ 2015-02-02  5:10 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

On Mon, Feb 02, 2015 at 01:28:47PM +0900, Minchan Kim wrote:
> On Mon, Feb 02, 2015 at 01:01:24PM +0900, Sergey Senozhatsky wrote:
> > On (02/02/15 11:44), Minchan Kim wrote:
> > > > sure, I did think about this. and I actually didn't find any reason not
> > > > to use ->refcount there. if user wants to reset the device, he first
> > > > should umount it to make bdev->bd_holders check happy. and that's where
> > > > IOs will be failed. so it makes sense to switch to ->refcount there, IMHO.
> > > 
> > > If we use zram as block device itself(not a fs or swap) and open the
> > > block device as !FMODE_EXCL, bd_holders will be void.
> > > 
> > 
> > hm.
> > I don't mind to use ->disksize there, but personally I'd maybe prefer
> > to use ->refcount, which just looks less hacky. zram's most common use
> > cases are coming from ram swap device or ram device with fs. so it looks
> > a bit like we care about some corner case here.
> 
> Maybe, but I always test zram with dd so it's not a corner case for me. :)
> 
> > 
> > just my opinion, no objections against ->disksize != 0.
> 
> Thanks. It's a draft for v2. Please review.
> 
> BTW, you pointed out race between bdev_open/close and reset and
> it's cleary bug although it's rare in real practice.
> So, I want to fix it earlier than this patch and mark it as -stable
> if we can fix it easily like Ganesh's work.
> If it gets landing, we could make this patch rebased on it.
> 
> From 699502b4e0c84b3d7b33f8754cf1c0109b16c012 Mon Sep 17 00:00:00 2001
> From: Minchan Kim <minchan@kernel.org>
> Date: Mon, 2 Feb 2015 10:36:28 +0900
> Subject: [PATCH v2] zram: remove init_lock in zram_make_request
> 
> Admin could reset zram during I/O operation going on so we have
> used zram->init_lock as read-side lock in I/O path to prevent
> sudden zram meta freeing.
> 
> However, the init_lock is really troublesome.
> We can't do call zram_meta_alloc under init_lock due to lockdep splat
> because zram_rw_page is one of the function under reclaim path and
> hold it as read_lock while other places in process context hold it
> as write_lock. So, we have used allocation out of the lock to avoid
> lockdep warn but it's not good for readability and fainally, I met
> another lockdep splat between init_lock and cpu_hotplug from
> kmem_cache_destroy during working zsmalloc compaction. :(
> 
> Yes, the ideal is to remove horrible init_lock of zram in rw path.
> This patch removes it in rw path and instead, add atomic refcount
> for meta lifetime management and completion to free meta in process
> context. It's important to free meta in process context because
> some of resource destruction needs mutex lock, which could be held
> if we releases the resource in reclaim context so it's deadlock,
> again.
> 
> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  drivers/block/zram/zram_drv.c | 85 ++++++++++++++++++++++++++++++-------------
>  drivers/block/zram/zram_drv.h | 20 +++++-----
>  2 files changed, 71 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index aa5a4c5..c6d505c 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -55,7 +55,7 @@ static DEVICE_ATTR_RO(name);
>  
>  static inline int init_done(struct zram *zram)
>  {
> -	return zram->meta != NULL;
> +	return zram->disksize != 0;
>  }
>  
>  static inline struct zram *dev_to_zram(struct device *dev)
> @@ -358,6 +358,18 @@ out_error:
>  	return NULL;
>  }
>  
> +static inline bool zram_meta_get(struct zram *zram)
> +{
> +	if (atomic_inc_not_zero(&zram->refcount))
> +		return true;
> +	return false;
> +}
> +
> +static inline void zram_meta_put(struct zram *zram)
> +{
> +	atomic_dec(&zram->refcount);
> +}
> +
>  static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
>  {
>  	if (*offset + bvec->bv_len >= PAGE_SIZE)
> @@ -719,6 +731,10 @@ static void zram_bio_discard(struct zram *zram, u32 index,
>  
>  static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  {
> +	struct zram_meta *meta;
> +	struct zcomp *comp;
> +	u64 disksize;
> +
>  	down_write(&zram->init_lock);
>  
>  	zram->limit_pages = 0;
> @@ -728,19 +744,32 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
>  		return;
>  	}
>  
> -	zcomp_destroy(zram->comp);
> -	zram->max_comp_streams = 1;
> -	zram_meta_free(zram->meta, zram->disksize);
> -	zram->meta = NULL;
> +	meta = zram->meta;
> +	comp = zram->comp;
> +	disksize = zram->disksize;
> +	zram->disksize = 0;
> +	/*
> +	 * ->refcount will go down to 0 eventually and rw handler cannot
> +	 * handle further I/O by init_done checking.
> +	 */
> +	zram_meta_put(zram);
> +	/*
> +	 * We want to free zram_meta in process context to avoid
> +	 * deadlock between reclaim path and any other locks
> +	 */
> +	wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
> +
>  	/* Reset stats */
>  	memset(&zram->stats, 0, sizeof(zram->stats));
> +	zram->max_comp_streams = 1;
>  
> -	zram->disksize = 0;
>  	if (reset_capacity)
>  		set_capacity(zram->disk, 0);
>  
>  	up_write(&zram->init_lock);
> -
> +	/* I/O operation under all of CPU are done so let's free */
> +	zram_meta_free(meta, disksize);
> +	zcomp_destroy(comp);
>  	/*
>  	 * Revalidate disk out of the init_lock to avoid lockdep splat.
>  	 * It's okay because disk's capacity is protected by init_lock
> @@ -783,6 +812,8 @@ static ssize_t disksize_store(struct device *dev,
>  		goto out_destroy_comp;
>  	}
>  
> +	init_waitqueue_head(&zram->io_done);
> +	zram_meta_get(zram);
        
Argh, It should be

        atomic_set(&zram->refcount, 1);

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  5:09                                   ` Sergey Senozhatsky
@ 2015-02-02  5:18                                     ` Minchan Kim
  2015-02-02  5:28                                       ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-02-02  5:18 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

On Mon, Feb 02, 2015 at 02:09:12PM +0900, Sergey Senozhatsky wrote:
> 
> the patch mosly looks good, except for one place:
> 
> On (02/02/15 13:28), Minchan Kim wrote:
> > @@ -783,6 +812,8 @@ static ssize_t disksize_store(struct device *dev,
> >  		goto out_destroy_comp;
> >  	}
> >  
> > +	init_waitqueue_head(&zram->io_done);
> > +	zram_meta_get(zram);
> 
> it was
> +       init_completion(&zram->io_done);
> +       atomic_set(&zram->refcount, 1);
> 
> I think we need to replace zram_meta_get(zram) with atomic_set(&zram->refcount, 1).
> 
> ->refcount is 0 by default and atomic_inc_not_zero(&zram->refcount) will not
> increment it here, nor anywhere else.
> 
> 
> >  	zram->meta = meta;
> >  	zram->comp = comp;
> >  	zram->disksize = disksize;
> > @@ -838,8 +869,8 @@ static ssize_t reset_store(struct device *dev,
> >  	/* Make sure all pending I/O is finished */
> >  	fsync_bdev(bdev);
> >  	bdput(bdev);
> > -
> 
> [..]
> 
> > @@ -1041,6 +1075,7 @@ static int create_device(struct zram *zram, int device_id)
> >  	int ret = -ENOMEM;
> >  
> >  	init_rwsem(&zram->init_lock);
> > +	atomic_set(&zram->refcount, 0);
> 
> sorry, I forgot that zram is kzalloc()-ated. so we can drop
> 
> 	atomic_set(&zram->refcount, 0)
> 

Everything are fixed. Ready to send a patch.
But before sending, hope we fix umount race issue first.

Thanks a lot, Sergey!

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  5:18                                     ` Minchan Kim
@ 2015-02-02  5:28                                       ` Sergey Senozhatsky
  0 siblings, 0 replies; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-02  5:28 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On (02/02/15 14:18), Minchan Kim wrote:
> Everything are fixed. Ready to send a patch.
> But before sending, hope we fix umount race issue first.
> 

Thanks a lot, Minchan!


OK, surely I don't mind to fix the umount race first, I just didn't want
to interrupt you in the middle of locking rework. the race is hardly
possible, but still.


I didn't review Ganesh's patch yet, I'll try to find some time to take a
look later this day.

I'm also planning to send a small `struct zram' cleanup patch by the end
of this day.


	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  3:41 [PATCH v1 2/2] zram: remove init_lock in zram_make_request Minchan Kim
@ 2015-02-02  5:59 ` Sergey Senozhatsky
  2015-02-02  6:18   ` Minchan Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-02  5:59 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On (02/02/15 12:41), Minchan Kim wrote:
> > If we use zram as block device itself(not a fs or swap) and open the
> > block device as !FMODE_EXCL, bd_holders will be void.
> > 
> > Another topic: As I didn't see enough fs/block_dev.c bd_holders in zram
> > would be mess. I guess we need to study hotplug of device and implement
> > it for zram reset rather than strange own konb. It should go TODO. :(
> 
> Actually, I thought bd_mutex use from custom driver was terrible idea
> so we should walk around with device hotplug but as I look through
> another drivers, they have used the lock for a long time.
> Maybe it's okay to use it in zram?
> If so, Ganesh's patch is no problem to me although I didn't' review it in detail.
> One thing I want to point out is that it would be better to change bd_holders
> with bd_openers to filter out because dd test opens block device as !EXCL
> so bd_holders will be void.
> 
> What do you think about it?
> 

a quick idea:
can we additionally move all bd flush and put work after zram_reset_device(zram, true)
and, perhaps, replace ->bd_holders with something else?

zram_reset_device() will not return until we have active IOs, pending IOs will be
invalidated by ->disksize != 0.

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  5:59 ` Sergey Senozhatsky
@ 2015-02-02  6:18   ` Minchan Kim
  2015-02-02  7:06     ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-02-02  6:18 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

On Mon, Feb 02, 2015 at 02:59:23PM +0900, Sergey Senozhatsky wrote:
> On (02/02/15 12:41), Minchan Kim wrote:
> > > If we use zram as block device itself(not a fs or swap) and open the
> > > block device as !FMODE_EXCL, bd_holders will be void.
> > > 
> > > Another topic: As I didn't see enough fs/block_dev.c bd_holders in zram
> > > would be mess. I guess we need to study hotplug of device and implement
> > > it for zram reset rather than strange own konb. It should go TODO. :(
> > 
> > Actually, I thought bd_mutex use from custom driver was terrible idea
> > so we should walk around with device hotplug but as I look through
> > another drivers, they have used the lock for a long time.
> > Maybe it's okay to use it in zram?
> > If so, Ganesh's patch is no problem to me although I didn't' review it in detail.
> > One thing I want to point out is that it would be better to change bd_holders
> > with bd_openers to filter out because dd test opens block device as !EXCL
> > so bd_holders will be void.
> > 
> > What do you think about it?
> > 
> 
> a quick idea:
> can we additionally move all bd flush and put work after zram_reset_device(zram, true)
> and, perhaps, replace ->bd_holders with something else?
> 
> zram_reset_device() will not return until we have active IOs, pending IOs will be
> invalidated by ->disksize != 0.

Sorry, I don't get it. Could you describe what you are concerning about active I/O?
My concern is just race bd_holder/bd_openers and bd_holders of zram check.
I don't think any simple solution without bd_mutex.
If we can close the race, anything could be a solution.
If we close the race, we should return -EBUSY if anyone is opening the zram device
so bd_openers check would be better than bd_holders.

> 
> 	-ss

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  6:18   ` Minchan Kim
@ 2015-02-02  7:06     ` Sergey Senozhatsky
  2015-02-03  1:54       ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-02  7:06 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On (02/02/15 15:18), Minchan Kim wrote:
> > a quick idea:
> > can we additionally move all bd flush and put work after zram_reset_device(zram, true)
> > and, perhaps, replace ->bd_holders with something else?
> > 
> > zram_reset_device() will not return until we have active IOs, pending IOs will be
> > invalidated by ->disksize != 0.
> 
> Sorry, I don't get it. Could you describe what you are concerning about active I/O?
> My concern is just race bd_holder/bd_openers and bd_holders of zram check.
> I don't think any simple solution without bd_mutex.
> If we can close the race, anything could be a solution.
> If we close the race, we should return -EBUSY if anyone is opening the zram device
> so bd_openers check would be better than bd_holders.
> 

yeah, sorry. nevermind.


So, guys, how about doing it differently, in less lines of code,
hopefully. Don't move reset_store()'s work to zram_reset_device().
Instead, move

	set_capacity(zram->disk, 0);
	revalidate_disk(zram->disk);

out from zram_reset_device() to reset_store(). this two function are
executed only when called from reset_store() anyway. this also will let
us drop `bool reset capacity' param from zram_reset_device().


so we will do in reset_store()

	mutex_lock(bdev->bd_mutex);

	fsync_bdev(bdev);
	zram_reset_device(zram);
	set_capacity(zram->disk, 0);

	mutex_unlock(&bdev->bd_mutex);

	revalidate_disk(zram->disk);
	bdput(bdev);



and change zram_reset_device(zram, false) call to simply zram_reset_device(zram)
in __exit zram_exit(void).

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-02  7:06     ` Sergey Senozhatsky
@ 2015-02-03  1:54       ` Sergey Senozhatsky
  2015-02-03  3:02         ` Minchan Kim
  0 siblings, 1 reply; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-03  1:54 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On (02/02/15 16:06), Sergey Senozhatsky wrote:
> So, guys, how about doing it differently, in less lines of code,
> hopefully. Don't move reset_store()'s work to zram_reset_device().
> Instead, move
> 
> 	set_capacity(zram->disk, 0);
> 	revalidate_disk(zram->disk);
> 
> out from zram_reset_device() to reset_store(). this two function are
> executed only when called from reset_store() anyway. this also will let
> us drop `bool reset capacity' param from zram_reset_device().
> 
> 
> so we will do in reset_store()
> 
> 	mutex_lock(bdev->bd_mutex);
> 
> 	fsync_bdev(bdev);
> 	zram_reset_device(zram);
> 	set_capacity(zram->disk, 0);
> 
> 	mutex_unlock(&bdev->bd_mutex);
> 
> 	revalidate_disk(zram->disk);
> 	bdput(bdev);
> 
> 
> 
> and change zram_reset_device(zram, false) call to simply zram_reset_device(zram)
> in __exit zram_exit(void).
> 

Hello,

Minchan, Ganesh, I sent a patch last night, with the above solution.
looks ok to you?

Minchan, I think I'll send my small struct zram clean-up patch after
your init_lock patch. what's your opinion?

	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-03  1:54       ` Sergey Senozhatsky
@ 2015-02-03  3:02         ` Minchan Kim
  2015-02-03  3:56           ` Sergey Senozhatsky
  0 siblings, 1 reply; 43+ messages in thread
From: Minchan Kim @ 2015-02-03  3:02 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Andrew Morton, linux-kernel@vger.kernel.org,
	Linux-MM, Nitin Gupta, Jerome Marchand, Ganesh Mahendran

On Tue, Feb 03, 2015 at 10:54:33AM +0900, Sergey Senozhatsky wrote:
> On (02/02/15 16:06), Sergey Senozhatsky wrote:
> > So, guys, how about doing it differently, in less lines of code,
> > hopefully. Don't move reset_store()'s work to zram_reset_device().
> > Instead, move
> > 
> > 	set_capacity(zram->disk, 0);
> > 	revalidate_disk(zram->disk);
> > 
> > out from zram_reset_device() to reset_store(). this two function are
> > executed only when called from reset_store() anyway. this also will let
> > us drop `bool reset capacity' param from zram_reset_device().
> > 
> > 
> > so we will do in reset_store()
> > 
> > 	mutex_lock(bdev->bd_mutex);
> > 
> > 	fsync_bdev(bdev);
> > 	zram_reset_device(zram);
> > 	set_capacity(zram->disk, 0);
> > 
> > 	mutex_unlock(&bdev->bd_mutex);
> > 
> > 	revalidate_disk(zram->disk);
> > 	bdput(bdev);
> > 
> > 
> > 
> > and change zram_reset_device(zram, false) call to simply zram_reset_device(zram)
> > in __exit zram_exit(void).
> > 
> 
> Hello,
> 
> Minchan, Ganesh, I sent a patch last night, with the above solution.
> looks ok to you?

Just I sent a feedback.

> 
> Minchan, I think I'll send my small struct zram clean-up patch after
> your init_lock patch. what's your opinion?

Good for me.

Thanks.
-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v1 2/2] zram: remove init_lock in zram_make_request
  2015-02-03  3:02         ` Minchan Kim
@ 2015-02-03  3:56           ` Sergey Senozhatsky
  0 siblings, 0 replies; 43+ messages in thread
From: Sergey Senozhatsky @ 2015-02-03  3:56 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Sergey Senozhatsky, Sergey Senozhatsky, Andrew Morton,
	linux-kernel@vger.kernel.org, Linux-MM, Nitin Gupta,
	Jerome Marchand, Ganesh Mahendran

On (02/03/15 12:02), Minchan Kim wrote:
> On Tue, Feb 03, 2015 at 10:54:33AM +0900, Sergey Senozhatsky wrote:
> > On (02/02/15 16:06), Sergey Senozhatsky wrote:
> > > So, guys, how about doing it differently, in less lines of code,
> > > hopefully. Don't move reset_store()'s work to zram_reset_device().
> > > Instead, move
> > > 
> > > 	set_capacity(zram->disk, 0);
> > > 	revalidate_disk(zram->disk);
> > > 
> > > out from zram_reset_device() to reset_store(). this two function are
> > > executed only when called from reset_store() anyway. this also will let
> > > us drop `bool reset capacity' param from zram_reset_device().
> > > 
> > > 
> > > so we will do in reset_store()
> > > 
> > > 	mutex_lock(bdev->bd_mutex);
> > > 
> > > 	fsync_bdev(bdev);
> > > 	zram_reset_device(zram);
> > > 	set_capacity(zram->disk, 0);
> > > 
> > > 	mutex_unlock(&bdev->bd_mutex);
> > > 
> > > 	revalidate_disk(zram->disk);
> > > 	bdput(bdev);
> > > 
> > > 
> > > 
> > > and change zram_reset_device(zram, false) call to simply zram_reset_device(zram)
> > > in __exit zram_exit(void).
> > > 
> > 
> > Hello,
> > 
> > Minchan, Ganesh, I sent a patch last night, with the above solution.
> > looks ok to you?
> 
> Just I sent a feedback.
> 

thanks.
yeah, !FMODE_EXCL mode.

how do you want to handle it -- you want to send a separate patch or
you want me to send incremental one-liner and ask Andrew to squash them?

	-ss

> > 
> > Minchan, I think I'll send my small struct zram clean-up patch after
> > your init_lock patch. what's your opinion?
> 
> Good for me.
> 
> Thanks.
> -- 
> Kind regards,
> Minchan Kim
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2015-02-03  3:56 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-02-02  3:41 [PATCH v1 2/2] zram: remove init_lock in zram_make_request Minchan Kim
2015-02-02  5:59 ` Sergey Senozhatsky
2015-02-02  6:18   ` Minchan Kim
2015-02-02  7:06     ` Sergey Senozhatsky
2015-02-03  1:54       ` Sergey Senozhatsky
2015-02-03  3:02         ` Minchan Kim
2015-02-03  3:56           ` Sergey Senozhatsky
  -- strict thread matches above, loose matches on Subject: below --
2015-01-28  8:15 [PATCH 1/2] zram: free meta table in zram_meta_free Minchan Kim
2015-01-28  8:15 ` [PATCH v1 2/2] zram: remove init_lock in zram_make_request Minchan Kim
2015-01-28 14:56   ` Sergey Senozhatsky
2015-01-28 15:04     ` Sergey Senozhatsky
2015-01-28 23:33     ` Minchan Kim
2015-01-29  1:57       ` Sergey Senozhatsky
2015-01-29  2:01         ` Minchan Kim
2015-01-29  2:22           ` Sergey Senozhatsky
2015-01-29  5:28             ` Minchan Kim
2015-01-29  6:06               ` Sergey Senozhatsky
2015-01-29  6:35                 ` Minchan Kim
2015-01-29  7:08                   ` Sergey Senozhatsky
2015-01-30 14:41                     ` Minchan Kim
2015-01-31 11:31                       ` Sergey Senozhatsky
2015-02-01 14:50                       ` Sergey Senozhatsky
2015-02-01 15:04                         ` Sergey Senozhatsky
2015-02-02  1:43                           ` Minchan Kim
2015-02-02  1:59                             ` Sergey Senozhatsky
2015-02-02  2:45                               ` Minchan Kim
2015-02-02  3:47                                 ` Sergey Senozhatsky
2015-02-02  1:30                         ` Minchan Kim
2015-02-02  1:48                           ` Sergey Senozhatsky
2015-02-02  2:44                             ` Minchan Kim
2015-02-02  4:01                               ` Sergey Senozhatsky
2015-02-02  4:28                                 ` Minchan Kim
2015-02-02  5:09                                   ` Sergey Senozhatsky
2015-02-02  5:18                                     ` Minchan Kim
2015-02-02  5:28                                       ` Sergey Senozhatsky
2015-02-02  5:10                                   ` Minchan Kim
2015-01-30  0:20                   ` Sergey Senozhatsky
2015-01-29 13:48   ` Ganesh Mahendran
2015-01-29 15:12     ` Sergey Senozhatsky
2015-01-30  7:52       ` Ganesh Mahendran
2015-01-30  8:08         ` Sergey Senozhatsky
2015-01-31  8:50           ` Ganesh Mahendran
2015-01-31 11:07             ` Sergey Senozhatsky
2015-01-31 12:59               ` Ganesh Mahendran

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).