diff for duplicates of <20150202042847.GG6402@blaptop> diff --git a/a/1.txt b/N1/1.txt index 33d4741..12226e6 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -27,3 +27,281 @@ it's cleary bug although it's rare in real practice. So, I want to fix it earlier than this patch and mark it as -stable if we can fix it easily like Ganesh's work. If it gets landing, we could make this patch rebased on it. + +>From 699502b4e0c84b3d7b33f8754cf1c0109b16c012 Mon Sep 17 00:00:00 2001 +From: Minchan Kim <minchan@kernel.org> +Date: Mon, 2 Feb 2015 10:36:28 +0900 +Subject: [PATCH v2] zram: remove init_lock in zram_make_request + +Admin could reset zram during I/O operation going on so we have +used zram->init_lock as read-side lock in I/O path to prevent +sudden zram meta freeing. + +However, the init_lock is really troublesome. +We can't do call zram_meta_alloc under init_lock due to lockdep splat +because zram_rw_page is one of the function under reclaim path and +hold it as read_lock while other places in process context hold it +as write_lock. So, we have used allocation out of the lock to avoid +lockdep warn but it's not good for readability and fainally, I met +another lockdep splat between init_lock and cpu_hotplug from +kmem_cache_destroy during working zsmalloc compaction. :( + +Yes, the ideal is to remove horrible init_lock of zram in rw path. +This patch removes it in rw path and instead, add atomic refcount +for meta lifetime management and completion to free meta in process +context. It's important to free meta in process context because +some of resource destruction needs mutex lock, which could be held +if we releases the resource in reclaim context so it's deadlock, +again. + +Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> +Signed-off-by: Minchan Kim <minchan@kernel.org> +--- + drivers/block/zram/zram_drv.c | 85 ++++++++++++++++++++++++++++++------------- + drivers/block/zram/zram_drv.h | 20 +++++----- + 2 files changed, 71 insertions(+), 34 deletions(-) + +diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c +index aa5a4c5..c6d505c 100644 +--- a/drivers/block/zram/zram_drv.c ++++ b/drivers/block/zram/zram_drv.c +@@ -55,7 +55,7 @@ static DEVICE_ATTR_RO(name); + + static inline int init_done(struct zram *zram) + { +- return zram->meta != NULL; ++ return zram->disksize != 0; + } + + static inline struct zram *dev_to_zram(struct device *dev) +@@ -358,6 +358,18 @@ out_error: + return NULL; + } + ++static inline bool zram_meta_get(struct zram *zram) ++{ ++ if (atomic_inc_not_zero(&zram->refcount)) ++ return true; ++ return false; ++} ++ ++static inline void zram_meta_put(struct zram *zram) ++{ ++ atomic_dec(&zram->refcount); ++} ++ + static void update_position(u32 *index, int *offset, struct bio_vec *bvec) + { + if (*offset + bvec->bv_len >= PAGE_SIZE) +@@ -719,6 +731,10 @@ static void zram_bio_discard(struct zram *zram, u32 index, + + static void zram_reset_device(struct zram *zram, bool reset_capacity) + { ++ struct zram_meta *meta; ++ struct zcomp *comp; ++ u64 disksize; ++ + down_write(&zram->init_lock); + + zram->limit_pages = 0; +@@ -728,19 +744,32 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) + return; + } + +- zcomp_destroy(zram->comp); +- zram->max_comp_streams = 1; +- zram_meta_free(zram->meta, zram->disksize); +- zram->meta = NULL; ++ meta = zram->meta; ++ comp = zram->comp; ++ disksize = zram->disksize; ++ zram->disksize = 0; ++ /* ++ * ->refcount will go down to 0 eventually and rw handler cannot ++ * handle further I/O by init_done checking. ++ */ ++ zram_meta_put(zram); ++ /* ++ * We want to free zram_meta in process context to avoid ++ * deadlock between reclaim path and any other locks ++ */ ++ wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); ++ + /* Reset stats */ + memset(&zram->stats, 0, sizeof(zram->stats)); ++ zram->max_comp_streams = 1; + +- zram->disksize = 0; + if (reset_capacity) + set_capacity(zram->disk, 0); + + up_write(&zram->init_lock); +- ++ /* I/O operation under all of CPU are done so let's free */ ++ zram_meta_free(meta, disksize); ++ zcomp_destroy(comp); + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock +@@ -783,6 +812,8 @@ static ssize_t disksize_store(struct device *dev, + goto out_destroy_comp; + } + ++ init_waitqueue_head(&zram->io_done); ++ zram_meta_get(zram); + zram->meta = meta; + zram->comp = comp; + zram->disksize = disksize; +@@ -838,8 +869,8 @@ static ssize_t reset_store(struct device *dev, + /* Make sure all pending I/O is finished */ + fsync_bdev(bdev); + bdput(bdev); +- + zram_reset_device(zram, true); ++ + return len; + + out: +@@ -908,23 +939,24 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) + { + struct zram *zram = queue->queuedata; + +- down_read(&zram->init_lock); +- if (unlikely(!init_done(zram))) ++ if (unlikely(!zram_meta_get(zram))) + goto error; + ++ if (unlikely(!init_done(zram))) ++ goto put_zram; ++ + if (!valid_io_request(zram, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size)) { + atomic64_inc(&zram->stats.invalid_io); +- goto error; ++ goto put_zram; + } + + __zram_make_request(zram, bio); +- up_read(&zram->init_lock); +- ++ zram_meta_put(zram); + return; +- ++put_zram: ++ zram_meta_put(zram); + error: +- up_read(&zram->init_lock); + bio_io_error(bio); + } + +@@ -946,21 +978,22 @@ static void zram_slot_free_notify(struct block_device *bdev, + static int zram_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) + { +- int offset, err; ++ int offset, err = -EIO; + u32 index; + struct zram *zram; + struct bio_vec bv; + + zram = bdev->bd_disk->private_data; ++ if (unlikely(!zram_meta_get(zram))) ++ goto out; ++ ++ if (unlikely(!init_done(zram))) ++ goto put_zram; ++ + if (!valid_io_request(zram, sector, PAGE_SIZE)) { + atomic64_inc(&zram->stats.invalid_io); +- return -EINVAL; +- } +- +- down_read(&zram->init_lock); +- if (unlikely(!init_done(zram))) { +- err = -EIO; +- goto out_unlock; ++ err = -EINVAL; ++ goto put_zram; + } + + index = sector >> SECTORS_PER_PAGE_SHIFT; +@@ -971,8 +1004,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, + bv.bv_offset = 0; + + err = zram_bvec_rw(zram, &bv, index, offset, rw); +-out_unlock: +- up_read(&zram->init_lock); ++put_zram: ++ zram_meta_put(zram); ++out: + /* + * If I/O fails, just return error(ie, non-zero) without + * calling page_endio. +@@ -1041,6 +1075,7 @@ static int create_device(struct zram *zram, int device_id) + int ret = -ENOMEM; + + init_rwsem(&zram->init_lock); ++ atomic_set(&zram->refcount, 0); + + zram->queue = blk_alloc_queue(GFP_KERNEL); + if (!zram->queue) { +diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h +index b05a816..6085335 100644 +--- a/drivers/block/zram/zram_drv.h ++++ b/drivers/block/zram/zram_drv.h +@@ -100,24 +100,26 @@ struct zram_meta { + + struct zram { + struct zram_meta *meta; ++ struct zcomp *comp; + struct request_queue *queue; + struct gendisk *disk; +- struct zcomp *comp; +- +- /* Prevent concurrent execution of device init, reset and R/W request */ ++ /* Prevent concurrent execution of device init */ + struct rw_semaphore init_lock; + /* +- * This is the limit on amount of *uncompressed* worth of data +- * we can store in a disk. ++ * the number of pages zram can consume for storing compressed data + */ +- u64 disksize; /* bytes */ ++ unsigned long limit_pages; ++ atomic_t refcount; /* refcount for zram_meta */ + int max_comp_streams; ++ + struct zram_stats stats; ++ /* wait all IO under all of cpu are done */ ++ wait_queue_head_t io_done; + /* +- * the number of pages zram can consume for storing compressed data ++ * This is the limit on amount of *uncompressed* worth of data ++ * we can store in a disk. + */ +- unsigned long limit_pages; +- ++ u64 disksize; /* bytes */ + char compressor[10]; + }; + #endif +-- +1.9.3 + + + +> +> I need to check fs/block_dev. can we switch away from ->bd_holders? +> +> > Another topic: As I didn't see enough fs/block_dev.c bd_holders in zram +> > would be mess. I guess we need to study hotplug of device and implement +> > it for zram reset rather than strange own konb. It should go TODO. :( +> +> ok, need to investigate this later. +> let's land current activities first. +> +> -ss + +-- +Kind regards, +Minchan Kim diff --git a/a/content_digest b/N1/content_digest index 3d7caeb..a08981e 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -49,6 +49,284 @@ "it's cleary bug although it's rare in real practice.\n" "So, I want to fix it earlier than this patch and mark it as -stable\n" "if we can fix it easily like Ganesh's work.\n" - If it gets landing, we could make this patch rebased on it. + "If it gets landing, we could make this patch rebased on it.\n" + "\n" + ">From 699502b4e0c84b3d7b33f8754cf1c0109b16c012 Mon Sep 17 00:00:00 2001\n" + "From: Minchan Kim <minchan@kernel.org>\n" + "Date: Mon, 2 Feb 2015 10:36:28 +0900\n" + "Subject: [PATCH v2] zram: remove init_lock in zram_make_request\n" + "\n" + "Admin could reset zram during I/O operation going on so we have\n" + "used zram->init_lock as read-side lock in I/O path to prevent\n" + "sudden zram meta freeing.\n" + "\n" + "However, the init_lock is really troublesome.\n" + "We can't do call zram_meta_alloc under init_lock due to lockdep splat\n" + "because zram_rw_page is one of the function under reclaim path and\n" + "hold it as read_lock while other places in process context hold it\n" + "as write_lock. So, we have used allocation out of the lock to avoid\n" + "lockdep warn but it's not good for readability and fainally, I met\n" + "another lockdep splat between init_lock and cpu_hotplug from\n" + "kmem_cache_destroy during working zsmalloc compaction. :(\n" + "\n" + "Yes, the ideal is to remove horrible init_lock of zram in rw path.\n" + "This patch removes it in rw path and instead, add atomic refcount\n" + "for meta lifetime management and completion to free meta in process\n" + "context. It's important to free meta in process context because\n" + "some of resource destruction needs mutex lock, which could be held\n" + "if we releases the resource in reclaim context so it's deadlock,\n" + "again.\n" + "\n" + "Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>\n" + "Signed-off-by: Minchan Kim <minchan@kernel.org>\n" + "---\n" + " drivers/block/zram/zram_drv.c | 85 ++++++++++++++++++++++++++++++-------------\n" + " drivers/block/zram/zram_drv.h | 20 +++++-----\n" + " 2 files changed, 71 insertions(+), 34 deletions(-)\n" + "\n" + "diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c\n" + "index aa5a4c5..c6d505c 100644\n" + "--- a/drivers/block/zram/zram_drv.c\n" + "+++ b/drivers/block/zram/zram_drv.c\n" + "@@ -55,7 +55,7 @@ static DEVICE_ATTR_RO(name);\n" + " \n" + " static inline int init_done(struct zram *zram)\n" + " {\n" + "-\treturn zram->meta != NULL;\n" + "+\treturn zram->disksize != 0;\n" + " }\n" + " \n" + " static inline struct zram *dev_to_zram(struct device *dev)\n" + "@@ -358,6 +358,18 @@ out_error:\n" + " \treturn NULL;\n" + " }\n" + " \n" + "+static inline bool zram_meta_get(struct zram *zram)\n" + "+{\n" + "+\tif (atomic_inc_not_zero(&zram->refcount))\n" + "+\t\treturn true;\n" + "+\treturn false;\n" + "+}\n" + "+\n" + "+static inline void zram_meta_put(struct zram *zram)\n" + "+{\n" + "+\tatomic_dec(&zram->refcount);\n" + "+}\n" + "+\n" + " static void update_position(u32 *index, int *offset, struct bio_vec *bvec)\n" + " {\n" + " \tif (*offset + bvec->bv_len >= PAGE_SIZE)\n" + "@@ -719,6 +731,10 @@ static void zram_bio_discard(struct zram *zram, u32 index,\n" + " \n" + " static void zram_reset_device(struct zram *zram, bool reset_capacity)\n" + " {\n" + "+\tstruct zram_meta *meta;\n" + "+\tstruct zcomp *comp;\n" + "+\tu64 disksize;\n" + "+\n" + " \tdown_write(&zram->init_lock);\n" + " \n" + " \tzram->limit_pages = 0;\n" + "@@ -728,19 +744,32 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)\n" + " \t\treturn;\n" + " \t}\n" + " \n" + "-\tzcomp_destroy(zram->comp);\n" + "-\tzram->max_comp_streams = 1;\n" + "-\tzram_meta_free(zram->meta, zram->disksize);\n" + "-\tzram->meta = NULL;\n" + "+\tmeta = zram->meta;\n" + "+\tcomp = zram->comp;\n" + "+\tdisksize = zram->disksize;\n" + "+\tzram->disksize = 0;\n" + "+\t/*\n" + "+\t * ->refcount will go down to 0 eventually and rw handler cannot\n" + "+\t * handle further I/O by init_done checking.\n" + "+\t */\n" + "+\tzram_meta_put(zram);\n" + "+\t/*\n" + "+\t * We want to free zram_meta in process context to avoid\n" + "+\t * deadlock between reclaim path and any other locks\n" + "+\t */\n" + "+\twait_event(zram->io_done, atomic_read(&zram->refcount) == 0);\n" + "+\n" + " \t/* Reset stats */\n" + " \tmemset(&zram->stats, 0, sizeof(zram->stats));\n" + "+\tzram->max_comp_streams = 1;\n" + " \n" + "-\tzram->disksize = 0;\n" + " \tif (reset_capacity)\n" + " \t\tset_capacity(zram->disk, 0);\n" + " \n" + " \tup_write(&zram->init_lock);\n" + "-\n" + "+\t/* I/O operation under all of CPU are done so let's free */\n" + "+\tzram_meta_free(meta, disksize);\n" + "+\tzcomp_destroy(comp);\n" + " \t/*\n" + " \t * Revalidate disk out of the init_lock to avoid lockdep splat.\n" + " \t * It's okay because disk's capacity is protected by init_lock\n" + "@@ -783,6 +812,8 @@ static ssize_t disksize_store(struct device *dev,\n" + " \t\tgoto out_destroy_comp;\n" + " \t}\n" + " \n" + "+\tinit_waitqueue_head(&zram->io_done);\n" + "+\tzram_meta_get(zram);\n" + " \tzram->meta = meta;\n" + " \tzram->comp = comp;\n" + " \tzram->disksize = disksize;\n" + "@@ -838,8 +869,8 @@ static ssize_t reset_store(struct device *dev,\n" + " \t/* Make sure all pending I/O is finished */\n" + " \tfsync_bdev(bdev);\n" + " \tbdput(bdev);\n" + "-\n" + " \tzram_reset_device(zram, true);\n" + "+\n" + " \treturn len;\n" + " \n" + " out:\n" + "@@ -908,23 +939,24 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)\n" + " {\n" + " \tstruct zram *zram = queue->queuedata;\n" + " \n" + "-\tdown_read(&zram->init_lock);\n" + "-\tif (unlikely(!init_done(zram)))\n" + "+\tif (unlikely(!zram_meta_get(zram)))\n" + " \t\tgoto error;\n" + " \n" + "+\tif (unlikely(!init_done(zram)))\n" + "+\t\tgoto put_zram;\n" + "+\n" + " \tif (!valid_io_request(zram, bio->bi_iter.bi_sector,\n" + " \t\t\t\t\tbio->bi_iter.bi_size)) {\n" + " \t\tatomic64_inc(&zram->stats.invalid_io);\n" + "-\t\tgoto error;\n" + "+\t\tgoto put_zram;\n" + " \t}\n" + " \n" + " \t__zram_make_request(zram, bio);\n" + "-\tup_read(&zram->init_lock);\n" + "-\n" + "+\tzram_meta_put(zram);\n" + " \treturn;\n" + "-\n" + "+put_zram:\n" + "+\tzram_meta_put(zram);\n" + " error:\n" + "-\tup_read(&zram->init_lock);\n" + " \tbio_io_error(bio);\n" + " }\n" + " \n" + "@@ -946,21 +978,22 @@ static void zram_slot_free_notify(struct block_device *bdev,\n" + " static int zram_rw_page(struct block_device *bdev, sector_t sector,\n" + " \t\t struct page *page, int rw)\n" + " {\n" + "-\tint offset, err;\n" + "+\tint offset, err = -EIO;\n" + " \tu32 index;\n" + " \tstruct zram *zram;\n" + " \tstruct bio_vec bv;\n" + " \n" + " \tzram = bdev->bd_disk->private_data;\n" + "+\tif (unlikely(!zram_meta_get(zram)))\n" + "+\t\tgoto out;\n" + "+\n" + "+\tif (unlikely(!init_done(zram)))\n" + "+\t\tgoto put_zram;\n" + "+\n" + " \tif (!valid_io_request(zram, sector, PAGE_SIZE)) {\n" + " \t\tatomic64_inc(&zram->stats.invalid_io);\n" + "-\t\treturn -EINVAL;\n" + "-\t}\n" + "-\n" + "-\tdown_read(&zram->init_lock);\n" + "-\tif (unlikely(!init_done(zram))) {\n" + "-\t\terr = -EIO;\n" + "-\t\tgoto out_unlock;\n" + "+\t\terr = -EINVAL;\n" + "+\t\tgoto put_zram;\n" + " \t}\n" + " \n" + " \tindex = sector >> SECTORS_PER_PAGE_SHIFT;\n" + "@@ -971,8 +1004,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,\n" + " \tbv.bv_offset = 0;\n" + " \n" + " \terr = zram_bvec_rw(zram, &bv, index, offset, rw);\n" + "-out_unlock:\n" + "-\tup_read(&zram->init_lock);\n" + "+put_zram:\n" + "+\tzram_meta_put(zram);\n" + "+out:\n" + " \t/*\n" + " \t * If I/O fails, just return error(ie, non-zero) without\n" + " \t * calling page_endio.\n" + "@@ -1041,6 +1075,7 @@ static int create_device(struct zram *zram, int device_id)\n" + " \tint ret = -ENOMEM;\n" + " \n" + " \tinit_rwsem(&zram->init_lock);\n" + "+\tatomic_set(&zram->refcount, 0);\n" + " \n" + " \tzram->queue = blk_alloc_queue(GFP_KERNEL);\n" + " \tif (!zram->queue) {\n" + "diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h\n" + "index b05a816..6085335 100644\n" + "--- a/drivers/block/zram/zram_drv.h\n" + "+++ b/drivers/block/zram/zram_drv.h\n" + "@@ -100,24 +100,26 @@ struct zram_meta {\n" + " \n" + " struct zram {\n" + " \tstruct zram_meta *meta;\n" + "+\tstruct zcomp *comp;\n" + " \tstruct request_queue *queue;\n" + " \tstruct gendisk *disk;\n" + "-\tstruct zcomp *comp;\n" + "-\n" + "-\t/* Prevent concurrent execution of device init, reset and R/W request */\n" + "+\t/* Prevent concurrent execution of device init */\n" + " \tstruct rw_semaphore init_lock;\n" + " \t/*\n" + "-\t * This is the limit on amount of *uncompressed* worth of data\n" + "-\t * we can store in a disk.\n" + "+\t * the number of pages zram can consume for storing compressed data\n" + " \t */\n" + "-\tu64 disksize;\t/* bytes */\n" + "+\tunsigned long limit_pages;\n" + "+\tatomic_t refcount; /* refcount for zram_meta */\n" + " \tint max_comp_streams;\n" + "+\n" + " \tstruct zram_stats stats;\n" + "+\t/* wait all IO under all of cpu are done */\n" + "+\twait_queue_head_t io_done;\n" + " \t/*\n" + "-\t * the number of pages zram can consume for storing compressed data\n" + "+\t * This is the limit on amount of *uncompressed* worth of data\n" + "+\t * we can store in a disk.\n" + " \t */\n" + "-\tunsigned long limit_pages;\n" + "-\n" + "+\tu64 disksize;\t/* bytes */\n" + " \tchar compressor[10];\n" + " };\n" + " #endif\n" + "-- \n" + "1.9.3\n" + "\n" + "\n" + "\n" + "> \n" + "> I need to check fs/block_dev. can we switch away from ->bd_holders?\n" + "> \n" + "> > Another topic: As I didn't see enough fs/block_dev.c bd_holders in zram\n" + "> > would be mess. I guess we need to study hotplug of device and implement\n" + "> > it for zram reset rather than strange own konb. It should go TODO. :(\n" + "> \n" + "> ok, need to investigate this later.\n" + "> let's land current activities first.\n" + "> \n" + "> \t-ss\n" + "\n" + "-- \n" + "Kind regards,\n" + Minchan Kim -2fe0d30174015fa81c0b9cfc18a4dff43db2b6bd6f56bbc9c4834f43524dc91b +9f92a0e90155bea2d8930d5ff58ce542f8ceca2736958c1af18ea51252666a40
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.