Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH 12/18] scsi: respect unchecked_isa_dma for blk-mq
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

Currently blk-mq always allocates the sense buffer using normal GFP_KERNEL
allocation.  Refactor the cmd pool code to split the cmd and sense allocation
and share the code to allocate the sense buffers as well as the sense buffer
slab caches between the legacy and blk-mq path.

Note that this switches to lazy allocation of the sense slab caches - the
slab caches (not the actual allocations) won't be destroy until the scsi
module is unloaded instead of keeping track of hosts using them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/scsi/hosts.c     |  4 ++++
 drivers/scsi/scsi.c      | 24 ++++---------------
 drivers/scsi/scsi_lib.c  | 62 +++++++++++++++++++++++++++++++++++++++++++++---
 drivers/scsi/scsi_priv.h |  5 ++++
 4 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 258a3f9..6d29c4a 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -213,6 +213,10 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
 		goto fail;
 	}
 
+	error = scsi_init_sense_cache(shost);
+	if (error)
+		goto fail;
+
 	if (shost_use_blk_mq(shost)) {
 		error = scsi_mq_setup_tags(shost);
 		if (error)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 0f93892..469aa0f 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -100,22 +100,18 @@ EXPORT_SYMBOL(scsi_sd_pm_domain);
 
 struct scsi_host_cmd_pool {
 	struct kmem_cache	*cmd_slab;
-	struct kmem_cache	*sense_slab;
 	unsigned int		users;
 	char			*cmd_name;
-	char			*sense_name;
 	unsigned int		slab_flags;
 };
 
 static struct scsi_host_cmd_pool scsi_cmd_pool = {
 	.cmd_name	= "scsi_cmd_cache",
-	.sense_name	= "scsi_sense_cache",
 	.slab_flags	= SLAB_HWCACHE_ALIGN,
 };
 
 static struct scsi_host_cmd_pool scsi_cmd_dma_pool = {
 	.cmd_name	= "scsi_cmd_cache(DMA)",
-	.sense_name	= "scsi_sense_cache(DMA)",
 	.slab_flags	= SLAB_HWCACHE_ALIGN|SLAB_CACHE_DMA,
 };
 
@@ -136,7 +132,7 @@ scsi_host_free_command(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
 
 	if (cmd->prot_sdb)
 		kmem_cache_free(scsi_sdb_cache, cmd->prot_sdb);
-	kmem_cache_free(pool->sense_slab, cmd->sense_buffer);
+	scsi_free_sense_buffer(shost, cmd->sense_buffer);
 	kmem_cache_free(pool->cmd_slab, cmd);
 }
 
@@ -158,7 +154,8 @@ scsi_host_alloc_command(struct Scsi_Host *shost, gfp_t gfp_mask)
 	if (!cmd)
 		goto fail;
 
-	cmd->sense_buffer = kmem_cache_alloc(pool->sense_slab, gfp_mask);
+	cmd->sense_buffer = scsi_alloc_sense_buffer(shost, gfp_mask,
+			NUMA_NO_NODE);
 	if (!cmd->sense_buffer)
 		goto fail_free_cmd;
 
@@ -171,7 +168,7 @@ scsi_host_alloc_command(struct Scsi_Host *shost, gfp_t gfp_mask)
 	return cmd;
 
 fail_free_sense:
-	kmem_cache_free(pool->sense_slab, cmd->sense_buffer);
+	scsi_free_sense_buffer(shost, cmd->sense_buffer);
 fail_free_cmd:
 	kmem_cache_free(pool->cmd_slab, cmd);
 fail:
@@ -301,7 +298,6 @@ scsi_find_host_cmd_pool(struct Scsi_Host *shost)
 static void
 scsi_free_host_cmd_pool(struct scsi_host_cmd_pool *pool)
 {
-	kfree(pool->sense_name);
 	kfree(pool->cmd_name);
 	kfree(pool);
 }
@@ -317,8 +313,7 @@ scsi_alloc_host_cmd_pool(struct Scsi_Host *shost)
 		return NULL;
 
 	pool->cmd_name = kasprintf(GFP_KERNEL, "%s_cmd", hostt->proc_name);
-	pool->sense_name = kasprintf(GFP_KERNEL, "%s_sense", hostt->proc_name);
-	if (!pool->cmd_name || !pool->sense_name) {
+	if (!pool->cmd_name) {
 		scsi_free_host_cmd_pool(pool);
 		return NULL;
 	}
@@ -357,12 +352,6 @@ scsi_get_host_cmd_pool(struct Scsi_Host *shost)
 						   pool->slab_flags, NULL);
 		if (!pool->cmd_slab)
 			goto out_free_pool;
-
-		pool->sense_slab = kmem_cache_create(pool->sense_name,
-						     SCSI_SENSE_BUFFERSIZE, 0,
-						     pool->slab_flags, NULL);
-		if (!pool->sense_slab)
-			goto out_free_slab;
 	}
 
 	pool->users++;
@@ -371,8 +360,6 @@ scsi_get_host_cmd_pool(struct Scsi_Host *shost)
 	mutex_unlock(&host_cmd_pool_mutex);
 	return retval;
 
-out_free_slab:
-	kmem_cache_destroy(pool->cmd_slab);
 out_free_pool:
 	if (hostt->cmd_size) {
 		scsi_free_host_cmd_pool(pool);
@@ -398,7 +385,6 @@ static void scsi_put_host_cmd_pool(struct Scsi_Host *shost)
 
 	if (!--pool->users) {
 		kmem_cache_destroy(pool->cmd_slab);
-		kmem_cache_destroy(pool->sense_slab);
 		if (hostt->cmd_size) {
 			scsi_free_host_cmd_pool(pool);
 			hostt->cmd_pool = NULL;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e9e1e14..3d6b364 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -39,6 +39,58 @@
 
 
 struct kmem_cache *scsi_sdb_cache;
+static struct kmem_cache *scsi_sense_cache;
+static struct kmem_cache *scsi_sense_isadma_cache;
+static DEFINE_MUTEX(scsi_sense_cache_mutex);
+
+static inline struct kmem_cache *
+scsi_select_sense_cache(struct Scsi_Host *shost)
+{
+	return shost->unchecked_isa_dma ?
+		scsi_sense_isadma_cache : scsi_sense_cache;
+}
+
+void scsi_free_sense_buffer(struct Scsi_Host *shost,
+		unsigned char *sense_buffer)
+{
+	kmem_cache_free(scsi_select_sense_cache(shost), sense_buffer);
+}
+
+unsigned char *scsi_alloc_sense_buffer(struct Scsi_Host *shost, gfp_t gfp_mask,
+		int numa_node)
+{
+	return kmem_cache_alloc_node(scsi_select_sense_cache(shost), gfp_mask,
+			numa_node);
+}
+
+int scsi_init_sense_cache(struct Scsi_Host *shost)
+{
+	struct kmem_cache *cache;
+	int ret = 0;
+
+	cache = scsi_select_sense_cache(shost);
+	if (cache)
+		return 0;
+
+	mutex_lock(&scsi_sense_cache_mutex);
+	if (shost->unchecked_isa_dma) {
+		scsi_sense_isadma_cache =
+			kmem_cache_create("scsi_sense_cache(DMA)",
+			SCSI_SENSE_BUFFERSIZE, 0,
+			SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA, NULL);
+		if (!scsi_sense_isadma_cache)
+			ret = -ENOMEM;
+	} else {
+		scsi_sense_cache =
+			kmem_cache_create("scsi_sense_cache",
+			SCSI_SENSE_BUFFERSIZE, 0, SLAB_HWCACHE_ALIGN, NULL);
+		if (!scsi_sense_cache)
+			ret = -ENOMEM;
+	}
+
+	mutex_unlock(&scsi_sense_cache_mutex);
+	return ret;
+}
 
 /*
  * When to reinvoke queueing after a resource shortage. It's 3 msecs to
@@ -1981,10 +2033,11 @@ static int scsi_init_request(void *data, struct request *rq,
 		unsigned int hctx_idx, unsigned int request_idx,
 		unsigned int numa_node)
 {
+	struct Scsi_Host *shost = data;
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
 
-	cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL,
-			numa_node);
+	cmd->sense_buffer =
+		scsi_alloc_sense_buffer(shost, GFP_KERNEL, numa_node);
 	if (!cmd->sense_buffer)
 		return -ENOMEM;
 	return 0;
@@ -1993,9 +2046,10 @@ static int scsi_init_request(void *data, struct request *rq,
 static void scsi_exit_request(void *data, struct request *rq,
 		unsigned int hctx_idx, unsigned int request_idx)
 {
+	struct Scsi_Host *shost = data;
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
 
-	kfree(cmd->sense_buffer);
+	scsi_free_sense_buffer(shost, cmd->sense_buffer);
 }
 
 static int scsi_map_queues(struct blk_mq_tag_set *set)
@@ -2208,6 +2262,8 @@ int __init scsi_init_queue(void)
 
 void scsi_exit_queue(void)
 {
+	kmem_cache_destroy(scsi_sense_cache);
+	kmem_cache_destroy(scsi_sense_isadma_cache);
 	kmem_cache_destroy(scsi_sdb_cache);
 }
 
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 193636a..1a712c6 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -30,6 +30,11 @@ extern void scsi_exit_hosts(void);
 
 /* scsi.c */
 extern bool scsi_use_blk_mq;
+void scsi_free_sense_buffer(struct Scsi_Host *shost,
+		unsigned char *sense_buffer);
+unsigned char *scsi_alloc_sense_buffer(struct Scsi_Host *shost, gfp_t gfp_mask,
+		int numa_node);
+int scsi_init_sense_cache(struct Scsi_Host *shost);
 extern int scsi_setup_command_freelist(struct Scsi_Host *shost);
 extern void scsi_destroy_command_freelist(struct Scsi_Host *shost);
 #ifdef CONFIG_SCSI_LOGGING
-- 
2.1.4


^ permalink raw reply related

* [PATCH 11/18] scsi: remove gfp_flags member in scsi_host_cmd_pool
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

When using the slab allocator we already decide at cache creation time if
an allocation comes from a GFP_DMA pool using the SLAB_CACHE_DMA flag,
and there is no point passing the kmalloc-family only GFP_DMA flag to
kmem_cache_alloc.  Drop all the infrastructure for doing so.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 drivers/scsi/scsi.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 75455d4..0f93892 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -105,7 +105,6 @@ struct scsi_host_cmd_pool {
 	char			*cmd_name;
 	char			*sense_name;
 	unsigned int		slab_flags;
-	gfp_t			gfp_mask;
 };
 
 static struct scsi_host_cmd_pool scsi_cmd_pool = {
@@ -118,7 +117,6 @@ static struct scsi_host_cmd_pool scsi_cmd_dma_pool = {
 	.cmd_name	= "scsi_cmd_cache(DMA)",
 	.sense_name	= "scsi_sense_cache(DMA)",
 	.slab_flags	= SLAB_HWCACHE_ALIGN|SLAB_CACHE_DMA,
-	.gfp_mask	= __GFP_DMA,
 };
 
 static DEFINE_MUTEX(host_cmd_pool_mutex);
@@ -156,12 +154,11 @@ scsi_host_alloc_command(struct Scsi_Host *shost, gfp_t gfp_mask)
 	struct scsi_host_cmd_pool *pool = shost->cmd_pool;
 	struct scsi_cmnd *cmd;
 
-	cmd = kmem_cache_zalloc(pool->cmd_slab, gfp_mask | pool->gfp_mask);
+	cmd = kmem_cache_zalloc(pool->cmd_slab, gfp_mask);
 	if (!cmd)
 		goto fail;
 
-	cmd->sense_buffer = kmem_cache_alloc(pool->sense_slab,
-					     gfp_mask | pool->gfp_mask);
+	cmd->sense_buffer = kmem_cache_alloc(pool->sense_slab, gfp_mask);
 	if (!cmd->sense_buffer)
 		goto fail_free_cmd;
 
@@ -327,10 +324,8 @@ scsi_alloc_host_cmd_pool(struct Scsi_Host *shost)
 	}
 
 	pool->slab_flags = SLAB_HWCACHE_ALIGN;
-	if (shost->unchecked_isa_dma) {
+	if (shost->unchecked_isa_dma)
 		pool->slab_flags |= SLAB_CACHE_DMA;
-		pool->gfp_mask = __GFP_DMA;
-	}
 
 	if (hostt->cmd_size)
 		hostt->cmd_pool = pool;
@@ -424,7 +419,6 @@ static void scsi_put_host_cmd_pool(struct Scsi_Host *shost)
  */
 int scsi_setup_command_freelist(struct Scsi_Host *shost)
 {
-	const gfp_t gfp_mask = shost->unchecked_isa_dma ? GFP_DMA : GFP_KERNEL;
 	struct scsi_cmnd *cmd;
 
 	spin_lock_init(&shost->free_list_lock);
@@ -437,7 +431,7 @@ int scsi_setup_command_freelist(struct Scsi_Host *shost)
 	/*
 	 * Get one backup command for this host.
 	 */
-	cmd = scsi_host_alloc_command(shost, gfp_mask);
+	cmd = scsi_host_alloc_command(shost, GFP_KERNEL);
 	if (!cmd) {
 		scsi_put_host_cmd_pool(shost);
 		shost->cmd_pool = NULL;
-- 
2.1.4


^ permalink raw reply related

* [PATCH 10/18] scsi_dh_hp_sw: switch to scsi_execute_req_flags()
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel, Hannes Reinecke, Hannes Reinecke
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

From: Hannes Reinecke <hare@suse.de>

Switch to scsi_execute_req_flags() instead of using the block interface
directly.  This will set REQ_QUIET and REQ_PREEMPT, but this is okay as
we're evaluating the errors anyway and should be able to send the command
even if the device is quiesced.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/device_handler/scsi_dh_hp_sw.c | 222 ++++++++--------------------
 1 file changed, 65 insertions(+), 157 deletions(-)

diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
index 308e871..be43c94 100644
--- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c
+++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
@@ -38,13 +38,10 @@
 #define HP_SW_PATH_PASSIVE		1
 
 struct hp_sw_dh_data {
-	unsigned char sense[SCSI_SENSE_BUFFERSIZE];
 	int path_state;
 	int retries;
 	int retry_cnt;
 	struct scsi_device *sdev;
-	activate_complete	callback_fn;
-	void			*callback_data;
 };
 
 static int hp_sw_start_stop(struct hp_sw_dh_data *);
@@ -56,43 +53,34 @@ static int hp_sw_start_stop(struct hp_sw_dh_data *);
  *
  * Returns SCSI_DH_DEV_OFFLINED if the sdev is on the passive path
  */
-static int tur_done(struct scsi_device *sdev, unsigned char *sense)
+static int tur_done(struct scsi_device *sdev, struct hp_sw_dh_data *h,
+		    struct scsi_sense_hdr *sshdr)
 {
-	struct scsi_sense_hdr sshdr;
-	int ret;
+	int ret = SCSI_DH_IO;
 
-	ret = scsi_normalize_sense(sense, SCSI_SENSE_BUFFERSIZE, &sshdr);
-	if (!ret) {
-		sdev_printk(KERN_WARNING, sdev,
-			    "%s: sending tur failed, no sense available\n",
-			    HP_SW_NAME);
-		ret = SCSI_DH_IO;
-		goto done;
-	}
-	switch (sshdr.sense_key) {
+	switch (sshdr->sense_key) {
 	case UNIT_ATTENTION:
 		ret = SCSI_DH_IMM_RETRY;
 		break;
 	case NOT_READY:
-		if ((sshdr.asc == 0x04) && (sshdr.ascq == 2)) {
+		if (sshdr->asc == 0x04 && sshdr->ascq == 2) {
 			/*
 			 * LUN not ready - Initialization command required
 			 *
 			 * This is the passive path
 			 */
-			ret = SCSI_DH_DEV_OFFLINED;
+			h->path_state = HP_SW_PATH_PASSIVE;
+			ret = SCSI_DH_OK;
 			break;
 		}
 		/* Fallthrough */
 	default:
 		sdev_printk(KERN_WARNING, sdev,
 			   "%s: sending tur failed, sense %x/%x/%x\n",
-			   HP_SW_NAME, sshdr.sense_key, sshdr.asc,
-			   sshdr.ascq);
+			   HP_SW_NAME, sshdr->sense_key, sshdr->asc,
+			   sshdr->ascq);
 		break;
 	}
-
-done:
 	return ret;
 }
 
@@ -105,131 +93,36 @@ static int tur_done(struct scsi_device *sdev, unsigned char *sense)
  */
 static int hp_sw_tur(struct scsi_device *sdev, struct hp_sw_dh_data *h)
 {
-	struct request *req;
-	int ret;
+	unsigned char cmd[6] = { TEST_UNIT_READY };
+	struct scsi_sense_hdr sshdr;
+	int ret = SCSI_DH_OK, res;
+	u64 req_flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
+		REQ_FAILFAST_DRIVER;
 
 retry:
-	req = blk_get_request(sdev->request_queue, WRITE, GFP_NOIO);
-	if (IS_ERR(req))
-		return SCSI_DH_RES_TEMP_UNAVAIL;
-
-	blk_rq_set_block_pc(req);
-	req->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
-			  REQ_FAILFAST_DRIVER;
-	req->cmd_len = COMMAND_SIZE(TEST_UNIT_READY);
-	req->cmd[0] = TEST_UNIT_READY;
-	req->timeout = HP_SW_TIMEOUT;
-	req->sense = h->sense;
-	memset(req->sense, 0, SCSI_SENSE_BUFFERSIZE);
-	req->sense_len = 0;
-
-	ret = blk_execute_rq(req->q, NULL, req, 1);
-	if (ret == -EIO) {
-		if (req->sense_len > 0) {
-			ret = tur_done(sdev, h->sense);
-		} else {
+	res = scsi_execute_req_flags(sdev, cmd, DMA_NONE, NULL, 0, &sshdr,
+				     HP_SW_TIMEOUT, HP_SW_RETRIES,
+				     NULL, req_flags, 0);
+	if (res) {
+		if (scsi_sense_valid(&sshdr))
+			ret = tur_done(sdev, h, &sshdr);
+		else {
 			sdev_printk(KERN_WARNING, sdev,
 				    "%s: sending tur failed with %x\n",
-				    HP_SW_NAME, req->errors);
+				    HP_SW_NAME, res);
 			ret = SCSI_DH_IO;
 		}
 	} else {
 		h->path_state = HP_SW_PATH_ACTIVE;
 		ret = SCSI_DH_OK;
 	}
-	if (ret == SCSI_DH_IMM_RETRY) {
-		blk_put_request(req);
+	if (ret == SCSI_DH_IMM_RETRY)
 		goto retry;
-	}
-	if (ret == SCSI_DH_DEV_OFFLINED) {
-		h->path_state = HP_SW_PATH_PASSIVE;
-		ret = SCSI_DH_OK;
-	}
-
-	blk_put_request(req);
 
 	return ret;
 }
 
 /*
- * start_done - Handle START STOP UNIT return status
- * @sdev: sdev the command has been sent to
- * @errors: blk error code
- */
-static int start_done(struct scsi_device *sdev, unsigned char *sense)
-{
-	struct scsi_sense_hdr sshdr;
-	int rc;
-
-	rc = scsi_normalize_sense(sense, SCSI_SENSE_BUFFERSIZE, &sshdr);
-	if (!rc) {
-		sdev_printk(KERN_WARNING, sdev,
-			    "%s: sending start_stop_unit failed, "
-			    "no sense available\n",
-			    HP_SW_NAME);
-		return SCSI_DH_IO;
-	}
-	switch (sshdr.sense_key) {
-	case NOT_READY:
-		if ((sshdr.asc == 0x04) && (sshdr.ascq == 3)) {
-			/*
-			 * LUN not ready - manual intervention required
-			 *
-			 * Switch-over in progress, retry.
-			 */
-			rc = SCSI_DH_RETRY;
-			break;
-		}
-		/* fall through */
-	default:
-		sdev_printk(KERN_WARNING, sdev,
-			   "%s: sending start_stop_unit failed, sense %x/%x/%x\n",
-			   HP_SW_NAME, sshdr.sense_key, sshdr.asc,
-			   sshdr.ascq);
-		rc = SCSI_DH_IO;
-	}
-
-	return rc;
-}
-
-static void start_stop_endio(struct request *req, int error)
-{
-	struct hp_sw_dh_data *h = req->end_io_data;
-	unsigned err = SCSI_DH_OK;
-
-	if (error || host_byte(req->errors) != DID_OK ||
-			msg_byte(req->errors) != COMMAND_COMPLETE) {
-		sdev_printk(KERN_WARNING, h->sdev,
-			    "%s: sending start_stop_unit failed with %x\n",
-			    HP_SW_NAME, req->errors);
-		err = SCSI_DH_IO;
-		goto done;
-	}
-
-	if (req->sense_len > 0) {
-		err = start_done(h->sdev, h->sense);
-		if (err == SCSI_DH_RETRY) {
-			err = SCSI_DH_IO;
-			if (--h->retry_cnt) {
-				blk_put_request(req);
-				err = hp_sw_start_stop(h);
-				if (err == SCSI_DH_OK)
-					return;
-			}
-		}
-	}
-done:
-	req->end_io_data = NULL;
-	__blk_put_request(req->q, req);
-	if (h->callback_fn) {
-		h->callback_fn(h->callback_data, err);
-		h->callback_fn = h->callback_data = NULL;
-	}
-	return;
-
-}
-
-/*
  * hp_sw_start_stop - Send START STOP UNIT command
  * @sdev: sdev command should be sent to
  *
@@ -237,26 +130,48 @@ static void start_stop_endio(struct request *req, int error)
  */
 static int hp_sw_start_stop(struct hp_sw_dh_data *h)
 {
-	struct request *req;
-
-	req = blk_get_request(h->sdev->request_queue, WRITE, GFP_ATOMIC);
-	if (IS_ERR(req))
-		return SCSI_DH_RES_TEMP_UNAVAIL;
-
-	blk_rq_set_block_pc(req);
-	req->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
-			  REQ_FAILFAST_DRIVER;
-	req->cmd_len = COMMAND_SIZE(START_STOP);
-	req->cmd[0] = START_STOP;
-	req->cmd[4] = 1;	/* Start spin cycle */
-	req->timeout = HP_SW_TIMEOUT;
-	req->sense = h->sense;
-	memset(req->sense, 0, SCSI_SENSE_BUFFERSIZE);
-	req->sense_len = 0;
-	req->end_io_data = h;
+	unsigned char cmd[6] = { START_STOP, 0, 0, 0, 1, 0 };
+	struct scsi_sense_hdr sshdr;
+	struct scsi_device *sdev = h->sdev;
+	int res, rc = SCSI_DH_OK;
+	int retry_cnt = HP_SW_RETRIES;
+	u64 req_flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
+		REQ_FAILFAST_DRIVER;
 
-	blk_execute_rq_nowait(req->q, NULL, req, 1, start_stop_endio);
-	return SCSI_DH_OK;
+retry:
+	res = scsi_execute_req_flags(sdev, cmd, DMA_NONE, NULL, 0, &sshdr,
+				     HP_SW_TIMEOUT, HP_SW_RETRIES,
+				     NULL, req_flags, 0);
+	if (res) {
+		if (!scsi_sense_valid(&sshdr)) {
+			sdev_printk(KERN_WARNING, sdev,
+				    "%s: sending start_stop_unit failed, "
+				    "no sense available\n", HP_SW_NAME);
+			return SCSI_DH_IO;
+		}
+		switch (sshdr.sense_key) {
+		case NOT_READY:
+			if (sshdr.asc == 0x04 && sshdr.ascq == 3) {
+				/*
+				 * LUN not ready - manual intervention required
+				 *
+				 * Switch-over in progress, retry.
+				 */
+				if (--retry_cnt)
+					goto retry;
+				rc = SCSI_DH_RETRY;
+				break;
+			}
+			/* fall through */
+		default:
+			sdev_printk(KERN_WARNING, sdev,
+				    "%s: sending start_stop_unit failed, "
+				    "sense %x/%x/%x\n", HP_SW_NAME,
+				    sshdr.sense_key, sshdr.asc, sshdr.ascq);
+			rc = SCSI_DH_IO;
+		}
+	}
+	return rc;
 }
 
 static int hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
@@ -290,15 +205,8 @@ static int hp_sw_activate(struct scsi_device *sdev,
 
 	ret = hp_sw_tur(sdev, h);
 
-	if (ret == SCSI_DH_OK && h->path_state == HP_SW_PATH_PASSIVE) {
-		h->retry_cnt = h->retries;
-		h->callback_fn = fn;
-		h->callback_data = data;
+	if (ret == SCSI_DH_OK && h->path_state == HP_SW_PATH_PASSIVE)
 		ret = hp_sw_start_stop(h);
-		if (ret == SCSI_DH_OK)
-			return 0;
-		h->callback_fn = h->callback_data = NULL;
-	}
 
 	if (fn)
 		fn(data, ret);
-- 
2.1.4


^ permalink raw reply related

* [PATCH 09/18] scsi_dh_emc: switch to scsi_execute_req_flags()
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel, Hannes Reinecke, Hannes Reinecke
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

From: Hannes Reinecke <hare@suse.de>

Switch to scsi_execute_req_flags() and scsi_get_vpd_page() instead of
open-coding it.  Using scsi_execute_req_flags() will set REQ_QUIET and
REQ_PREEMPT, but this is okay as we're evaluating the errors anyway and
should be able to send the command even if the device is quiesced.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/device_handler/scsi_dh_emc.c | 247 +++++++-----------------------
 1 file changed, 56 insertions(+), 191 deletions(-)

diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c
index 5b80746..4a7679f 100644
--- a/drivers/scsi/device_handler/scsi_dh_emc.c
+++ b/drivers/scsi/device_handler/scsi_dh_emc.c
@@ -88,12 +88,6 @@ struct clariion_dh_data {
 	 */
 	unsigned char buffer[CLARIION_BUFFER_SIZE];
 	/*
-	 * SCSI sense buffer for commands -- assumes serial issuance
-	 * and completion sequence of all commands for same multipath.
-	 */
-	unsigned char sense[SCSI_SENSE_BUFFERSIZE];
-	unsigned int senselen;
-	/*
 	 * LUN state
 	 */
 	int lun_state;
@@ -116,44 +110,38 @@ struct clariion_dh_data {
 /*
  * Parse MODE_SELECT cmd reply.
  */
-static int trespass_endio(struct scsi_device *sdev, char *sense)
+static int trespass_endio(struct scsi_device *sdev,
+			  struct scsi_sense_hdr *sshdr)
 {
 	int err = SCSI_DH_IO;
-	struct scsi_sense_hdr sshdr;
-
-	if (!scsi_normalize_sense(sense, SCSI_SENSE_BUFFERSIZE, &sshdr)) {
-		sdev_printk(KERN_ERR, sdev, "%s: Found valid sense data 0x%2x, "
-			    "0x%2x, 0x%2x while sending CLARiiON trespass "
-			    "command.\n", CLARIION_NAME, sshdr.sense_key,
-			    sshdr.asc, sshdr.ascq);
 
-		if ((sshdr.sense_key == 0x05) && (sshdr.asc == 0x04) &&
-		     (sshdr.ascq == 0x00)) {
-			/*
-			 * Array based copy in progress -- do not send
-			 * mode_select or copy will be aborted mid-stream.
-			 */
-			sdev_printk(KERN_INFO, sdev, "%s: Array Based Copy in "
-				    "progress while sending CLARiiON trespass "
-				    "command.\n", CLARIION_NAME);
-			err = SCSI_DH_DEV_TEMP_BUSY;
-		} else if ((sshdr.sense_key == 0x02) && (sshdr.asc == 0x04) &&
-			    (sshdr.ascq == 0x03)) {
-			/*
-			 * LUN Not Ready - Manual Intervention Required
-			 * indicates in-progress ucode upgrade (NDU).
-			 */
-			sdev_printk(KERN_INFO, sdev, "%s: Detected in-progress "
-				    "ucode upgrade NDU operation while sending "
-				    "CLARiiON trespass command.\n", CLARIION_NAME);
-			err = SCSI_DH_DEV_TEMP_BUSY;
-		} else
-			err = SCSI_DH_DEV_FAILED;
-	} else {
-		sdev_printk(KERN_INFO, sdev,
-			    "%s: failed to send MODE SELECT, no sense available\n",
-			    CLARIION_NAME);
-	}
+	sdev_printk(KERN_ERR, sdev, "%s: Found valid sense data 0x%2x, "
+		    "0x%2x, 0x%2x while sending CLARiiON trespass "
+		    "command.\n", CLARIION_NAME, sshdr->sense_key,
+		    sshdr->asc, sshdr->ascq);
+
+	if (sshdr->sense_key == 0x05 && sshdr->asc == 0x04 &&
+	    sshdr->ascq == 0x00) {
+		/*
+		 * Array based copy in progress -- do not send
+		 * mode_select or copy will be aborted mid-stream.
+		 */
+		sdev_printk(KERN_INFO, sdev, "%s: Array Based Copy in "
+			    "progress while sending CLARiiON trespass "
+			    "command.\n", CLARIION_NAME);
+		err = SCSI_DH_DEV_TEMP_BUSY;
+	} else if (sshdr->sense_key == 0x02 && sshdr->asc == 0x04 &&
+		   sshdr->ascq == 0x03) {
+		/*
+		 * LUN Not Ready - Manual Intervention Required
+		 * indicates in-progress ucode upgrade (NDU).
+		 */
+		sdev_printk(KERN_INFO, sdev, "%s: Detected in-progress "
+			    "ucode upgrade NDU operation while sending "
+			    "CLARiiON trespass command.\n", CLARIION_NAME);
+		err = SCSI_DH_DEV_TEMP_BUSY;
+	} else
+		err = SCSI_DH_DEV_FAILED;
 	return err;
 }
 
@@ -257,103 +245,15 @@ static char * parse_sp_model(struct scsi_device *sdev, unsigned char *buffer)
 	return sp_model;
 }
 
-/*
- * Get block request for REQ_BLOCK_PC command issued to path.  Currently
- * limited to MODE_SELECT (trespass) and INQUIRY (VPD page 0xC0) commands.
- *
- * Uses data and sense buffers in hardware handler context structure and
- * assumes serial servicing of commands, both issuance and completion.
- */
-static struct request *get_req(struct scsi_device *sdev, int cmd,
-				unsigned char *buffer)
-{
-	struct request *rq;
-	int len = 0;
-
-	rq = blk_get_request(sdev->request_queue,
-			(cmd != INQUIRY) ? WRITE : READ, GFP_NOIO);
-	if (IS_ERR(rq)) {
-		sdev_printk(KERN_INFO, sdev, "get_req: blk_get_request failed");
-		return NULL;
-	}
-
-	blk_rq_set_block_pc(rq);
-	rq->cmd_len = COMMAND_SIZE(cmd);
-	rq->cmd[0] = cmd;
-
-	switch (cmd) {
-	case MODE_SELECT:
-		len = sizeof(short_trespass);
-		rq->cmd[1] = 0x10;
-		rq->cmd[4] = len;
-		break;
-	case MODE_SELECT_10:
-		len = sizeof(long_trespass);
-		rq->cmd[1] = 0x10;
-		rq->cmd[8] = len;
-		break;
-	case INQUIRY:
-		len = CLARIION_BUFFER_SIZE;
-		rq->cmd[4] = len;
-		memset(buffer, 0, len);
-		break;
-	default:
-		BUG_ON(1);
-		break;
-	}
-
-	rq->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
-			 REQ_FAILFAST_DRIVER;
-	rq->timeout = CLARIION_TIMEOUT;
-	rq->retries = CLARIION_RETRIES;
-
-	if (blk_rq_map_kern(rq->q, rq, buffer, len, GFP_NOIO)) {
-		blk_put_request(rq);
-		return NULL;
-	}
-
-	return rq;
-}
-
-static int send_inquiry_cmd(struct scsi_device *sdev, int page,
-			    struct clariion_dh_data *csdev)
-{
-	struct request *rq = get_req(sdev, INQUIRY, csdev->buffer);
-	int err;
-
-	if (!rq)
-		return SCSI_DH_RES_TEMP_UNAVAIL;
-
-	rq->sense = csdev->sense;
-	memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
-	rq->sense_len = csdev->senselen = 0;
-
-	rq->cmd[0] = INQUIRY;
-	if (page != 0) {
-		rq->cmd[1] = 1;
-		rq->cmd[2] = page;
-	}
-	err = blk_execute_rq(sdev->request_queue, NULL, rq, 1);
-	if (err == -EIO) {
-		sdev_printk(KERN_INFO, sdev,
-			    "%s: failed to send %s INQUIRY: %x\n",
-			    CLARIION_NAME, page?"EVPD":"standard",
-			    rq->errors);
-		csdev->senselen = rq->sense_len;
-		err = SCSI_DH_IO;
-	}
-
-	blk_put_request(rq);
-
-	return err;
-}
-
 static int send_trespass_cmd(struct scsi_device *sdev,
 			    struct clariion_dh_data *csdev)
 {
-	struct request *rq;
 	unsigned char *page22;
-	int err, len, cmd;
+	unsigned char cdb[COMMAND_SIZE(MODE_SELECT)];
+	int err, res = SCSI_DH_OK, len;
+	struct scsi_sense_hdr sshdr;
+	u64 req_flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
+		REQ_FAILFAST_DRIVER;
 
 	if (csdev->flags & CLARIION_SHORT_TRESPASS) {
 		page22 = short_trespass;
@@ -361,40 +261,37 @@ static int send_trespass_cmd(struct scsi_device *sdev,
 			/* Set Honor Reservations bit */
 			page22[6] |= 0x80;
 		len = sizeof(short_trespass);
-		cmd = MODE_SELECT;
+		cdb[0] = MODE_SELECT;
+		cdb[1] = 0x10;
+		cdb[4] = len;
 	} else {
 		page22 = long_trespass;
 		if (!(csdev->flags & CLARIION_HONOR_RESERVATIONS))
 			/* Set Honor Reservations bit */
 			page22[10] |= 0x80;
 		len = sizeof(long_trespass);
-		cmd = MODE_SELECT_10;
+		cdb[0] = MODE_SELECT_10;
+		cdb[8] = len;
 	}
 	BUG_ON((len > CLARIION_BUFFER_SIZE));
 	memcpy(csdev->buffer, page22, len);
 
-	rq = get_req(sdev, cmd, csdev->buffer);
-	if (!rq)
-		return SCSI_DH_RES_TEMP_UNAVAIL;
-
-	rq->sense = csdev->sense;
-	memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
-	rq->sense_len = csdev->senselen = 0;
-
-	err = blk_execute_rq(sdev->request_queue, NULL, rq, 1);
-	if (err == -EIO) {
-		if (rq->sense_len) {
-			err = trespass_endio(sdev, csdev->sense);
-		} else {
+	err = scsi_execute_req_flags(sdev, cdb, DMA_TO_DEVICE,
+				     csdev->buffer, len, &sshdr,
+				     CLARIION_TIMEOUT * HZ, CLARIION_RETRIES,
+				     NULL, req_flags, 0);
+	if (err) {
+		if (scsi_sense_valid(&sshdr))
+			res = trespass_endio(sdev, &sshdr);
+		else {
 			sdev_printk(KERN_INFO, sdev,
 				    "%s: failed to send MODE SELECT: %x\n",
-				    CLARIION_NAME, rq->errors);
+				    CLARIION_NAME, err);
+			res = SCSI_DH_IO;
 		}
 	}
 
-	blk_put_request(rq);
-
-	return err;
+	return res;
 }
 
 static int clariion_check_sense(struct scsi_device *sdev,
@@ -464,21 +361,7 @@ static int clariion_std_inquiry(struct scsi_device *sdev,
 	int err;
 	char *sp_model;
 
-	err = send_inquiry_cmd(sdev, 0, csdev);
-	if (err != SCSI_DH_OK && csdev->senselen) {
-		struct scsi_sense_hdr sshdr;
-
-		if (scsi_normalize_sense(csdev->sense, SCSI_SENSE_BUFFERSIZE,
-					 &sshdr)) {
-			sdev_printk(KERN_ERR, sdev, "%s: INQUIRY sense code "
-				    "%02x/%02x/%02x\n", CLARIION_NAME,
-				    sshdr.sense_key, sshdr.asc, sshdr.ascq);
-		}
-		err = SCSI_DH_IO;
-		goto out;
-	}
-
-	sp_model = parse_sp_model(sdev, csdev->buffer);
+	sp_model = parse_sp_model(sdev, sdev->inquiry);
 	if (!sp_model) {
 		err = SCSI_DH_DEV_UNSUPP;
 		goto out;
@@ -500,30 +383,12 @@ static int clariion_std_inquiry(struct scsi_device *sdev,
 static int clariion_send_inquiry(struct scsi_device *sdev,
 				 struct clariion_dh_data *csdev)
 {
-	int err, retry = CLARIION_RETRIES;
-
-retry:
-	err = send_inquiry_cmd(sdev, 0xC0, csdev);
-	if (err != SCSI_DH_OK && csdev->senselen) {
-		struct scsi_sense_hdr sshdr;
-
-		err = scsi_normalize_sense(csdev->sense, SCSI_SENSE_BUFFERSIZE,
-					   &sshdr);
-		if (!err)
-			return SCSI_DH_IO;
-
-		err = clariion_check_sense(sdev, &sshdr);
-		if (retry > 0 && err == ADD_TO_MLQUEUE) {
-			retry--;
-			goto retry;
-		}
-		sdev_printk(KERN_ERR, sdev, "%s: INQUIRY sense code "
-			    "%02x/%02x/%02x\n", CLARIION_NAME,
-			      sshdr.sense_key, sshdr.asc, sshdr.ascq);
-		err = SCSI_DH_IO;
-	} else {
+	int err = SCSI_DH_IO;
+
+	if (!scsi_get_vpd_page(sdev, 0xC0, csdev->buffer,
+			       CLARIION_BUFFER_SIZE))
 		err = parse_sp_info_reply(sdev, csdev);
-	}
+
 	return err;
 }
 
-- 
2.1.4


^ permalink raw reply related

* [PATCH 08/18] scsi_dh_rdac: switch to scsi_execute_req_flags()
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel, Hannes Reinecke, Hannes Reinecke
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

From: Hannes Reinecke <hare@suse.de>

Switch to scsi_execute_req_flags() and scsi_get_vpd_page() instead of
open-coding it.  Using scsi_execute_req_flags() will set REQ_QUIET and
REQ_PREEMPT, but this is okay as we're evaluating the errors anyway and
should be able to send the command even if the device is quiesced.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/device_handler/scsi_dh_rdac.c | 174 +++++++++--------------------
 1 file changed, 51 insertions(+), 123 deletions(-)

diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c
index 00d9c32..b64eaae 100644
--- a/drivers/scsi/device_handler/scsi_dh_rdac.c
+++ b/drivers/scsi/device_handler/scsi_dh_rdac.c
@@ -205,7 +205,6 @@ struct rdac_dh_data {
 #define RDAC_NON_PREFERRED	1
 	char			preferred;
 
-	unsigned char		sense[SCSI_SENSE_BUFFERSIZE];
 	union			{
 		struct c2_inquiry c2;
 		struct c4_inquiry c4;
@@ -262,40 +261,12 @@ do { \
 		sdev_printk(KERN_INFO, sdev, RDAC_NAME ": " f "\n", ## arg); \
 } while (0);
 
-static struct request *get_rdac_req(struct scsi_device *sdev,
-			void *buffer, unsigned buflen, int rw)
+static unsigned int rdac_failover_get(struct rdac_controller *ctlr,
+				      struct list_head *list,
+				      unsigned char *cdb)
 {
-	struct request *rq;
-	struct request_queue *q = sdev->request_queue;
-
-	rq = blk_get_request(q, rw, GFP_NOIO);
-
-	if (IS_ERR(rq)) {
-		sdev_printk(KERN_INFO, sdev,
-				"get_rdac_req: blk_get_request failed.\n");
-		return NULL;
-	}
-	blk_rq_set_block_pc(rq);
-
-	if (buflen && blk_rq_map_kern(q, rq, buffer, buflen, GFP_NOIO)) {
-		blk_put_request(rq);
-		sdev_printk(KERN_INFO, sdev,
-				"get_rdac_req: blk_rq_map_kern failed.\n");
-		return NULL;
-	}
-
-	rq->cmd_flags |= REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
-			 REQ_FAILFAST_DRIVER;
-	rq->retries = RDAC_RETRIES;
-	rq->timeout = RDAC_TIMEOUT;
-
-	return rq;
-}
-
-static struct request *rdac_failover_get(struct scsi_device *sdev,
-			struct rdac_dh_data *h, struct list_head *list)
-{
-	struct request *rq;
+	struct scsi_device *sdev = ctlr->ms_sdev;
+	struct rdac_dh_data *h = sdev->handler_data;
 	struct rdac_mode_common *common;
 	unsigned data_size;
 	struct rdac_queue_data *qdata;
@@ -332,27 +303,17 @@ static struct request *rdac_failover_get(struct scsi_device *sdev,
 		lun_table[qdata->h->lun] = 0x81;
 	}
 
-	/* get request for block layer packet command */
-	rq = get_rdac_req(sdev, &h->ctlr->mode_select, data_size, WRITE);
-	if (!rq)
-		return NULL;
-
 	/* Prepare the command. */
 	if (h->ctlr->use_ms10) {
-		rq->cmd[0] = MODE_SELECT_10;
-		rq->cmd[7] = data_size >> 8;
-		rq->cmd[8] = data_size & 0xff;
+		cdb[0] = MODE_SELECT_10;
+		cdb[7] = data_size >> 8;
+		cdb[8] = data_size & 0xff;
 	} else {
-		rq->cmd[0] = MODE_SELECT;
-		rq->cmd[4] = data_size;
+		cdb[0] = MODE_SELECT;
+		cdb[4] = data_size;
 	}
-	rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
-
-	rq->sense = h->sense;
-	memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
-	rq->sense_len = 0;
 
-	return rq;
+	return data_size;
 }
 
 static void release_controller(struct kref *kref)
@@ -400,46 +361,14 @@ static struct rdac_controller *get_controller(int index, char *array_name,
 	return ctlr;
 }
 
-static int submit_inquiry(struct scsi_device *sdev, int page_code,
-			  unsigned int len, struct rdac_dh_data *h)
-{
-	struct request *rq;
-	struct request_queue *q = sdev->request_queue;
-	int err = SCSI_DH_RES_TEMP_UNAVAIL;
-
-	rq = get_rdac_req(sdev, &h->inq, len, READ);
-	if (!rq)
-		goto done;
-
-	/* Prepare the command. */
-	rq->cmd[0] = INQUIRY;
-	rq->cmd[1] = 1;
-	rq->cmd[2] = page_code;
-	rq->cmd[4] = len;
-	rq->cmd_len = COMMAND_SIZE(INQUIRY);
-
-	rq->sense = h->sense;
-	memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
-	rq->sense_len = 0;
-
-	err = blk_execute_rq(q, NULL, rq, 1);
-	if (err == -EIO)
-		err = SCSI_DH_IO;
-
-	blk_put_request(rq);
-done:
-	return err;
-}
-
 static int get_lun_info(struct scsi_device *sdev, struct rdac_dh_data *h,
 			char *array_name, u8 *array_id)
 {
-	int err, i;
-	struct c8_inquiry *inqp;
+	int err = SCSI_DH_IO, i;
+	struct c8_inquiry *inqp = &h->inq.c8;
 
-	err = submit_inquiry(sdev, 0xC8, sizeof(struct c8_inquiry), h);
-	if (err == SCSI_DH_OK) {
-		inqp = &h->inq.c8;
+	if (!scsi_get_vpd_page(sdev, 0xC8, (unsigned char *)inqp,
+			       sizeof(struct c8_inquiry))) {
 		if (inqp->page_code != 0xc8)
 			return SCSI_DH_NOSYS;
 		if (inqp->page_id[0] != 'e' || inqp->page_id[1] != 'd' ||
@@ -453,20 +382,20 @@ static int get_lun_info(struct scsi_device *sdev, struct rdac_dh_data *h,
 		*(array_name+ARRAY_LABEL_LEN-1) = '\0';
 		memset(array_id, 0, UNIQUE_ID_LEN);
 		memcpy(array_id, inqp->array_unique_id, inqp->array_uniq_id_len);
+		err = SCSI_DH_OK;
 	}
 	return err;
 }
 
 static int check_ownership(struct scsi_device *sdev, struct rdac_dh_data *h)
 {
-	int err, access_state;
+	int err = SCSI_DH_IO, access_state;
 	struct rdac_dh_data *tmp;
-	struct c9_inquiry *inqp;
+	struct c9_inquiry *inqp = &h->inq.c9;
 
 	h->state = RDAC_STATE_ACTIVE;
-	err = submit_inquiry(sdev, 0xC9, sizeof(struct c9_inquiry), h);
-	if (err == SCSI_DH_OK) {
-		inqp = &h->inq.c9;
+	if (!scsi_get_vpd_page(sdev, 0xC9, (unsigned char *)inqp,
+			       sizeof(struct c9_inquiry))) {
 		/* detect the operating mode */
 		if ((inqp->avte_cvp >> 5) & 0x1)
 			h->mode = RDAC_MODE_IOSHIP; /* LUN in IOSHIP mode */
@@ -501,6 +430,7 @@ static int check_ownership(struct scsi_device *sdev, struct rdac_dh_data *h)
 			tmp->sdev->access_state = access_state;
 		}
 		rcu_read_unlock();
+		err = SCSI_DH_OK;
 	}
 
 	return err;
@@ -509,12 +439,11 @@ static int check_ownership(struct scsi_device *sdev, struct rdac_dh_data *h)
 static int initialize_controller(struct scsi_device *sdev,
 		struct rdac_dh_data *h, char *array_name, u8 *array_id)
 {
-	int err, index;
-	struct c4_inquiry *inqp;
+	int err = SCSI_DH_IO, index;
+	struct c4_inquiry *inqp = &h->inq.c4;
 
-	err = submit_inquiry(sdev, 0xC4, sizeof(struct c4_inquiry), h);
-	if (err == SCSI_DH_OK) {
-		inqp = &h->inq.c4;
+	if (!scsi_get_vpd_page(sdev, 0xC4, (unsigned char *)inqp,
+			       sizeof(struct c4_inquiry))) {
 		/* get the controller index */
 		if (inqp->slot_id[1] == 0x31)
 			index = 0;
@@ -530,18 +459,18 @@ static int initialize_controller(struct scsi_device *sdev,
 			h->sdev = sdev;
 		}
 		spin_unlock(&list_lock);
+		err = SCSI_DH_OK;
 	}
 	return err;
 }
 
 static int set_mode_select(struct scsi_device *sdev, struct rdac_dh_data *h)
 {
-	int err;
-	struct c2_inquiry *inqp;
+	int err = SCSI_DH_IO;
+	struct c2_inquiry *inqp = &h->inq.c2;
 
-	err = submit_inquiry(sdev, 0xC2, sizeof(struct c2_inquiry), h);
-	if (err == SCSI_DH_OK) {
-		inqp = &h->inq.c2;
+	if (!scsi_get_vpd_page(sdev, 0xC2, (unsigned char *)inqp,
+			       sizeof(struct c2_inquiry))) {
 		/*
 		 * If more than MODE6_MAX_LUN luns are supported, use
 		 * mode select 10
@@ -550,36 +479,35 @@ static int set_mode_select(struct scsi_device *sdev, struct rdac_dh_data *h)
 			h->ctlr->use_ms10 = 1;
 		else
 			h->ctlr->use_ms10 = 0;
+		err = SCSI_DH_OK;
 	}
 	return err;
 }
 
 static int mode_select_handle_sense(struct scsi_device *sdev,
-					unsigned char *sensebuf)
+				    struct scsi_sense_hdr *sense_hdr)
 {
-	struct scsi_sense_hdr sense_hdr;
-	int err = SCSI_DH_IO, ret;
+	int err = SCSI_DH_IO;
 	struct rdac_dh_data *h = sdev->handler_data;
 
-	ret = scsi_normalize_sense(sensebuf, SCSI_SENSE_BUFFERSIZE, &sense_hdr);
-	if (!ret)
+	if (!scsi_sense_valid(sense_hdr))
 		goto done;
 
-	switch (sense_hdr.sense_key) {
+	switch (sense_hdr->sense_key) {
 	case NO_SENSE:
 	case ABORTED_COMMAND:
 	case UNIT_ATTENTION:
 		err = SCSI_DH_RETRY;
 		break;
 	case NOT_READY:
-		if (sense_hdr.asc == 0x04 && sense_hdr.ascq == 0x01)
+		if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x01)
 			/* LUN Not Ready and is in the Process of Becoming
 			 * Ready
 			 */
 			err = SCSI_DH_RETRY;
 		break;
 	case ILLEGAL_REQUEST:
-		if (sense_hdr.asc == 0x91 && sense_hdr.ascq == 0x36)
+		if (sense_hdr->asc == 0x91 && sense_hdr->ascq == 0x36)
 			/*
 			 * Command Lock contention
 			 */
@@ -592,7 +520,7 @@ static int mode_select_handle_sense(struct scsi_device *sdev,
 	RDAC_LOG(RDAC_LOG_FAILOVER, sdev, "array %s, ctlr %d, "
 		"MODE_SELECT returned with sense %02x/%02x/%02x",
 		(char *) h->ctlr->array_name, h->ctlr->index,
-		sense_hdr.sense_key, sense_hdr.asc, sense_hdr.ascq);
+		sense_hdr->sense_key, sense_hdr->asc, sense_hdr->ascq);
 
 done:
 	return err;
@@ -602,13 +530,16 @@ static void send_mode_select(struct work_struct *work)
 {
 	struct rdac_controller *ctlr =
 		container_of(work, struct rdac_controller, ms_work);
-	struct request *rq;
 	struct scsi_device *sdev = ctlr->ms_sdev;
 	struct rdac_dh_data *h = sdev->handler_data;
-	struct request_queue *q = sdev->request_queue;
-	int err, retry_cnt = RDAC_RETRY_COUNT;
+	int err = SCSI_DH_OK, retry_cnt = RDAC_RETRY_COUNT;
 	struct rdac_queue_data *tmp, *qdata;
 	LIST_HEAD(list);
+	unsigned char cdb[COMMAND_SIZE(MODE_SELECT_10)];
+	struct scsi_sense_hdr sshdr;
+	unsigned int data_size;
+	u64 req_flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
+		REQ_FAILFAST_DRIVER;
 
 	spin_lock(&ctlr->ms_lock);
 	list_splice_init(&ctlr->ms_head, &list);
@@ -616,21 +547,19 @@ static void send_mode_select(struct work_struct *work)
 	ctlr->ms_sdev = NULL;
 	spin_unlock(&ctlr->ms_lock);
 
-retry:
-	err = SCSI_DH_RES_TEMP_UNAVAIL;
-	rq = rdac_failover_get(sdev, h, &list);
-	if (!rq)
-		goto done;
+ retry:
+	data_size = rdac_failover_get(ctlr, &list, cdb);
 
 	RDAC_LOG(RDAC_LOG_FAILOVER, sdev, "array %s, ctlr %d, "
 		"%s MODE_SELECT command",
 		(char *) h->ctlr->array_name, h->ctlr->index,
 		(retry_cnt == RDAC_RETRY_COUNT) ? "queueing" : "retrying");
 
-	err = blk_execute_rq(q, NULL, rq, 1);
-	blk_put_request(rq);
-	if (err != SCSI_DH_OK) {
-		err = mode_select_handle_sense(sdev, h->sense);
+	if (scsi_execute_req_flags(sdev, cdb, DMA_TO_DEVICE,
+				   &h->ctlr->mode_select, data_size, &sshdr,
+				   RDAC_TIMEOUT * HZ,
+				   RDAC_RETRIES, NULL, req_flags, 0)) {
+		err = mode_select_handle_sense(sdev, &sshdr);
 		if (err == SCSI_DH_RETRY && retry_cnt--)
 			goto retry;
 		if (err == SCSI_DH_IMM_RETRY)
@@ -643,7 +572,6 @@ static void send_mode_select(struct work_struct *work)
 				(char *) h->ctlr->array_name, h->ctlr->index);
 	}
 
-done:
 	list_for_each_entry_safe(qdata, tmp, &list, entry) {
 		list_del(&qdata->entry);
 		if (err == SCSI_DH_OK)
-- 
2.1.4


^ permalink raw reply related

* [PATCH 07/18] dm: always defer request allocation to the owner of the request_queue
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

DM already calls blk_mq_alloc_request on the request_queue of the
underlying device if it is a blk-mq device.  But now that we allow drivers
to allocate additional data and initialize it ahead of time we need to do
the same for all drivers.   Doing so and using the new cmd_size
infrastructure in the block layer greatly simplifies the dm-rq and mpath
code, and should also make arbitrary combinations of SQ and MQ devices
with SQ or MQ device mapper tables easily possible as a further step.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h          |   1 -
 drivers/md/dm-mpath.c         | 132 ++++------------------
 drivers/md/dm-rq.c            | 251 ++++++++++--------------------------------
 drivers/md/dm-rq.h            |   2 +-
 drivers/md/dm-target.c        |   7 --
 drivers/md/dm.c               |  30 ++---
 drivers/md/dm.h               |   3 +-
 include/linux/device-mapper.h |   3 -
 8 files changed, 85 insertions(+), 344 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 40ceba1..136fda3 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -92,7 +92,6 @@ struct mapped_device {
 	 * io objects are allocated from here.
 	 */
 	mempool_t *io_pool;
-	mempool_t *rq_pool;
 
 	struct bio_set *bs;
 
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6400cff..784f237 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -92,12 +92,6 @@ struct multipath {
 
 	unsigned queue_mode;
 
-	/*
-	 * We must use a mempool of dm_mpath_io structs so that we
-	 * can resubmit bios on error.
-	 */
-	mempool_t *mpio_pool;
-
 	struct mutex work_mutex;
 	struct work_struct trigger_event;
 
@@ -115,8 +109,6 @@ struct dm_mpath_io {
 
 typedef int (*action_fn) (struct pgpath *pgpath);
 
-static struct kmem_cache *_mpio_cache;
-
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
@@ -209,7 +201,6 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
 
-		m->mpio_pool = NULL;
 		m->queue_mode = DM_TYPE_NONE;
 
 		m->ti = ti;
@@ -229,16 +220,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
 			m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
 		else
 			m->queue_mode = DM_TYPE_REQUEST_BASED;
-	}
-
-	if (m->queue_mode == DM_TYPE_REQUEST_BASED) {
-		unsigned min_ios = dm_get_reserved_rq_based_ios();
-
-		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
-		if (!m->mpio_pool)
-			return -ENOMEM;
-	}
-	else if (m->queue_mode == DM_TYPE_BIO_BASED) {
+	} else if (m->queue_mode == DM_TYPE_BIO_BASED) {
 		INIT_WORK(&m->process_queued_bios, process_queued_bios);
 		/*
 		 * bio-based doesn't support any direct scsi_dh management;
@@ -263,7 +245,6 @@ static void free_multipath(struct multipath *m)
 
 	kfree(m->hw_handler_name);
 	kfree(m->hw_handler_params);
-	mempool_destroy(m->mpio_pool);
 	kfree(m);
 }
 
@@ -272,38 +253,6 @@ static struct dm_mpath_io *get_mpio(union map_info *info)
 	return info->ptr;
 }
 
-static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info)
-{
-	struct dm_mpath_io *mpio;
-
-	if (!m->mpio_pool) {
-		/* Use blk-mq pdu memory requested via per_io_data_size */
-		mpio = get_mpio(info);
-		memset(mpio, 0, sizeof(*mpio));
-		return mpio;
-	}
-
-	mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
-	if (!mpio)
-		return NULL;
-
-	memset(mpio, 0, sizeof(*mpio));
-	info->ptr = mpio;
-
-	return mpio;
-}
-
-static void clear_request_fn_mpio(struct multipath *m, union map_info *info)
-{
-	/* Only needed for non blk-mq (.request_fn) multipath */
-	if (m->mpio_pool) {
-		struct dm_mpath_io *mpio = info->ptr;
-
-		info->ptr = NULL;
-		mempool_free(mpio, m->mpio_pool);
-	}
-}
-
 static size_t multipath_per_bio_data_size(void)
 {
 	return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
@@ -530,16 +479,17 @@ static bool must_push_back_bio(struct multipath *m)
 /*
  * Map cloned requests (request-based multipath)
  */
-static int __multipath_map(struct dm_target *ti, struct request *clone,
-			   union map_info *map_context,
-			   struct request *rq, struct request **__clone)
+static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
+				   union map_info *map_context,
+				   struct request **__clone)
 {
 	struct multipath *m = ti->private;
 	int r = DM_MAPIO_REQUEUE;
-	size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
+	size_t nr_bytes = blk_rq_bytes(rq);
 	struct pgpath *pgpath;
 	struct block_device *bdev;
-	struct dm_mpath_io *mpio;
+	struct dm_mpath_io *mpio = get_mpio(map_context);
+	struct request *clone;
 
 	/* Do we need to select a new pgpath? */
 	pgpath = lockless_dereference(m->current_pgpath);
@@ -556,42 +506,23 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 		return r;
 	}
 
-	mpio = set_mpio(m, map_context);
-	if (!mpio)
-		/* ENOMEM, requeue */
-		return r;
-
+	memset(mpio, 0, sizeof(*mpio));
 	mpio->pgpath = pgpath;
 	mpio->nr_bytes = nr_bytes;
 
 	bdev = pgpath->path.dev->bdev;
 
-	if (clone) {
-		/*
-		 * Old request-based interface: allocated clone is passed in.
-		 * Used by: .request_fn stacked on .request_fn path(s).
-		 */
-		clone->q = bdev_get_queue(bdev);
-		clone->rq_disk = bdev->bd_disk;
-		clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-	} else {
-		/*
-		 * blk-mq request-based interface; used by both:
-		 * .request_fn stacked on blk-mq path(s) and
-		 * blk-mq stacked on blk-mq path(s).
-		 */
-		clone = blk_mq_alloc_request(bdev_get_queue(bdev),
-					     rq_data_dir(rq), BLK_MQ_REQ_NOWAIT);
-		if (IS_ERR(clone)) {
-			/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
-			clear_request_fn_mpio(m, map_context);
-			return r;
-		}
-		clone->bio = clone->biotail = NULL;
-		clone->rq_disk = bdev->bd_disk;
-		clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-		*__clone = clone;
+	clone = blk_get_request(bdev_get_queue(bdev),
+			rq->cmd_flags | REQ_NOMERGE,
+			GFP_ATOMIC);
+	if (IS_ERR(clone)) {
+		/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
+		return r;
 	}
+	clone->bio = clone->biotail = NULL;
+	clone->rq_disk = bdev->bd_disk;
+	clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+	*__clone = clone;
 
 	if (pgpath->pg->ps.type->start_io)
 		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
@@ -600,22 +531,9 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	return DM_MAPIO_REMAPPED;
 }
 
-static int multipath_map(struct dm_target *ti, struct request *clone,
-			 union map_info *map_context)
-{
-	return __multipath_map(ti, clone, map_context, NULL, NULL);
-}
-
-static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
-				   union map_info *map_context,
-				   struct request **clone)
-{
-	return __multipath_map(ti, NULL, map_context, rq, clone);
-}
-
 static void multipath_release_clone(struct request *clone)
 {
-	blk_mq_free_request(clone);
+	blk_put_request(clone);
 }
 
 /*
@@ -1187,7 +1105,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->num_write_same_bios = 1;
 	if (m->queue_mode == DM_TYPE_BIO_BASED)
 		ti->per_io_data_size = multipath_per_bio_data_size();
-	else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
+	else
 		ti->per_io_data_size = sizeof(struct dm_mpath_io);
 
 	return 0;
@@ -1610,7 +1528,6 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 		if (ps->type->end_io)
 			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
 	}
-	clear_request_fn_mpio(m, map_context);
 
 	return r;
 }
@@ -2060,7 +1977,6 @@ static struct target_type multipath_target = {
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
-	.map_rq = multipath_map,
 	.clone_and_map_rq = multipath_clone_and_map,
 	.release_clone_rq = multipath_release_clone,
 	.rq_end_io = multipath_end_io,
@@ -2080,11 +1996,6 @@ static int __init dm_multipath_init(void)
 {
 	int r;
 
-	/* allocate a slab for the dm_mpath_ios */
-	_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
-	if (!_mpio_cache)
-		return -ENOMEM;
-
 	r = dm_register_target(&multipath_target);
 	if (r < 0) {
 		DMERR("request-based register failed %d", r);
@@ -2120,8 +2031,6 @@ static int __init dm_multipath_init(void)
 bad_alloc_kmultipathd:
 	dm_unregister_target(&multipath_target);
 bad_register_target:
-	kmem_cache_destroy(_mpio_cache);
-
 	return r;
 }
 
@@ -2131,7 +2040,6 @@ static void __exit dm_multipath_exit(void)
 	destroy_workqueue(kmultipathd);
 
 	dm_unregister_target(&multipath_target);
-	kmem_cache_destroy(_mpio_cache);
 }
 
 module_init(dm_multipath_init);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 3f12916..8d06834 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -109,28 +109,6 @@ void dm_stop_queue(struct request_queue *q)
 		dm_mq_stop_queue(q);
 }
 
-static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
-						gfp_t gfp_mask)
-{
-	return mempool_alloc(md->io_pool, gfp_mask);
-}
-
-static void free_old_rq_tio(struct dm_rq_target_io *tio)
-{
-	mempool_free(tio, tio->md->io_pool);
-}
-
-static struct request *alloc_old_clone_request(struct mapped_device *md,
-					       gfp_t gfp_mask)
-{
-	return mempool_alloc(md->rq_pool, gfp_mask);
-}
-
-static void free_old_clone_request(struct mapped_device *md, struct request *rq)
-{
-	mempool_free(rq, md->rq_pool);
-}
-
 /*
  * Partial completion handling for request-based dm
  */
@@ -185,7 +163,7 @@ static void end_clone_bio(struct bio *clone)
 
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 {
-	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+	return blk_mq_rq_to_pdu(rq);
 }
 
 static void rq_end_stats(struct mapped_device *md, struct request *orig)
@@ -233,31 +211,6 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 	dm_put(md);
 }
 
-static void free_rq_clone(struct request *clone)
-{
-	struct dm_rq_target_io *tio = clone->end_io_data;
-	struct mapped_device *md = tio->md;
-
-	blk_rq_unprep_clone(clone);
-
-	/*
-	 * It is possible for a clone_old_rq() allocated clone to
-	 * get passed in -- it may not yet have a request_queue.
-	 * This is known to occur if the error target replaces
-	 * a multipath target that has a request_fn queue stacked
-	 * on blk-mq queue(s).
-	 */
-	if (clone->q && clone->q->mq_ops)
-		/* stacked on blk-mq queue(s) */
-		tio->ti->type->release_clone_rq(clone);
-	else if (!md->queue->mq_ops)
-		/* request_fn queue stacked on request_fn queue(s) */
-		free_old_clone_request(md, clone);
-
-	if (!md->queue->mq_ops)
-		free_old_rq_tio(tio);
-}
-
 /*
  * Complete the clone and the original request.
  * Must be called without clone's queue lock held,
@@ -270,7 +223,9 @@ static void dm_end_request(struct request *clone, int error)
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
 
-	free_rq_clone(clone);
+	blk_rq_unprep_clone(clone);
+	tio->ti->type->release_clone_rq(clone);
+
 	rq_end_stats(md, rq);
 	if (!rq->q->mq_ops)
 		blk_end_request_all(rq, error);
@@ -279,22 +234,6 @@ static void dm_end_request(struct request *clone, int error)
 	rq_completed(md, rw, true);
 }
 
-static void dm_unprep_request(struct request *rq)
-{
-	struct dm_rq_target_io *tio = tio_from_request(rq);
-	struct request *clone = tio->clone;
-
-	if (!rq->q->mq_ops) {
-		rq->special = NULL;
-		rq->rq_flags &= ~RQF_DONTPREP;
-	}
-
-	if (clone)
-		free_rq_clone(clone);
-	else if (!tio->md->queue->mq_ops)
-		free_old_rq_tio(tio);
-}
-
 /*
  * Requeue the original request of a clone.
  */
@@ -333,7 +272,10 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
 	int rw = rq_data_dir(rq);
 
 	rq_end_stats(md, rq);
-	dm_unprep_request(rq);
+	if (tio->clone) {
+		blk_rq_unprep_clone(tio->clone);
+		tio->ti->type->release_clone_rq(tio->clone);
+	}
 
 	if (!rq->q->mq_ops)
 		dm_old_requeue_request(rq);
@@ -388,14 +330,11 @@ static void dm_softirq_done(struct request *rq)
 	if (!clone) {
 		rq_end_stats(tio->md, rq);
 		rw = rq_data_dir(rq);
-		if (!rq->q->mq_ops) {
+		if (!rq->q->mq_ops)
 			blk_end_request_all(rq, tio->error);
-			rq_completed(tio->md, rw, false);
-			free_old_rq_tio(tio);
-		} else {
+		else
 			blk_mq_end_request(rq, tio->error);
-			rq_completed(tio->md, rw, false);
-		}
+		rq_completed(tio->md, rw, false);
 		return;
 	}
 
@@ -439,16 +378,6 @@ static void end_clone_request(struct request *clone, int error)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
-	if (!clone->q->mq_ops) {
-		/*
-		 * For just cleaning up the information of the queue in which
-		 * the clone was dispatched.
-		 * The clone is *NOT* freed actually here because it is alloced
-		 * from dm own mempool (RQF_ALLOCED isn't set).
-		 */
-		__blk_put_request(clone->q, clone);
-	}
-
 	/*
 	 * Actual request completion is done in a softirq context which doesn't
 	 * hold the clone's queue lock.  Otherwise, deadlock could occur because:
@@ -506,28 +435,6 @@ static int setup_clone(struct request *clone, struct request *rq,
 	return 0;
 }
 
-static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
-				    struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	/*
-	 * Create clone for use with .request_fn request_queue
-	 */
-	struct request *clone;
-
-	clone = alloc_old_clone_request(md, gfp_mask);
-	if (!clone)
-		return NULL;
-
-	blk_rq_init(NULL, clone);
-	if (setup_clone(clone, rq, tio, gfp_mask)) {
-		/* -ENOMEM */
-		free_old_clone_request(md, clone);
-		return NULL;
-	}
-
-	return clone;
-}
-
 static void map_tio_request(struct kthread_work *work);
 
 static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
@@ -549,60 +456,6 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
 		kthread_init_work(&tio->work, map_tio_request);
 }
 
-static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
-					       struct mapped_device *md,
-					       gfp_t gfp_mask)
-{
-	struct dm_rq_target_io *tio;
-	int srcu_idx;
-	struct dm_table *table;
-
-	tio = alloc_old_rq_tio(md, gfp_mask);
-	if (!tio)
-		return NULL;
-
-	init_tio(tio, rq, md);
-
-	table = dm_get_live_table(md, &srcu_idx);
-	/*
-	 * Must clone a request if this .request_fn DM device
-	 * is stacked on .request_fn device(s).
-	 */
-	if (!dm_table_all_blk_mq_devices(table)) {
-		if (!clone_old_rq(rq, md, tio, gfp_mask)) {
-			dm_put_live_table(md, srcu_idx);
-			free_old_rq_tio(tio);
-			return NULL;
-		}
-	}
-	dm_put_live_table(md, srcu_idx);
-
-	return tio;
-}
-
-/*
- * Called with the queue lock held.
- */
-static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
-{
-	struct mapped_device *md = q->queuedata;
-	struct dm_rq_target_io *tio;
-
-	if (unlikely(rq->special)) {
-		DMWARN("Already has something in rq->special.");
-		return BLKPREP_KILL;
-	}
-
-	tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
-	if (!tio)
-		return BLKPREP_DEFER;
-
-	rq->special = tio;
-	rq->rq_flags |= RQF_DONTPREP;
-
-	return BLKPREP_OK;
-}
-
 /*
  * Returns:
  * DM_MAPIO_*       : the request has been processed as indicated
@@ -617,31 +470,18 @@ static int map_request(struct dm_rq_target_io *tio)
 	struct request *rq = tio->orig;
 	struct request *clone = NULL;
 
-	if (tio->clone) {
-		clone = tio->clone;
-		r = ti->type->map_rq(ti, clone, &tio->info);
-		if (r == DM_MAPIO_DELAY_REQUEUE)
-			return DM_MAPIO_REQUEUE; /* .request_fn requeue is always immediate */
-	} else {
-		r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
-		if (r < 0) {
-			/* The target wants to complete the I/O */
-			dm_kill_unmapped_request(rq, r);
-			return r;
-		}
-		if (r == DM_MAPIO_REMAPPED &&
-		    setup_clone(clone, rq, tio, GFP_ATOMIC)) {
-			/* -ENOMEM */
-			ti->type->release_clone_rq(clone);
-			return DM_MAPIO_REQUEUE;
-		}
-	}
-
+	r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
 		/* The target has taken the I/O to submit by itself later */
 		break;
 	case DM_MAPIO_REMAPPED:
+		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+			/* -ENOMEM */
+			ti->type->release_clone_rq(clone);
+			return DM_MAPIO_REQUEUE;
+		}
+
 		/* The target has remapped the I/O so dispatch it */
 		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
 				     blk_rq_pos(rq));
@@ -700,6 +540,29 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 	dm_get(md);
 }
 
+static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq)
+{
+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+
+	/*
+	 * Must initialize md member of tio, otherwise it won't
+	 * be available in dm_mq_queue_rq.
+	 */
+	tio->md = md;
+
+	if (md->init_tio_pdu) {
+		/* target-specific per-io data is immediately after the tio */
+		tio->info.ptr = tio + 1;
+	}
+
+	return 0;
+}
+
+static int dm_rq_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp)
+{
+	return __dm_rq_init_rq(q->rq_alloc_data, rq);
+}
+
 static void map_tio_request(struct kthread_work *work)
 {
 	struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
@@ -794,6 +657,7 @@ static void dm_old_request_fn(struct request_queue *q)
 		dm_start_request(md, rq);
 
 		tio = tio_from_request(rq);
+		init_tio(tio, rq, md);
 		/* Establish tio->ti before queuing work (map_tio_request) */
 		tio->ti = ti;
 		kthread_queue_work(&md->kworker, &tio->work);
@@ -804,10 +668,22 @@ static void dm_old_request_fn(struct request_queue *q)
 /*
  * Fully initialize a .request_fn request-based queue.
  */
-int dm_old_init_request_queue(struct mapped_device *md)
+int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
 {
+	struct dm_target *immutable_tgt;
+
 	/* Fully initialize the queue */
+	md->queue->cmd_size = sizeof(struct dm_rq_target_io);
+	md->queue->rq_alloc_data = md;
 	md->queue->request_fn = dm_old_request_fn;
+	md->queue->init_rq_fn = dm_rq_init_rq;
+
+	immutable_tgt = dm_table_get_immutable_target(t);
+	if (immutable_tgt && immutable_tgt->per_io_data_size) {
+		/* any target-specific per-io data is immediately after the tio */
+		md->queue->cmd_size += immutable_tgt->per_io_data_size;
+		md->init_tio_pdu = true;
+	}
 	if (blk_init_allocated_queue(md->queue) < 0)
 		return -EINVAL;
 
@@ -816,7 +692,6 @@ int dm_old_init_request_queue(struct mapped_device *md)
 
 	dm_init_normal_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
-	blk_queue_prep_rq(md->queue, dm_old_prep_fn);
 
 	/* Initialize the request-based DM worker thread */
 	kthread_init_worker(&md->kworker);
@@ -837,21 +712,7 @@ static int dm_mq_init_request(void *data, struct request *rq,
 		       unsigned int hctx_idx, unsigned int request_idx,
 		       unsigned int numa_node)
 {
-	struct mapped_device *md = data;
-	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
-
-	/*
-	 * Must initialize md member of tio, otherwise it won't
-	 * be available in dm_mq_queue_rq.
-	 */
-	tio->md = md;
-
-	if (md->init_tio_pdu) {
-		/* target-specific per-io data is immediately after the tio */
-		tio->info.ptr = tio + 1;
-	}
-
-	return 0;
+	return __dm_rq_init_rq(data, rq);
 }
 
 static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
index 4da06ca..f0020d2 100644
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -48,7 +48,7 @@ struct dm_rq_clone_bio_info {
 bool dm_use_blk_mq_default(void);
 bool dm_use_blk_mq(struct mapped_device *md);
 
-int dm_old_init_request_queue(struct mapped_device *md);
+int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t);
 int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t);
 void dm_mq_cleanup_mapped_device(struct mapped_device *md);
 
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 710ae28..43d3445 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -131,12 +131,6 @@ static int io_err_map(struct dm_target *tt, struct bio *bio)
 	return -EIO;
 }
 
-static int io_err_map_rq(struct dm_target *ti, struct request *clone,
-			 union map_info *map_context)
-{
-	return -EIO;
-}
-
 static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
 				   union map_info *map_context,
 				   struct request **clone)
@@ -161,7 +155,6 @@ static struct target_type error_target = {
 	.ctr  = io_err_ctr,
 	.dtr  = io_err_dtr,
 	.map  = io_err_map,
-	.map_rq = io_err_map_rq,
 	.clone_and_map_rq = io_err_clone_and_map_rq,
 	.release_clone_rq = io_err_release_clone_rq,
 	.direct_access = io_err_direct_access,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3086da5..ff4a29a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -91,7 +91,6 @@ static int dm_numa_node = DM_NUMA_NODE;
  */
 struct dm_md_mempools {
 	mempool_t *io_pool;
-	mempool_t *rq_pool;
 	struct bio_set *bs;
 };
 
@@ -1419,7 +1418,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
 	if (md->kworker_task)
 		kthread_stop(md->kworker_task);
 	mempool_destroy(md->io_pool);
-	mempool_destroy(md->rq_pool);
 	if (md->bs)
 		bioset_free(md->bs);
 
@@ -1595,12 +1593,10 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
-	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
+	BUG_ON(!p || md->io_pool || md->bs);
 
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
-	md->rq_pool = p->rq_pool;
-	p->rq_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
 
@@ -1777,7 +1773,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 
 	switch (type) {
 	case DM_TYPE_REQUEST_BASED:
-		r = dm_old_init_request_queue(md);
+		r = dm_old_init_request_queue(md, t);
 		if (r) {
 			DMERR("Cannot initialize queue for request-based mapped device");
 			return r;
@@ -2493,7 +2489,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 					    unsigned integrity, unsigned per_io_data_size)
 {
 	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
-	struct kmem_cache *cachep = NULL;
 	unsigned int pool_size = 0;
 	unsigned int front_pad;
 
@@ -2503,20 +2498,16 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
 	case DM_TYPE_DAX_BIO_BASED:
-		cachep = _io_cache;
 		pool_size = dm_get_reserved_bio_based_ios();
 		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+	
+		pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
+		if (!pools->io_pool)
+			goto out;
 		break;
 	case DM_TYPE_REQUEST_BASED:
-		cachep = _rq_tio_cache;
-		pool_size = dm_get_reserved_rq_based_ios();
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-		/* fall through to setup remaining rq-based pools */
 	case DM_TYPE_MQ_REQUEST_BASED:
-		if (!pool_size)
-			pool_size = dm_get_reserved_rq_based_ios();
+		pool_size = dm_get_reserved_rq_based_ios();
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_io_data_size is used for blk-mq pdu at queue allocation */
 		break;
@@ -2524,12 +2515,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 		BUG();
 	}
 
-	if (cachep) {
-		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-		if (!pools->io_pool)
-			goto out;
-	}
-
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
 		goto out;
@@ -2551,7 +2536,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 		return;
 
 	mempool_destroy(pools->io_pool);
-	mempool_destroy(pools->rq_pool);
 
 	if (pools->bs)
 		bioset_free(pools->bs);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index f0aad08..f298b01 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -95,8 +95,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
 /*
  * To check whether the target type is request-based or not (bio-based).
  */
-#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \
-				    ((t)->type->clone_and_map_rq != NULL))
+#define dm_target_request_based(t) ((t)->type->clone_and_map_rq != NULL)
 
 /*
  * To check whether the target type is a hybrid (capable of being
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ef7962e..a7e6903 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -55,8 +55,6 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
  * = 2: The target wants to push back the io
  */
 typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio);
-typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
-				  union map_info *map_context);
 typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti,
 					    struct request *rq,
 					    union map_info *map_context,
@@ -163,7 +161,6 @@ struct target_type {
 	dm_ctr_fn ctr;
 	dm_dtr_fn dtr;
 	dm_map_fn map;
-	dm_map_request_fn map_rq;
 	dm_clone_and_map_request_fn clone_and_map_rq;
 	dm_release_clone_request_fn release_clone_rq;
 	dm_endio_fn end_io;
-- 
2.1.4


^ permalink raw reply related

* [PATCH 06/18] dm: remove incomple BLOCK_PC support
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

DM tries to copy a few fields around for BLOCK_PC requests, but given
that no dm-target ever wires up scsi_cmd_ioctl BLOCK_PC can't actually
be sent to dm.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/md/dm-rq.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 93f6e9f..3f12916 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -270,19 +270,6 @@ static void dm_end_request(struct request *clone, int error)
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
 
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
-		rq->errors = clone->errors;
-		rq->resid_len = clone->resid_len;
-
-		if (rq->sense)
-			/*
-			 * We are using the sense buffer of the original
-			 * request.
-			 * So setting the length of the sense data is enough.
-			 */
-			rq->sense_len = clone->sense_len;
-	}
-
 	free_rq_clone(clone);
 	rq_end_stats(md, rq);
 	if (!rq->q->mq_ops)
@@ -511,9 +498,6 @@ static int setup_clone(struct request *clone, struct request *rq,
 	if (r)
 		return r;
 
-	clone->cmd = rq->cmd;
-	clone->cmd_len = rq->cmd_len;
-	clone->sense = rq->sense;
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
 
-- 
2.1.4


^ permalink raw reply related

* [PATCH 05/18] block: allow specifying size for extra command data
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

This mirrors the blk-mq capabilities to allocate extra drivers-specific
data behind struct request by setting a cmd_size field, as well as having
a constructor / destructor for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 block/blk-core.c       | 59 ++++++++++++++++++++++++++++++++++++++++----------
 block/blk-flush.c      |  5 ++---
 block/blk-sysfs.c      |  7 ++++--
 include/linux/blkdev.h |  7 ++++++
 4 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 54b5512..7de7164 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -606,17 +606,41 @@ void blk_cleanup_queue(struct request_queue *q)
 EXPORT_SYMBOL(blk_cleanup_queue);
 
 /* Allocate memory local to the request queue */
-static void *alloc_request_struct(gfp_t gfp_mask, void *data)
+static void *alloc_request_simple(gfp_t gfp_mask, void *data)
 {
-	int nid = (int)(long)data;
-	return kmem_cache_alloc_node(request_cachep, gfp_mask, nid);
+	struct request_queue *q = data;
+
+	return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node);
 }
 
-static void free_request_struct(void *element, void *unused)
+static void free_request_simple(void *element, void *data)
 {
 	kmem_cache_free(request_cachep, element);
 }
 
+static void *alloc_request_size(gfp_t gfp_mask, void *data)
+{
+	struct request_queue *q = data;
+	struct request *rq;
+
+	rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask,
+			q->node);
+	if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) {
+		kfree(rq);
+		rq = NULL;
+	}
+	return rq;
+}
+
+static void free_request_size(void *element, void *data)
+{
+	struct request_queue *q = data;
+
+	if (q->exit_rq_fn)
+		q->exit_rq_fn(q, element);
+	kfree(element);
+}
+
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask)
 {
@@ -629,10 +653,15 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q,
 	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
 	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 
-	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct,
-					  free_request_struct,
-					  (void *)(long)q->node, gfp_mask,
-					  q->node);
+	if (q->cmd_size) {
+		rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
+				alloc_request_size, free_request_size,
+				q, gfp_mask, q->node);
+	} else {
+		rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
+				alloc_request_simple, free_request_simple,
+				q, gfp_mask, q->node);
+	}
 	if (!rl->rq_pool)
 		return -ENOMEM;
 
@@ -846,12 +875,15 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
 
 int blk_init_allocated_queue(struct request_queue *q)
 {
-	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
+	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
 	if (!q->fq)
 		return -ENOMEM;
 
+	if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL))
+		goto out_free_flush_queue;
+
 	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
-		goto fail;
+		goto out_exit_flush_rq;
 
 	INIT_WORK(&q->timeout_work, blk_timeout_work);
 	q->queue_flags		|= QUEUE_FLAG_DEFAULT;
@@ -869,13 +901,16 @@ int blk_init_allocated_queue(struct request_queue *q)
 	/* init elevator */
 	if (elevator_init(q, NULL)) {
 		mutex_unlock(&q->sysfs_lock);
-		goto fail;
+		goto out_exit_flush_rq;
 	}
 
 	mutex_unlock(&q->sysfs_lock);
 	return 0;
 
-fail:
+out_exit_flush_rq:
+	if (q->exit_rq_fn)
+		q->exit_rq_fn(q, q->fq->flush_rq);
+out_free_flush_queue:
 	blk_free_flush_queue(q->fq);
 	wbt_exit(q);
 	return -ENOMEM;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index d7de34e..bf3ba3c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -547,11 +547,10 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 	if (!fq)
 		goto fail;
 
-	if (q->mq_ops) {
+	if (q->mq_ops)
 		spin_lock_init(&fq->mq_flush_lock);
-		rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
-	}
 
+	rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
 	fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node);
 	if (!fq->flush_rq)
 		goto fail_rq;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1dbce05..894f773 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -814,10 +814,13 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	if (!q->mq_ops)
+	if (!q->mq_ops) {
+		if (q->exit_rq_fn)
+			q->exit_rq_fn(q, q->fq->flush_rq);
 		blk_free_flush_queue(q->fq);
-	else
+	} else {
 		blk_mq_release(q);
+	}
 
 	blk_trace_shutdown(q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a036c4a..648ecf5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -273,6 +273,8 @@ typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 typedef int (lld_busy_fn) (struct request_queue *q);
 typedef int (bsg_job_fn) (struct bsg_job *);
+typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
+typedef void (exit_rq_fn)(struct request_queue *, struct request *);
 
 enum blk_eh_timer_return {
 	BLK_EH_NOT_HANDLED,
@@ -408,6 +410,8 @@ struct request_queue {
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 	lld_busy_fn		*lld_busy_fn;
+	init_rq_fn		*init_rq_fn;
+	exit_rq_fn		*exit_rq_fn;
 
 	const struct blk_mq_ops	*mq_ops;
 
@@ -572,6 +576,9 @@ struct request_queue {
 	struct bio_set		*bio_split;
 
 	bool			mq_sysfs_init_done;
+
+	size_t			cmd_size;
+	void			*rq_alloc_data;
 };
 
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
-- 
2.1.4


^ permalink raw reply related

* [PATCH 04/18] block: simplify blk_init_allocated_queue
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

Return an errno value instead of the passed in queue so that the callers
don't have to keep track of two queues, and move the assignment of the
request_fn and lock to the caller as passing them as argument doesn't
simplify anything.  While we're at it also remove two pointless NULL
assignments, given that the request structure is zeroed on allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 block/blk-core.c       | 38 +++++++++++++++-----------------------
 drivers/md/dm-rq.c     |  3 ++-
 include/linux/blkdev.h |  3 +--
 3 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a84c1b9..54b5512 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -823,15 +823,19 @@ EXPORT_SYMBOL(blk_init_queue);
 struct request_queue *
 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
-	struct request_queue *uninit_q, *q;
+	struct request_queue *q;
 
-	uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
-	if (!uninit_q)
+	q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+	if (!q)
 		return NULL;
 
-	q = blk_init_allocated_queue(uninit_q, rfn, lock);
-	if (!q)
-		blk_cleanup_queue(uninit_q);
+	q->request_fn = rfn;
+	if (lock)
+		q->queue_lock = lock;
+	if (blk_init_allocated_queue(q) < 0) {
+		blk_cleanup_queue(q);
+		return NULL;
+	}
 
 	return q;
 }
@@ -839,30 +843,19 @@ EXPORT_SYMBOL(blk_init_queue_node);
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
 
-struct request_queue *
-blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
-			 spinlock_t *lock)
-{
-	if (!q)
-		return NULL;
 
+int blk_init_allocated_queue(struct request_queue *q)
+{
 	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
 	if (!q->fq)
-		return NULL;
+		return -ENOMEM;
 
 	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
 		goto fail;
 
 	INIT_WORK(&q->timeout_work, blk_timeout_work);
-	q->request_fn		= rfn;
-	q->prep_rq_fn		= NULL;
-	q->unprep_rq_fn		= NULL;
 	q->queue_flags		|= QUEUE_FLAG_DEFAULT;
 
-	/* Override internal queue lock with supplied lock pointer */
-	if (lock)
-		q->queue_lock		= lock;
-
 	/*
 	 * This also sets hw/phys segments, boundary and size
 	 */
@@ -880,13 +873,12 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	}
 
 	mutex_unlock(&q->sysfs_lock);
-
-	return q;
+	return 0;
 
 fail:
 	blk_free_flush_queue(q->fq);
 	wbt_exit(q);
-	return NULL;
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
 
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9d7275f..93f6e9f 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -823,7 +823,8 @@ static void dm_old_request_fn(struct request_queue *q)
 int dm_old_init_request_queue(struct mapped_device *md)
 {
 	/* Fully initialize the queue */
-	if (!blk_init_allocated_queue(md->queue, dm_old_request_fn, NULL))
+	md->queue->request_fn = dm_old_request_fn;
+	if (blk_init_allocated_queue(md->queue) < 0)
 		return -EINVAL;
 
 	/* disable dm_old_request_fn's merge heuristic by default */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8e0b57e..a036c4a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1131,8 +1131,7 @@ extern void blk_unprep_request(struct request *);
 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
 					spinlock_t *lock, int node_id);
 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
-extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
-						      request_fn_proc *, spinlock_t *);
+extern int blk_init_allocated_queue(struct request_queue *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
-- 
2.1.4


^ permalink raw reply related

* [PATCH 03/18] block: fix elevator init check
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

We can't initalize the elevator fields for flushes as flush share space
in struct request with the elevator data.  But currently we can't
commnicate that a request is a flush through blk_get_request as we
can only pass READ or WRITE, and the low-level code looks at the
possible NULL bio to check for a flush.

Fix this by allowing to pass any block op and flags, and by checking for
the flush flags in __get_request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 block/blk-core.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index b830e14..a84c1b9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1022,25 +1022,6 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
 	return 0;
 }
 
-/*
- * Determine if elevator data should be initialized when allocating the
- * request associated with @bio.
- */
-static bool blk_rq_should_init_elevator(struct bio *bio)
-{
-	if (!bio)
-		return true;
-
-	/*
-	 * Flush requests do not use the elevator so skip initialization.
-	 * This allows a request to share the flush and elevator data.
-	 */
-	if (op_is_flush(bio->bi_opf))
-		return false;
-
-	return true;
-}
-
 /**
  * __get_request - get a free request
  * @rl: request list to allocate from
@@ -1119,10 +1100,13 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 	 * request is freed.  This guarantees icq's won't be destroyed and
 	 * makes creating new ones safe.
 	 *
+	 * Flush requests do not use the elevator so skip initialization.
+	 * This allows a request to share the flush and elevator data.
+	 *
 	 * Also, lookup icq while holding queue_lock.  If it doesn't exist,
 	 * it will be created after releasing queue_lock.
 	 */
-	if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
+	if (!op_is_flush(op) && !blk_queue_bypass(q)) {
 		rq_flags |= RQF_ELVPRIV;
 		q->nr_rqs_elvpriv++;
 		if (et->icq_cache && ioc)
@@ -1276,8 +1260,6 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 {
 	struct request *rq;
 
-	BUG_ON(rw != READ && rw != WRITE);
-
 	/* create ioc upfront */
 	create_io_context(gfp_mask, q->node);
 
-- 
2.1.4


^ permalink raw reply related

* [PATCH 02/18] md: cleanup bio op / flags handling in raid1_write_request
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

No need for the local variables, the bio is still live and we can just
assigned the bits we want directly.  Make me wonder why we can't assign
all the bio flags to start with.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/raid1.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7b0f647..67b0365 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1170,10 +1170,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	int i, disks;
 	struct bitmap *bitmap = mddev->bitmap;
 	unsigned long flags;
-	const int op = bio_op(bio);
-	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
-	const unsigned long do_flush_fua = (bio->bi_opf &
-						(REQ_PREFLUSH | REQ_FUA));
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid1_plug_cb *plug = NULL;
@@ -1389,7 +1385,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 				   conf->mirrors[i].rdev->data_offset);
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		bio_set_op_attrs(mbio, op, do_flush_fua | do_sync);
+		mbio->bi_opf = bio_op(bio) |
+			(bio->bi_opf & (REQ_SYNC | REQ_PREFLUSH | REQ_FUA));
 		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
 		    !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
 		    conf->raid_disks - mddev->degraded > 1)
-- 
2.1.4


^ permalink raw reply related

* [PATCH 01/18] block: add a op_is_flush helper
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel
In-Reply-To: <1485365126-23210-1-git-send-email-hch@lst.de>

This centralizes the checks for bios that needs to be go into the flush
state machine.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c             |  8 ++++----
 block/blk-mq-sched.c         |  5 ++---
 block/blk-mq.c               |  4 ++--
 drivers/md/bcache/request.c  |  2 +-
 drivers/md/dm-cache-target.c | 13 +++----------
 drivers/md/dm-thin.c         | 13 +++++--------
 include/linux/blk_types.h    |  9 +++++++++
 7 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a61f140..b830e14 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1035,7 +1035,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 	 * Flush requests do not use the elevator so skip initialization.
 	 * This allows a request to share the flush and elevator data.
 	 */
-	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA))
+	if (op_is_flush(bio->bi_opf))
 		return false;
 
 	return true;
@@ -1641,7 +1641,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) {
+	if (op_is_flush(bio->bi_opf)) {
 		spin_lock_irq(q->queue_lock);
 		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
@@ -2145,7 +2145,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	 */
 	BUG_ON(blk_queued_rq(rq));
 
-	if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
+	if (op_is_flush(rq->cmd_flags))
 		where = ELEVATOR_INSERT_FLUSH;
 
 	add_acct_request(q, rq, where);
@@ -3256,7 +3256,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
-		if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
+		if (op_is_flush(rq->cmd_flags))
 			__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
 		else
 			__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index d05061f..3bd66e5 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -111,7 +111,6 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct request *rq;
-	const bool is_flush = op & (REQ_PREFLUSH | REQ_FUA);
 
 	blk_queue_enter_live(q);
 	ctx = blk_mq_get_ctx(q);
@@ -126,7 +125,7 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
 		 * Flush requests are special and go directly to the
 		 * dispatch list.
 		 */
-		if (!is_flush && e->type->ops.mq.get_request) {
+		if (!op_is_flush(op) && e->type->ops.mq.get_request) {
 			rq = e->type->ops.mq.get_request(q, op, data);
 			if (rq)
 				rq->rq_flags |= RQF_QUEUED;
@@ -138,7 +137,7 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
 	}
 
 	if (rq) {
-		if (!is_flush) {
+		if (!op_is_flush(op)) {
 			rq->elv.icq = NULL;
 			if (e && e->type->icq_cache)
 				blk_mq_sched_assign_ioc(q, rq, bio);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ee69e5e..e229f8a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1378,7 +1378,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = op_is_sync(bio->bi_opf);
-	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+	const int is_flush_fua = op_is_flush(bio->bi_opf);
 	struct blk_mq_alloc_data data;
 	struct request *rq;
 	unsigned int request_count = 0, srcu_idx;
@@ -1498,7 +1498,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = op_is_sync(bio->bi_opf);
-	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+	const int is_flush_fua = op_is_flush(bio->bi_opf);
 	struct blk_plug *plug;
 	unsigned int request_count = 0;
 	struct blk_mq_alloc_data data;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 76d2087..01035e7 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -666,7 +666,7 @@ static inline struct search *search_alloc(struct bio *bio,
 	s->iop.write_prio	= 0;
 	s->iop.error		= 0;
 	s->iop.flags		= 0;
-	s->iop.flush_journal	= (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0;
+	s->iop.flush_journal	= op_is_flush(bio->bi_opf);
 	s->iop.wq		= bcache_wq;
 
 	return s;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index e04c61e..5b9cf56 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -787,8 +787,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
 	spin_lock_irqsave(&cache->lock, flags);
-	if (cache->need_tick_bio &&
-	    !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) &&
+	if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
 	    bio_op(bio) != REQ_OP_DISCARD) {
 		pb->tick = true;
 		cache->need_tick_bio = false;
@@ -828,11 +827,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 	return to_oblock(block_nr);
 }
 
-static int bio_triggers_commit(struct cache *cache, struct bio *bio)
-{
-	return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-}
-
 /*
  * You must increment the deferred set whilst the prison cell is held.  To
  * encourage this, we ask for 'cell' to be passed in.
@@ -884,7 +878,7 @@ static void issue(struct cache *cache, struct bio *bio)
 {
 	unsigned long flags;
 
-	if (!bio_triggers_commit(cache, bio)) {
+	if (!op_is_flush(bio->bi_opf)) {
 		accounted_request(cache, bio);
 		return;
 	}
@@ -1069,8 +1063,7 @@ static void dec_io_migrations(struct cache *cache)
 
 static bool discard_or_flush(struct bio *bio)
 {
-	return bio_op(bio) == REQ_OP_DISCARD ||
-	       bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+	return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
 }
 
 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index d1c05c1..110982d 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -699,7 +699,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 
 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 {
-	return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) &&
+	return op_is_flush(bio->bi_opf) &&
 		dm_thin_changed_this_transaction(tc->td);
 }
 
@@ -870,8 +870,7 @@ static void __inc_remap_and_issue_cell(void *context,
 	struct bio *bio;
 
 	while ((bio = bio_list_pop(&cell->bios))) {
-		if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
-		    bio_op(bio) == REQ_OP_DISCARD)
+		if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
 			bio_list_add(&info->defer_bios, bio);
 		else {
 			inc_all_io_entry(info->tc->pool, bio);
@@ -1716,9 +1715,8 @@ static void __remap_and_issue_shared_cell(void *context,
 	struct bio *bio;
 
 	while ((bio = bio_list_pop(&cell->bios))) {
-		if ((bio_data_dir(bio) == WRITE) ||
-		    (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
-		     bio_op(bio) == REQ_OP_DISCARD))
+		if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||
+		    bio_op(bio) == REQ_OP_DISCARD)
 			bio_list_add(&info->defer_bios, bio);
 		else {
 			struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
@@ -2635,8 +2633,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) ||
-	    bio_op(bio) == REQ_OP_DISCARD) {
+	if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
 		thin_defer_bio_with_throttle(tc, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0e5b1cd..37c9a43 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -221,6 +221,15 @@ static inline bool op_is_write(unsigned int op)
 }
 
 /*
+ * Check if the bio or request is one that needs special treatment in the
+ * flush state machine.
+ */
+static inline bool op_is_flush(unsigned int op)
+{
+	return op & (REQ_FUA | REQ_PREFLUSH);
+}
+
+/*
  * Reads are always treated as synchronous, as are requests with the FUA or
  * PREFLUSH flag.  Other operations may be marked as synchronous using the
  * REQ_SYNC flag.
-- 
2.1.4


^ permalink raw reply related

* split scsi passthrough fields out of struct request V2
From: Christoph Hellwig @ 2017-01-25 17:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Mike Snitzer, Junichi Nomura, linux-block, linux-scsi, linux-raid,
	dm-devel

Hi all,

this series splits the support for SCSI passthrough commands from the
main struct request used all over the block layer into a separate
scsi_request structure that drivers that want to support SCSI passthough
need to embedded as the first thing into their request-private data,
similar to how we handle NVMe passthrough commands.

To support this I've added support for that the private data after
request structure to the legacy request path instead, so that it can
be treated the same way as the blk-mq path.  Compare to the current
scsi_cmnd allocator that actually is a major simplification.

Changes since V1:
 - fix handling of a NULL sense pointer in __scsi_execute
 - clean up handling of the flush flags in the block layer and MD
 - additional small cleanup in dm-rq

^ permalink raw reply

* Raid wiki - was Re: MDADM grow /dev/md0 - chunk size
From: Wols Lists @ 2017-01-25 17:04 UTC (permalink / raw)
  To: NeilBrown, linux-raid
In-Reply-To: <87lgu2emgs.fsf@notabene.neil.brown.name>

On 22/01/17 22:52, NeilBrown wrote:
> And... please don't send nag emails so soon - it was barely more than
> 24hours after the original.  This just comes across as rude and
> impatient.  People have other commitments.
> My rule of thumb is to wait at least a week before resending - and then
> resend the full text of the original.  Your nag email was not only too
> soon, but contained no detail and so was useless.

Just to point out - to everyone :-) - that the linux wiki is currently
being maintained so pointing people at it is a good idea.

And there's a page called "asking for help". Not only does this tell
people all the commands (like "cat /proc/mdstat", "mdadm --detail",
"mdadm --examine") they should be running to get the info the list is
likely to want, but

*almost the first thing* on that page is a reminder that many of us here
are volunteers, and prompt replies are a privilege not a right. Lusers
get spoilt in that we take pride in our work.

But can I ask, if you do point people at the wiki, just take a quick
glance yourself first. And if you do spot a mistake on the wiki,
brickbats and fixes are welcome :-)

Cheers,
Wol

^ permalink raw reply

* Re: Soft-Raid 0 Performance | Transfer two Data-Streams (CPU+FPGA) to the same Soft-Raid
From: Eric Schwarz @ 2017-01-25 16:49 UTC (permalink / raw)
  To: Coly Li; +Cc: linux-raid
In-Reply-To: <f2c7fa6f-3061-9baa-d2dd-1abbe7e22cfe@suse.de>

Hi Coly,

>> 1.) I have set-up a softraid (raid level 0) with mdadm using two M.2
>> modules. For one module the throughput is ~350MB/s (no mdadm) for two
>> modules the throughput is ~500MB/s which is less than factor 1,5 of 
>> the
>> throughput of a single drive. The filesystem used is ext4. Is there
>> someone having some values for comparison? For me the throughput gain
>> seems to be too little. The test was done using a HP Z840 workstation.
> 
> Hi Eric,
> 
> Could you attach your testing script as well, than we can have a look.

Create RAID0:
$ mdadm --create /dev/md0 --auto md --level=0 --raid-devices=2 /dev/sdb1 
/dev/sdc1

Write to bare RAID0:
$ dd if=/dev/zero of=/dev/md0 bs=1G count=1 oflag=dsync

dd will show the throughput

Cheers
Eric

^ permalink raw reply

* Re: errors when reading in one section of a disk
From: Wols Lists @ 2017-01-25 16:42 UTC (permalink / raw)
  To: Boylan, Ross, linux-raid@vger.kernel.org
In-Reply-To: <3b82fbda-b3a7-4bcf-8c1e-cbb2b2fd7075@exhybrid01.net.ucsf.edu>

On 25/01/17 02:43, Boylan, Ross wrote:
> It looks as if the RAID arrays survive the redefinition of the physical devices (e.g., sde is reattached as sdj), but it's hard to tell since the partition with the error is mounted read-only and thus generates errors.  I have been rebooting the whole system shortly after the problem happens.  Should  the /dev/md* devices survive such remapping beneath them?
> 
Everything is moving to UUIDs. And raid, iirc, has done so.

In other words, when your system is running, it doesn't care what you
refer to the drives as - sde or sdj is immaterial - it converts those to
UUIDs before saving them in the config, so when it re-assembles the
array after some sort of reset, it gets the right drive. Linux has a
whole bunch of symlinks in /dev to make life easy for the sysadm, and
raid does what makes life easy for it.

Thing is, udev explicitly does NOT guarantee things like sda, sdb, etc
(and md0, md1`, md27, md126 etc) will be preserved - they are allocated
in the order the devices are found and there are quite a few systems out
there that are distinctly non-deterministic so all the little quirks
WILL have been found and ironed out. It's just "fortunate" that PC
hardware happens to be - for the most part - deterministic giving the
same result every time.

> After the remapping the file systems the md devices supported on the host computer were still mounted (albeit ro for one of them). mdadm -D did not llist the names of the component devices and the info in proc (or maybe it was run) seemed stale, since it still had the old device names (e.g., sde3 rather than sdj3).

I'm guessing linux reset the Vantec box, rediscovering and moving all
the drives, but raid didn't realise anything had happened so it was
still using the old names. Not a good place to be, I don't think...

Cheers,
Wol

^ permalink raw reply

* Re: Input/Output error reading from a clean raid
From: John Stoffel @ 2017-01-25 15:54 UTC (permalink / raw)
  To: Salatiel Filho; +Cc: Andreas Klauer, linux-raid
In-Reply-To: <CAGmni9qF_LjdzYtsMJ9YmHoymG=SN6p2Mta-COEhbyReQFbNTA@mail.gmail.com>

>>>>> "Salatiel" == Salatiel Filho <salatiel.filho@gmail.com> writes:

Salatiel> On Mon, Jan 23, 2017 at 2:34 PM, Andreas Klauer
Salatiel> <Andreas.Klauer@metamorpher.de> wrote:
>> On Mon, Jan 23, 2017 at 11:02:24AM -0300, Salatiel Filho wrote:
>>> mdadm mdadm --examine-badblocks /dev/sdd1 /dev/sdg1 /dev/sdf1  /dev/sde1
>>> 
>>> Bad-blocks on /dev/sdd1:
>>> 1515723072 for 512 sectors
>>> Bad-blocks on /dev/sde1:
>>> 1515723072 for 512 sectors
>> 
>> md believes you have bad blocks in identical places so it won't return
>> whatever data is in these blocks. Thus you get read errors even if there
>> is no bad block on the disk itself. Those bad block entries can be caused
>> by cable or controller flukes, making temporary problems permanent...
>> 
>> Personally I disable the bad block list everywhere.
>> 
>> You can search this list for old messages regarding --examine-badblocks,
>> this problem came up several times. Clearing the mdadm bad block list is
>> worth a try. There's an undocumented option, update=force-no-bbl or such.
>> 
>> Regards
>> Andreas Klauer

Salatiel> Thanks all of you for the help.
Salatiel> Andreas, the force-no-bbl from mdadm 3.4 did the trick. I was able to
Salatiel> retrieve all files and their md5 matches, so it is great =)

Great news, glad I could help, wish I had pin-pointed the root cause
better.

john

^ permalink raw reply

* Re: errors when reading in one section of a disk
From: John Stoffel @ 2017-01-25 15:44 UTC (permalink / raw)
  To: Boylan, Ross; +Cc: linux-raid@vger.kernel.org
In-Reply-To: <3b82fbda-b3a7-4bcf-8c1e-cbb2b2fd7075@exhybrid01.net.ucsf.edu>

Ross> I suspect this problem has little to do with RAID, but since I'm
Ross> using it I thought I'd start here.  I have 3 disks in a Vantec
Ross> HX4 4 SATA drive box, connected by USB.  Repeatedly, when I try
Ross> to read certain sectors of one disk every disk in the box seems
Ross> to reset and then get remounted.  I thought this was a transient
Ross> power failure the first time (though I'm on UPS), but when it
Ross> happened again I figured the problem was elsewhere.  I am
Ross> surprised that a read failure on one drive seems to reset the
Ross> entire connection to the box.

Ross> The disk is a relatively new WD Red Pro, and so seems unlikely
Ross> to have a physical failure.  The Vantec box is almost 5 years
Ross> old, and seems a better suspect.  The only reason I can think
Ross> that reading a particular spot on one disk would make it fail is
Ross> that this is perhaps accompanied by some other activity at the
Ross> same time.  The Vantec itself supplies no RAID or anything
Ross> fancy, just access to the drives.

I'd just replace the Vantec box, the USB-SATA bridge is probably A)
slow, and B) flaky.  

^ permalink raw reply

* drives failed during reshape, array won't even force-assemble
From: Thomas Warntjen @ 2017-01-25 13:27 UTC (permalink / raw)
  To: linux-raid

On my new Ubuntu Server 16.4 LTS server I have an old RAID5 made from 
5+1 WD Red 3TB drives which I wanted to upgrade first to RAID6 (5+2) and 
then to 6 data disks, so I added 2 new drives und started the reshape:

# mdadm /dev/md1 --grow --level=6 --backup=/root/raid6.backupfile

When the reshape was at ~70% some wonky cabling caused some of the 
drives to temporarily fail (I heard the drives spin down after I 
accidently touched the cable - SMART says the disks are ok and another 
array on those disks starts just fine).
After a reboot, the array won't start, marking all the drives as spares 
(md1):

# cat /proc/mdstat
Personalities : [raid1] [raid6] [raid5] [raid4] [linear] [multipath] 
[raid0] [raid10]
md1 : inactive sdg3[3](S) sdj3[1](S) sdi3[6](S) sdh3[0](S) sdc3[2](S) 
sdd3[4](S) sdf3[5](S) sde3[8](S)
       23429580800 blocks super 0.91

md127 : active (auto-read-only) raid6 sdj1[7] sdi1[4] sdg1[2] sdh1[6] 
sdc1[0] sdf1[1] sde1[5] sdd1[3]
       6346752 blocks super 1.2 level 6, 512k chunk, algorithm 2 [8/8] 
[UUUUUUUU]

md0 : active raid1 sdb1[2] sda1[1]
       240022528 blocks super 1.2 [2/2] [UU]
       bitmap: 1/2 pages [4KB], 65536KB chunk


# mdadm --detail /dev/md1
/dev/md1:
         Version : 0.91
      Raid Level : raid0
   Total Devices : 8
Preferred Minor : 0
     Persistence : Superblock is persistent

           State : inactive

       New Level : raid6
      New Layout : left-symmetric
   New Chunksize : 64K

            UUID : 7a58ed4f:baf1934e:a2963c6e:a542ed71
          Events : 0.12370980

     Number   Major   Minor   RaidDevice

        -       8       35        -        /dev/sdc3
        -       8       51        -        /dev/sdd3
        -       8       67        -        /dev/sde3
        -       8       83        -        /dev/sdf3
        -       8       99        -        /dev/sdg3
        -       8      115        -        /dev/sdh3
        -       8      131        -        /dev/sdi3
        -       8      147        -        /dev/sdj3


Since that was the second time the reshape was interrupted (the first 
time was an intentional reboot) I thaought I knew what I was doing and 
stopped and force-assembled the array. That didn't work and probably 
borked it some more...

So according to the RAID-Wiki 
(https://raid.wiki.kernel.org/index.php/Recovering_a_failed_software_RAID) 
I stopped the array and created overlay files (and copied the backup-file).

mdadm -E tells me that probably sdd and sdf were the failing drives:

# parallel --tag -k mdadm -E ::: $OVERLAYS|grep -E 'Update'
/dev/mapper/sdc3            Update Time : Tue Jan 24 21:03:00 2017
/dev/mapper/sdd3            Update Time : Tue Jan 24 21:02:49 2017
/dev/mapper/sde3            Update Time : Tue Jan 24 21:10:19 2017
/dev/mapper/sdf3            Update Time : Tue Jan 24 21:02:49 2017
/dev/mapper/sdh3            Update Time : Tue Jan 24 21:03:00 2017
/dev/mapper/sdi3            Update Time : Tue Jan 24 21:10:19 2017
/dev/mapper/sdj3            Update Time : Tue Jan 24 21:03:00 2017
/dev/mapper/sdg3            Update Time : Tue Jan 24 21:10:19 2017

# parallel --tag -k mdadm -E ::: $OVERLAYS|grep -E 'Events'
/dev/mapper/sdc3                 Events : 12370980
/dev/mapper/sdd3                 Events : 12370974
/dev/mapper/sde3                 Events : 12370980
/dev/mapper/sdf3                 Events : 12370974
/dev/mapper/sdh3                 Events : 12370980
/dev/mapper/sdi3                 Events : 12370980
/dev/mapper/sdj3                 Events : 12370980
/dev/mapper/sdg3                 Events : 12370980


Obviously the disks have diverging ideas about the health of the array 
and interestingly also about their own identity:

/dev/sdc3:
       Number   Major   Minor   RaidDevice State
this     2       8       35        2      active sync   /dev/sdc3

    0     0       8      131        0      active sync   /dev/sdi3
    1     1       8      163        1      active sync
    2     2       8       35        2      active sync   /dev/sdc3
    3     3       8      115        3      active sync   /dev/sdh3
    4     4       0        0        4      faulty removed
    5     5       0        0        5      faulty removed
    6     6       8      147        6      active   /dev/sdj3
    7     7       8       67        7      spare   /dev/sde3

/dev/sdd3:
       Number   Major   Minor   RaidDevice State
this     4       8       51        4      active sync   /dev/sdd3

    0     0       8      131        0      active sync   /dev/sdi3
    1     1       8      163        1      active sync
    2     2       8       35        2      active sync   /dev/sdc3
    3     3       8      115        3      active sync   /dev/sdh3
    4     4       8       51        4      active sync   /dev/sdd3
    5     5       8       83        5      active sync   /dev/sdf3
    6     6       8      147        6      active   /dev/sdj3
    7     7       8       67        7      spare   /dev/sde3

/dev/sde3:
       Number   Major   Minor   RaidDevice State
this     8       8       67        8      spare   /dev/sde3

    0     0       0        0        0      removed
    1     1       0        0        1      faulty removed
    2     2       0        0        2      faulty removed
    3     3       8      115        3      active sync   /dev/sdh3
    4     4       0        0        4      faulty removed
    5     5       0        0        5      faulty removed
    6     6       8      147        6      active   /dev/sdj3
    7     7       8      131        7      faulty   /dev/sdi3

/dev/sdf3:
       Number   Major   Minor   RaidDevice State
this     5       8       83        5      active sync   /dev/sdf3

    0     0       8      131        0      active sync   /dev/sdi3
    1     1       8      163        1      active sync
    2     2       8       35        2      active sync   /dev/sdc3
    3     3       8      115        3      active sync   /dev/sdh3
    4     4       8       51        4      active sync   /dev/sdd3
    5     5       8       83        5      active sync   /dev/sdf3
    6     6       8      147        6      active   /dev/sdj3
    7     7       8       67        7      spare   /dev/sde3

/dev/sdg3:
       Number   Major   Minor   RaidDevice State
this     3       8      115        3      active sync   /dev/sdh3

    0     0       0        0        0      removed
    1     1       0        0        1      faulty removed
    2     2       0        0        2      faulty removed
    3     3       8      115        3      active sync   /dev/sdh3
    4     4       0        0        4      faulty removed
    5     5       0        0        5      faulty removed
    6     6       8      147        6      active   /dev/sdj3
    7     7       8      131        7      faulty   /dev/sdi3

/dev/sdh3:
       Number   Major   Minor   RaidDevice State
this     0       8      131        0      active sync   /dev/sdi3

    0     0       8      131        0      active sync   /dev/sdi3
    1     1       8      163        1      active sync
    2     2       8       35        2      active sync   /dev/sdc3
    3     3       8      115        3      active sync   /dev/sdh3
    4     4       0        0        4      faulty removed
    5     5       0        0        5      faulty removed
    6     6       8      147        6      active   /dev/sdj3
    7     7       8       67        7      spare   /dev/sde3

/dev/sdi3:
       Number   Major   Minor   RaidDevice State
this     6       8      147        6      active   /dev/sdj3

    0     0       0        0        0      removed
    1     1       0        0        1      faulty removed
    2     2       0        0        2      faulty removed
    3     3       8      115        3      active sync   /dev/sdh3
    4     4       0        0        4      faulty removed
    5     5       0        0        5      faulty removed
    6     6       8      147        6      active   /dev/sdj3
    7     7       8      131        7      faulty   /dev/sdi3

/dev/sdj3:
       Number   Major   Minor   RaidDevice State
this     1       8      163        1      active sync

    0     0       8      131        0      active sync   /dev/sdi3
    1     1       8      163        1      active sync
    2     2       8       35        2      active sync   /dev/sdc3
    3     3       8      115        3      active sync   /dev/sdh3
    4     4       0        0        4      faulty removed
    5     5       0        0        5      faulty removed
    6     6       8      147        6      active   /dev/sdj3
    7     7       8       67        7      spare   /dev/sde3


(for reference)

# l /dev/mapper/
total 0
drwxr-xr-x  2 root root     220 Jan 25 12:34 .
drwxr-xr-x 20 root root    5.5K Jan 25 12:34 ..
crw-------  1 root root 10, 236 Jan 25 12:20 control
lrwxrwxrwx  1 root root       7 Jan 25 12:55 sdc3 -> ../dm-4
lrwxrwxrwx  1 root root       7 Jan 25 12:55 sdd3 -> ../dm-6
lrwxrwxrwx  1 root root       7 Jan 25 12:55 sde3 -> ../dm-5
lrwxrwxrwx  1 root root       7 Jan 25 12:55 sdf3 -> ../dm-7
lrwxrwxrwx  1 root root       7 Jan 25 12:55 sdg3 -> ../dm-2
lrwxrwxrwx  1 root root       7 Jan 25 12:55 sdh3 -> ../dm-3
lrwxrwxrwx  1 root root       7 Jan 25 12:55 sdi3 -> ../dm-0
lrwxrwxrwx  1 root root       7 Jan 25 12:55 sdj3 -> ../dm-1


The event-count of the drives doesn't look too bad, so I try to assemble 
the array:

# mdadm --assemble /dev/md1 $OVERLAYS --verbose 
--backup-file=raid6.backupfile
mdadm: looking for devices for /dev/md1
mdadm: /dev/mapper/sdc3 is identified as a member of /dev/md1, slot 2.
mdadm: /dev/mapper/sdd3 is identified as a member of /dev/md1, slot 4.
mdadm: /dev/mapper/sde3 is identified as a member of /dev/md1, slot 8.
mdadm: /dev/mapper/sdf3 is identified as a member of /dev/md1, slot 5.
mdadm: /dev/mapper/sdh3 is identified as a member of /dev/md1, slot 0.
mdadm: /dev/mapper/sdi3 is identified as a member of /dev/md1, slot 6.
mdadm: /dev/mapper/sdj3 is identified as a member of /dev/md1, slot 1.
mdadm: /dev/mapper/sdg3 is identified as a member of /dev/md1, slot 3.
mdadm: ignoring /dev/mapper/sdg3 as it reports /dev/mapper/sdc3 as failed
mdadm: ignoring /dev/mapper/sdi3 as it reports /dev/mapper/sdc3 as failed
mdadm: device 16 in /dev/md1 has wrong state in superblock, but 
/dev/mapper/sde3 seems ok
mdadm: /dev/md1 has an active reshape - checking if critical section 
needs to be restored
mdadm: restoring critical section
mdadm: added /dev/mapper/sdj3 to /dev/md1 as 1
mdadm: added /dev/mapper/sdc3 to /dev/md1 as 2
mdadm: no uptodate device for slot 3 of /dev/md1
mdadm: added /dev/mapper/sdd3 to /dev/md1 as 4 (possibly out of date)
mdadm: added /dev/mapper/sdf3 to /dev/md1 as 5 (possibly out of date)
mdadm: no uptodate device for slot 6 of /dev/md1
mdadm: added /dev/mapper/sde3 to /dev/md1 as 8
mdadm: added /dev/mapper/sdh3 to /dev/md1 as 0
mdadm: /dev/md1 assembled from 3 drives and 1 spare - not enough to 
start the array.


that was to be expected, now with --force:

# mdadm --assemble /dev/md1 $OVERLAYS --verbose 
--backup-file=raid6.backupfile --force
mdadm: looking for devices for /dev/md1
mdadm: /dev/mapper/sdc3 is identified as a member of /dev/md1, slot 2.
mdadm: /dev/mapper/sdd3 is identified as a member of /dev/md1, slot 4.
mdadm: /dev/mapper/sde3 is identified as a member of /dev/md1, slot 8.
mdadm: /dev/mapper/sdf3 is identified as a member of /dev/md1, slot 5.
mdadm: /dev/mapper/sdh3 is identified as a member of /dev/md1, slot 0.
mdadm: /dev/mapper/sdi3 is identified as a member of /dev/md1, slot 6.
mdadm: /dev/mapper/sdj3 is identified as a member of /dev/md1, slot 1.
mdadm: /dev/mapper/sdg3 is identified as a member of /dev/md1, slot 3.
mdadm: clearing FAULTY flag for device 2 in /dev/md1 for /dev/mapper/sde3
mdadm: Marking array /dev/md1 as 'clean'
mdadm: /dev/md1 has an active reshape - checking if critical section 
needs to be restored
mdadm: restoring critical section
mdadm: added /dev/mapper/sdj3 to /dev/md1 as 1
mdadm: added /dev/mapper/sdc3 to /dev/md1 as 2
mdadm: added /dev/mapper/sdg3 to /dev/md1 as 3
mdadm: added /dev/mapper/sdd3 to /dev/md1 as 4 (possibly out of date)
mdadm: added /dev/mapper/sdf3 to /dev/md1 as 5 (possibly out of date)
mdadm: added /dev/mapper/sdi3 to /dev/md1 as 6
mdadm: added /dev/mapper/sde3 to /dev/md1 as 8
mdadm: added /dev/mapper/sdh3 to /dev/md1 as 0
mdadm: failed to RUN_ARRAY /dev/md1: Input/output error


in the kern.log the following messages appeared:

Jan 25 13:02:51 Oghma kernel: [  765.051249] md: md1 stopped.
Jan 25 13:03:04 Oghma kernel: [  778.562635] md: bind<dm-1>
Jan 25 13:03:04 Oghma kernel: [  778.562780] md: bind<dm-4>
Jan 25 13:03:04 Oghma kernel: [  778.562891] md: bind<dm-2>
Jan 25 13:03:04 Oghma kernel: [  778.562999] md: bind<dm-6>
Jan 25 13:03:04 Oghma kernel: [  778.563104] md: bind<dm-7>
Jan 25 13:03:04 Oghma kernel: [  778.563207] md: bind<dm-0>
Jan 25 13:03:04 Oghma kernel: [  778.563400] md: bind<dm-5>
Jan 25 13:03:04 Oghma kernel: [  778.563577] md: bind<dm-3>
Jan 25 13:03:04 Oghma kernel: [  778.563720] md: kicking non-fresh dm-7 
from array!
Jan 25 13:03:04 Oghma kernel: [  778.563729] md: unbind<dm-7>
Jan 25 13:03:04 Oghma kernel: [  778.577201] md: export_rdev(dm-7)
Jan 25 13:03:04 Oghma kernel: [  778.577213] md: kicking non-fresh dm-6 
from array!
Jan 25 13:03:04 Oghma kernel: [  778.577223] md: unbind<dm-6>
Jan 25 13:03:04 Oghma kernel: [  778.605194] md: export_rdev(dm-6)
Jan 25 13:03:04 Oghma kernel: [  778.607491] md/raid:md1: reshape will 
continue
Jan 25 13:03:04 Oghma kernel: [  778.607541] md/raid:md1: device dm-3 
operational as raid disk 0
Jan 25 13:03:04 Oghma kernel: [  778.607545] md/raid:md1: device dm-2 
operational as raid disk 3
Jan 25 13:03:04 Oghma kernel: [  778.607549] md/raid:md1: device dm-4 
operational as raid disk 2
Jan 25 13:03:04 Oghma kernel: [  778.607551] md/raid:md1: device dm-1 
operational as raid disk 1
Jan 25 13:03:04 Oghma kernel: [  778.608605] md/raid:md1: allocated 7548kB
Jan 25 13:03:04 Oghma kernel: [  778.608733] md/raid:md1: not enough 
operational devices (3/7 failed)
Jan 25 13:03:04 Oghma kernel: [  778.608760] RAID conf printout:
Jan 25 13:03:04 Oghma kernel: [  778.608763]  --- level:6 rd:7 wd:4
Jan 25 13:03:04 Oghma kernel: [  778.608766]  disk 0, o:1, dev:dm-3
Jan 25 13:03:04 Oghma kernel: [  778.608769]  disk 1, o:1, dev:dm-1
Jan 25 13:03:04 Oghma kernel: [  778.608771]  disk 2, o:1, dev:dm-4
Jan 25 13:03:04 Oghma kernel: [  778.608773]  disk 3, o:1, dev:dm-2
Jan 25 13:03:04 Oghma kernel: [  778.608776]  disk 6, o:1, dev:dm-0
Jan 25 13:03:04 Oghma kernel: [  778.609364] md/raid:md1: failed to run 
raid set.
Jan 25 13:03:04 Oghma kernel: [  778.609367] md: pers->run() failed ...
Jan 25 13:03:04 Oghma kernel: [  778.609509] md: md1 stopped.
Jan 25 13:03:04 Oghma kernel: [  778.609519] md: unbind<dm-3>
Jan 25 13:03:04 Oghma kernel: [  778.629256] md: export_rdev(dm-3)
Jan 25 13:03:04 Oghma kernel: [  778.629273] md: unbind<dm-5>
Jan 25 13:03:04 Oghma kernel: [  778.649237] md: export_rdev(dm-5)
Jan 25 13:03:04 Oghma kernel: [  778.649255] md: unbind<dm-0>
Jan 25 13:03:04 Oghma kernel: [  778.665242] md: export_rdev(dm-0)
Jan 25 13:03:04 Oghma kernel: [  778.665259] md: unbind<dm-2>
Jan 25 13:03:04 Oghma kernel: [  778.681241] md: export_rdev(dm-2)
Jan 25 13:03:04 Oghma kernel: [  778.681258] md: unbind<dm-4>
Jan 25 13:03:04 Oghma kernel: [  778.693306] md: export_rdev(dm-4)
Jan 25 13:03:04 Oghma kernel: [  778.693323] md: unbind<dm-1>
Jan 25 13:03:04 Oghma kernel: [  778.705242] md: export_rdev(dm-1)


This seems to be the same problem this guy had 5 years ago 
https://www.spinics.net/lists/raid/msg37483.html but he got enough disks 
going to start the array.
What else is there I can do? This is my last hope :/

kernel: 4.4.0-59-generic #80-Ubuntu SMP Fri Jan 6 17:47:47 UTC 2017 
x86_64 x86_64 x86_64 GNU/Linux
mdadm: installed was "v3.3 - 3rd September 2013", now updated to "v3.4 - 
28th January 2016"

Thanks in advance!

^ permalink raw reply

* performance collapse: 9 mio IOPS to 1.5 mio with MD RAID0
From: Tobias Oberstein @ 2017-01-25 11:45 UTC (permalink / raw)
  To: linux-raid

Hi,

I have a storage consisting of 8 NVMe drives (16 logical drives) that I 
verified (FIO) is able to do >9 million 4kB random read IOPS if I run 
FIO on the set of individual NVMes.

However, when I create a MD (RAID-0) over the 16 NVMes and run the same 
tests, performance collapses:

ioengine=sync, invidual NVMes: IOPS=9191k
ioengine=sync, MD (RAID-0) over NVMes: IOPS=1562k

Using ioengine=psync, the performance collapse isn't as dramatic, but 
still very signifcant:

ioengine=sync, invidual NVMes: IOPS=9395k
ioengine=sync, MD (RAID-0) over NVMes: IOPS=4117k

--

All detail results (including runs under Linux perf) and FIO control 
files are here

https://github.com/oberstet/scratchbox/tree/master/cruncher/sync-engines-perf

--

With sync/MD, top in perf is

   82.77%  fio      [kernel.kallsyms]   [k] osq_lock
    3.12%  fio      [kernel.kallsyms]   [k] nohz_balance_exit_idle
    1.40%  fio      [kernel.kallsyms]   [k] trigger_load_balance
    1.01%  fio      [kernel.kallsyms]   [k] native_queued_spin_lock_slowpath

With psync/MD, top in perf is

   45.56%  fio      [kernel.kallsyms]   [k] md_make_request
    4.33%  fio      [kernel.kallsyms]   [k] osq_lock
    3.40%  fio      [kernel.kallsyms]   [k] native_queued_spin_lock_slowpath
    3.23%  fio      [kernel.kallsyms]   [k] _raw_spin_lock
    2.21%  fio      [kernel.kallsyms]   [k] raid0_make_request

--

Of course there isn't a free lunch, but a performance collapse in this 
order for a RAID-0, that is pure striping, seems excessive.

What's going on?

Cheers,
/Tobias

MD device was created like this:

sudo mdadm --create /dev/md1 \
   --chunk=8 \
   --level=0 \
   --raid-devices=16 \
   /dev/nvme0n1 \
   /dev/nvme1n1 \
   /dev/nvme2n1 \
   /dev/nvme3n1 \
   /dev/nvme4n1 \
   /dev/nvme5n1 \
   /dev/nvme6n1 \
   /dev/nvme7n1 \
   /dev/nvme8n1 \
   /dev/nvme9n1 \
   /dev/nvme10n1 \
   /dev/nvme11n1 \
   /dev/nvme12n1 \
   /dev/nvme13n1 \
   /dev/nvme14n1 \
   /dev/nvme15n1

The NVMes are low-level formatted with 4k sectors. Before, I had 512 
bytes (default), and the perf. collapse was even more dramatic.

The chunk size of 8k is used because this is supposed to carry database 
workloads later.

My target workload is PostgreSQL which is 100% 8k and lseek/read/write 
(not using pread/pwrite or pvread/pvwrite etc).

^ permalink raw reply

* [PATCH] md linear: fix a race between linear_add() and linear_congested()
From: colyli @ 2017-01-25 11:15 UTC (permalink / raw)
  To: linux-raid; +Cc: Coly Li, Shaohua Li, Neil Brown, stable

Recently I receie a report that on Linux v3.0 based kerenl, hot add disk
to a md linear device causes kernel crash at linear_congested(). From the
crash image analysis, I find in linear_congested(), mddev->raid_disks
contains value N, but conf->disks[] only has N-1 pointers available. Then
a pointer deference to a NULL pointer crashes the kernel.

There is a race between linear_add() and linear_congested(), RCU stuffs
used in these two functions cannot avoid the race. Since Linuv v4.0
RCU code is replaced by introducing mddev_suspend().  After checking the
upstream code, it seems linear_congested() is not called in
generic_make_request() code patch, so mddev_suspend() cannot provent it
from being called. The possible race still exists.

Here I explain how the race still exists in current code.  For a machine
has many CPUs, on one CPU, linear_add() is called to add a hard disk to a
md linear device; at the same time on other CPU, linear_congested() is
called to detect whether this md linear device is congested before issuing
an I/O request onto it.

Now I use a possible code execution time sequence to demo how the possible
race happens, 

seq    linear_add()                linear_congested()
 0                                 conf=mddev->private
 1   oldconf=mddev->private
 2   mddev->raid_disks++
 3                              for (i=0; i<mddev->raid_disks;i++)
 4                                bdev_get_queue(conf->disks[i].rdev->bdev)
 5   mddev->private=newconf

In linear_add() mddev->raid_disks is increased in time seq 2, and on
another CPU in linear_congested() the for-loop iterates conf->disks[i] by
the increased mddev->raid_disks in time seq 3,4. But conf with one more
element (which is a pointer to struct dev_info type) to conf->disks[] is
not updated yet, accessing its structure member in time seq 4 will cause a
NULL pointer deference fault.

The fix is to update mddev->private with new value before increasing
mddev->raid_disks, and to make sure on other CPUs their are seen to be
updated in same order as linear_add() does (otherwise the race may still
happen), a smp_mb() is necessary.

A question is, by this fix, if mddev->private is update to new value in
linear_add(), but in linear_congested() the for-loop still tests old value
of mddev->raid_disks, then the iteration will miss the last element of
conf->disks[]. My answer is don't worry it, it's OK. the reasons are,
 - When updating mddev->private, the md linear device is suspend, no I/O
   may happen, it is safe to missing congestion status of the last
   new-added hard disk. 
 - In the worst case linear_congested() returns 0 and I/O sent to this md
   linear device, but the new added hard disk is congested, then the I/O
   request will be blocked for a while if it just happenly hits the new
   added hard disk. linear_congested() is in code path of wb_congested(),
   which is quite hot in write back code path. Comparing to add locking
   code in linear_congested(), the cost of the worst case is acceptable.

The bug is reported on Linux v3.0 based kernel, it can and should be
applied to all kernels since Linux v3.0. I see linear_add() is merged into
mainline since Linux v2.6.18, maybe stable kernel maintainers after this
version may consider to pick this fix as well.

Signed-off-by: Coly Li <colyli@suse.de>
Cc: Shaohua Li <shli@fb.com>
Cc: Neil Brown <neilb@suse.com>
Cc: stable@vger.kernel.org
---
 drivers/md/linear.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5975c99..48ccfad 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -196,10 +196,22 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
 	if (!newconf)
 		return -ENOMEM;

+	/* In linear_congested(), mddev->raid_disks and mddev->private
+	 * are accessed without protection by mddev_suspend(). If on
+	 * another CPU,  in linear_congested() mddev->private is still seen
+	 * to contains old value but mddev->raid_disks is seen to have the
+	 * increased value, the last iteration to conf->disks[i].rdev will
+	 * trigger a NULL pointer deference. To avoid this race, here
+	 * mddev->private must be updated before increasing
+	 * mddev->raid_disks, and a smp_mb() is required between them. Then
+	 * in linear_congested(), we are sure the updated mddev->private is
+	 * seen when iterating conf->disks[i].
+	 */
 	mddev_suspend(mddev);
 	oldconf = mddev->private;
-	mddev->raid_disks++;
 	mddev->private = newconf;
+	smp_mb();
+	mddev->raid_disks++;
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	mddev_resume(mddev);
-- 
2.6.6

^ permalink raw reply related

* errors when reading in one section of a disk
From: Boylan, Ross @ 2017-01-25  2:43 UTC (permalink / raw)
  To: linux-raid@vger.kernel.org

I suspect this  problem has little to do with RAID, but since I'm using it I thought I'd start here.
I have 3 disks in a Vantec HX4 4 SATA drive box, connected by USB.  Repeatedly, when I try to read certain sectors of one disk every disk in the box seems to reset and then get remounted.  I thought this was a transient power failure the first time (though I'm on UPS), but when it happened again I figured the problem was elsewhere.  I am surprised that a read failure on one drive seems to reset the entire connection to the box.

The disk is a relatively new WD Red Pro, and so seems unlikely to have a physical failure.  The Vantec box is almost 5 years old, and seems a better suspect.  The only reason I can think that reading a particular spot on one disk would make it fail is that this is perhaps accompanied by some other activity at the same time.  The Vantec itself supplies no RAID or anything fancy, just access to the drives.

The setup is only nominally RAID1, since I have yet to add a spare drive for either array in use.

It looks as if the RAID arrays survive the redefinition of the physical devices (e.g., sde is reattached as sdj), but it's hard to tell since the partition with the error is mounted read-only and thus generates errors.  I have been rebooting the whole system shortly after the problem happens.  Should  the /dev/md* devices survive such remapping beneath them?

After the remapping the file systems the md devices supported on the host computer were still mounted (albeit ro for one of them). mdadm -D did not llist the names of the component devices and the info in proc (or maybe it was run) seemed stale, since it still had the old device names (e.g., sde3 rather than sdj3).

Linux Kernel 3.16.  Hmm, these problems are occurring a day after a security upgrade of the kernel:
[UPGRADE] linux-image-3.16.0-0.bpo.4-amd64:amd64 3.16.36-1+deb8u2~bpo70+1 -> 3.16.39-1~bpo70+1
There is a very long list of changes between those versions: http://ftp-master.metadata.debian.org/changelogs//main/l/linux/linux_3.16.39-1~bpo70+1_changelog



Here's the kern.log entries from the event start:
Jan 24 15:22:59 tempserver kernel: [96418.113742] usb 2-4: USB disconnect, device number 3
Jan 24 15:22:59 tempserver kernel: [96418.117141] scsi 7:0:0:2: rejecting I/O to offline device
Jan 24 15:22:59 tempserver kernel: [96418.117149] scsi 7:0:0:2: [sde] killing request
Jan 24 15:22:59 tempserver kernel: [96418.117172] scsi 7:0:0:2: [sde] Unhandled error code
Jan 24 15:22:59 tempserver kernel: [96418.117174] scsi 7:0:0:2: [sde]  
Jan 24 15:22:59 tempserver kernel: [96418.117175] Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
Jan 24 15:22:59 tempserver kernel: [96418.117177] scsi 7:0:0:2: [sde] CDB: 
Jan 24 15:22:59 tempserver kernel: [96418.117178] Read(16): 88 40 00 00 00 01 93 d9 41 a8 00 00 00 f0 00 00
Jan 24 15:22:59 tempserver kernel: [96418.117188] end_request: I/O error, dev sde, sector 6775456168  # 2nd time it was 6775455984
Jan 24 15:22:59 tempserver kernel: [96418.117202] scsi 7:0:0:2: rejecting I/O to offline device
Jan 24 15:22:59 tempserver kernel: [96418.117205] scsi 7:0:0:2: [sde] killing request
Jan 24 15:22:59 tempserver kernel: [96418.117208] scsi 7:0:0:2: [sde] Unhandled error code
Jan 24 15:22:59 tempserver kernel: [96418.117210] scsi 7:0:0:2: [sde]  
Jan 24 15:22:59 tempserver kernel: [96418.117211] Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
Jan 24 15:22:59 tempserver kernel: [96418.117212] scsi 7:0:0:2: [sde] CDB: 
Jan 24 15:22:59 tempserver kernel: [96418.117213] Read(16): 88 00 00 00 00 01 93 d9 42 98 00 00 00 20 00 00
Jan 24 15:22:59 tempserver kernel: [96418.117220] end_request: I/O error, dev sde, sector 6775456408
Jan 24 15:23:01 tempserver kernel: [96419.765989] usb 2-4: new SuperSpeed USB device number 4 using xhci_hcd
Jan 24 15:23:01 tempserver kernel: [96419.782952] usb 2-4: New USB device found, idVendor=152d, idProduct=0551
Jan 24 15:23:01 tempserver kernel: [96419.782955] usb 2-4: New USB device strings: Mfr=1, Product=2, SerialNumber=5
Jan 24 15:23:01 tempserver kernel: [96419.782957] usb 2-4: Product: USB to ATA/ATAPI Bridge
Jan 24 15:23:01 tempserver kernel: [96419.782958] usb 2-4: Manufacturer: JMicron
Jan 24 15:23:01 tempserver kernel: [96419.782958] usb 2-4: SerialNumber: DA00862620FF
Jan 24 15:23:01 tempserver kernel: [96419.785081] usb-storage 2-4:1.0: USB Mass Storage device detected
Jan 24 15:23:01 tempserver kernel: [96419.785189] scsi8 : usb-storage 2-4:1.0
Jan 24 15:23:02 tempserver kernel: [96420.783801] scsi 8:0:0:0: Direct-Access     WDC WD20 01FASS-00W2B0         PQ: 0 ANSI: 2 CCS
Jan 24 15:23:02 tempserver kernel: [96420.784024] scsi 8:0:0:1: Direct-Access     WDC WD20 EARS-00MVWB0          PQ: 0 ANSI: 2 CCS
Jan 24 15:23:02 tempserver kernel: [96420.784240] scsi 8:0:0:2: Direct-Access     WDC WD40 01FFSX-68JNUN0        PQ: 0 ANSI: 2 CCS

The original read error seems to be in sde3, which is md/media3 aka md127 (see partition map below).  But there are subsequent failures on md126 aka media4.  The writes originate from VirtualBox which stores control info on media3 and has virtual disks on media3 and media4 (AFAIK this is all via regular files--the virtual disks are simply files on the host OS).  Here's the rest of the log, and then a bunch of diagnostic info on my system.  I'm not sure if attachments are OK, so this is all inline.

Thanks for any help you can offer.
Ross Boylan

Jan 24 15:23:02 tempserver kernel: [96420.784571] sd 8:0:0:0: Attached scsi generic sg2 type 0
Jan 24 15:23:02 tempserver kernel: [96420.784721] sd 8:0:0:1: Attached scsi generic sg3 type 0
Jan 24 15:23:02 tempserver kernel: [96420.784812] sd 8:0:0:0: [sdh] 3907029168 512-byte logical blocks: (2.00 TB/1.81 TiB)
Jan 24 15:23:02 tempserver kernel: [96420.784831] sd 8:0:0:2: Attached scsi generic sg4 type 0
Jan 24 15:23:02 tempserver kernel: [96420.785191] sd 8:0:0:0: [sdh] Write Protect is off
Jan 24 15:23:02 tempserver kernel: [96420.785194] sd 8:0:0:0: [sdh] Mode Sense: 28 00 00 00
Jan 24 15:23:02 tempserver kernel: [96420.785342] sd 8:0:0:1: [sdi] 3907029168 512-byte logical blocks: (2.00 TB/1.81 TiB)
Jan 24 15:23:02 tempserver kernel: [96420.785475] sd 8:0:0:2: [sdj] Very big device. Trying to use READ CAPACITY(16).
Jan 24 15:23:02 tempserver kernel: [96420.785704] sd 8:0:0:0: [sdh] No Caching mode page found
Jan 24 15:23:02 tempserver kernel: [96420.785706] sd 8:0:0:0: [sdh] Assuming drive cache: write through
Jan 24 15:23:02 tempserver kernel: [96420.786061] sd 8:0:0:1: [sdi] Write Protect is off
Jan 24 15:23:02 tempserver kernel: [96420.786063] sd 8:0:0:1: [sdi] Mode Sense: 28 00 00 00
Jan 24 15:23:02 tempserver kernel: [96420.786220] sd 8:0:0:2: [sdj] 7814037168 512-byte logical blocks: (4.00 TB/3.63 TiB)
Jan 24 15:23:02 tempserver kernel: [96420.786527] sd 8:0:0:1: [sdi] No Caching mode page found
Jan 24 15:23:02 tempserver kernel: [96420.786530] sd 8:0:0:1: [sdi] Assuming drive cache: write through
Jan 24 15:23:02 tempserver kernel: [96420.786794] sd 8:0:0:2: [sdj] Write Protect is off
Jan 24 15:23:02 tempserver kernel: [96420.786796] sd 8:0:0:2: [sdj] Mode Sense: 28 00 00 00
Jan 24 15:23:02 tempserver kernel: [96420.787246] sd 8:0:0:2: [sdj] No Caching mode page found
Jan 24 15:23:02 tempserver kernel: [96420.787249] sd 8:0:0:2: [sdj] Assuming drive cache: write through
Jan 24 15:23:02 tempserver kernel: [96420.797382] sd 8:0:0:2: [sdj] Very big device. Trying to use READ CAPACITY(16).
Jan 24 15:23:02 tempserver kernel: [96420.927411] Alternate GPT is invalid, using primary GPT.
Jan 24 15:23:02 tempserver kernel: [96420.927424]  sdi: sdi1 sdi2
Jan 24 15:23:02 tempserver kernel: [96420.928383]  sdh: sdh1 sdh2 sdh3
Jan 24 15:23:02 tempserver kernel: [96420.937783] sd 8:0:0:1: [sdi] Attached SCSI disk
Jan 24 15:23:02 tempserver kernel: [96420.938368]  sdj: sdj1 sdj2 sdj3
Jan 24 15:23:02 tempserver kernel: [96420.943147] sd 8:0:0:0: [sdh] Attached SCSI disk
Jan 24 15:23:02 tempserver kernel: [96420.944894] sd 8:0:0:2: [sdj] Very big device. Trying to use READ CAPACITY(16).
Jan 24 15:23:02 tempserver kernel: [96420.945971] sd 8:0:0:2: [sdj] Attached SCSI disk
Jan 24 15:23:04 tempserver kernel: [96423.135634] md: super_written gets error=-19, uptodate=0
Jan 24 15:23:04 tempserver kernel: [96423.135666] EXT4-fs warning (device md127): ext4_end_bio:317: I/O error -5 writing to inode 75497479 (offset 0 size 0 starting block 15947872)
Jan 24 15:23:04 tempserver kernel: [96423.135670] Buffer I/O error on device md127, logical block 15947872
Jan 24 15:23:04 tempserver kernel: [96423.135676] EXT4-fs warning (device md127): ext4_end_bio:317: I/O error -5 writing to inode 75497479 (offset 397312 size 4096 starting block 15947873)
Jan 24 15:23:04 tempserver kernel: [96423.135679] Buffer I/O error on device md127, logical block 15947873
Jan 24 15:23:04 tempserver kernel: [96423.135684] JBD2: Detected IO errors while flushing file data on md127-8
Jan 24 15:23:04 tempserver kernel: [96423.135687] Aborting journal on device md127-8.
Jan 24 15:23:04 tempserver kernel: [96423.135698] Buffer I/O error on device md127, logical block 243826688
Jan 24 15:23:04 tempserver kernel: [96423.135698] lost page write due to I/O error on md127
Jan 24 15:23:04 tempserver kernel: [96423.135702] JBD2: Error -5 detected when updating journal superblock for md127-8.
Jan 24 15:23:04 tempserver kernel: [96423.339670] md: super_written gets error=-19, uptodate=0
Jan 24 15:23:12 tempserver kernel: [96431.622932] md: super_written gets error=-19, uptodate=0
Jan 24 15:23:12 tempserver kernel: [96431.622951] Buffer I/O error on device md127, logical block 0
Jan 24 15:23:12 tempserver kernel: [96431.622952] lost page write due to I/O error on md127
Jan 24 15:23:12 tempserver kernel: [96431.622958] EXT4-fs error (device md127): ext4_journal_check_start:56: Detected aborted journal
Jan 24 15:23:12 tempserver kernel: [96431.622961] EXT4-fs (md127): Remounting filesystem read-only
Jan 24 15:23:12 tempserver kernel: [96431.622963] EXT4-fs (md127): previous I/O error to superblock detected
Jan 24 15:23:12 tempserver kernel: [96431.622971] Buffer I/O error on device md127, logical block 0
Jan 24 15:23:12 tempserver kernel: [96431.622972] lost page write due to I/O error on md127
Jan 24 15:23:13 tempserver kernel: [96431.663207] plugin-containe[11485]: segfault at 141aa150e3c4 ip 00007fe380dfe522 sp 00007fff062466e8 error 6 in libflashplayer.so[7fe38078d000+107b000]
Jan 24 15:23:13 tempserver kernel: [96431.823785] md: super_written gets error=-19, uptodate=0
Jan 24 15:23:32 tempserver kernel: [96451.362478] device br0 left promiscuous mode
Jan 24 15:23:32 tempserver kernel: [96451.381227] vboxnetflt: 0 out of 11296830 packets were not sent (directed to host)
Jan 24 15:23:55 tempserver kernel: [96474.392410] md: super_written gets error=-19, uptodate=0
Jan 24 15:23:55 tempserver kernel: [96474.392454] Aborting journal on device md126-8.
Jan 24 15:23:55 tempserver kernel: [96474.392464] Buffer I/O error on device md126, logical block 243826688
Jan 24 15:23:55 tempserver kernel: [96474.392465] lost page write due to I/O error on md126
Jan 24 15:23:55 tempserver kernel: [96474.392469] JBD2: Error -5 detected when updating journal superblock for md126-8.
Jan 24 15:23:55 tempserver kernel: [96474.596458] md: super_written gets error=-19, uptodate=0
Jan 24 15:36:05 tempserver kernel: [97204.943952] md: super_written gets error=-19, uptodate=0
Jan 24 15:36:05 tempserver kernel: [97204.952725] Buffer I/O error on device md126, logical block 488344000
Jan 24 15:36:05 tempserver kernel: [97204.952737] Buffer I/O error on device md126, logical block 488344000
Jan 24 15:36:05 tempserver kernel: [97204.953310] Buffer I/O error on device md126, logical block 488344014
Jan 24 15:36:05 tempserver kernel: [97204.953317] Buffer I/O error on device md126, logical block 488344014
Jan 24 15:36:05 tempserver kernel: [97204.953342] Buffer I/O error on device md126, logical block 488344015
Jan 24 15:36:05 tempserver kernel: [97204.953347] Buffer I/O error on device md126, logical block 488344015
Jan 24 15:36:05 tempserver kernel: [97204.953598] Buffer I/O error on device md126, logical block 488344015
Jan 24 15:36:06 tempserver kernel: [97205.157682] md: super_written gets error=-19, uptodate=0
Jan 24 15:38:55 tempserver kernel: [97374.191852] md: super_written gets error=-19, uptodate=0
Jan 24 15:38:55 tempserver kernel: [97374.192780] Buffer I/O error on device md126, logical block 488344000
Jan 24 15:38:55 tempserver kernel: [97374.192786] Buffer I/O error on device md126, logical block 488344000
Jan 24 15:38:55 tempserver kernel: [97374.197645] Buffer I/O error on device md126, logical block 488344014
Jan 24 15:38:55 tempserver kernel: [97374.197656] Buffer I/O error on device md126, logical block 488344014
Jan 24 15:38:55 tempserver kernel: [97374.197927] Buffer I/O error on device md126, logical block 488344015
Jan 24 15:38:55 tempserver kernel: [97374.197932] Buffer I/O error on device md126, logical block 488344015
Jan 24 15:38:55 tempserver kernel: [97374.198258] Buffer I/O error on device md126, logical block 488344015
Jan 24 15:38:55 tempserver kernel: [97374.399491] md: super_written gets error=-19, uptodate=0
Jan 24 15:44:44 tempserver kernel: [97723.869340] md: super_written gets error=-19, uptodate=0
Jan 24 15:44:44 tempserver kernel: [97724.072512] md: super_written gets error=-19, uptodate=0
Jan 24 15:59:35 tempserver kernel: [98615.191953] Buffer I/O error on device md127, logical block 488344256
Jan 24 15:59:35 tempserver kernel: [98615.191962] Buffer I/O error on device md127, logical block 488344256
Jan 24 15:59:35 tempserver kernel: [98615.191970] Buffer I/O error on device md127, logical block 488344270
Jan 24 15:59:35 tempserver kernel: [98615.191973] Buffer I/O error on device md127, logical block 488344270
Jan 24 15:59:35 tempserver kernel: [98615.191979] Buffer I/O error on device md127, logical block 0
Jan 24 15:59:35 tempserver kernel: [98615.191986] Buffer I/O error on device md127, logical block 488344271
Jan 24 15:59:35 tempserver kernel: [98615.192017] Buffer I/O error on device md127, logical block 488344271
Jan 24 15:59:35 tempserver kernel: [98615.192022] Buffer I/O error on device md127, logical block 488344271
Jan 24 15:59:35 tempserver kernel: [98615.192027] Buffer I/O error on device md127, logical block 488344271
Jan 24 15:59:35 tempserver kernel: [98615.192031] Buffer I/O error on device md127, logical block 488344271
Jan 24 16:09:34 tempserver kernel: [99214.978331] md: super_written gets error=-19, uptodate=0
Jan 24 16:09:35 tempserver kernel: [99215.181417] md: super_written gets error=-19, uptodate=0
Jan 24 16:09:36 tempserver kernel: [99216.470117] nfsd: last server has exited, flushing export cache
Jan 24 16:09:37 tempserver kernel: [99217.660950] Ebtables v2.0 registered
Jan 24 16:09:38 tempserver kernel: [99218.145566] Ebtables v2.0 unregistered
Jan 24 16:10:08 tempserver kernel: [99248.258963] md: super_written gets error=-19, uptodate=0
Jan 24 16:10:08 tempserver kernel: [99248.461482] md: super_written gets error=-19, uptodate=0

root@tempserver:~# date
Tue Jan 24 16:48:33 PST 2017
root@tempserver:~# ls /dev/sd?
/dev/sda  /dev/sdb  /dev/sdc  /dev/sdd	/dev/sde  /dev/sdf  /dev/sdg
root@tempserver:~# ls -lR /dev/disk/
/dev/disk/:
total 0
drwxr-xr-x 2 root root 2580 Jan 24 16:43 by-id
drwxr-xr-x 2 root root   80 Jan 24 16:43 by-label
drwxr-xr-x 2 root root  460 Jan 24 16:43 by-path
drwxr-xr-x 2 root root  760 Jan 24 16:43 by-uuid

/dev/disk/by-id:
total 0
lrwxrwxrwx 1 root root  9 Jan 24 16:43 ata-ASUS_BW-12B1ST_D1D0CL102702 -> ../../sr0
lrwxrwxrwx 1 root root  9 Jan 24 16:43 ata-INTEL_SSDSC2CT240A4_CVKI3111011H240DGN -> ../../sda
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-INTEL_SSDSC2CT240A4_CVKI3111011H240DGN-part1 -> ../../sda1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-INTEL_SSDSC2CT240A4_CVKI3111011H240DGN-part2 -> ../../sda2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-INTEL_SSDSC2CT240A4_CVKI3111011H240DGN-part5 -> ../../sda5
lrwxrwxrwx 1 root root  9 Jan 24 16:43 ata-ST31000528AS_5VP2GNTB -> ../../sdf
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-ST31000528AS_5VP2GNTB-part1 -> ../../sdf1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-ST31000528AS_5VP2GNTB-part2 -> ../../sdf2
lrwxrwxrwx 1 root root  9 Jan 24 16:43 ata-WDC_WD2001FASS-00W2B0_WD-WMAY00862620 -> ../../sdc
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD2001FASS-00W2B0_WD-WMAY00862620-part1 -> ../../sdc1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD2001FASS-00W2B0_WD-WMAY00862620-part2 -> ../../sdc2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD2001FASS-00W2B0_WD-WMAY00862620-part3 -> ../../sdc3
lrwxrwxrwx 1 root root  9 Jan 24 16:43 ata-WDC_WD20EARS-00MVWB0_WD-WMAZA2177901 -> ../../sdd
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD20EARS-00MVWB0_WD-WMAZA2177901-part1 -> ../../sdd1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD20EARS-00MVWB0_WD-WMAZA2177901-part2 -> ../../sdd2
lrwxrwxrwx 1 root root  9 Jan 24 16:43 ata-WDC_WD30EFRX-68AX9N0_WD-WMC1T3337275 -> ../../sdg
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD30EFRX-68AX9N0_WD-WMC1T3337275-part1 -> ../../sdg1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD30EFRX-68AX9N0_WD-WMC1T3337275-part2 -> ../../sdg2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD30EFRX-68AX9N0_WD-WMC1T3337275-part3 -> ../../sdg3
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD30EFRX-68AX9N0_WD-WMC1T3337275-part4 -> ../../sdg4
lrwxrwxrwx 1 root root  9 Jan 24 16:43 ata-WDC_WD4000FYYZ-01UL1B2_WD-WMC130E91SHR -> ../../sdb
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD4000FYYZ-01UL1B2_WD-WMC130E91SHR-part1 -> ../../sdb1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD4000FYYZ-01UL1B2_WD-WMC130E91SHR-part2 -> ../../sdb2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD4000FYYZ-01UL1B2_WD-WMC130E91SHR-part3 -> ../../sdb3
lrwxrwxrwx 1 root root  9 Jan 24 16:43 ata-WDC_WD4001FFSX-68JNUN0_WD-WMC5D0D0FC30 -> ../../sde
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD4001FFSX-68JNUN0_WD-WMC5D0D0FC30-part1 -> ../../sde1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD4001FFSX-68JNUN0_WD-WMC5D0D0FC30-part2 -> ../../sde2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 ata-WDC_WD4001FFSX-68JNUN0_WD-WMC5D0D0FC30-part3 -> ../../sde3
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-amd64 -> ../../dm-21
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-bacula--backup -> ../../dm-11
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-boot_rescue -> ../../dm-16
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-chroot -> ../../dm-20
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-cyrspool3_raw -> ../../dm-12
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-cyrspool4_raw -> ../../dm-18
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-htpc_raw -> ../../dm-22
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-media -> ../../dm-19
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-root2_raw -> ../../dm-14
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-root_rescue -> ../../dm-15
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-spare -> ../../dm-17
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-daisy-swap2_raw -> ../../dm-13
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-local -> ../../dm-27
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-name-sunflower-backup -> ../../dm-0
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-name-sunflower-corn_vdisk -> ../../dm-4
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-name-sunflower-emergency_raw -> ../../dm-8
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-name-sunflower-local_raw -> ../../dm-7
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-name-sunflower-media01b -> ../../dm-9
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-name-sunflower-tcorn_raw -> ../../dm-3
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-name-sunflower-thome_raw -> ../../dm-2
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-sunflower-tjessie_raw -> ../../dm-10
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-name-sunflower-troot_raw -> ../../dm-1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-name-sunflower-usr -> ../../dm-5
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-name-sunflower-var_raw -> ../../dm-6
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-tcorn -> ../../dm-25
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-thome -> ../../dm-24
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-troot -> ../../dm-23
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-name-var -> ../../dm-26
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-CRYPT-LUKS1-2bb2a8af61d84197bb5bf0ada5f8c145-troot -> ../../dm-23
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-CRYPT-LUKS1-5b2669f5f53d426aa020222328b7b1b5-var_unformatted -> ../../dm-26
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-CRYPT-LUKS1-8e44946b36714849bf932b1fea424152-thome_unformatted -> ../../dm-24
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-CRYPT-LUKS1-8f81993c99bd45878c6691cc34f7a72d-tcorn_unformatted -> ../../dm-25
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-CRYPT-LUKS1-c4c3307e1dc74355b856d0bb74c9b34d-local_unformatted -> ../../dm-27
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2Lwehm1xZkVI2ym848WteFyvcU0WMI1SmxbTe5 -> ../../dm-0
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2Lwehm5rfsZVH44ldAowRfKLUqIbOUgI4DNm6t -> ../../dm-8
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2LwehmgqkTmQkgfRwVc9T9DXrt367flkvDySxZ -> ../../dm-5
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2LwehmlDm0ycEdcp9iuYRrds6PYMAJaqCrBdfN -> ../../dm-6
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2LwehmqtwcajNbZPU79hzjRIZOQ0Mfp1KqWHUi -> ../../dm-1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2LwehmSK7oOXpTLFeETtX3H5fs3Xh03wYznC1l -> ../../dm-9
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2LwehmTXs2ZnNyqXnBosaETfSn768SDzdWwOSa -> ../../dm-2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2LwehmuEA7zvu1IbpGn1dTRTSoTH1HB9fLzMrW -> ../../dm-7
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2LwehmuTNw0CbcB2KnjHimq6qivMq9v5QBoI2E -> ../../dm-4
lrwxrwxrwx 1 root root 10 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2LwehmXvn23A9ek1cLamHDhMlj0pH0HoDiasUd -> ../../dm-3
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-XoFdIZYHwuly4qbL9Fs02JawEn2LwehmyHVB0zmJLScJ0HAvMKt64l4xN4pXi0mj -> ../../dm-10
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe90vCqlTJEJz9XNqDiLqiAFb3layCyLU37 -> ../../dm-20
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe99ipPWHOKmw0yofhIvCOVboVu8UMDM4u4 -> ../../dm-15
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe9GREOTHce1L8jwqvNJFAKxSmnJ0dPTd9G -> ../../dm-11
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe9L1Bjf25QpiD4oECosMnoQRDVmVsh0qkE -> ../../dm-13
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe9mcPIsz7WhlkqQC2X2oVfxY79HBAofIHj -> ../../dm-22
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe9MSso91I10LYVQabovqhnlZDWUjEdx0V5 -> ../../dm-17
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe9Q2MlxfXKVRxtrZ0x2nvDPcrmImPt1pmR -> ../../dm-14
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe9qhvdeinb4t1wLcJSpahjLPKj7vLIUVAC -> ../../dm-18
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe9Ssp6mtjMM0QcG0o0RVd6MiMzfSbqaEp6 -> ../../dm-12
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe9sunZGxMZUjSELYuiH8dO9KkGls13MI1m -> ../../dm-21
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe9U2iTmorRBKzgoAFaOryb3gSRiJF3NKC2 -> ../../dm-16
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dm-uuid-LVM-z2DbsrYwb0J56tBMDTcfxH7cumFodpe9vGye9ccRLab1PTYCTHNeH01PZ325Dot0 -> ../../dm-19
lrwxrwxrwx 1 root root 11 Jan 24 16:43 md-name-tempserver:media3 -> ../../md127
lrwxrwxrwx 1 root root 11 Jan 24 16:43 md-name-tempserver:media4 -> ../../md126
lrwxrwxrwx 1 root root 11 Jan 24 16:43 md-uuid-285852a7:f02b9cdf:f2dca45a:f6a50616 -> ../../md127
lrwxrwxrwx 1 root root 11 Jan 24 16:43 md-uuid-69440821:e9aa9259:ab2b06ce:d09bedc7 -> ../../md126
lrwxrwxrwx 1 root root  9 Jan 24 16:43 scsi-SATA_INTEL_SSDSC2CT2CVKI3111011H240DGN -> ../../sda
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SATA_INTEL_SSDSC2CT2CVKI3111011H240DGN-part1 -> ../../sda1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SATA_INTEL_SSDSC2CT2CVKI3111011H240DGN-part2 -> ../../sda2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SATA_INTEL_SSDSC2CT2CVKI3111011H240DGN-part5 -> ../../sda5
lrwxrwxrwx 1 root root  9 Jan 24 16:43 scsi-SATA_ST31000528AS_5VP2GNTB -> ../../sdf
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SATA_ST31000528AS_5VP2GNTB-part1 -> ../../sdf1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SATA_ST31000528AS_5VP2GNTB-part2 -> ../../sdf2
lrwxrwxrwx 1 root root  9 Jan 24 16:43 scsi-SATA_WDC_WD30EFRX-68_WD-WMC1T3337275 -> ../../sdg
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SATA_WDC_WD30EFRX-68_WD-WMC1T3337275-part1 -> ../../sdg1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SATA_WDC_WD30EFRX-68_WD-WMC1T3337275-part2 -> ../../sdg2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SATA_WDC_WD30EFRX-68_WD-WMC1T3337275-part3 -> ../../sdg3
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SATA_WDC_WD30EFRX-68_WD-WMC1T3337275-part4 -> ../../sdg4
lrwxrwxrwx 1 root root  9 Jan 24 16:43 scsi-SWDC_WD4000FYYZ-01UL1B2_310000104102 -> ../../sdb
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SWDC_WD4000FYYZ-01UL1B2_310000104102-part1 -> ../../sdb1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SWDC_WD4000FYYZ-01UL1B2_310000104102-part2 -> ../../sdb2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 scsi-SWDC_WD4000FYYZ-01UL1B2_310000104102-part3 -> ../../sdb3
lrwxrwxrwx 1 root root  9 Jan 24 16:43 wwn-0x5000c5001e215923 -> ../../sdf
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x5000c5001e215923-part1 -> ../../sdf1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x5000c5001e215923-part2 -> ../../sdf2
lrwxrwxrwx 1 root root  9 Jan 24 16:43 wwn-0x50014ee05962817d -> ../../sdb
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee05962817d-part1 -> ../../sdb1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee05962817d-part2 -> ../../sdb2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee05962817d-part3 -> ../../sdb3
lrwxrwxrwx 1 root root  9 Jan 24 16:43 wwn-0x50014ee0aebe906f -> ../../sde
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee0aebe906f-part1 -> ../../sde1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee0aebe906f-part2 -> ../../sde2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee0aebe906f-part3 -> ../../sde3
lrwxrwxrwx 1 root root  9 Jan 24 16:43 wwn-0x50014ee6ab45f9a7 -> ../../sdd
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee6ab45f9a7-part1 -> ../../sdd1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee6ab45f9a7-part2 -> ../../sdd2
lrwxrwxrwx 1 root root  9 Jan 24 16:43 wwn-0x50014ee6addf95c6 -> ../../sdg
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee6addf95c6-part1 -> ../../sdg1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee6addf95c6-part2 -> ../../sdg2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee6addf95c6-part3 -> ../../sdg3
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x50014ee6addf95c6-part4 -> ../../sdg4
lrwxrwxrwx 1 root root  9 Jan 24 16:43 wwn-0x55cd2e400003c916 -> ../../sda
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x55cd2e400003c916-part1 -> ../../sda1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x55cd2e400003c916-part2 -> ../../sda2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 wwn-0x55cd2e400003c916-part5 -> ../../sda5

/dev/disk/by-label:
total 0
lrwxrwxrwx 1 root root 10 Jan 24 16:43 2TB -> ../../sdb3
lrwxrwxrwx 1 root root 10 Jan 24 16:43 backup\x20NTFS -> ../../sdb2

/dev/disk/by-path:
total 0
lrwxrwxrwx 1 root root  9 Jan 24 16:43 pci-0000:00:14.0-usb-0:2:1.0-scsi-0:0:0:0 -> ../../sdb
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:2:1.0-scsi-0:0:0:0-part1 -> ../../sdb1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:2:1.0-scsi-0:0:0:0-part2 -> ../../sdb2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:2:1.0-scsi-0:0:0:0-part3 -> ../../sdb3
lrwxrwxrwx 1 root root  9 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:0 -> ../../sdc
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:0-part1 -> ../../sdc1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:0-part2 -> ../../sdc2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:0-part3 -> ../../sdc3
lrwxrwxrwx 1 root root  9 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:1 -> ../../sdd
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:1-part1 -> ../../sdd1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:1-part2 -> ../../sdd2
lrwxrwxrwx 1 root root  9 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:2 -> ../../sde
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:2-part1 -> ../../sde1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:2-part2 -> ../../sde2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:14.0-usb-0:4:1.0-scsi-0:0:0:2-part3 -> ../../sde3
lrwxrwxrwx 1 root root  9 Jan 24 16:43 pci-0000:00:1f.2-scsi-0:0:0:0 -> ../../sdg
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:1f.2-scsi-0:0:0:0-part1 -> ../../sdf1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:1f.2-scsi-0:0:0:0-part2 -> ../../sdg2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:1f.2-scsi-0:0:0:0-part3 -> ../../sdg3
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:1f.2-scsi-0:0:0:0-part4 -> ../../sdg4
lrwxrwxrwx 1 root root 10 Jan 24 16:43 pci-0000:00:1f.2-scsi-0:0:0:0-part5 -> ../../sda5

/dev/disk/by-uuid:
total 0
lrwxrwxrwx 1 root root 10 Jan 24 16:43 0662CED262CEC621 -> ../../sdb3
lrwxrwxrwx 1 root root 10 Jan 24 16:43 081b7fd9-577f-46ad-a659-b74a03fe6e3f -> ../../dm-9
lrwxrwxrwx 1 root root 10 Jan 24 16:43 2707f7ec-48cc-4c41-98ec-4dc5ee8bb8dd -> ../../sdc2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 2bb2a8af-61d8-4197-bb5b-f0ada5f8c145 -> ../../dm-1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 2f3af1c2-9fc4-4aef-a21e-cd115b5e2a8f -> ../../sda1
lrwxrwxrwx 1 root root 10 Jan 24 16:43 303e4b54-0fd7-4139-97d1-a3b97a7e367b -> ../../sdg3
lrwxrwxrwx 1 root root 10 Jan 24 16:43 5b2669f5-f53d-426a-a020-222328b7b1b5 -> ../../dm-6
lrwxrwxrwx 1 root root 11 Jan 24 16:43 5c60cd58-788e-41c4-a5eb-78109e64bfdd -> ../../dm-14
lrwxrwxrwx 1 root root 11 Jan 24 16:43 5ff96b07-d4c2-4b9f-b481-62d77492c18e -> ../../dm-10
lrwxrwxrwx 1 root root 11 Jan 24 16:43 623c8d83-e4c5-497c-bc93-7bda97501c34 -> ../../dm-26
lrwxrwxrwx 1 root root 11 Jan 24 16:43 65bf0b61-0d0b-44ac-9e10-e22d4949b8a5 -> ../../dm-18
lrwxrwxrwx 1 root root 11 Jan 24 16:43 67dc2d8b-6cf7-4e71-8916-764260e54d47 -> ../../md126
lrwxrwxrwx 1 root root 11 Jan 24 16:43 81792fa0-2c78-4a7d-a3d3-5f2f8e7a5eee -> ../../dm-12
lrwxrwxrwx 1 root root 11 Jan 24 16:43 87345578-0806-48ba-b2c2-492fb4b5022c -> ../../dm-17
lrwxrwxrwx 1 root root 10 Jan 24 16:43 8e44946b-3671-4849-bf93-2b1fea424152 -> ../../dm-2
lrwxrwxrwx 1 root root 10 Jan 24 16:43 8f81993c-99bd-4587-8c66-91cc34f7a72d -> ../../dm-3
lrwxrwxrwx 1 root root 11 Jan 24 16:43 91b78c10-140f-4b25-aace-2cab439164be -> ../../dm-15
lrwxrwxrwx 1 root root 10 Jan 24 16:43 9839fbc0-250d-483e-b741-b4e868cb5723 -> ../../dm-0
lrwxrwxrwx 1 root root 10 Jan 24 16:43 9a5935ce-1760-4f5f-81a4-83bb25ee8fd6 -> ../../dm-5
lrwxrwxrwx 1 root root 10 Jan 24 16:43 a0ee2630-2804-435c-b45d-2710c7a49dd1 -> ../../sdg4
lrwxrwxrwx 1 root root 10 Jan 24 16:43 af09e385-caa4-4163-ab0b-b9208a263b7e -> ../../sdf1
lrwxrwxrwx 1 root root 11 Jan 24 16:43 b04de76a-16fc-458a-a3f9-1e991706e53a -> ../../dm-20
lrwxrwxrwx 1 root root 11 Jan 24 16:43 b3874960-ae0e-463b-bb40-924b214b5583 -> ../../dm-16
lrwxrwxrwx 1 root root 10 Jan 24 16:43 B4F46594F4655A1E -> ../../sdb2
lrwxrwxrwx 1 root root 11 Jan 24 16:43 ba5d596d-67cf-4546-afec-e38937a89603 -> ../../dm-22
lrwxrwxrwx 1 root root 11 Jan 24 16:43 c2c3c7dd-7e38-4073-a2d5-e33c1a8eead9 -> ../../dm-21
lrwxrwxrwx 1 root root 10 Jan 24 16:43 c4c3307e-1dc7-4355-b856-d0bb74c9b34d -> ../../dm-7
lrwxrwxrwx 1 root root 11 Jan 24 16:43 d0536249-aaac-4973-99b5-d8d192429f15 -> ../../dm-19
lrwxrwxrwx 1 root root 11 Jan 24 16:43 d1a627a2-cfcc-4b56-b8d4-52cd0d59c255 -> ../../dm-25
lrwxrwxrwx 1 root root 10 Jan 24 16:43 d53c9913-1277-4432-b62d-ab801359e237 -> ../../sda5
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dd3e1ea2-3333-40d3-846f-7ba26e44fdc1 -> ../../dm-27
lrwxrwxrwx 1 root root 11 Jan 24 16:43 dd9c3a51-e3ba-4d79-8f7d-95c6e94a3c4d -> ../../md127
lrwxrwxrwx 1 root root 11 Jan 24 16:43 e2078a85-f24d-4d04-b557-29a802231a4c -> ../../dm-11
lrwxrwxrwx 1 root root 10 Jan 24 16:43 f1083f5b-4c52-4b42-ad53-5da55f90de65 -> ../../dm-8
lrwxrwxrwx 1 root root 11 Jan 24 16:43 f8a05932-b9bd-463d-a058-97533a248b87 -> ../../dm-24
lrwxrwxrwx 1 root root 11 Jan 24 16:43 fae964d1-464e-4b03-bbdd-d5854b979918 -> ../../dm-23
root@tempserver:~# mdadm -D /dev/md126
/dev/md126:
        Version : 1.2
  Creation Time : Thu Feb 18 14:09:00 2016
     Raid Level : raid1
     Array Size : 1953376064 (1862.88 GiB 2000.26 GB)
  Used Dev Size : 1953376064 (1862.88 GiB 2000.26 GB)
   Raid Devices : 1
  Total Devices : 1
    Persistence : Superblock is persistent

    Update Time : Tue Jan 24 16:46:10 2017
          State : clean 
 Active Devices : 1
Working Devices : 1
 Failed Devices : 0
  Spare Devices : 0

           Name : tempserver:media4  (local to host tempserver)
           UUID : 69440821:e9aa9259:ab2b06ce:d09bedc7
         Events : 28

    Number   Major   Minor   RaidDevice State
       0       8       66        0      active sync   /dev/sde2
root@tempserver:~# mdadm -D /dev/md127
/dev/md127:
        Version : 1.2
  Creation Time : Sat Dec 12 15:50:57 2015
     Raid Level : raid1
     Array Size : 1953377088 (1862.89 GiB 2000.26 GB)
  Used Dev Size : 1953377088 (1862.89 GiB 2000.26 GB)
   Raid Devices : 1
  Total Devices : 1
    Persistence : Superblock is persistent

    Update Time : Tue Jan 24 16:48:18 2017
          State : clean 
 Active Devices : 1
Working Devices : 1
 Failed Devices : 0
  Spare Devices : 0

           Name : tempserver:media3  (local to host tempserver)
           UUID : 285852a7:f02b9cdf:f2dca45a:f6a50616
         Events : 42

    Number   Major   Minor   RaidDevice State
       0       8       67        0      active sync   /dev/sde3
root@tempserver:~# parted /dev/sde p  # see a bit further down for same info in sectors
Model: WDC WD40 01FFSX-68JNUN0 (scsi)
Disk /dev/sde: 4001GB
Sector size (logical/physical): 512B/512B
Partition Table: gpt

Number  Start   End     Size    File system  Name            Flags
 1      1049kB  2097kB  1049kB               boot            bios_grub
 2      2097kB  2000GB  2000GB               wdredpro-spare
 3      2000GB  4001GB  2000GB               media3

root@tempserver:~# mdadm -v -E /dev/sde3
/dev/sde3:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x0
     Array UUID : 285852a7:f02b9cdf:f2dca45a:f6a50616
           Name : tempserver:media3  (local to host tempserver)
  Creation Time : Sat Dec 12 15:50:57 2015
     Raid Level : raid1
   Raid Devices : 1

 Avail Dev Size : 3906754560 (1862.89 GiB 2000.26 GB)
     Array Size : 1953377088 (1862.89 GiB 2000.26 GB)
  Used Dev Size : 3906754176 (1862.89 GiB 2000.26 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
          State : clean
    Device UUID : f3e9d7c0:45950e91:dd342a67:2a4cd675

    Update Time : Tue Jan 24 16:48:18 2017
       Checksum : 5a04fd2 - correct
         Events : 42


   Device Role : Active device 0
   Array State : A ('A' == active, '.' == missing)
root@tempserver:~# parted /dev/sde
GNU Parted 2.3
Using /dev/sde
Welcome to GNU Parted! Type 'help' to view a list of commands.
(parted) unit s
(parted) p
Model: WDC WD40 01FFSX-68JNUN0 (scsi)
Disk /dev/sde: 7814037168s
Sector size (logical/physical): 512B/512B
Partition Table: gpt

Number  Start        End          Size         File system  Name            Flags
 1      2048s        4095s        2048s                     boot            bios_grub
 2      4096s        3907018751s  3907014656s               wdredpro-spare
 3      3907018752s  7814035455s  3907016704s               media3

(parted) q
root@tempserver:~# df -h .
Filesystem         Size  Used Avail Use% Mounted on
/dev/mapper/troot  959M  433M  478M  48% /
root@tempserver:~# cd
root@tempserver:~# pwd
/root
root@tempserver:~# date; smartctl -H -i -l scterc /dev/sde
Tue Jan 24 17:28:33 PST 2017
smartctl 5.41 2011-06-09 r3365 [x86_64-linux-3.16.0-0.bpo.4-amd64] (local build)
Copyright (C) 2002-11 by Bruce Allen, http://smartmontools.sourceforge.net

=== START OF INFORMATION SECTION ===
Device Model:     WDC WD4001FFSX-68JNUN0
Serial Number:    WD-WMC5D0D0FC30
LU WWN Device Id: 5 0014ee 0aebe906f
Firmware Version: 81.00A81
User Capacity:    4,000,787,030,016 bytes [4.00 TB]
Sector Sizes:     512 bytes logical, 4096 bytes physical
Device is:        Not in smartctl database [for details use: -P showall]
ATA Version is:   8
ATA Standard is:  Exact ATA specification draft version not indicated
Local Time is:    Tue Jan 24 17:28:33 2017 PST
SMART support is: Available - device has SMART capability.
SMART support is: Enabled

=== START OF READ SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED

Error Write SCT (Get) Error Recovery Control Command failed: ATA output registers not supported
Warning: device does not support SCT (Get) Error Recovery Control command

root@tempserver:~# mdadm -E /dev/sde
/dev/sde:
   MBR Magic : aa55
Partition[0] :   4294967295 sectors at            1 (type ee)
root@tempserver:~# mdadm -E /dev/sde[23]
/dev/sde2:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x0
     Array UUID : 69440821:e9aa9259:ab2b06ce:d09bedc7
           Name : tempserver:media4  (local to host tempserver)
  Creation Time : Thu Feb 18 14:09:00 2016
     Raid Level : raid1
   Raid Devices : 1

 Avail Dev Size : 3906752512 (1862.88 GiB 2000.26 GB)
     Array Size : 1953376064 (1862.88 GiB 2000.26 GB)
  Used Dev Size : 3906752128 (1862.88 GiB 2000.26 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
          State : clean
    Device UUID : 41e5317f:225a70b9:5970930e:6a5ff665

    Update Time : Tue Jan 24 16:46:10 2017
       Checksum : 9d338c13 - correct
         Events : 28


   Device Role : Active device 0
   Array State : A ('A' == active, '.' == missing)
/dev/sde3:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x0
     Array UUID : 285852a7:f02b9cdf:f2dca45a:f6a50616
           Name : tempserver:media3  (local to host tempserver)
  Creation Time : Sat Dec 12 15:50:57 2015
     Raid Level : raid1
   Raid Devices : 1

 Avail Dev Size : 3906754560 (1862.89 GiB 2000.26 GB)
     Array Size : 1953377088 (1862.89 GiB 2000.26 GB)
  Used Dev Size : 3906754176 (1862.89 GiB 2000.26 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
          State : clean
    Device UUID : f3e9d7c0:45950e91:dd342a67:2a4cd675

    Update Time : Tue Jan 24 16:48:18 2017
       Checksum : 5a04fd2 - correct
         Events : 42


   Device Role : Active device 0
   Array State : A ('A' == active, '.' == missing)
root@tempserver:~# mdadm -D /dev/md/media3
/dev/md/media3:
        Version : 1.2
  Creation Time : Sat Dec 12 15:50:57 2015
     Raid Level : raid1
     Array Size : 1953377088 (1862.89 GiB 2000.26 GB)
  Used Dev Size : 1953377088 (1862.89 GiB 2000.26 GB)
   Raid Devices : 1
  Total Devices : 1
    Persistence : Superblock is persistent

    Update Time : Tue Jan 24 16:48:18 2017
          State : clean 
 Active Devices : 1
Working Devices : 1
 Failed Devices : 0
  Spare Devices : 0

           Name : tempserver:media3  (local to host tempserver)
           UUID : 285852a7:f02b9cdf:f2dca45a:f6a50616
         Events : 42

    Number   Major   Minor   RaidDevice State
       0       8       67        0      active sync   /dev/sde3
root@tempserver:~# mdadm -D /dev/md/media4
/dev/md/media4:
        Version : 1.2
  Creation Time : Thu Feb 18 14:09:00 2016
     Raid Level : raid1
     Array Size : 1953376064 (1862.88 GiB 2000.26 GB)
  Used Dev Size : 1953376064 (1862.88 GiB 2000.26 GB)
   Raid Devices : 1
  Total Devices : 1
    Persistence : Superblock is persistent

    Update Time : Tue Jan 24 16:46:10 2017
          State : clean 
 Active Devices : 1
Working Devices : 1
 Failed Devices : 0
  Spare Devices : 0

           Name : tempserver:media4  (local to host tempserver)
           UUID : 69440821:e9aa9259:ab2b06ce:d09bedc7
         Events : 28

    Number   Major   Minor   RaidDevice State
       0       8       66        0      active sync   /dev/sde2
root@tempserver:~# /usr/local/src/lsdrv/lsdrv 
Traceback (most recent call last):
  File "/usr/local/src/lsdrv/lsdrv", line 469, in <module>
    devstat = os.stat('/dev/'+vg_name+'/'+lv_name)
OSError: [Errno 2] No such file or directory: '/dev/daisy/cyrlib'
root@tempserver:~# cat /proc/mdstat
Personalities : [raid1] 
md126 : active raid1 sde2[0]
      1953376064 blocks super 1.2 [1/1] [U]
      
md127 : active raid1 sde3[0]
      1953377088 blocks super 1.2 [1/1] [U]
      
unused devices: <none>



^ permalink raw reply

* [PATCH v1] md/r5cache: improve journal device efficiency
From: Song Liu @ 2017-01-24 22:08 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu, jsorensen

It is important to be able to flush all stripes in raid5-cache.
Therefore, we need reserve some space on the journal device for
these flushes. If flush operation includes pending writes to the
stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
for the flush out. This reduces the efficiency of journal space.
If we exclude these pending writes from flush operation, we only
need (conf->max_degraded + 1) pages per stripe.

With this patch, when log space is critical (R5C_LOG_CRITICAL=1),
pending writes will be excluded from stripe flush out. Therefore,
we can reduce reserved space for flush out and thus improve journal
device efficiency.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 36 +++++++++++++++++++++++++-----------
 drivers/md/raid5.c       | 42 +++++++++++++++++++++++++++++++++---------
 2 files changed, 58 insertions(+), 20 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 76c0e50..00fe64b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -389,17 +389,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
 /*
  * Total log space (in sectors) needed to flush all data in cache
  *
- * Currently, writing-out phase automatically includes all pending writes
- * to the same sector. So the reclaim of each stripe takes up to
- * (conf->raid_disks + 1) pages of log space.
+ * To avoid deadlock due to log space, it is necessary to reserve log
+ * space to flush critical stripes (stripes that occupying log space near
+ * last_checkpoint). This function helps check how much log space is
+ * required to flush all cached stripes.
  *
- * To totally avoid deadlock due to log space, the code reserves
- * (conf->raid_disks + 1) pages for each stripe in cache, which is not
- * necessary in most cases.
+ * To reduce log space requirements, two mechanisms are used to give cache
+ * flush higher priorities:
+ *    1. In handle_stripe_dirtying() and schedule_reconstruction(),
+ *       stripes ALREADY in journal can be flushed w/o pending writes;
+ *    2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
+ *       can be delayed (r5l_add_no_space_stripe).
  *
- * To improve this, we will need writing-out phase to be able to NOT include
- * pending writes, which will reduce the requirement to
- * (conf->max_degraded + 1) pages per stripe in cache.
+ * In cache flush, the stripe goes through 1 and then 2. For a stripe that
+ * already passed 1, flushing it requires at most (conf->raid_disks + 1)
+ * pages of journal space. For stripes that has not passed 1, flushing it
+ * requires (conf->max_degraded + 1) pages of journal space. There are at
+ * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
+ * required to flush all cached stripes (in pages) is:
+ *
+ *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
+ *     (group_cnt + 1) * (raid_disks + 1)
+ * or
+ *     (stripe_in_journal_count) * (max_degraded + 1) +
+ *     (group_cnt + 1) * (raid_disks - max_degraded)
  */
 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
 {
@@ -408,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
 	if (!r5c_is_writeback(log))
 		return 0;
 
-	return BLOCK_SECTORS * (conf->raid_disks + 1) *
-		atomic_read(&log->stripe_in_journal_count);
+	return BLOCK_SECTORS *
+		((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
+		 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
 }
 
 /*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b62f671..b0d1345 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2951,12 +2951,36 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
  *      like to flush data in journal to RAID disks first, so complex rmw
  *      is handled in the write patch (handle_stripe_dirtying).
  *
+ *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
+ *
+ *      It is important to be able to flush all stripes in raid5-cache.
+ *      Therefore, we need reserve some space on the journal device for
+ *      these flushes. If flush operation includes pending writes to the
+ *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
+ *      for the flush out. If we exclude these pending writes from flush
+ *      operation, we only need (conf->max_degraded + 1) pages per stripe.
+ *      Therefore, excluding pending writes in these cases enables more
+ *      efficient use of the journal device.
+ *
+ *      Note: To make sure the stripe makes progress, we only delay
+ *      towrite for stripes with data already in journal (injournal > 0).
+ *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
+ *      no_space_stripes list.
+ *
  */
-static inline bool delay_towrite(struct r5dev *dev,
-				   struct stripe_head_state *s)
+static inline bool delay_towrite(struct r5conf *conf,
+				 struct r5dev *dev,
+				 struct stripe_head_state *s)
 {
-	return !test_bit(R5_OVERWRITE, &dev->flags) &&
-		!test_bit(R5_Insync, &dev->flags) && s->injournal;
+	/* case 1 above */
+	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+	    !test_bit(R5_Insync, &dev->flags) && s->injournal)
+		return true;
+	/* case 2 above */
+	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
+	    s->injournal > 0)
+		return true;
+	return false;
 }
 
 static void
@@ -2979,7 +3003,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 
-			if (dev->towrite && !delay_towrite(dev, s)) {
+			if (dev->towrite && !delay_towrite(conf, dev, s)) {
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantdrain, &dev->flags);
 				if (!expand)
@@ -3731,7 +3755,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if (((dev->towrite && !delay_towrite(dev, s)) ||
+		if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
 		     i == sh->pd_idx || i == sh->qd_idx ||
 		     test_bit(R5_InJournal, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
@@ -3755,8 +3779,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 		}
 	}
 
-	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
-		(unsigned long long)sh->sector, rmw, rcw);
+	pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
+		 (unsigned long long)sh->sector, sh->state, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
 		/* prefer read-modify-write, but need to get some data */
@@ -3796,7 +3820,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (((dev->towrite && !delay_towrite(dev, s)) ||
+			if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
 			     i == sh->pd_idx || i == sh->qd_idx ||
 			     test_bit(R5_InJournal, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-- 
2.9.3


^ permalink raw reply related

* Re: Input/Output error reading from a clean raid
From: Wols Lists @ 2017-01-24 21:58 UTC (permalink / raw)
  To: Salatiel Filho, Andreas Klauer; +Cc: linux-raid
In-Reply-To: <CAGmni9qF_LjdzYtsMJ9YmHoymG=SN6p2Mta-COEhbyReQFbNTA@mail.gmail.com>

On 24/01/17 21:15, Salatiel Filho wrote:
> I really think it is very unlikely that two different disks from two
> different brands would have problems at exactly the same block.
> I have a question, who populates the badblock list ? Is the check
> action send to the /sys/block/md??/md/sync_action OR each read error
> updates it ?

I think it's a known problem - nobody seems to know quite why it happens
but when a block is added to the badblocks list it seems to get added to
every device. Given that modern hard-drives are supposed to relocate bad
blocks and not need a badblock list, I think that's why it's not been
found, most people especially those in the know just tend to disable
os-level badblocks.

Cheers,
Wol

^ permalink raw reply

* Re: Input/Output error reading from a clean raid
From: Salatiel Filho @ 2017-01-24 21:15 UTC (permalink / raw)
  To: Andreas Klauer; +Cc: linux-raid
In-Reply-To: <20170123173411.GA9270@metamorpher.de>

On Mon, Jan 23, 2017 at 2:34 PM, Andreas Klauer
<Andreas.Klauer@metamorpher.de> wrote:
> On Mon, Jan 23, 2017 at 11:02:24AM -0300, Salatiel Filho wrote:
>> mdadm mdadm --examine-badblocks /dev/sdd1 /dev/sdg1 /dev/sdf1  /dev/sde1
>>
>> Bad-blocks on /dev/sdd1:
>>           1515723072 for 512 sectors
>> Bad-blocks on /dev/sde1:
>>           1515723072 for 512 sectors
>
> md believes you have bad blocks in identical places so it won't return
> whatever data is in these blocks. Thus you get read errors even if there
> is no bad block on the disk itself. Those bad block entries can be caused
> by cable or controller flukes, making temporary problems permanent...
>
> Personally I disable the bad block list everywhere.
>
> You can search this list for old messages regarding --examine-badblocks,
> this problem came up several times. Clearing the mdadm bad block list is
> worth a try. There's an undocumented option, update=force-no-bbl or such.
>
> Regards
> Andreas Klauer

Thanks all of you for the help.
Andreas, the force-no-bbl from mdadm 3.4 did the trick. I was able to
retrieve all files and their md5 matches, so it is great =)
I really think it is very unlikely that two different disks from two
different brands would have problems at exactly the same block.
I have a question, who populates the badblock list ? Is the check
action send to the /sys/block/md??/md/sync_action OR each read error
updates it ?
I think it was maybe some problem with the cable ( it is a 4 disks usb3 bay ).
Anyway, thank you very much !

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox