dm-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
From: Mike Snitzer <snitzer@redhat.com>
To: Sagi Grimberg <sagig@dev.mellanox.co.il>
Cc: Christoph Hellwig <hch@infradead.org>,
	"keith.busch@intel.com" <keith.busch@intel.com>,
	dm-devel@redhat.com,
	"linux-nvme@lists.infradead.org" <linux-nvme@lists.infradead.org>,
	Bart Van Assche <bart.vanassche@sandisk.com>
Subject: Re: dm-multipath low performance with blk-mq
Date: Mon, 25 Jan 2016 16:40:16 -0500	[thread overview]
Message-ID: <20160125214016.GA10060@redhat.com> (raw)
In-Reply-To: <20160119224512.GA10515@redhat.com>

On Tue, Jan 19 2016 at  5:45P -0500,
Mike Snitzer <snitzer@redhat.com> wrote:

> On Mon, Jan 18 2016 at  7:04am -0500,
> Sagi Grimberg <sagig@dev.mellanox.co.il> wrote:
> 
> > Hi All,
> > 
> > I've recently tried out dm-multipath over a "super-fast" nvme device
> > and noticed a serious lock contention in dm-multipath that requires some
> > extra attention. The nvme device is a simple loopback device emulation
> > backed by null_blk device.
> > 
> > With this I've seen dm-multipath pushing around ~470K IOPs while
> > the native (loopback) nvme performance can easily push up to 1500K+ IOPs.
> > 
> > perf output [1] reveals a huge lock contention on the multipath lock
> > which is a per-dm_target contention point which seem to defeat the
> > purpose of blk-mq i/O path.
> > 
> > The two current bottlenecks seem to come from multipath_busy and
> > __multipath_map. Would it make better sense to move to a percpu_ref
> > model with freeze/unfreeze logic for updates similar to what blk-mq
> > is doing?
> >
> > Thoughts?
> 
> Your perf output clearly does identify the 'struct multipath' spinlock
> as a bottleneck.
> 
> Is it fair to assume that implied in your test is that you increased
> md->tag_set.nr_hw_queues to > 1 in dm_init_request_based_blk_mq_queue()?
> 
> I'd like to start by replicating your testbed.  So I'll see about
> setting up the nvme loop driver you referenced in earlier mail.
> Can you share your fio job file and fio commandline for your test?

Would still appreciate answers to my 2 questions above (did you modify
md->tag_set.nr_hw_queues and can you share your fio job?)

I've yet to reproduce your config (using hch's nvme loop driver) or
test to verify your findings but I did develop a patch that switches
from spinlock_t to rwlock_t.  I've only compile tested this but I'll try
to reproduce your setup and then test this patch to see if it helps.

Your worst offenders (multipath_busy and __multipath_map) are now using
a read lock in the fast path.

 drivers/md/dm-mpath.c | 127 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 83 insertions(+), 44 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index cfa29f5..34aadb1 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -11,6 +11,7 @@
 #include "dm-path-selector.h"
 #include "dm-uevent.h"
 
+#include <linux/rwlock_types.h>
 #include <linux/blkdev.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
@@ -67,7 +68,7 @@ struct multipath {
 	const char *hw_handler_name;
 	char *hw_handler_params;
 
-	spinlock_t lock;
+	rwlock_t lock;
 
 	unsigned nr_priority_groups;
 	struct list_head priority_groups;
@@ -189,7 +190,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 	m = kzalloc(sizeof(*m), GFP_KERNEL);
 	if (m) {
 		INIT_LIST_HEAD(&m->priority_groups);
-		spin_lock_init(&m->lock);
+		rwlock_init(&m->lock);
 		m->queue_io = 1;
 		m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
 		INIT_WORK(&m->trigger_event, trigger_event);
@@ -386,12 +387,24 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	struct pgpath *pgpath;
 	struct block_device *bdev;
 	struct dm_mpath_io *mpio;
+	bool use_write_lock = false;
 
-	spin_lock_irq(&m->lock);
+retry:
+	if (!use_write_lock)
+		read_lock_irq(&m->lock);
+	else
+		write_lock_irq(&m->lock);
 
 	/* Do we need to select a new pgpath? */
-	if (!m->current_pgpath ||
-	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
+	if (!use_write_lock) {
+		if (!m->current_pgpath ||
+		    (!m->queue_io && (m->repeat_count && m->repeat_count == 1))) {
+			use_write_lock = true;
+			read_unlock_irq(&m->lock);
+			goto retry;
+		}
+	} else if (!m->current_pgpath ||
+		   (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
 		__choose_pgpath(m, nr_bytes);
 
 	pgpath = m->current_pgpath;
@@ -401,13 +414,23 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 			r = -EIO;	/* Failed */
 		goto out_unlock;
 	} else if (m->queue_io || m->pg_init_required) {
+		if (!use_write_lock) {
+			use_write_lock = true;
+			read_unlock_irq(&m->lock);
+			goto retry;
+		}
 		__pg_init_all_paths(m);
 		goto out_unlock;
 	}
 
+	if (!use_write_lock)
+		read_unlock_irq(&m->lock);
+	else
+		write_unlock_irq(&m->lock);
+
 	if (set_mapinfo(m, map_context) < 0)
 		/* ENOMEM, requeue */
-		goto out_unlock;
+		return r;
 
 	mpio = map_context->ptr;
 	mpio->pgpath = pgpath;
@@ -415,8 +438,6 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 
 	bdev = pgpath->path.dev->bdev;
 
-	spin_unlock_irq(&m->lock);
-
 	if (clone) {
 		/* Old request-based interface: allocated clone is passed in */
 		clone->q = bdev_get_queue(bdev);
@@ -443,7 +464,10 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	return DM_MAPIO_REMAPPED;
 
 out_unlock:
-	spin_unlock_irq(&m->lock);
+	if (!use_write_lock)
+		read_unlock_irq(&m->lock);
+	else
+		write_unlock_irq(&m->lock);
 
 	return r;
 }
@@ -474,14 +498,15 @@ static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&m->lock, flags);
+	write_lock_irqsave(&m->lock, flags);
 
 	if (save_old_value)
 		m->saved_queue_if_no_path = m->queue_if_no_path;
 	else
 		m->saved_queue_if_no_path = queue_if_no_path;
 	m->queue_if_no_path = queue_if_no_path;
-	spin_unlock_irqrestore(&m->lock, flags);
+
+	write_unlock_irqrestore(&m->lock, flags);
 
 	if (!queue_if_no_path)
 		dm_table_run_md_queue_async(m->ti->table);
@@ -898,12 +923,12 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 
-		spin_lock_irqsave(&m->lock, flags);
+		read_lock_irqsave(&m->lock, flags);
 		if (!m->pg_init_in_progress) {
-			spin_unlock_irqrestore(&m->lock, flags);
+			read_unlock_irqrestore(&m->lock, flags);
 			break;
 		}
-		spin_unlock_irqrestore(&m->lock, flags);
+		read_unlock_irqrestore(&m->lock, flags);
 
 		io_schedule();
 	}
@@ -916,18 +941,18 @@ static void flush_multipath_work(struct multipath *m)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&m->lock, flags);
+	write_lock_irqsave(&m->lock, flags);
 	m->pg_init_disabled = 1;
-	spin_unlock_irqrestore(&m->lock, flags);
+	write_unlock_irqrestore(&m->lock, flags);
 
 	flush_workqueue(kmpath_handlerd);
 	multipath_wait_for_pg_init_completion(m);
 	flush_workqueue(kmultipathd);
 	flush_work(&m->trigger_event);
 
-	spin_lock_irqsave(&m->lock, flags);
+	write_lock_irqsave(&m->lock, flags);
 	m->pg_init_disabled = 0;
-	spin_unlock_irqrestore(&m->lock, flags);
+	write_unlock_irqrestore(&m->lock, flags);
 }
 
 static void multipath_dtr(struct dm_target *ti)
@@ -946,7 +971,7 @@ static int fail_path(struct pgpath *pgpath)
 	unsigned long flags;
 	struct multipath *m = pgpath->pg->m;
 
-	spin_lock_irqsave(&m->lock, flags);
+	write_lock_irqsave(&m->lock, flags);
 
 	if (!pgpath->is_active)
 		goto out;
@@ -968,7 +993,7 @@ static int fail_path(struct pgpath *pgpath)
 	schedule_work(&m->trigger_event);
 
 out:
-	spin_unlock_irqrestore(&m->lock, flags);
+	write_unlock_irqrestore(&m->lock, flags);
 
 	return 0;
 }
@@ -982,7 +1007,7 @@ static int reinstate_path(struct pgpath *pgpath)
 	unsigned long flags;
 	struct multipath *m = pgpath->pg->m;
 
-	spin_lock_irqsave(&m->lock, flags);
+	write_lock_irqsave(&m->lock, flags);
 
 	if (pgpath->is_active)
 		goto out;
@@ -1014,7 +1039,7 @@ static int reinstate_path(struct pgpath *pgpath)
 	schedule_work(&m->trigger_event);
 
 out:
-	spin_unlock_irqrestore(&m->lock, flags);
+	write_unlock_irqrestore(&m->lock, flags);
 	if (run_queue)
 		dm_table_run_md_queue_async(m->ti->table);
 
@@ -1049,13 +1074,13 @@ static void bypass_pg(struct multipath *m, struct priority_group *pg,
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&m->lock, flags);
+	write_lock_irqsave(&m->lock, flags);
 
 	pg->bypassed = bypassed;
 	m->current_pgpath = NULL;
 	m->current_pg = NULL;
 
-	spin_unlock_irqrestore(&m->lock, flags);
+	write_unlock_irqrestore(&m->lock, flags);
 
 	schedule_work(&m->trigger_event);
 }
@@ -1076,7 +1101,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
 		return -EINVAL;
 	}
 
-	spin_lock_irqsave(&m->lock, flags);
+	write_lock_irqsave(&m->lock, flags);
 	list_for_each_entry(pg, &m->priority_groups, list) {
 		pg->bypassed = 0;
 		if (--pgnum)
@@ -1086,7 +1111,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
 		m->current_pg = NULL;
 		m->next_pg = pg;
 	}
-	spin_unlock_irqrestore(&m->lock, flags);
+	write_unlock_irqrestore(&m->lock, flags);
 
 	schedule_work(&m->trigger_event);
 	return 0;
@@ -1125,14 +1150,14 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
 	unsigned long flags;
 	int limit_reached = 0;
 
-	spin_lock_irqsave(&m->lock, flags);
+	write_lock_irqsave(&m->lock, flags);
 
 	if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
 		m->pg_init_required = 1;
 	else
 		limit_reached = 1;
 
-	spin_unlock_irqrestore(&m->lock, flags);
+	write_unlock_irqrestore(&m->lock, flags);
 
 	return limit_reached;
 }
@@ -1186,7 +1211,7 @@ static void pg_init_done(void *data, int errors)
 		fail_path(pgpath);
 	}
 
-	spin_lock_irqsave(&m->lock, flags);
+	write_lock_irqsave(&m->lock, flags);
 	if (errors) {
 		if (pgpath == m->current_pgpath) {
 			DMERR("Could not failover device. Error %d.", errors);
@@ -1213,7 +1238,7 @@ static void pg_init_done(void *data, int errors)
 	wake_up(&m->pg_init_wait);
 
 out:
-	spin_unlock_irqrestore(&m->lock, flags);
+	write_unlock_irqrestore(&m->lock, flags);
 }
 
 static void activate_path(struct work_struct *work)
@@ -1272,7 +1297,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	if (mpio->pgpath)
 		fail_path(mpio->pgpath);
 
-	spin_lock_irqsave(&m->lock, flags);
+	read_lock_irqsave(&m->lock, flags);
 	if (!m->nr_valid_paths) {
 		if (!m->queue_if_no_path) {
 			if (!__must_push_back(m))
@@ -1282,7 +1307,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 				r = error;
 		}
 	}
-	spin_unlock_irqrestore(&m->lock, flags);
+	read_unlock_irqrestore(&m->lock, flags);
 
 	return r;
 }
@@ -1340,9 +1365,9 @@ static void multipath_resume(struct dm_target *ti)
 	struct multipath *m = (struct multipath *) ti->private;
 	unsigned long flags;
 
-	spin_lock_irqsave(&m->lock, flags);
+	write_lock_irqsave(&m->lock, flags);
 	m->queue_if_no_path = m->saved_queue_if_no_path;
-	spin_unlock_irqrestore(&m->lock, flags);
+	write_unlock_irqrestore(&m->lock, flags);
 }
 
 /*
@@ -1372,7 +1397,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 	unsigned pg_num;
 	char state;
 
-	spin_lock_irqsave(&m->lock, flags);
+	read_lock_irqsave(&m->lock, flags);
 
 	/* Features */
 	if (type == STATUSTYPE_INFO)
@@ -1467,7 +1492,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 		break;
 	}
 
-	spin_unlock_irqrestore(&m->lock, flags);
+	read_unlock_irqrestore(&m->lock, flags);
 }
 
 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
@@ -1534,16 +1559,27 @@ out:
 }
 
 static int multipath_prepare_ioctl(struct dm_target *ti,
-		struct block_device **bdev, fmode_t *mode)
+				   struct block_device **bdev, fmode_t *mode)
 {
 	struct multipath *m = ti->private;
 	unsigned long flags;
 	int r;
+	bool use_write_lock = false;
 
-	spin_lock_irqsave(&m->lock, flags);
+retry:
+	if (!use_write_lock)
+		read_lock_irqsave(&m->lock, flags);
+	else
+		write_lock_irqsave(&m->lock, flags);
 
-	if (!m->current_pgpath)
+	if (!m->current_pgpath) {
+		if (!use_write_lock) {
+			use_write_lock = true;
+			read_unlock_irqrestore(&m->lock, flags);
+			goto retry;
+		}
 		__choose_pgpath(m, 0);
+	}
 
 	if (m->current_pgpath) {
 		if (!m->queue_io) {
@@ -1562,17 +1598,20 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 			r = -EIO;
 	}
 
-	spin_unlock_irqrestore(&m->lock, flags);
+	if (!use_write_lock)
+		read_unlock_irqrestore(&m->lock, flags);
+	else
+		write_unlock_irqrestore(&m->lock, flags);
 
 	if (r == -ENOTCONN) {
-		spin_lock_irqsave(&m->lock, flags);
+		write_lock_irqsave(&m->lock, flags);
 		if (!m->current_pg) {
 			/* Path status changed, redo selection */
 			__choose_pgpath(m, 0);
 		}
 		if (m->pg_init_required)
 			__pg_init_all_paths(m);
-		spin_unlock_irqrestore(&m->lock, flags);
+		write_unlock_irqrestore(&m->lock, flags);
 		dm_table_run_md_queue_async(m->ti->table);
 	}
 
@@ -1627,7 +1666,7 @@ static int multipath_busy(struct dm_target *ti)
 	struct pgpath *pgpath;
 	unsigned long flags;
 
-	spin_lock_irqsave(&m->lock, flags);
+	read_lock_irqsave(&m->lock, flags);
 
 	/* pg_init in progress or no paths available */
 	if (m->pg_init_in_progress ||
@@ -1674,7 +1713,7 @@ static int multipath_busy(struct dm_target *ti)
 		busy = 0;
 
 out:
-	spin_unlock_irqrestore(&m->lock, flags);
+	read_unlock_irqrestore(&m->lock, flags);
 
 	return busy;
 }

  reply	other threads:[~2016-01-25 21:40 UTC|newest]

Thread overview: 65+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <569CD4D6.2040908@dev.mellanox.co.il>
2016-01-19 10:37 ` dm-multipath low performance with blk-mq Sagi Grimberg
2016-01-19 22:45   ` Mike Snitzer
2016-01-25 21:40     ` Mike Snitzer [this message]
2016-01-25 23:37       ` Benjamin Marzinski
2016-01-26 13:29         ` Mike Snitzer
2016-01-26 14:01           ` Hannes Reinecke
2016-01-26 14:47             ` Mike Snitzer
2016-01-26 14:56               ` Christoph Hellwig
2016-01-26 15:27                 ` Mike Snitzer
2016-01-26 15:57             ` Benjamin Marzinski
2016-01-27 11:14           ` Sagi Grimberg
2016-01-27 17:48             ` Mike Snitzer
2016-01-27 17:51               ` Jens Axboe
2016-01-27 18:16                 ` Mike Snitzer
2016-01-27 18:26                   ` Jens Axboe
2016-01-27 19:14                     ` Mike Snitzer
2016-01-27 19:50                       ` Jens Axboe
2016-01-27 17:56               ` Sagi Grimberg
2016-01-27 18:42                 ` Mike Snitzer
2016-01-27 19:49                   ` Jens Axboe
2016-01-27 20:45                     ` Mike Snitzer
2016-01-29 23:35                 ` Mike Snitzer
2016-01-30  8:52                   ` Hannes Reinecke
2016-01-30 19:12                     ` Mike Snitzer
2016-02-01  6:46                       ` Hannes Reinecke
2016-02-03 18:04                         ` Mike Snitzer
2016-02-03 18:24                           ` Mike Snitzer
2016-02-03 19:22                             ` Mike Snitzer
2016-02-04  6:54                             ` Hannes Reinecke
2016-02-04 13:54                               ` Mike Snitzer
2016-02-04 13:58                                 ` Hannes Reinecke
2016-02-04 14:09                                   ` Mike Snitzer
2016-02-04 14:32                                     ` Hannes Reinecke
2016-02-04 14:44                                       ` Mike Snitzer
2016-02-05 15:13                                 ` [RFC PATCH] dm: fix excessive dm-mq context switching Mike Snitzer
2016-02-05 18:05                                   ` Mike Snitzer
2016-02-05 19:19                                     ` Mike Snitzer
2016-02-07 15:41                                       ` Sagi Grimberg
2016-02-07 16:07                                         ` Mike Snitzer
2016-02-07 16:42                                           ` Sagi Grimberg
2016-02-07 16:37                                         ` Bart Van Assche
2016-02-07 16:43                                           ` Sagi Grimberg
2016-02-07 16:53                                             ` Mike Snitzer
2016-02-07 16:54                                             ` Sagi Grimberg
2016-02-07 17:20                                               ` Mike Snitzer
2016-02-08 12:21                                                 ` Sagi Grimberg
2016-02-08 14:34                                                   ` Mike Snitzer
2016-02-09  7:50                                                 ` Hannes Reinecke
2016-02-09 14:55                                                   ` Mike Snitzer
2016-02-09 15:32                                                     ` Hannes Reinecke
2016-02-10  0:45                                                       ` Mike Snitzer
2016-02-11  1:50                                                         ` RCU-ified dm-mpath for testing/review Mike Snitzer
2016-02-11  3:35                                                           ` Mike Snitzer
2016-02-11 15:34                                                           ` Mike Snitzer
2016-02-12 15:18                                                             ` Hannes Reinecke
2016-02-12 15:26                                                               ` Mike Snitzer
2016-02-12 16:04                                                                 ` Hannes Reinecke
2016-02-12 18:00                                                                   ` Mike Snitzer
2016-02-15  6:47                                                                     ` Hannes Reinecke
2016-01-26  1:49       ` dm-multipath low performance with blk-mq Benjamin Marzinski
2016-01-26 16:03       ` Mike Snitzer
2016-01-26 16:44         ` Christoph Hellwig
2016-01-27  2:09           ` Mike Snitzer
2016-01-27 11:10             ` Sagi Grimberg
2016-01-26 21:40         ` Benjamin Marzinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160125214016.GA10060@redhat.com \
    --to=snitzer@redhat.com \
    --cc=bart.vanassche@sandisk.com \
    --cc=dm-devel@redhat.com \
    --cc=hch@infradead.org \
    --cc=keith.busch@intel.com \
    --cc=linux-nvme@lists.infradead.org \
    --cc=sagig@dev.mellanox.co.il \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).