From: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
To: neilb@suse.de
Cc: linux-raid@vger.kernel.org,
Florian-Ewald Mueller <florian-ewald.mueller@profitbricks.com>,
Sebastian Parschauer <sebastian.riemer@profitbricks.com>
Subject: [RFC PATCH 2/4] md: introduce request function mode support
Date: Wed, 4 Jun 2014 19:10:00 +0200 [thread overview]
Message-ID: <1401901802-16296-3-git-send-email-sebastian.riemer@profitbricks.com> (raw)
In-Reply-To: <1401901802-16296-1-git-send-email-sebastian.riemer@profitbricks.com>
From: Florian-Ewald Mueller <florian-ewald.mueller@profitbricks.com>
This introduces the writable module parameter 'rq_mode' which is
used to set the I/O mode for all subsequently created MD devices.
Set it to 0 for the default mode (the make request function mode)
in order to process I/O bio-by-bio or set it to 1 for the new
request function mode to process I/O request-by-request. Common
code is shared between both modes.
The advantage of the new mode is that a scheduler can be used and
the block layer cares for I/O statistics.
Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@profitbricks.com>
[spars: merged commits, changed description, fixed checkpatch warnings]
Signed-off-by: Sebastian Parschauer <sebastian.riemer@profitbricks.com>
---
drivers/md/md.c | 280 +++++++++++++++++++++++++++++++++++++++++++++++++------
drivers/md/md.h | 7 ++
2 files changed, 257 insertions(+), 30 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8c653f9..0e5c420 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -56,8 +56,6 @@
#ifdef BIO_ACCOUNTING_EXTENSION
-#include <linux/ratelimit.h>
-
struct md_bio_private {
void (*orig_bio_endio)(struct bio *, int);
void *orig_bio_private;
@@ -68,6 +66,30 @@ struct md_bio_private {
static struct kmem_cache *md_bio_private_cache __read_mostly;
+#endif /* BIO_ACCOUNTING_EXTENSION */
+
+#ifdef MD_REQUEST_FUNCTION
+
+struct md_request_clone {
+ struct work_struct work;
+ struct mddev *mdp;
+ struct request *req;
+ struct bio_list bios;
+ atomic_t cnt;
+ int err;
+};
+
+#define MD_RQ_MODE_DEFAULT 0
+
+static unsigned int rq_mode __read_mostly = MD_RQ_MODE_DEFAULT;
+static struct kmem_cache *md_request_clone_cache __read_mostly;
+
+#endif /* MD_REQUEST_FUNCTION */
+
+#if defined BIO_ACCOUNTING_EXTENSION || defined MD_REQUEST_FUNCTION
+
+#include <linux/ratelimit.h>
+
static DEFINE_RATELIMIT_STATE(md_ratelimit_state,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -78,7 +100,7 @@ static inline int __must_check md_valid_ptr(const void *p)
}
#define VALID_PTR(p) md_valid_ptr(p)
-#endif /* BIO_ACCOUNTING_EXTENSION */
+#endif /* BIO_ACCOUNTING_EXTENSION || MD_REQUEST_FUNCTION */
#ifndef MODULE
static void autostart_arrays(int part);
@@ -326,31 +348,17 @@ static void md_bio_endio(struct bio *bio, int err)
#endif /* BIO_ACCOUNTING_EXTENSION */
-/* Rather than calling directly into the personality make_request function,
- * IO requests come here first so that we can check if the device is
- * being suspended pending a reconfiguration.
- * We hold a refcount over the call to ->make_request. By the time that
- * call has finished, the bio has been linked into some internal structure
- * and so is visible to ->quiesce(), so we don't need the refcount any more.
- */
-static void md_make_request(struct request_queue *q, struct bio *bio)
+static inline int md_make_request_head(struct mddev *mddev, struct bio *bio)
{
const int rw = bio_data_dir(bio);
- struct mddev *mddev = q->queuedata;
- int cpu;
- unsigned int sectors;
-#ifdef BIO_ACCOUNTING_EXTENSION
- struct md_bio_private *mbp;
-#endif /* BIO_ACCOUNTING_EXTENSION */
- if (mddev == NULL || mddev->pers == NULL
- || !mddev->ready) {
+ if (mddev == NULL || mddev->pers == NULL || !mddev->ready) {
bio_io_error(bio);
- return;
+ return 1;
}
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
- return;
+ return 1;
}
smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
@@ -369,6 +377,39 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
}
atomic_inc(&mddev->active_io);
rcu_read_unlock();
+ return 0;
+}
+
+static inline void md_make_request_body(struct mddev *mddev, struct bio *bio)
+{
+ mddev->pers->make_request(mddev, bio);
+}
+
+static inline void md_make_request_tail(struct mddev *mddev, struct bio *bio)
+{
+ if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+ wake_up(&mddev->sb_wait);
+}
+
+/* Rather than calling directly into the personality make_request function,
+ * IO requests come here first so that we can check if the device is
+ * being suspended pending a reconfiguration.
+ * We hold a refcount over the call to ->make_request. By the time that
+ * call has finished, the bio has been linked into some internal structure
+ * and so is visible to ->quiesce(), so we don't need the refcount any more.
+ */
+static void md_make_request(struct request_queue *q, struct bio *bio)
+{
+ const int rw = bio_data_dir(bio);
+ struct mddev *mddev = q->queuedata;
+ int cpu;
+ unsigned int sectors;
+#ifdef BIO_ACCOUNTING_EXTENSION
+ struct md_bio_private *mbp;
+#endif /* BIO_ACCOUNTING_EXTENSION */
+
+ if (unlikely(md_make_request_head(mddev, bio)))
+ return;
/*
* save the sectors now since our bio can
@@ -397,7 +438,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
bio->bi_private = mbp;
}
#endif /* BIO_ACCOUNTING_EXTENSION */
- mddev->pers->make_request(mddev, bio);
+ md_make_request_body(mddev, bio);
#ifndef BIO_ACCOUNTING_EXTENSION
cpu = part_stat_lock();
@@ -406,10 +447,131 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
part_stat_unlock();
#endif /* !BIO_ACCOUNTING_EXTENSION */
- if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
- wake_up(&mddev->sb_wait);
+ md_make_request_tail(mddev, bio);
+}
+
+#ifdef MD_REQUEST_FUNCTION
+
+static inline void md_make_request_bio(struct mddev *mddev, struct bio *bio)
+{
+ if (unlikely(md_make_request_head(mddev, bio)))
+ return;
+ md_make_request_body(mddev, bio);
+ md_make_request_tail(mddev, bio);
+}
+
+static inline void md_request_clone_release(struct md_request_clone *rcl)
+{
+ if (atomic_dec_and_test(&rcl->cnt)) {
+ blk_end_request_all(rcl->req, rcl->err);
+ kmem_cache_free(md_request_clone_cache, rcl);
+ }
+}
+
+static void md_request_bio_endio(struct bio *bio, int err)
+{
+ struct md_request_clone *rcl = bio->bi_private;
+
+ if (unlikely(err < 0))
+ rcl->err = err;
+
+ bio_put(bio);
+ md_request_clone_release(rcl);
+}
+
+static void md_request_clone_worker(struct work_struct *wkp)
+{
+ struct md_request_clone *rcl =
+ container_of(wkp, struct md_request_clone, work);
+ struct bio_list *blp = &rcl->bios;
+ struct mddev *mddev = rcl->mdp;
+ struct bio *bio;
+
+ bio = bio_list_pop(blp);
+ while (VALID_PTR(bio)) {
+ md_make_request_bio(mddev, bio);
+ bio = bio_list_pop(blp);
+ }
+ md_request_clone_release(rcl);
}
+static inline int md_process_request(struct mddev *mddev, struct request *req)
+{
+ struct md_request_clone *rcl;
+
+ struct bio *bio, *clone;
+ int error;
+
+ rcl = kmem_cache_alloc(md_request_clone_cache, GFP_NOIO);
+ if (unlikely(!VALID_PTR(rcl))) {
+ if (__ratelimit(&md_ratelimit_state))
+ pr_warn("%s: [%s] kmem_cache_alloc failed\n",
+ __func__, mdname(mddev));
+ return -ENOMEM;
+ }
+ rcl->err = 0;
+ rcl->req = req;
+ rcl->mdp = mddev;
+ atomic_set(&rcl->cnt, 1);
+ bio_list_init(&rcl->bios);
+ bio = req->bio;
+ while (VALID_PTR(bio)) {
+ clone = bio_clone(bio, GFP_NOWAIT);
+ if (unlikely(!VALID_PTR(clone))) {
+ if (__ratelimit(&md_ratelimit_state))
+ pr_warn("%s: [%s] bio_clone failed\n",
+ __func__, mdname(mddev));
+ error = -ENOMEM;
+ goto error_out;
+ }
+ clone->bi_private = rcl;
+ clone->bi_end_io = md_request_bio_endio;
+ bio_list_add(&rcl->bios, clone);
+ atomic_inc(&rcl->cnt);
+ bio = bio->bi_next;
+ }
+ INIT_WORK(&rcl->work, md_request_clone_worker);
+ queue_work(mddev->request_wq, &rcl->work);
+ return 0;
+error_out:
+ bio = bio_list_pop(&rcl->bios);
+ while (VALID_PTR(bio)) {
+ bio_put(bio);
+ bio = bio_list_pop(&rcl->bios);
+ }
+ kmem_cache_free(md_request_clone_cache, rcl);
+ return error;
+}
+
+#ifndef blk_fs_request
+#define blk_fs_request(p) ((p)->cmd_type == REQ_TYPE_FS)
+#endif /* !blk_fs_request */
+
+static void md_request_function(struct request_queue *rqp)
+{
+ struct mddev *mddev = rqp->queuedata;
+
+ struct request *req;
+ int rc;
+
+ while ((req = blk_fetch_request(rqp)) != NULL) {
+ if (unlikely(!blk_fs_request(req))) {
+ if (__ratelimit(&md_ratelimit_state))
+ pr_warn("%s: [%s] non-fs request\n",
+ __func__, mdname(mddev));
+ __blk_end_request_all(req, -ENOTSUPP);
+ continue;
+ }
+ spin_unlock_irq(rqp->queue_lock);
+ rc = md_process_request(mddev, req);
+ spin_lock_irq(rqp->queue_lock);
+ if (unlikely(rc < 0))
+ __blk_end_request_all(req, rc);
+ }
+}
+
+#endif /* MD_REQUEST_FUNCTION */
+
/* mddev_suspend makes sure no new requests are submitted
* to the device, and that any requests that have been submitted
* are completely handled.
@@ -567,8 +729,15 @@ static void mddev_put(struct mddev *mddev)
*/
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
queue_work(md_misc_wq, &mddev->del_work);
- } else
+ } else {
+#ifdef MD_REQUEST_FUNCTION
+ if (likely(VALID_PTR(mddev->request_wq))) {
+ destroy_workqueue(mddev->request_wq);
+ mddev->request_wq = NULL;
+ }
+#endif /* MD_REQUEST_FUNCTION */
kfree(mddev);
+ }
}
spin_unlock(&all_mddevs_lock);
if (bs)
@@ -4923,6 +5092,13 @@ static void md_free(struct kobject *ko)
if (mddev->queue)
blk_cleanup_queue(mddev->queue);
+#ifdef MD_REQUEST_FUNCTION
+ if (likely(VALID_PTR(mddev->request_wq))) {
+ destroy_workqueue(mddev->request_wq);
+ mddev->request_wq = NULL;
+ }
+#endif /* MD_REQUEST_FUNCTION */
+
kfree(mddev);
}
@@ -4990,12 +5166,32 @@ static int md_alloc(dev_t dev, char *name)
}
error = -ENOMEM;
- mddev->queue = blk_alloc_queue(GFP_KERNEL);
- if (!mddev->queue)
- goto abort;
+#ifdef MD_REQUEST_FUNCTION
+ if (!rq_mode) {
+#endif /* MD_REQUEST_FUNCTION */
+ mddev->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!mddev->queue)
+ goto abort;
+ blk_queue_make_request(mddev->queue, md_make_request);
+#ifdef MD_REQUEST_FUNCTION
+ } else {
+ mddev->request_wq =
+ create_singlethread_workqueue(mdname(mddev));
+ if (unlikely(!VALID_PTR(mddev->request_wq))) {
+ pr_warn("%s: create_singlethread_workqueue (%s) "
+ "failed\n", __func__, mdname(mddev));
+ goto abort;
+ }
+ mddev->queue = blk_init_queue(md_request_function, NULL);
+ if (!mddev->queue) {
+ destroy_workqueue(mddev->request_wq);
+ mddev->request_wq = NULL;
+ goto abort;
+ }
+ }
+#endif /* MD_REQUEST_FUNCTION */
mddev->queue->queuedata = mddev;
- blk_queue_make_request(mddev->queue, md_make_request);
blk_set_stacking_limits(&mddev->queue->limits);
disk = alloc_disk(1 << shift);
@@ -8714,11 +8910,23 @@ static int __init md_init(void)
#ifdef BIO_ACCOUNTING_EXTENSION
md_bio_private_cache = KMEM_CACHE(md_bio_private, 0);
if (unlikely(!VALID_PTR(md_bio_private_cache))) {
- pr_err("%s: KMEM_CACHE failed\n", __func__);
+ pr_err("%s: KMEM_CACHE (bio_priv) failed\n", __func__);
return -ENOMEM;
}
#endif /* BIO_ACCOUNTING_EXTENSION */
+#ifdef MD_REQUEST_FUNCTION
+ md_request_clone_cache = KMEM_CACHE(md_request_clone, 0);
+ if (unlikely(!VALID_PTR(md_request_clone_cache))) {
+ pr_err("%s: KMEM_CACHE (req_clone) failed\n", __func__);
+#ifdef BIO_ACCOUNTING_EXTENSION
+ kmem_cache_destroy(md_bio_private_cache);
+ md_bio_private_cache = NULL;
+#endif /* BIO_ACCOUNTING_EXTENSION */
+ return -ENOMEM;
+ }
+#endif /* MD_REQUEST_FUNCTION */
+
md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
if (!md_wq)
goto err_wq;
@@ -8856,6 +9064,13 @@ static __exit void md_exit(void)
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
+#ifdef MD_REQUEST_FUNCTION
+ if (likely(VALID_PTR(md_request_clone_cache))) {
+ kmem_cache_destroy(md_request_clone_cache);
+ md_request_clone_cache = NULL;
+ }
+#endif /* MD_REQUEST_FUNCTION */
+
#ifdef BIO_ACCOUNTING_EXTENSION
if (likely(VALID_PTR(md_bio_private_cache))) {
kmem_cache_destroy(md_bio_private_cache);
@@ -8887,6 +9102,11 @@ module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
+#ifdef MD_REQUEST_FUNCTION
+module_param(rq_mode, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(rq_mode, " this module's io input mode (default: 0 [make request mode])");
+#endif /* MD_REQUEST_FUNCTION */
+
EXPORT_SYMBOL(register_md_personality);
EXPORT_SYMBOL(unregister_md_personality);
EXPORT_SYMBOL(md_error);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f0e9171..8d639e0 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -25,6 +25,10 @@
#include <linux/workqueue.h>
#if 1
+#define MD_REQUEST_FUNCTION
+#endif
+
+#if 1
#define BIO_ACCOUNTING_EXTENSION
#endif
@@ -455,6 +459,9 @@ struct mddev {
#ifdef BIO_ACCOUNTING_EXTENSION
struct md_stats stats;
#endif /* BIO_ACCOUNTING_EXTENSION */
+#ifdef MD_REQUEST_FUNCTION
+ struct workqueue_struct *request_wq;
+#endif /* MD_REQUEST_FUNCTION */
};
--
1.7.9.5
next prev parent reply other threads:[~2014-06-04 17:10 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-05-28 13:04 [RFC] Process requests instead of bios to use a scheduler Sebastian Parschauer
2014-06-01 23:32 ` NeilBrown
2014-06-02 9:51 ` Sebastian Parschauer
2014-06-02 10:20 ` NeilBrown
2014-06-02 11:12 ` Sebastian Parschauer
2014-06-04 17:09 ` [RFC PATCH 0/4] md/mdadm: introduce request function mode support Sebastian Parschauer
2014-06-04 17:09 ` [RFC PATCH 1/4] md: complete bio accounting and add io_latency extension Sebastian Parschauer
2014-06-04 17:10 ` Sebastian Parschauer [this message]
2014-06-04 17:10 ` [RFC PATCH 3/4] md: handle IO latency accounting in rqfn mode Sebastian Parschauer
2014-06-04 17:10 ` [RFC PATCH 4/4] mdadm: introduce '--use-requestfn' create/assembly option Sebastian Parschauer
2014-06-17 13:20 ` [RFC PATCH 0/4] md/mdadm: introduce request function mode support Sebastian Parschauer
[not found] ` <CAH3kUhEK26+4KryoReosMt654-vcrkkgkxaW5tKkFRDBqgX82w@mail.gmail.com>
[not found] ` <53A14513.20902@profitbricks.com>
2014-06-18 13:57 ` Roberto Spadim
2014-06-18 14:43 ` Sebastian Parschauer
2014-06-24 7:09 ` NeilBrown
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1401901802-16296-3-git-send-email-sebastian.riemer@profitbricks.com \
--to=sebastian.riemer@profitbricks.com \
--cc=florian-ewald.mueller@profitbricks.com \
--cc=linux-raid@vger.kernel.org \
--cc=neilb@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).