From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org, Kent Overstreet <kmo@daterainc.com>,
Linus Torvalds <torvalds@linux-foundation.org>
Subject: [ 06/52] bcache: Fix a writeback performance regression
Date: Wed, 2 Oct 2013 21:05:24 -0700 [thread overview]
Message-ID: <20131003040522.625750151@linuxfoundation.org> (raw)
In-Reply-To: <20131003040522.190209641@linuxfoundation.org>
3.10-stable review patch. If anyone has any objections, please let me know.
------------------
From: Kent Overstreet <kmo@daterainc.com>
commit c2a4f3183a1248f615a695fbd8905da55ad11bba upstream.
Background writeback works by scanning the btree for dirty data and
adding those keys into a fixed size buffer, then for each dirty key in
the keybuf writing it to the backing device.
When read_dirty() finishes and it's time to scan for more dirty data, we
need to wait for the outstanding writeback IO to finish - they still
take up slots in the keybuf (so that foreground writes can check for
them to avoid races) - without that wait, we'll continually rescan when
we'll be able to add at most a key or two to the keybuf, and that takes
locks that starves foreground IO. Doh.
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
drivers/md/bcache/bcache.h | 7 ++----
drivers/md/bcache/util.c | 11 +++++++++-
drivers/md/bcache/util.h | 12 ++++++++---
drivers/md/bcache/writeback.c | 43 ++++++++++++++++++++----------------------
4 files changed, 43 insertions(+), 30 deletions(-)
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -499,7 +499,7 @@ struct cached_dev {
*/
atomic_t has_dirty;
- struct ratelimit writeback_rate;
+ struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
/*
@@ -508,10 +508,9 @@ struct cached_dev {
*/
sector_t last_read;
- /* Number of writeback bios in flight */
- atomic_t in_flight;
+ /* Limit number of writeback bios in flight */
+ struct semaphore in_flight;
struct closure_with_timer writeback;
- struct closure_waitlist writeback_wait;
struct keybuf writeback_keys;
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_s
stats->last = now ?: 1;
}
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
+/**
+ * bch_next_delay() - increment @d by the amount of work done, and return how
+ * long to delay until the next time to do some work.
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
uint64_t now = local_clock();
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -452,17 +452,23 @@ read_attribute(name ## _last_ ## frequen
(ewma) >> factor; \
})
-struct ratelimit {
+struct bch_ratelimit {
+ /* Next time we want to do some work, in nanoseconds */
uint64_t next;
+
+ /*
+ * Rate at which we want to do work, in units per nanosecond
+ * The units here correspond to the units passed to bch_next_delay()
+ */
unsigned rate;
};
-static inline void ratelimit_reset(struct ratelimit *d)
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
{
d->next = local_clock();
}
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
#define __DIV_SAFE(n, d, zero) \
({ \
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -91,11 +91,15 @@ static void update_writeback_rate(struct
static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
{
+ uint64_t ret;
+
if (atomic_read(&dc->disk.detaching) ||
!dc->writeback_percent)
return 0;
- return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+ ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+
+ return min_t(uint64_t, ret, HZ);
}
/* Background writeback */
@@ -165,7 +169,7 @@ static void refill_dirty(struct closure
up_write(&dc->writeback_lock);
- ratelimit_reset(&dc->writeback_rate);
+ bch_ratelimit_reset(&dc->writeback_rate);
/* Punt to workqueue only so we don't recurse and blow the stack */
continue_at(cl, read_dirty, dirty_wq);
@@ -246,9 +250,7 @@ static void write_dirty_finish(struct cl
}
bch_keybuf_del(&dc->writeback_keys, w);
- atomic_dec_bug(&dc->in_flight);
-
- closure_wake_up(&dc->writeback_wait);
+ up(&dc->in_flight);
closure_return_with_destructor(cl, dirty_io_destructor);
}
@@ -278,7 +280,7 @@ static void write_dirty(struct closure *
trace_bcache_write_dirty(&io->bio);
closure_bio_submit(&io->bio, cl, &io->dc->disk);
- continue_at(cl, write_dirty_finish, dirty_wq);
+ continue_at(cl, write_dirty_finish, system_wq);
}
static void read_dirty_endio(struct bio *bio, int error)
@@ -299,7 +301,7 @@ static void read_dirty_submit(struct clo
trace_bcache_read_dirty(&io->bio);
closure_bio_submit(&io->bio, cl, &io->dc->disk);
- continue_at(cl, write_dirty, dirty_wq);
+ continue_at(cl, write_dirty, system_wq);
}
static void read_dirty(struct closure *cl)
@@ -324,12 +326,9 @@ static void read_dirty(struct closure *c
if (delay > 0 &&
(KEY_START(&w->key) != dc->last_read ||
- jiffies_to_msecs(delay) > 50)) {
- w->private = NULL;
-
- closure_delay(&dc->writeback, delay);
- continue_at(cl, read_dirty, dirty_wq);
- }
+ jiffies_to_msecs(delay) > 50))
+ while (delay)
+ delay = schedule_timeout(delay);
dc->last_read = KEY_OFFSET(&w->key);
@@ -354,15 +353,10 @@ static void read_dirty(struct closure *c
pr_debug("%s", pkey(&w->key));
- closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
+ down(&dc->in_flight);
+ closure_call(&io->cl, read_dirty_submit, NULL, cl);
delay = writeback_delay(dc, KEY_SIZE(&w->key));
-
- atomic_inc(&dc->in_flight);
-
- if (!closure_wait_event(&dc->writeback_wait, cl,
- atomic_read(&dc->in_flight) < 64))
- continue_at(cl, read_dirty, dirty_wq);
}
if (0) {
@@ -372,11 +366,16 @@ err:
bch_keybuf_del(&dc->writeback_keys, w);
}
- refill_dirty(cl);
+ /*
+ * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+ * freed) before refilling again
+ */
+ continue_at(cl, refill_dirty, dirty_wq);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
{
+ sema_init(&dc->in_flight, 64);
closure_init_unlocked(&dc->writeback);
init_rwsem(&dc->writeback_lock);
@@ -406,7 +405,7 @@ void bch_writeback_exit(void)
int __init bch_writeback_init(void)
{
- dirty_wq = create_singlethread_workqueue("bcache_writeback");
+ dirty_wq = create_workqueue("bcache_writeback");
if (!dirty_wq)
return -ENOMEM;
next prev parent reply other threads:[~2013-10-03 4:05 UTC|newest]
Thread overview: 57+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-10-03 4:05 [ 00/52] 3.10.15-stable review Greg Kroah-Hartman
2013-10-03 4:05 ` [ 01/52] block: Fix bio_copy_data() Greg Kroah-Hartman
2013-10-03 4:05 ` [ 02/52] sysv: Add forgotten superblock lock init for v7 fs Greg Kroah-Hartman
2013-10-03 4:05 ` [ 03/52] bcache: Fix a dumb journal discard bug Greg Kroah-Hartman
2013-10-03 4:05 ` [ 04/52] bcache: Strip endline when writing the label through sysfs Greg Kroah-Hartman
2013-10-03 4:05 ` [ 05/52] bcache: Fix for when no journal entries are found Greg Kroah-Hartman
2013-10-03 4:05 ` Greg Kroah-Hartman [this message]
2013-10-03 4:05 ` [ 07/52] bcache: Fix a flush/fua performance bug Greg Kroah-Hartman
2013-10-03 4:05 ` [ 08/52] bcache: Fix a dumb CPU spinning bug in writeback Greg Kroah-Hartman
2013-10-03 4:05 ` [ 09/52] bcache: Fix a shrinker deadlock Greg Kroah-Hartman
2013-10-03 4:05 ` [ 10/52] bcache: Fix for handling overlapping extents when reading in a btree node Greg Kroah-Hartman
2013-10-03 4:05 ` [ 11/52] bcache: Fix flushes in writeback mode Greg Kroah-Hartman
2013-10-03 4:05 ` [ 12/52] x86/reboot: Add quirk to make Dell C6100 use reboot=pci automatically Greg Kroah-Hartman
2013-10-03 4:05 ` [ 13/52] tools lib lk: Uninclude linux/magic.h in debugfs.c Greg Kroah-Hartman
2013-10-03 4:05 ` [ 14/52] x86, efi: Dont map Boot Services on i386 Greg Kroah-Hartman
2013-10-03 4:05 ` [ 15/52] mei: make me client counters less error prone Greg Kroah-Hartman
2013-10-03 4:05 ` [ 16/52] mei: bus: stop wait for read during cl state transition Greg Kroah-Hartman
2013-10-03 4:05 ` [ 17/52] mei: cancel stall timers in mei_reset Greg Kroah-Hartman
2013-10-03 4:05 ` [ 18/52] tty: Fix SIGTTOU not sent with tcflush() Greg Kroah-Hartman
2013-10-03 4:05 ` [ 19/52] serial: tegra: fix tty-kref leak Greg Kroah-Hartman
2013-10-03 4:05 ` [ 20/52] serial: pch_uart: fix tty-kref leak in rx-error path Greg Kroah-Hartman
2013-10-03 4:05 ` [ 21/52] serial: pch_uart: fix tty-kref leak in dma-rx path Greg Kroah-Hartman
2013-10-03 4:05 ` [ 22/52] ARM: 7837/3: fix Thumb-2 bug in AES assembler code Greg Kroah-Hartman
2013-10-03 4:05 ` [ 23/52] staging: vt6656: [BUG] main_usb.c oops on device_close move flag earlier Greg Kroah-Hartman
2013-10-03 4:05 ` [ 24/52] staging: vt6656: [BUG] iwctl_siwencodeext return if device not open Greg Kroah-Hartman
2013-10-03 4:05 ` [ 25/52] drm/i915/tv: clear adjusted_mode.flags Greg Kroah-Hartman
2013-10-03 4:05 ` [ 26/52] xhci: Ensure a command structure points to the correct trb on the command ring Greg Kroah-Hartman
2013-10-03 4:05 ` [ 27/52] xhci: Fix oops happening after address device timeout Greg Kroah-Hartman
2013-10-03 4:05 ` [ 28/52] USB: fix PM config symbol in uhci-hcd, ehci-hcd, and xhci-hcd Greg Kroah-Hartman
2013-10-03 4:05 ` [ 29/52] xhci: Fix race between ep halt and URB cancellation Greg Kroah-Hartman
2013-10-03 4:05 ` [ 30/52] USB: OHCI: accept very late isochronous URBs Greg Kroah-Hartman
2013-10-03 4:05 ` [ 31/52] USB: UHCI: " Greg Kroah-Hartman
2013-10-03 4:05 ` [ 32/52] USB: Fix breakage in ffs_fs_mount() Greg Kroah-Hartman
2013-10-03 4:05 ` [ 33/52] fsl/usb: Resolve PHY_CLK_VLD instability issue for ULPI phy Greg Kroah-Hartman
2013-10-03 4:05 ` [ 34/52] usb: dwc3: pci: add support for BayTrail Greg Kroah-Hartman
2013-10-03 4:05 ` [ 35/52] usb: dwc3: add support for Merrifield Greg Kroah-Hartman
2013-10-03 4:05 ` [ 36/52] usb/core/devio.c: Dont reject control message to endpoint with wrong direction bit Greg Kroah-Hartman
2013-10-03 4:05 ` [ 37/52] driver core : Fix use after free of dev->parent in device_shutdown Greg Kroah-Hartman
2013-10-03 4:05 ` [ 38/52] dm snapshot: workaround for a false positive lockdep warning Greg Kroah-Hartman
2013-10-03 4:05 ` [ 39/52] dm-snapshot: fix performance degradation due to small hash size Greg Kroah-Hartman
2013-10-03 4:05 ` [ 40/52] dm mpath: disable WRITE SAME if it fails Greg Kroah-Hartman
2013-10-03 4:05 ` [ 41/52] dm-raid: silence compiler warning on rebuilds_per_group Greg Kroah-Hartman
2013-10-03 4:06 ` [ 42/52] drm/i915: preserve pipe A quirk in i9xx_set_pipeconf Greg Kroah-Hartman
2013-10-03 4:06 ` [ 43/52] drm/i915/dp: increase i2c-over-aux retry interval on AUX DEFER Greg Kroah-Hartman
2013-10-03 4:06 ` [ 44/52] drm/radeon: avoid UVD corruption on AGP cards using GPU gart Greg Kroah-Hartman
2013-10-03 4:06 ` [ 45/52] drm/radeon: Make r100_cp_ring_info() and radeon_ring_gfx() safe (v2) Greg Kroah-Hartman
2013-10-03 4:06 ` [ 46/52] drm/radeon: disable tests/benchmarks if accel is disabled Greg Kroah-Hartman
2013-10-03 4:06 ` [ 47/52] drm/radeon: add missing hdmi callbacks for rv6xx Greg Kroah-Hartman
2013-10-03 4:06 ` [ 48/52] drm/radeon: fix hdmi audio on DCE3.0/3.1 asics Greg Kroah-Hartman
2013-10-03 4:06 ` [ 49/52] ARM: mxs: stub out mxs_pm_init for !CONFIG_PM Greg Kroah-Hartman
2013-10-03 4:06 ` [ 50/52] hwmon: (applesmc) Check key count before proceeding Greg Kroah-Hartman
2013-10-03 4:06 ` [ 51/52] ALSA: compress: Fix compress device unregister Greg Kroah-Hartman
2013-10-03 4:06 ` [ 52/52] drm/i915: fix gen4 digital port hotplug definitions Greg Kroah-Hartman
2013-10-03 13:32 ` [ 00/52] 3.10.15-stable review Guenter Roeck
2013-10-03 18:41 ` Greg Kroah-Hartman
2013-10-03 22:54 ` Shuah Khan
2013-10-03 23:04 ` Greg Kroah-Hartman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20131003040522.625750151@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=kmo@daterainc.com \
--cc=linux-kernel@vger.kernel.org \
--cc=stable@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).