From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org, Kent Overstreet <kmo@daterainc.com>,
Linus Torvalds <torvalds@linux-foundation.org>
Subject: [ 06/52] bcache: Fix a writeback performance regression
Date: Wed, 2 Oct 2013 21:05:24 -0700 [thread overview]
Message-ID: <20131003040522.625750151@linuxfoundation.org> (raw)
In-Reply-To: <20131003040522.190209641@linuxfoundation.org>
3.10-stable review patch. If anyone has any objections, please let me know.
------------------
From: Kent Overstreet <kmo@daterainc.com>
commit c2a4f3183a1248f615a695fbd8905da55ad11bba upstream.
Background writeback works by scanning the btree for dirty data and
adding those keys into a fixed size buffer, then for each dirty key in
the keybuf writing it to the backing device.
When read_dirty() finishes and it's time to scan for more dirty data, we
need to wait for the outstanding writeback IO to finish - they still
take up slots in the keybuf (so that foreground writes can check for
them to avoid races) - without that wait, we'll continually rescan when
we'll be able to add at most a key or two to the keybuf, and that takes
locks that starves foreground IO. Doh.
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
drivers/md/bcache/bcache.h | 7 ++----
drivers/md/bcache/util.c | 11 +++++++++-
drivers/md/bcache/util.h | 12 ++++++++---
drivers/md/bcache/writeback.c | 43 ++++++++++++++++++++----------------------
4 files changed, 43 insertions(+), 30 deletions(-)
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -499,7 +499,7 @@ struct cached_dev {
*/
atomic_t has_dirty;
- struct ratelimit writeback_rate;
+ struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
/*
@@ -508,10 +508,9 @@ struct cached_dev {
*/
sector_t last_read;
- /* Number of writeback bios in flight */
- atomic_t in_flight;
+ /* Limit number of writeback bios in flight */
+ struct semaphore in_flight;
struct closure_with_timer writeback;
- struct closure_waitlist writeback_wait;
struct keybuf writeback_keys;
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_s
stats->last = now ?: 1;
}
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
+/**
+ * bch_next_delay() - increment @d by the amount of work done, and return how
+ * long to delay until the next time to do some work.
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
uint64_t now = local_clock();
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -452,17 +452,23 @@ read_attribute(name ## _last_ ## frequen
(ewma) >> factor; \
})
-struct ratelimit {
+struct bch_ratelimit {
+ /* Next time we want to do some work, in nanoseconds */
uint64_t next;
+
+ /*
+ * Rate at which we want to do work, in units per nanosecond
+ * The units here correspond to the units passed to bch_next_delay()
+ */
unsigned rate;
};
-static inline void ratelimit_reset(struct ratelimit *d)
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
{
d->next = local_clock();
}
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
#define __DIV_SAFE(n, d, zero) \
({ \
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -91,11 +91,15 @@ static void update_writeback_rate(struct
static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
{
+ uint64_t ret;
+
if (atomic_read(&dc->disk.detaching) ||
!dc->writeback_percent)
return 0;
- return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+ ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+
+ return min_t(uint64_t, ret, HZ);
}
/* Background writeback */
@@ -165,7 +169,7 @@ static void refill_dirty(struct closure
up_write(&dc->writeback_lock);
- ratelimit_reset(&dc->writeback_rate);
+ bch_ratelimit_reset(&dc->writeback_rate);
/* Punt to workqueue only so we don't recurse and blow the stack */
continue_at(cl, read_dirty, dirty_wq);
@@ -246,9 +250,7 @@ static void write_dirty_finish(struct cl
}
bch_keybuf_del(&dc->writeback_keys, w);
- atomic_dec_bug(&dc->in_flight);
-
- closure_wake_up(&dc->writeback_wait);
+ up(&dc->in_flight);
closure_return_with_destructor(cl, dirty_io_destructor);
}
@@ -278,7 +280,7 @@ static void write_dirty(struct closure *
trace_bcache_write_dirty(&io->bio);
closure_bio_submit(&io->bio, cl, &io->dc->disk);
- continue_at(cl, write_dirty_finish, dirty_wq);
+ continue_at(cl, write_dirty_finish, system_wq);
}
static void read_dirty_endio(struct bio *bio, int error)
@@ -299,7 +301,7 @@ static void read_dirty_submit(struct clo
trace_bcache_read_dirty(&io->bio);
closure_bio_submit(&io->bio, cl, &io->dc->disk);
- continue_at(cl, write_dirty, dirty_wq);
+ continue_at(cl, write_dirty, system_wq);
}
static void read_dirty(struct closure *cl)
@@ -324,12 +326,9 @@ static void read_dirty(struct closure *c
if (delay > 0 &&
(KEY_START(&w->key) != dc->last_read ||
- jiffies_to_msecs(delay) > 50)) {
- w->private = NULL;
-
- closure_delay(&dc->writeback, delay);
- continue_at(cl, read_dirty, dirty_wq);
- }
+ jiffies_to_msecs(delay) > 50))
+ while (delay)
+ delay = schedule_timeout(delay);
dc->last_read = KEY_OFFSET(&w->key);
@@ -354,15 +353,10 @@ static void read_dirty(struct closure *c
pr_debug("%s", pkey(&w->key));
- closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
+ down(&dc->in_flight);
+ closure_call(&io->cl, read_dirty_submit, NULL, cl);
delay = writeback_delay(dc, KEY_SIZE(&w->key));
-
- atomic_inc(&dc->in_flight);
-
- if (!closure_wait_event(&dc->writeback_wait, cl,
- atomic_read(&dc->in_flight) < 64))
- continue_at(cl, read_dirty, dirty_wq);
}
if (0) {
@@ -372,11 +366,16 @@ err:
bch_keybuf_del(&dc->writeback_keys, w);
}
- refill_dirty(cl);
+ /*
+ * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+ * freed) before refilling again
+ */
+ continue_at(cl, refill_dirty, dirty_wq);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
{
+ sema_init(&dc->in_flight, 64);
closure_init_unlocked(&dc->writeback);
init_rwsem(&dc->writeback_lock);
@@ -406,7 +405,7 @@ void bch_writeback_exit(void)
int __init bch_writeback_init(void)
{
- dirty_wq = create_singlethread_workqueue("bcache_writeback");
+ dirty_wq = create_workqueue("bcache_writeback");
if (!dirty_wq)
return -ENOMEM;
next prev parent reply other threads:[~2013-10-03 5:25 UTC|newest]
Thread overview: 57+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-10-03 4:05 [ 00/52] 3.10.15-stable review Greg Kroah-Hartman
2013-10-03 4:05 ` [ 01/52] block: Fix bio_copy_data() Greg Kroah-Hartman
2013-10-03 4:05 ` [ 02/52] sysv: Add forgotten superblock lock init for v7 fs Greg Kroah-Hartman
2013-10-03 4:05 ` [ 03/52] bcache: Fix a dumb journal discard bug Greg Kroah-Hartman
2013-10-03 4:05 ` [ 04/52] bcache: Strip endline when writing the label through sysfs Greg Kroah-Hartman
2013-10-03 4:05 ` [ 05/52] bcache: Fix for when no journal entries are found Greg Kroah-Hartman
2013-10-03 4:05 ` Greg Kroah-Hartman [this message]
2013-10-03 4:05 ` [ 07/52] bcache: Fix a flush/fua performance bug Greg Kroah-Hartman
2013-10-03 4:05 ` [ 08/52] bcache: Fix a dumb CPU spinning bug in writeback Greg Kroah-Hartman
2013-10-03 4:05 ` [ 09/52] bcache: Fix a shrinker deadlock Greg Kroah-Hartman
2013-10-03 4:05 ` [ 10/52] bcache: Fix for handling overlapping extents when reading in a btree node Greg Kroah-Hartman
2013-10-03 4:05 ` [ 11/52] bcache: Fix flushes in writeback mode Greg Kroah-Hartman
2013-10-03 4:05 ` [ 12/52] x86/reboot: Add quirk to make Dell C6100 use reboot=pci automatically Greg Kroah-Hartman
2013-10-03 4:05 ` [ 13/52] tools lib lk: Uninclude linux/magic.h in debugfs.c Greg Kroah-Hartman
2013-10-03 4:05 ` [ 14/52] x86, efi: Dont map Boot Services on i386 Greg Kroah-Hartman
2013-10-03 4:05 ` [ 15/52] mei: make me client counters less error prone Greg Kroah-Hartman
2013-10-03 4:05 ` [ 16/52] mei: bus: stop wait for read during cl state transition Greg Kroah-Hartman
2013-10-03 4:05 ` [ 17/52] mei: cancel stall timers in mei_reset Greg Kroah-Hartman
2013-10-03 4:05 ` [ 18/52] tty: Fix SIGTTOU not sent with tcflush() Greg Kroah-Hartman
2013-10-03 4:05 ` [ 19/52] serial: tegra: fix tty-kref leak Greg Kroah-Hartman
2013-10-03 4:05 ` [ 20/52] serial: pch_uart: fix tty-kref leak in rx-error path Greg Kroah-Hartman
2013-10-03 4:05 ` [ 21/52] serial: pch_uart: fix tty-kref leak in dma-rx path Greg Kroah-Hartman
2013-10-03 4:05 ` [ 22/52] ARM: 7837/3: fix Thumb-2 bug in AES assembler code Greg Kroah-Hartman
2013-10-03 4:05 ` [ 23/52] staging: vt6656: [BUG] main_usb.c oops on device_close move flag earlier Greg Kroah-Hartman
2013-10-03 4:05 ` [ 24/52] staging: vt6656: [BUG] iwctl_siwencodeext return if device not open Greg Kroah-Hartman
2013-10-03 4:05 ` [ 25/52] drm/i915/tv: clear adjusted_mode.flags Greg Kroah-Hartman
2013-10-03 4:05 ` [ 26/52] xhci: Ensure a command structure points to the correct trb on the command ring Greg Kroah-Hartman
2013-10-03 4:05 ` [ 27/52] xhci: Fix oops happening after address device timeout Greg Kroah-Hartman
2013-10-03 4:05 ` [ 28/52] USB: fix PM config symbol in uhci-hcd, ehci-hcd, and xhci-hcd Greg Kroah-Hartman
2013-10-03 4:05 ` [ 29/52] xhci: Fix race between ep halt and URB cancellation Greg Kroah-Hartman
2013-10-03 4:05 ` [ 30/52] USB: OHCI: accept very late isochronous URBs Greg Kroah-Hartman
2013-10-03 4:05 ` [ 31/52] USB: UHCI: " Greg Kroah-Hartman
2013-10-03 4:05 ` [ 32/52] USB: Fix breakage in ffs_fs_mount() Greg Kroah-Hartman
2013-10-03 4:05 ` [ 33/52] fsl/usb: Resolve PHY_CLK_VLD instability issue for ULPI phy Greg Kroah-Hartman
2013-10-03 4:05 ` [ 34/52] usb: dwc3: pci: add support for BayTrail Greg Kroah-Hartman
2013-10-03 4:05 ` [ 35/52] usb: dwc3: add support for Merrifield Greg Kroah-Hartman
2013-10-03 4:05 ` [ 36/52] usb/core/devio.c: Dont reject control message to endpoint with wrong direction bit Greg Kroah-Hartman
2013-10-03 4:05 ` [ 37/52] driver core : Fix use after free of dev->parent in device_shutdown Greg Kroah-Hartman
2013-10-03 4:05 ` [ 38/52] dm snapshot: workaround for a false positive lockdep warning Greg Kroah-Hartman
2013-10-03 4:05 ` [ 39/52] dm-snapshot: fix performance degradation due to small hash size Greg Kroah-Hartman
2013-10-03 4:05 ` [ 40/52] dm mpath: disable WRITE SAME if it fails Greg Kroah-Hartman
2013-10-03 4:05 ` [ 41/52] dm-raid: silence compiler warning on rebuilds_per_group Greg Kroah-Hartman
2013-10-03 4:06 ` [ 42/52] drm/i915: preserve pipe A quirk in i9xx_set_pipeconf Greg Kroah-Hartman
2013-10-03 4:06 ` [ 43/52] drm/i915/dp: increase i2c-over-aux retry interval on AUX DEFER Greg Kroah-Hartman
2013-10-03 4:06 ` [ 44/52] drm/radeon: avoid UVD corruption on AGP cards using GPU gart Greg Kroah-Hartman
2013-10-03 4:06 ` [ 45/52] drm/radeon: Make r100_cp_ring_info() and radeon_ring_gfx() safe (v2) Greg Kroah-Hartman
2013-10-03 4:06 ` [ 46/52] drm/radeon: disable tests/benchmarks if accel is disabled Greg Kroah-Hartman
2013-10-03 4:06 ` [ 47/52] drm/radeon: add missing hdmi callbacks for rv6xx Greg Kroah-Hartman
2013-10-03 4:06 ` [ 48/52] drm/radeon: fix hdmi audio on DCE3.0/3.1 asics Greg Kroah-Hartman
2013-10-03 4:06 ` [ 49/52] ARM: mxs: stub out mxs_pm_init for !CONFIG_PM Greg Kroah-Hartman
2013-10-03 4:06 ` [ 50/52] hwmon: (applesmc) Check key count before proceeding Greg Kroah-Hartman
2013-10-03 4:06 ` [ 51/52] ALSA: compress: Fix compress device unregister Greg Kroah-Hartman
2013-10-03 4:06 ` [ 52/52] drm/i915: fix gen4 digital port hotplug definitions Greg Kroah-Hartman
2013-10-03 13:32 ` [ 00/52] 3.10.15-stable review Guenter Roeck
2013-10-03 18:41 ` Greg Kroah-Hartman
2013-10-03 22:54 ` Shuah Khan
2013-10-03 23:04 ` Greg Kroah-Hartman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20131003040522.625750151@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=kmo@daterainc.com \
--cc=linux-kernel@vger.kernel.org \
--cc=stable@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.