public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Joe Thornber <ejt@redhat.com>,
	Mike Snitzer <snitzer@redhat.com>,
	Sasha Levin <alexander.levin@microsoft.com>
Subject: [PATCH 4.9 41/59] dm thin metadata: try to avoid ever aborting transactions
Date: Mon,  8 Oct 2018 20:31:48 +0200	[thread overview]
Message-ID: <20181008175550.722285235@linuxfoundation.org> (raw)
In-Reply-To: <20181008175546.620836256@linuxfoundation.org>

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Joe Thornber <ejt@redhat.com>

[ Upstream commit 3ab91828166895600efd9cdc3a0eb32001f7204a ]

Committing a transaction can consume some metadata of it's own, we now
reserve a small amount of metadata to cover this.  Free metadata
reported by the kernel will not include this reserve.

If any of the reserve has been used after a commit we enter a new
internal state PM_OUT_OF_METADATA_SPACE.  This is reported as
PM_READ_ONLY, so no userland changes are needed.  If the metadata
device is resized the pool will move back to PM_WRITE.

These changes mean we never need to abort and rollback a transaction due
to running out of metadata space.  This is particularly important
because there have been a handful of reports of data corruption against
DM thin-provisioning that can all be attributed to the thin-pool having
ran out of metadata space.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/md/dm-thin-metadata.c |   36 ++++++++++++++++++++
 drivers/md/dm-thin.c          |   73 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 100 insertions(+), 9 deletions(-)

--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -190,6 +190,12 @@ struct dm_pool_metadata {
 	sector_t data_block_size;
 
 	/*
+	 * We reserve a section of the metadata for commit overhead.
+	 * All reported space does *not* include this.
+	 */
+	dm_block_t metadata_reserve;
+
+	/*
 	 * Set if a transaction has to be aborted but the attempt to roll back
 	 * to the previous (good) transaction failed.  The only pool metadata
 	 * operation possible in this state is the closing of the device.
@@ -827,6 +833,22 @@ static int __commit_transaction(struct d
 	return dm_tm_commit(pmd->tm, sblock);
 }
 
+static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
+{
+	int r;
+	dm_block_t total;
+	dm_block_t max_blocks = 4096; /* 16M */
+
+	r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
+	if (r) {
+		DMERR("could not get size of metadata device");
+		pmd->metadata_reserve = max_blocks;
+	} else {
+		sector_div(total, 10);
+		pmd->metadata_reserve = min(max_blocks, total);
+	}
+}
+
 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 					       sector_t data_block_size,
 					       bool format_device)
@@ -860,6 +882,8 @@ struct dm_pool_metadata *dm_pool_metadat
 		return ERR_PTR(r);
 	}
 
+	__set_metadata_reserve(pmd);
+
 	return pmd;
 }
 
@@ -1831,6 +1855,13 @@ int dm_pool_get_free_metadata_block_coun
 	down_read(&pmd->root_lock);
 	if (!pmd->fail_io)
 		r = dm_sm_get_nr_free(pmd->metadata_sm, result);
+
+	if (!r) {
+		if (*result < pmd->metadata_reserve)
+			*result = 0;
+		else
+			*result -= pmd->metadata_reserve;
+	}
 	up_read(&pmd->root_lock);
 
 	return r;
@@ -1943,8 +1974,11 @@ int dm_pool_resize_metadata_dev(struct d
 	int r = -EINVAL;
 
 	down_write(&pmd->root_lock);
-	if (!pmd->fail_io)
+	if (!pmd->fail_io) {
 		r = __resize_space_map(pmd->metadata_sm, new_count);
+		if (!r)
+			__set_metadata_reserve(pmd);
+	}
 	up_write(&pmd->root_lock);
 
 	return r;
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -200,7 +200,13 @@ struct dm_thin_new_mapping;
 enum pool_mode {
 	PM_WRITE,		/* metadata may be changed */
 	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */
+
+	/*
+	 * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
+	 */
+	PM_OUT_OF_METADATA_SPACE,
 	PM_READ_ONLY,		/* metadata may not be changed */
+
 	PM_FAIL,		/* all I/O fails */
 };
 
@@ -1386,7 +1392,35 @@ static void set_pool_mode(struct pool *p
 
 static void requeue_bios(struct pool *pool);
 
-static void check_for_space(struct pool *pool)
+static bool is_read_only_pool_mode(enum pool_mode mode)
+{
+	return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
+}
+
+static bool is_read_only(struct pool *pool)
+{
+	return is_read_only_pool_mode(get_pool_mode(pool));
+}
+
+static void check_for_metadata_space(struct pool *pool)
+{
+	int r;
+	const char *ooms_reason = NULL;
+	dm_block_t nr_free;
+
+	r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
+	if (r)
+		ooms_reason = "Could not get free metadata blocks";
+	else if (!nr_free)
+		ooms_reason = "No free metadata blocks";
+
+	if (ooms_reason && !is_read_only(pool)) {
+		DMERR("%s", ooms_reason);
+		set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
+	}
+}
+
+static void check_for_data_space(struct pool *pool)
 {
 	int r;
 	dm_block_t nr_free;
@@ -1412,14 +1446,16 @@ static int commit(struct pool *pool)
 {
 	int r;
 
-	if (get_pool_mode(pool) >= PM_READ_ONLY)
+	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
 		return -EINVAL;
 
 	r = dm_pool_commit_metadata(pool->pmd);
 	if (r)
 		metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
-	else
-		check_for_space(pool);
+	else {
+		check_for_metadata_space(pool);
+		check_for_data_space(pool);
+	}
 
 	return r;
 }
@@ -1485,6 +1521,19 @@ static int alloc_data_block(struct thin_
 		return r;
 	}
 
+	r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
+	if (r) {
+		metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
+		return r;
+	}
+
+	if (!free_blocks) {
+		/* Let's commit before we use up the metadata reserve. */
+		r = commit(pool);
+		if (r)
+			return r;
+	}
+
 	return 0;
 }
 
@@ -1516,6 +1565,7 @@ static int should_error_unserviceable_bi
 	case PM_OUT_OF_DATA_SPACE:
 		return pool->pf.error_if_no_space ? -ENOSPC : 0;
 
+	case PM_OUT_OF_METADATA_SPACE:
 	case PM_READ_ONLY:
 	case PM_FAIL:
 		return -EIO;
@@ -2479,8 +2529,9 @@ static void set_pool_mode(struct pool *p
 		error_retry_list(pool);
 		break;
 
+	case PM_OUT_OF_METADATA_SPACE:
 	case PM_READ_ONLY:
-		if (old_mode != new_mode)
+		if (!is_read_only_pool_mode(old_mode))
 			notify_of_pool_mode_change(pool, "read-only");
 		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_read_only;
@@ -3418,6 +3469,10 @@ static int maybe_resize_metadata_dev(str
 		DMINFO("%s: growing the metadata device from %llu to %llu blocks",
 		       dm_device_name(pool->pool_md),
 		       sb_metadata_dev_size, metadata_dev_size);
+
+		if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
+			set_pool_mode(pool, PM_WRITE);
+
 		r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
 		if (r) {
 			metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
@@ -3721,7 +3776,7 @@ static int pool_message(struct dm_target
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 
-	if (get_pool_mode(pool) >= PM_READ_ONLY) {
+	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
 		DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
 		      dm_device_name(pool->pool_md));
 		return -EOPNOTSUPP;
@@ -3795,6 +3850,7 @@ static void pool_status(struct dm_target
 	dm_block_t nr_blocks_data;
 	dm_block_t nr_blocks_metadata;
 	dm_block_t held_root;
+	enum pool_mode mode;
 	char buf[BDEVNAME_SIZE];
 	char buf2[BDEVNAME_SIZE];
 	struct pool_c *pt = ti->private;
@@ -3865,9 +3921,10 @@ static void pool_status(struct dm_target
 		else
 			DMEMIT("- ");
 
-		if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+		mode = get_pool_mode(pool);
+		if (mode == PM_OUT_OF_DATA_SPACE)
 			DMEMIT("out_of_data_space ");
-		else if (pool->pf.mode == PM_READ_ONLY)
+		else if (is_read_only_pool_mode(mode))
 			DMEMIT("ro ");
 		else
 			DMEMIT("rw ");



  parent reply	other threads:[~2018-10-08 18:40 UTC|newest]

Thread overview: 65+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-10-08 18:31 [PATCH 4.9 00/59] 4.9.132-stable review Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 01/59] serial: mvebu-uart: Fix reporting of effective CSIZE to userspace Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 02/59] time: Introduce jiffies64_to_nsecs() Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 03/59] mac80211: Run TXQ teardown code before de-registering interfaces Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 04/59] KVM: PPC: Book3S HV: Dont truncate HPTE index in xlate function Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 05/59] mac80211: correct use of IEEE80211_VHT_CAP_RXSTBC_X Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 06/59] mac80211_hwsim: " Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 07/59] gpio: adp5588: Fix sleep-in-atomic-context bug Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 08/59] mac80211: mesh: fix HWMP sequence numbering to follow standard Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 09/59] net: hns: add netif_carrier_off before change speed and duplex Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 10/59] cfg80211: nl80211_update_ft_ies() to validate NL80211_ATTR_IE Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 11/59] gpio: Fix crash due to registration race Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 12/59] ARC: atomics: unbork atomic_fetch_##op() Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 13/59] RAID10 BUG_ON in raise_barrier when force is true and conf->barrier is 0 Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 14/59] i2c: uniphier: issue STOP only for last message or I2C_M_STOP Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 15/59] i2c: uniphier-f: " Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 16/59] net: cadence: Fix a sleep-in-atomic-context bug in macb_halt_tx() Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 17/59] fs/cifs: dont translate SFM_SLASH (U+F026) to backslash Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 18/59] cfg80211: fix a type issue in ieee80211_chandef_to_operating_class() Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 19/59] mac80211: fix a race between restart and CSA flows Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 20/59] mac80211: Fix station bandwidth setting after channel switch Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 21/59] mac80211: dont Tx a deauth frame if the AP forbade Tx Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 22/59] mac80211: shorten the IBSS debug messages Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 23/59] tools/vm/slabinfo.c: fix sign-compare warning Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 24/59] tools/vm/page-types.c: fix "defined but not used" warning Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 25/59] mm: madvise(MADV_DODUMP): allow hugetlbfs pages Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 26/59] HID: add support for Apple Magic Keyboards Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 27/59] usb: gadget: fotg210-udc: Fix memory leak of fotg210->ep[i] Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 28/59] pinctrl: msm: Really mask level interrupts to prevent latching Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 29/59] HID: hid-saitek: Add device ID for RAT 7 Contagion Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 30/59] perf evsel: Fix potential null pointer dereference in perf_evsel__new_idx() Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 31/59] perf probe powerpc: Ignore SyS symbols irrespective of endianness Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 32/59] RDMA/ucma: check fd type in ucma_migrate_id() Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 33/59] USB: yurex: Check for truncation in yurex_read() Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 34/59] nvmet-rdma: fix possible bogus dereference under heavy load Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 35/59] net/mlx5: Consider PCI domain in search for next dev Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 36/59] drm/nouveau/TBDdevinit: dont fail when PMU/PRE_OS is missing from VBIOS Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 37/59] dm raid: fix rebuild of specific devices by updating superblock Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 38/59] fs/cifs: suppress a string overflow warning Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 39/59] net: ena: fix driver when PAGE_SIZE == 64kB Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 40/59] perf/x86/intel: Add support/quirk for the MISPREDICT bit on Knights Landing CPUs Greg Kroah-Hartman
2018-10-08 18:31 ` Greg Kroah-Hartman [this message]
2018-10-08 18:31 ` [PATCH 4.9 42/59] arch/hexagon: fix kernel/dma.c build warning Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 43/59] hexagon: modify ffs() and fls() to return int Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 44/59] arm64: jump_label.h: use asm_volatile_goto macro instead of "asm goto" Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 45/59] r8169: Clear RTL_FLAG_TASK_*_PENDING when clearing RTL_FLAG_TASK_ENABLED Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 46/59] s390/qeth: use vzalloc for QUERY OAT buffer Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 47/59] s390/qeth: dont dump past end of unknown HW header Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 48/59] cifs: read overflow in is_valid_oplock_break() Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 49/59] xen/manage: dont complain about an empty value in control/sysrq node Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 50/59] xen: avoid crash in disable_hotplug_cpu Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 51/59] xen: fix GCC warning and remove duplicate EVTCHN_ROW/EVTCHN_COL usage Greg Kroah-Hartman
2018-10-08 18:31 ` [PATCH 4.9 52/59] sysfs: Do not return POSIX ACL xattrs via listxattr Greg Kroah-Hartman
2018-10-08 18:32 ` [PATCH 4.9 53/59] smb2: fix missing files in root share directory listing Greg Kroah-Hartman
2018-10-08 18:32 ` [PATCH 4.9 54/59] ALSA: hda/realtek - Cannot adjust speakers volume on Dell XPS 27 7760 Greg Kroah-Hartman
2018-10-08 18:32 ` [PATCH 4.9 55/59] crypto: qat - Fix KASAN stack-out-of-bounds bug in adf_probe() Greg Kroah-Hartman
2018-10-08 18:32 ` [PATCH 4.9 56/59] crypto: mxs-dcp - Fix wait logic on chan threads Greg Kroah-Hartman
2018-10-08 18:32 ` [PATCH 4.9 57/59] gpiolib: Free the last requested descriptor Greg Kroah-Hartman
2018-10-08 18:32 ` [PATCH 4.9 58/59] proc: restrict kernel stack dumps to root Greg Kroah-Hartman
2018-10-08 18:32 ` [PATCH 4.9 59/59] ocfs2: fix locking for res->tracking and dlm->tracking_list Greg Kroah-Hartman
2018-10-08 23:22 ` [PATCH 4.9 00/59] 4.9.132-stable review Shuah Khan
2018-10-09  1:30 ` Nathan Chancellor
2018-10-09  9:26   ` Greg Kroah-Hartman
2018-10-09 21:05 ` Guenter Roeck
2018-10-10  4:15 ` Naresh Kamboju

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181008175550.722285235@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=alexander.levin@microsoft.com \
    --cc=ejt@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=snitzer@redhat.com \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox