From: Fam Zheng <famz@redhat.com>
To: qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>,
qemu-block@nongnu.org, rjones@redhat.com,
John Snow <jsnow@redhat.com>, Jeff Cody <jcody@redhat.com>,
Markus Armbruster <armbru@redhat.com>,
Max Reitz <mreitz@redhat.com>,
stefanha@redhat.com, den@openvz.org, pbonzini@redhat.com,
berrange@redhat.com
Subject: [Qemu-devel] [PATCH v7 06/20] raw-posix: Add image locking support
Date: Mon, 8 Aug 2016 21:13:19 +0800 [thread overview]
Message-ID: <1470662013-19785-7-git-send-email-famz@redhat.com> (raw)
In-Reply-To: <1470662013-19785-1-git-send-email-famz@redhat.com>
virtlockd in libvirt locks the first byte, we lock byte 1 to avoid
the intervene.
Both file and host device protocols are covered.
The complication is with reopen. We have three different locking states,
namely "unlocked", "shared locked" and "exclusively locked".
When we reopen, the new fd may need a new locking mode. Moving away to or from
exclusive is a bit tricky because we cannot do it atomically. This patch solves
it by dup() s->fd to s->lock_fd and avoid close(), so that there isn't a racy
window where we drop the lock on one fd before acquiring the exclusive lock on
the other.
To make the logic easier to manage, and allow better reuse, the code is
internally organized by state transition table (old_lock -> new_lock).
Signed-off-by: Fam Zheng <famz@redhat.com>
---
v7: Fix raw_reopen_to_unlock wrong assertion.
---
block/raw-posix.c | 298 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 297 insertions(+), 1 deletion(-)
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 6ed7547..371667d 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -133,6 +133,7 @@ do { \
typedef struct BDRVRawState {
int fd;
+ int lock_fd;
int type;
int open_flags;
size_t buf_align;
@@ -149,6 +150,7 @@ typedef struct BDRVRawState {
typedef struct BDRVRawReopenState {
int fd;
+ int lock_fd;
int open_flags;
} BDRVRawReopenState;
@@ -367,6 +369,38 @@ static void raw_parse_flags(int bdrv_flags, int *open_flags)
}
}
+
+static int raw_lockf_fd(int fd, BdrvLockfCmd cmd)
+{
+ assert(fd >= 0);
+ /* Locking byte 1 avoids interfereing with virtlockd. */
+ switch (cmd) {
+ case BDRV_LOCKF_EXCLUSIVE:
+ return qemu_lock_fd(fd, 1, 1, true);
+ case BDRV_LOCKF_SHARED:
+ return qemu_lock_fd(fd, 1, 1, false);
+ case BDRV_LOCKF_UNLOCK:
+ return qemu_unlock_fd(fd, 1, 1);
+ default:
+ abort();
+ }
+}
+
+static int raw_lockf(BlockDriverState *bs, BdrvLockfCmd cmd)
+{
+
+ BDRVRawState *s = bs->opaque;
+
+ if (s->lock_fd < 0) {
+ s->lock_fd = qemu_dup(s->fd);
+ if (s->lock_fd < 0) {
+ return s->lock_fd;
+ }
+ }
+
+ return raw_lockf_fd(s->lock_fd, cmd);
+}
+
#ifdef CONFIG_LINUX_AIO
static bool raw_use_aio(int bdrv_flags)
{
@@ -433,6 +467,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
raw_parse_flags(bdrv_flags, &s->open_flags);
s->fd = -1;
+ s->lock_fd = -1;
fd = qemu_open(filename, s->open_flags, 0644);
if (fd < 0) {
ret = -errno;
@@ -529,6 +564,253 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
return raw_open_common(bs, options, flags, 0, errp);
}
+typedef enum {
+ RAW_REOPEN_PREPARE,
+ RAW_REOPEN_COMMIT,
+ RAW_REOPEN_ABORT
+} RawReopenOperation;
+
+typedef int (*RawReopenFunc)(BDRVReopenState *state,
+ RawReopenOperation op,
+ BdrvLockfCmd old_lock,
+ BdrvLockfCmd new_lock,
+ Error **errp);
+
+static int
+raw_reopen_identical(BDRVReopenState *state,
+ RawReopenOperation op,
+ BdrvLockfCmd old_lock,
+ BdrvLockfCmd new_lock,
+ Error **errp)
+{
+ assert(old_lock == new_lock);
+ return 0;
+}
+
+static int
+raw_reopen_from_unlock(BDRVReopenState *state,
+ RawReopenOperation op,
+ BdrvLockfCmd old_lock,
+ BdrvLockfCmd new_lock,
+ Error **errp)
+{
+ BDRVRawReopenState *raw_s = state->opaque;
+ int ret = 0;
+
+ assert(old_lock != new_lock);
+ assert(old_lock == BDRV_LOCKF_UNLOCK);
+ switch (op) {
+ case RAW_REOPEN_PREPARE:
+ ret = raw_lockf_fd(raw_s->lock_fd, new_lock);
+ if (ret) {
+ error_setg_errno(errp, -ret, "Failed to lock new fd");
+ }
+ break;
+ case RAW_REOPEN_COMMIT:
+ case RAW_REOPEN_ABORT:
+ break;
+ }
+
+ return ret;
+}
+
+static int
+raw_reopen_to_unlock(BDRVReopenState *state,
+ RawReopenOperation op,
+ BdrvLockfCmd old_lock,
+ BdrvLockfCmd new_lock,
+ Error **errp)
+{
+ BDRVRawState *s = state->bs->opaque;
+ int ret = 0;
+
+ assert(old_lock != new_lock);
+ assert(new_lock == BDRV_LOCKF_UNLOCK);
+ switch (op) {
+ case RAW_REOPEN_PREPARE:
+ break;
+ case RAW_REOPEN_COMMIT:
+ if (s->lock_fd >= 0) {
+ qemu_close(s->lock_fd);
+ s->lock_fd = -1;
+ }
+ break;
+ case RAW_REOPEN_ABORT:
+ break;
+ }
+
+ return ret;
+}
+
+static int
+raw_reopen_upgrade(BDRVReopenState *state,
+ RawReopenOperation op,
+ BdrvLockfCmd old_lock,
+ BdrvLockfCmd new_lock,
+ Error **errp)
+{
+ BDRVRawReopenState *raw_s = state->opaque;
+ BDRVRawState *s = state->bs->opaque;
+ int ret = 0, ret2;
+
+ assert(old_lock == BDRV_LOCKF_SHARED);
+ assert(new_lock == BDRV_LOCKF_EXCLUSIVE);
+ switch (op) {
+ case RAW_REOPEN_PREPARE:
+ ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
+ if (ret) {
+ error_setg_errno(errp, -ret, "Failed to lock new fd (shared)");
+ break;
+ }
+ ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_UNLOCK);
+ if (ret) {
+ error_setg_errno(errp, -ret, "Failed to unlock old fd");
+ goto restore;
+ }
+ ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_EXCLUSIVE);
+ if (ret) {
+ error_setg_errno(errp, -ret, "Failed to lock new fd (exclusive)");
+ goto restore;
+ }
+ break;
+ case RAW_REOPEN_COMMIT:
+ break;
+ case RAW_REOPEN_ABORT:
+ raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
+ ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
+ if (ret) {
+ error_report("Failed to restore lock on old fd");
+ }
+ break;
+ }
+
+ return ret;
+restore:
+ ret2 = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
+ if (ret2) {
+ error_report("Failed to restore old lock");
+ }
+ return ret;
+}
+
+static int
+raw_reopen_downgrade(BDRVReopenState *state,
+ RawReopenOperation op,
+ BdrvLockfCmd old_lock,
+ BdrvLockfCmd new_lock,
+ Error **errp)
+{
+ BDRVRawReopenState *raw_s = state->opaque;
+ BDRVRawState *s = state->bs->opaque;
+ int ret = 0;
+
+ assert(old_lock == BDRV_LOCKF_EXCLUSIVE);
+ assert(new_lock == BDRV_LOCKF_SHARED);
+ switch (op) {
+ case RAW_REOPEN_PREPARE:
+ break;
+ case RAW_REOPEN_COMMIT:
+ ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
+ if (ret) {
+ error_report("Failed to downgrade old lock");
+ break;
+ }
+ ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
+ if (ret) {
+ error_report("Failed to lock new fd");
+ break;
+ }
+ break;
+ case RAW_REOPEN_ABORT:
+ break;
+ }
+
+ return ret;
+}
+
+/**
+ * Transactionally moving between three possible locking states is tricky and
+ * must be done carefully. That is mostly because downgrading an exclusive lock
+ * to shared or unlocked is not guaranteed to be revertable. As a result, in
+ * such cases we have to defer the downgraing to "commit", given that no revert
+ * will happen after that point, and that downgrading a lock should never fail.
+ *
+ * On the other hand, upgrading a lock (e.g. from unlocked or shared to
+ * exclusive lock) must happen in "prepare" because it may fail.
+ *
+ * Manage the operation matrix with this state transition table to make
+ * fulfulling above conditions easier.
+ */
+static const struct RawReopenFuncRecord {
+ BdrvLockfCmd old_lock;
+ BdrvLockfCmd new_lock;
+ RawReopenFunc func;
+ bool need_lock_fd;
+} reopen_functions[] = {
+ {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_UNLOCK, raw_reopen_identical, false},
+ {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_SHARED, raw_reopen_from_unlock, true},
+ {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_EXCLUSIVE, raw_reopen_from_unlock, true},
+ {BDRV_LOCKF_SHARED, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false},
+ {BDRV_LOCKF_SHARED, BDRV_LOCKF_SHARED, raw_reopen_identical, false},
+ {BDRV_LOCKF_SHARED, BDRV_LOCKF_EXCLUSIVE, raw_reopen_upgrade, true},
+ {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false},
+ {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_SHARED, raw_reopen_downgrade, true},
+ {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_EXCLUSIVE, raw_reopen_identical, false},
+};
+
+static int raw_reopen_handle_lock(BDRVReopenState *state,
+ RawReopenOperation op,
+ Error **errp)
+{
+ BDRVRawReopenState *raw_s = state->opaque;
+ BDRVRawState *s = state->bs->opaque;
+ BdrvLockfCmd old_lock, new_lock;
+ const struct RawReopenFuncRecord *rec;
+ int ret;
+
+ old_lock = bdrv_get_locking_cmd(bdrv_get_flags(state->bs));
+ new_lock = bdrv_get_locking_cmd(state->flags);
+
+ for (rec = &reopen_functions[0];
+ rec < &reopen_functions[ARRAY_SIZE(reopen_functions)];
+ rec++) {
+ if (rec->old_lock == old_lock && rec->new_lock == new_lock) {
+ break;
+ }
+ }
+ assert(rec != &reopen_functions[ARRAY_SIZE(reopen_functions)]);
+
+ switch (op) {
+ case RAW_REOPEN_PREPARE:
+ if (rec->need_lock_fd) {
+ ret = qemu_dup(raw_s->fd);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to dup new fd");
+ return ret;
+ }
+ raw_s->lock_fd = ret;
+ }
+ return rec->func(state, op, old_lock, new_lock, errp);
+ case RAW_REOPEN_COMMIT:
+ rec->func(state, op, old_lock, new_lock, errp);
+ if (rec->need_lock_fd) {
+ if (s->lock_fd >= 0) {
+ qemu_close(s->lock_fd);
+ }
+ s->lock_fd = raw_s->lock_fd;
+ }
+ break;
+ case RAW_REOPEN_ABORT:
+ rec->func(state, op, old_lock, new_lock, errp);
+ if (rec->need_lock_fd && raw_s->lock_fd >= 0) {
+ qemu_close(raw_s->lock_fd);
+ raw_s->lock_fd = -1;
+ }
+ break;
+ }
+ return 0;
+}
+
static int raw_reopen_prepare(BDRVReopenState *state,
BlockReopenQueue *queue, Error **errp)
{
@@ -607,6 +889,10 @@ static int raw_reopen_prepare(BDRVReopenState *state,
}
}
+ if (!ret) {
+ ret = raw_reopen_handle_lock(state, RAW_REOPEN_PREPARE, errp);
+ }
+
return ret;
}
@@ -617,6 +903,8 @@ static void raw_reopen_commit(BDRVReopenState *state)
s->open_flags = raw_s->open_flags;
+ raw_reopen_handle_lock(state, RAW_REOPEN_COMMIT, NULL);
+
qemu_close(s->fd);
s->fd = raw_s->fd;
@@ -634,6 +922,8 @@ static void raw_reopen_abort(BDRVReopenState *state)
return;
}
+ raw_reopen_handle_lock(state, RAW_REOPEN_ABORT, NULL);
+
if (raw_s->fd >= 0) {
qemu_close(raw_s->fd);
raw_s->fd = -1;
@@ -1321,6 +1611,10 @@ static void raw_close(BlockDriverState *bs)
qemu_close(s->fd);
s->fd = -1;
}
+ if (s->lock_fd >= 0) {
+ qemu_close(s->lock_fd);
+ s->lock_fd = -1;
+ }
}
static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -1874,7 +2168,7 @@ BlockDriver bdrv_file = {
.bdrv_get_info = raw_get_info,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
-
+ .bdrv_lockf = raw_lockf,
.create_opts = &raw_create_opts,
};
@@ -2324,6 +2618,8 @@ static BlockDriver bdrv_host_device = {
#ifdef __linux__
.bdrv_aio_ioctl = hdev_aio_ioctl,
#endif
+
+ .bdrv_lockf = raw_lockf,
};
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
--
2.7.4
next prev parent reply other threads:[~2016-08-08 13:14 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-08-08 13:13 [Qemu-devel] [PATCH v7 00/20] block: Image locking series for 2.8 Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 01/20] block: Add flag bits for image locking Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 02/20] qapi: Add lock-mode in blockdev-add options Fam Zheng
2016-09-06 16:18 ` Kevin Wolf
2016-09-07 2:19 ` Fam Zheng
2016-09-22 14:58 ` Eric Blake
2016-09-23 4:01 ` Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 03/20] block: Add and parse "lock-mode" option for image locking Fam Zheng
2016-09-06 16:33 ` Kevin Wolf
2016-09-07 2:18 ` Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 04/20] block: Introduce image file locking Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 05/20] osdep: Add qemu_lock_fd and qemu_unlock_fd Fam Zheng
2016-08-08 13:13 ` Fam Zheng [this message]
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 07/20] qemu-io: Add "-L" option for BDRV_O_NO_LOCK Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 08/20] qemu-img: Add "-L" option to sub commands Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 09/20] qemu-img: Update documentation of "-L" option Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 10/20] qemu-nbd: Add "--no-lock/-L" option Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 11/20] block: Don't lock drive-backup target image in none mode Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 12/20] qemu-iotests: 046: Move version detection out from verify_io Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 13/20] qemu-iotests: Wait for QEMU processes before checking image in 091 Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 14/20] qemu-iotests: 030: Disable image locking when checking test image Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 15/20] iotests: 087: Disable image locking in cases where file is shared Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 16/20] iotests: 130: Check image info locklessly Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 17/20] iotests: Disable image locking in 085 Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 18/20] tests: Use null-co:// instead of /dev/null Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 19/20] block: Turn on image locking by default Fam Zheng
2016-08-08 13:13 ` [Qemu-devel] [PATCH v7 20/20] qemu-iotests: Add test case 153 for image locking Fam Zheng
2016-08-08 13:59 ` [Qemu-devel] [PATCH v7 00/20] block: Image locking series for 2.8 no-reply
2016-08-09 4:42 ` Fam Zheng
2016-09-06 1:49 ` Fam Zheng
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1470662013-19785-7-git-send-email-famz@redhat.com \
--to=famz@redhat.com \
--cc=armbru@redhat.com \
--cc=berrange@redhat.com \
--cc=den@openvz.org \
--cc=jcody@redhat.com \
--cc=jsnow@redhat.com \
--cc=kwolf@redhat.com \
--cc=mreitz@redhat.com \
--cc=pbonzini@redhat.com \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=rjones@redhat.com \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).