From: Shaurya Rane <ssrane_b23@ee.vjti.ac.in>
To: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org
Cc: manfred@colorfullife.com, viro@zeniv.linux.org.uk,
brauner@kernel.org, chuck.lever@oracle.com, jlayton@kernel.org,
rstoyanov@fedoraproject.org, ptikhomirov@virtuozzo.com,
Shaurya Rane <ssrane_b23@ee.vjti.ac.in>
Subject: [RFC PATCH 3/3] ipc/mqueue: implement fcntl(F_MQ_PEEK) for non-destructive message inspection
Date: Thu, 26 Mar 2026 00:30:25 +0530 [thread overview]
Message-ID: <20260325190025.40312-4-ssrane_b23@ee.vjti.ac.in> (raw)
In-Reply-To: <20260325190025.40312-1-ssrane_b23@ee.vjti.ac.in>
Add support for F_MQ_PEEK, a new fcntl command that reads a POSIX
message queue message by index without removing it from the queue.
Background:
CRIU (Checkpoint/Restore In Userspace) supports live container migration
and process checkpoint/restore. POSIX message queues are a widely-used
IPC mechanism, but CRIU cannot checkpoint processes that hold open mqueue
file descriptors: there is no kernel interface to inspect queued messages
non-destructively. The SysV IPC analogue (MSG_COPY for msgrcv) was
introduced specifically for CRIU in commit 4a674f34ba04 ("ipc: introduce
message queue copy feature"). This patch provides the equivalent for
POSIX mqueues.
Implementation:
The queue stores messages in a red-black tree (info->msg_tree) keyed
by priority, with each tree node holding a FIFO list of messages at
that priority level. mq_peek_at_offset() walks this structure in
receive order (highest priority first, FIFO within priority) to locate
the message at the requested index without modifying any state.
Message payload is copied into a kvmalloc'd kernel buffer under
info->lock using pure memcpy() (no page faults possible). This
correctly handles multi-segment messages by walking the msg_msgseg
chain. The lock is released before copy_to_user() transfers the
kernel buffer to userspace.
A new include/linux/mqueue.h kernel header is added to declare
do_mq_peek() for use from fs/fcntl.c, following the same pattern as
include/linux/memfd.h for memfd_fcntl().
Concurrency:
The snapshot is consistent within the spin_lock() critical section.
Between two F_MQ_PEEK calls the queue may change (messages may be sent
or received). This is documented snapshot semantics, analogous to
/proc entries. CRIU freezes the target process via ptrace before
dumping, so in practice the queue is stable for the entire checkpoint
sequence.
Link: https://github.com/checkpoint-restore/criu/issues/2285
Signed-off-by: Shaurya Rane <ssrane_b23@ee.vjti.ac.in>
---
fs/fcntl.c | 4 ++
include/linux/mqueue.h | 19 ++++++
ipc/mqueue.c | 129 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 152 insertions(+)
create mode 100644 include/linux/mqueue.h
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f93dbca08435..32d0dcc8e544 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -24,6 +24,7 @@
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/memfd.h>
+#include <linux/mqueue.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/rw_hint.h>
@@ -563,6 +564,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
return -EFAULT;
err = fcntl_setdeleg(fd, filp, &deleg);
break;
+ case F_MQ_PEEK:
+ err = do_mq_peek(filp, argp);
+ break;
default:
break;
}
diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
new file mode 100644
index 000000000000..a725fcf90d39
--- /dev/null
+++ b/include/linux/mqueue.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_MQUEUE_H
+#define __LINUX_MQUEUE_H
+
+#include <uapi/linux/mqueue.h>
+
+struct file;
+
+#ifdef CONFIG_POSIX_MQUEUE
+long do_mq_peek(struct file *filp, struct mq_peek_attr __user *uattr);
+#else
+static inline long do_mq_peek(struct file *filp,
+ struct mq_peek_attr __user *uattr)
+{
+ return -EBADF;
+}
+#endif /* CONFIG_POSIX_MQUEUE */
+
+#endif /* __LINUX_MQUEUE_H */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index bb7c9e5d2b90..5e73864a9657 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -286,6 +286,135 @@ static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
return msg;
}
+/*
+ * mq_peek_at_offset - locate a message by receive-order index.
+ *
+ * Walk the priority tree from highest to lowest priority, and within each
+ * priority level in FIFO order, returning the message at position @offset
+ * (0 = next message that mq_receive() would dequeue).
+ *
+ * Must be called with info->lock held. Does not modify queue state.
+ * Returns NULL if @offset >= mq_curmsgs.
+ */
+static struct msg_msg *mq_peek_at_offset(struct mqueue_inode_info *info,
+ int offset)
+{
+ struct posix_msg_tree_node *leaf;
+ struct rb_node *node;
+ struct msg_msg *msg;
+ int count = 0;
+
+ for (node = info->msg_tree_rightmost; node; node = rb_prev(node)) {
+ leaf = rb_entry(node, struct posix_msg_tree_node, rb_node);
+ list_for_each_entry(msg, &leaf->msg_list, m_list) {
+ if (count == offset)
+ return msg;
+ count++;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * mq_msg_copy_to_buf - copy message payload into a flat kernel buffer.
+ *
+ * Handles multi-segment messages by walking the msg_msgseg chain.
+ * Uses only memcpy() so it is safe to call under info->lock.
+ * Returns the number of bytes copied.
+ */
+static size_t mq_msg_copy_to_buf(struct msg_msg *msg, void *buf, size_t buf_len)
+{
+ size_t alen, to_copy, copied = 0;
+ struct msg_msgseg *seg;
+
+ to_copy = min(buf_len, msg->m_ts);
+
+ alen = min(to_copy, DATALEN_MSG);
+ memcpy(buf, msg + 1, alen);
+ copied += alen;
+ to_copy -= alen;
+
+ for (seg = msg->next; seg && to_copy > 0; seg = seg->next) {
+ alen = min(to_copy, DATALEN_SEG);
+ memcpy((char *)buf + copied, seg + 1, alen);
+ copied += alen;
+ to_copy -= alen;
+ }
+ return copied;
+}
+
+/*
+ * do_mq_peek - implement fcntl(F_MQ_PEEK).
+ *
+ * Read the message at position @attr.offset in receive order from the
+ * queue without removing it. Position 0 is the message that the next
+ * mq_receive() would return (highest priority, FIFO within priority).
+ *
+ * The snapshot is consistent within the spin_lock() critical section.
+ * Between two F_MQ_PEEK calls the queue may change; this is documented
+ * snapshot semantics analogous to /proc entries.
+ *
+ * Returns bytes copied on success, -ENOMSG if offset >= mq_curmsgs.
+ */
+long do_mq_peek(struct file *filp, struct mq_peek_attr __user *uattr)
+{
+ struct mqueue_inode_info *info;
+ struct mq_peek_attr attr;
+ struct msg_msg *msg;
+ void *kbuf;
+ long ret;
+
+ if (filp->f_op != &mqueue_file_operations)
+ return -EBADF;
+
+ if (!(filp->f_mode & FMODE_READ))
+ return -EBADF;
+
+ if (copy_from_user(&attr, uattr, sizeof(attr)))
+ return -EFAULT;
+
+ if (attr.offset < 0 || !attr.buf_len || !attr.buf)
+ return -EINVAL;
+
+ info = MQUEUE_I(file_inode(filp));
+
+ /*
+ * Allocate the kernel copy buffer before taking the spinlock.
+ * Cap at mq_msgsize: no message can exceed it.
+ */
+ kbuf = kvmalloc(min_t(size_t, attr.buf_len, info->attr.mq_msgsize),
+ GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ spin_lock(&info->lock);
+
+ msg = mq_peek_at_offset(info, attr.offset);
+ if (!msg) {
+ spin_unlock(&info->lock);
+ kvfree(kbuf);
+ return -ENOMSG;
+ }
+
+ /*
+ * Copy the payload under the lock using pure memcpy() (no page
+ * faults), then transfer to userspace after releasing the lock.
+ */
+ ret = mq_msg_copy_to_buf(msg, kbuf,
+ min_t(size_t, attr.buf_len,
+ info->attr.mq_msgsize));
+ attr.msg_prio = msg->m_type;
+
+ spin_unlock(&info->lock);
+
+ if (copy_to_user(attr.buf, kbuf, ret) ||
+ copy_to_user(uattr, &attr, sizeof(attr)))
+ ret = -EFAULT;
+
+ kvfree(kbuf);
+ return ret;
+}
+
static struct inode *mqueue_get_inode(struct super_block *sb,
struct ipc_namespace *ipc_ns, umode_t mode,
struct mq_attr *attr)
--
2.34.1
prev parent reply other threads:[~2026-03-25 19:01 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-25 19:00 [RFC PATCH 0/3] ipc/mqueue: add fcntl(F_MQ_PEEK) for non-destructive message inspection Shaurya Rane
2026-03-25 19:00 ` [RFC PATCH 1/3] mqueue: uapi: add struct mq_peek_attr and F_MQ_PEEK Shaurya Rane
2026-03-25 19:00 ` [RFC PATCH 2/3] msg: move struct msg_msgseg and DATALEN_* to include/linux/msg.h Shaurya Rane
2026-03-25 19:00 ` Shaurya Rane [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260325190025.40312-4-ssrane_b23@ee.vjti.ac.in \
--to=ssrane_b23@ee.vjti.ac.in \
--cc=brauner@kernel.org \
--cc=chuck.lever@oracle.com \
--cc=jlayton@kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=manfred@colorfullife.com \
--cc=ptikhomirov@virtuozzo.com \
--cc=rstoyanov@fedoraproject.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox