linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: David Herrmann <dh.herrmann@gmail.com>
To: linux-kernel@vger.kernel.org
Cc: Matthew Wilcox <matthew@wil.cx>, Ryan Lortie <desrt@desrt.ca>,
	Hugh Dickins <hughd@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>, Kay Sievers <kay@vrfy.org>,
	dri-devel@lists.freedesktop.org, Daniel Mack <zonque@gmail.com>,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	Karol Lewandowski <k.lewandowsk@samsung.com>,
	Lennart Poettering <lennart@poettering.net>,
	Greg Kroah-Hartman <greg@kroah.com>, Tejun Heo <tj@kernel.org>,
	"Michael Kerrisk \(man-pages\)" <mtk.manpages@gmail.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Alexander Viro <viro@zeniv.linux.org.uk>
Subject: [PATCH 3/6] shm: add memfd_create() syscall
Date: Wed, 19 Mar 2014 20:06:48 +0100	[thread overview]
Message-ID: <1395256011-2423-4-git-send-email-dh.herrmann@gmail.com> (raw)
In-Reply-To: <1395256011-2423-1-git-send-email-dh.herrmann@gmail.com>

memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor
that you can pass to mmap(). It explicitly allows sealing and
avoids any connection to user-visible mount-points. Thus, it's not
subject to quotas on mounted file-systems, but can be used like
malloc()'ed memory, but with a file-descriptor to it.

memfd_create() does not create a front-FD, but instead returns the raw
shmem file, so calls like ftruncate() can be used. Also calls like fstat()
will return proper information and mark the file as regular file. Sealing
is explicitly supported on memfds.

Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not
subject to quotas and alike.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
---
 arch/x86/syscalls/syscall_32.tbl |  1 +
 arch/x86/syscalls/syscall_64.tbl |  1 +
 include/linux/syscalls.h         |  1 +
 include/uapi/linux/memfd.h       |  9 ++++++
 kernel/sys_ni.c                  |  1 +
 mm/shmem.c                       | 67 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 80 insertions(+)
 create mode 100644 include/uapi/linux/memfd.h

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 96bc506..c943b8a 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -359,3 +359,4 @@
 350	i386	finit_module		sys_finit_module
 351	i386	sched_setattr		sys_sched_setattr
 352	i386	sched_getattr		sys_sched_getattr
+353	i386	memfd_create		sys_memfd_create
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index a12bddc..e9d56a8 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -322,6 +322,7 @@
 313	common	finit_module		sys_finit_module
 314	common	sched_setattr		sys_sched_setattr
 315	common	sched_getattr		sys_sched_getattr
+316	common	memfd_create		sys_memfd_create
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a747a77..124b838 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -791,6 +791,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
 asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
+asmlinkage long sys_memfd_create(const char *uname_ptr, u64 size, u64 flags);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
 asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
new file mode 100644
index 0000000..d74cc89
--- /dev/null
+++ b/include/uapi/linux/memfd.h
@@ -0,0 +1,9 @@
+#ifndef _UAPI_LINUX_MEMFD_H
+#define _UAPI_LINUX_MEMFD_H
+
+#include <linux/types.h>
+
+/* flags for memfd_create(2) */
+#define MFD_CLOEXEC		0x0001
+
+#endif /* _UAPI_LINUX_MEMFD_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7078052..53e05af 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -193,6 +193,7 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
 
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
diff --git a/mm/shmem.c b/mm/shmem.c
index 44d7f3b..48feb42 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,7 +66,9 @@ static struct vfsmount *shm_mnt;
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
+#include <linux/syscalls.h>
 #include <linux/fcntl.h>
+#include <uapi/linux/memfd.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -3039,6 +3041,71 @@ out4:
 	return error;
 }
 
+/* maximum length of memfd names */
+#define MFD_MAX_NAMELEN 256
+
+SYSCALL_DEFINE3(memfd_create,
+		const char*, uname,
+		u64, size,
+		u64, flags)
+{
+	struct file *shm;
+	char *name;
+	int fd, r;
+	long len;
+
+	if (flags & ~(u64)MFD_CLOEXEC)
+		return -EINVAL;
+	if ((u64)(loff_t)size != size || (loff_t)size < 0)
+		return -EINVAL;
+
+	/* length includes terminating zero */
+	len = strnlen_user(uname, MFD_MAX_NAMELEN);
+	if (len <= 0)
+		return -EFAULT;
+	else if (len > MFD_MAX_NAMELEN)
+		return -EINVAL;
+
+	name = kmalloc(len + 6, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	strcpy(name, "memfd:");
+	if (copy_from_user(&name[6], uname, len)) {
+		r = -EFAULT;
+		goto err_name;
+	}
+
+	/* terminating-zero may have changed after strnlen_user() returned */
+	if (name[len + 6 - 1]) {
+		r = -EFAULT;
+		goto err_name;
+	}
+
+	fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+	if (fd < 0) {
+		r = fd;
+		goto err_name;
+	}
+
+	shm = shmem_file_setup(name, size, 0);
+	if (IS_ERR(shm)) {
+		r = PTR_ERR(shm);
+		goto err_fd;
+	}
+	shm->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+
+	fd_install(fd, shm);
+	kfree(name);
+	return fd;
+
+err_fd:
+	put_unused_fd(fd);
+err_name:
+	kfree(name);
+	return r;
+}
+
 #else /* !CONFIG_SHMEM */
 
 /*
-- 
1.9.0

  parent reply	other threads:[~2014-03-19 19:06 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-03-19 19:06 [PATCH 0/6] File Sealing & memfd_create() David Herrmann
2014-03-19 19:06 ` [PATCH 1/6] fs: fix i_writecount on shmem and friends David Herrmann
2014-03-19 19:06 ` [PATCH 2/6] shm: add sealing API David Herrmann
2014-03-19 19:06 ` David Herrmann [this message]
2014-03-20  8:47   ` [PATCH 3/6] shm: add memfd_create() syscall Cyrill Gorcunov
2014-03-20  9:01     ` Pavel Emelyanov
2014-03-20 11:29       ` David Herrmann
2014-03-20 11:50         ` Pavel Emelyanov
2014-03-20 19:22   ` John Stultz
2014-04-02 13:38   ` Konstantin Khlebnikov
2014-04-02 14:18     ` David Herrmann
2014-04-02 14:52       ` Konstantin Khlebnikov
2014-04-10 19:07     ` Andy Lutomirski
2014-03-19 19:06 ` [PATCH 4/6] selftests: add memfd_create() + sealing tests David Herrmann
2014-03-19 19:06 ` [PATCH man-pages 5/6] fcntl.2: document SHMEM_SET/GET_SEALS commands David Herrmann
2014-03-19 19:06 ` [PATCH man-pages 6/6] memfd_create.2: add memfd_create() man-page David Herrmann
2014-03-20  2:55 ` [PATCH 0/6] File Sealing & memfd_create() Greg Kroah-Hartman
2014-03-20  3:49 ` Linus Torvalds
2014-03-20  8:07   ` David Herrmann
2014-03-20 14:41     ` One Thousand Gnomes
2014-03-20 15:12       ` David Herrmann
2014-03-20 15:26         ` One Thousand Gnomes
2014-03-20 15:32 ` tytso
2014-03-20 15:39   ` One Thousand Gnomes
2014-03-20 15:48   ` David Herrmann
2014-03-20 16:38     ` tytso
2014-04-10 19:14       ` Andy Lutomirski
2014-04-10 20:32         ` Theodore Ts'o
2014-04-10 20:37           ` Andy Lutomirski
2014-04-10 20:49             ` David Herrmann
2014-04-10 21:16               ` Andy Lutomirski
2014-04-10 22:57                 ` David Herrmann
2014-04-10 23:05                   ` Andy Lutomirski
2014-04-10 23:16                     ` David Herrmann
2014-04-10 23:32                       ` Andy Lutomirski
2014-04-20 15:03             ` Pavel Machek
2014-06-17  9:48             ` Florian Weimer
2014-06-17 16:21               ` Andy Lutomirski
2014-04-10 14:45   ` Colin Walters
2014-04-10 19:15     ` Andy Lutomirski
2014-04-10 19:45       ` Colin Walters
2014-04-11  6:09         ` Alex Elsayed
2014-04-08 13:00 ` Florian Weimer
2014-04-09 21:31   ` David Herrmann
2014-04-22  9:10     ` Florian Weimer
2014-04-22 11:55       ` David Herrmann
2014-04-22 12:44         ` Florian Weimer
2014-04-22 12:55           ` David Herrmann
2014-04-10 19:17   ` Andy Lutomirski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1395256011-2423-4-git-send-email-dh.herrmann@gmail.com \
    --to=dh.herrmann@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=desrt@desrt.ca \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=greg@kroah.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=k.lewandowsk@samsung.com \
    --cc=kay@vrfy.org \
    --cc=lennart@poettering.net \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=matthew@wil.cx \
    --cc=mtk.manpages@gmail.com \
    --cc=tj@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=zonque@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).