* [PATCH RFC] syslog ns proof of concept
@ 2012-11-17 0:25 Serge Hallyn
2012-11-17 3:14 ` Eric W. Biederman
0 siblings, 1 reply; 5+ messages in thread
From: Serge Hallyn @ 2012-11-17 0:25 UTC (permalink / raw)
To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Cc: Daniel Lezcano, ebiederm-aS9lmoZGLiVWk0Htik3J/w,
Stéphane Graber
Introduce a system log namespace. The syslog ns is tied to a user
namespace. You must create a new user namespace before you can create a
new sylog ns. The syslog ns is created through a new command (11) to
the __NR_syslog system call.
Once a task enters a new syslog ns, it's "dmesg", "dmesg -c" and
/dev/kmsg actions affect only itself, so that user-created syslog
messages no longer are confusingly combined in the host's syslog.
"printk" itself always goes to the initial syslog_ns, and consoles
belong only to the initial syslog_ns. However printks relating to a
specific network namespace, for instance, can now be targeted to the
syslog ns for the user ns which owns the network ns, aiding in debugging
in a container.
This patch is on top of the user namespace enhanced kernel at
git://kernel.ubuntu.com/serge/quantal-userns. It is good enough to
compile with stock ubuntu kernel options, boot, launch other syslog
namespaces and exercise them. It will need help before it will compile
with funky options like CONFIG_PRINTK=n. This is only being sent out to
get feedback on the general idea.
Comments greatly appreciated.
(See https://wiki.ubuntu.com/LxcSyslogNs for background).
Signed-off-by: Serge Hallyn <serge.hallyn-GeWIH/nMZzLQT0dZR+AlfA@public.gmane.org>
---
fs/proc/kmsg.c | 12 +-
include/linux/printk.h | 1 -
include/linux/syslog.h | 70 +++++-
include/linux/user_namespace.h | 1 +
kernel/printk.c | 530 +++++++++++++++++++++++-----------------
kernel/sysctl.c | 3 +-
kernel/user.c | 3 +
kernel/user_namespace.c | 3 +
8 files changed, 392 insertions(+), 231 deletions(-)
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bd4b5a7..3ba594c 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -13,6 +13,8 @@
#include <linux/proc_fs.h>
#include <linux/fs.h>
#include <linux/syslog.h>
+#include <linux/cred.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
#include <asm/io.h>
@@ -21,12 +23,12 @@ extern wait_queue_head_t log_wait;
static int kmsg_open(struct inode * inode, struct file * file)
{
- return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
+ return do_syslog(file->f_cred->user_ns->syslog_ns, SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
}
static int kmsg_release(struct inode * inode, struct file * file)
{
- (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
+ (void) do_syslog(file->f_cred->user_ns->syslog_ns, SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
return 0;
}
@@ -34,15 +36,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
if ((file->f_flags & O_NONBLOCK) &&
- !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+ !do_syslog(file->f_cred->user_ns->syslog_ns, SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
return -EAGAIN;
- return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
+ return do_syslog(file->f_cred->user_ns->syslog_ns, SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
}
static unsigned int kmsg_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &log_wait, wait);
- if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+ if (do_syslog(file->f_cred->user_ns->syslog_ns, SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
return POLLIN | POLLRDNORM;
return 0;
}
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9afc01e..70f8380 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -134,7 +134,6 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msec);
extern int printk_delay_msec;
-extern int dmesg_restrict;
extern int kptr_restrict;
void log_buf_kexec_setup(void);
diff --git a/include/linux/syslog.h b/include/linux/syslog.h
index 3891139..b653870 100644
--- a/include/linux/syslog.h
+++ b/include/linux/syslog.h
@@ -44,9 +44,77 @@
/* Return size of the log buffer */
#define SYSLOG_ACTION_SIZE_BUFFER 10
+#define SYSLOG_ACTION_NEW_NS 11
+
#define SYSLOG_FROM_CALL 0
#define SYSLOG_FROM_FILE 1
-int do_syslog(int type, char __user *buf, int count, bool from_file);
+enum log_flags {
+ LOG_NOCONS = 1, /* already flushed, do not print to console */
+ LOG_NEWLINE = 2, /* text ended with a newline */
+ LOG_PREFIX = 4, /* text started with a prefix */
+ LOG_CONT = 8, /* text is a fragment of a continuation line */
+};
+
+
+struct syslog_ns {
+ unsigned buf_len; /* buffer available space size */
+ char *buf; /* allocated ring buffer */
+
+ /* the next printk record to read by syslog(READ) or /proc/kmsg */
+ u64 syslog_seq;
+ u32 syslog_idx;
+ enum log_flags syslog_prev;
+ size_t syslog_partial;
+
+ /* index and sequence number of the first record stored in the buffer */
+ u64 first_seq;
+ u32 first_idx;
+
+ /* index and sequence number of the next record to store in the buffer */
+ u64 next_seq;
+ u32 next_idx;
+
+ /* the next printk record to read after the last 'clear' command */
+ u64 clear_seq;
+ u32 clear_idx;
+
+ int dmesg_restrict;
+
+ /*
+ * user namesapce which owns this ns. It and its ancestors have
+ * privilege over the syslog_ns. The userns pins the syslog_ns, so
+ * syslog_ns can't pin user_ns. It doesn't need to as we'll only
+ * use ->owner when a task in the syslog_ns (which must be in ->owner
+ * or a child thereof, therefore keeping ->owner alive) is calling
+ * do_syslog().
+ */
+ struct user_namespace *owner;
+ struct kref kref;
+};
+
+static inline struct syslog_ns *get_syslog_ns(struct syslog_ns *ns)
+{
+ if (ns)
+ kref_get(&ns->kref);
+ return ns;
+}
+
+static inline void free_syslog_ns(struct kref *kref)
+{
+ struct syslog_ns *ns = container_of(kref, struct syslog_ns, kref);
+
+ kfree(ns->buf);
+ kfree(ns);
+}
+
+static inline void put_syslog_ns(struct syslog_ns *ns)
+{
+ if (ns)
+ kref_put(&ns->kref, free_syslog_ns);
+}
+int do_syslog(struct syslog_ns *, int type, char __user *buf, int count, bool from_file);
+
+extern struct syslog_ns init_syslog_ns;
#endif /* _LINUX_SYSLOG_H */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index b9bd2e6..8aebb8b 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -26,6 +26,7 @@ struct user_namespace {
kuid_t owner;
kgid_t group;
unsigned int proc_inum;
+ struct syslog_ns *syslog_ns;
};
extern struct user_namespace init_user_ns;
diff --git a/kernel/printk.c b/kernel/printk.c
index 2d607f4..d5fc682 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -42,6 +42,9 @@
#include <linux/notifier.h>
#include <linux/rculist.h>
#include <linux/poll.h>
+#include <linux/spinlock_types.h>
+#include <linux/wait.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
@@ -193,13 +196,6 @@ static int console_may_schedule;
* separated by ',', and find the message after the ';' character.
*/
-enum log_flags {
- LOG_NOCONS = 1, /* already flushed, do not print to console */
- LOG_NEWLINE = 2, /* text ended with a newline */
- LOG_PREFIX = 4, /* text started with a prefix */
- LOG_CONT = 8, /* text is a fragment of a continuation line */
-};
-
struct log {
u64 ts_nsec; /* timestamp in nanoseconds */
u16 len; /* length of entire record */
@@ -217,28 +213,11 @@ struct log {
static DEFINE_RAW_SPINLOCK(logbuf_lock);
#ifdef CONFIG_PRINTK
-/* the next printk record to read by syslog(READ) or /proc/kmsg */
-static u64 syslog_seq;
-static u32 syslog_idx;
-static enum log_flags syslog_prev;
-static size_t syslog_partial;
-
-/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
-static u32 log_first_idx;
-
-/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
-static u32 log_next_idx;
-
/* the next printk record to write to the console */
static u64 console_seq;
static u32 console_idx;
static enum log_flags console_prev;
-/* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
-static u32 clear_idx;
#define PREFIX_MAX 32
#define LOG_LINE_MAX 1024 - PREFIX_MAX
@@ -249,12 +228,29 @@ static u32 clear_idx;
#else
#define LOG_ALIGN __alignof__(struct log)
#endif
+
+/* log_buf for init_syslog_ns */
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
-static char *log_buf = __log_buf;
-static u32 log_buf_len = __LOG_BUF_LEN;
-/* cpu currently holding logbuf_lock */
+struct syslog_ns init_syslog_ns = {
+ .buf = __log_buf,
+ .buf_len = __LOG_BUF_LEN,
+ .owner = &init_user_ns,
+ .kref = {
+ .refcount = ATOMIC_INIT(2), // one for init_user_ns, one for cont
+ },
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+ .dmesg_restrict = 1,
+#else
+ .dmesg_restrict = 0,
+#endif
+};
+
+#define LOG_BUF_MASK(ns) ((ns)->buf_len-1)
+#define LOG_BUF(ns, idx) ((ns)->buf[(idx) & LOG_BUF_MASK(ns)])
+
+/* cpu currently holding lock */
static volatile unsigned int logbuf_cpu = UINT_MAX;
/* human readable text of the record */
@@ -270,23 +266,23 @@ static char *log_dict(const struct log *msg)
}
/* get record by index; idx must point to valid msg */
-static struct log *log_from_idx(u32 idx)
+static struct log *log_from_idx(struct syslog_ns *ns, u32 idx)
{
- struct log *msg = (struct log *)(log_buf + idx);
+ struct log *msg = (struct log *)(ns->buf + idx);
/*
* A length == 0 record is the end of buffer marker. Wrap around and
* read the message at the start of the buffer.
*/
if (!msg->len)
- return (struct log *)log_buf;
+ return (struct log *)ns->buf;
return msg;
}
/* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
+static u32 log_next(struct syslog_ns *ns, u32 idx)
{
- struct log *msg = (struct log *)(log_buf + idx);
+ struct log *msg = (struct log *)(ns->buf + idx);
/* length == 0 indicates the end of the buffer; wrap */
/*
@@ -295,14 +291,14 @@ static u32 log_next(u32 idx)
* return the one after that.
*/
if (!msg->len) {
- msg = (struct log *)log_buf;
+ msg = (struct log *)ns->buf;
return msg->len;
}
return idx + msg->len;
}
/* insert record into the buffer, discard old ones, update heads */
-static void log_store(int facility, int level,
+static void log_store(struct syslog_ns *ns, int facility, int level,
enum log_flags flags, u64 ts_nsec,
const char *dict, u16 dict_len,
const char *text, u16 text_len)
@@ -315,34 +311,34 @@ static void log_store(int facility, int level,
pad_len = (-size) & (LOG_ALIGN - 1);
size += pad_len;
- while (log_first_seq < log_next_seq) {
+ while (ns->first_seq < ns->next_seq) {
u32 free;
- if (log_next_idx > log_first_idx)
- free = max(log_buf_len - log_next_idx, log_first_idx);
+ if (ns->next_idx > ns->first_idx)
+ free = max(ns->buf_len - ns->next_idx, ns->first_idx);
else
- free = log_first_idx - log_next_idx;
+ free = ns->first_idx - ns->next_idx;
if (free > size + sizeof(struct log))
break;
/* drop old messages until we have enough contiuous space */
- log_first_idx = log_next(log_first_idx);
- log_first_seq++;
+ ns->first_idx = log_next(ns, ns->first_idx);
+ ns->first_seq++;
}
- if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
+ if (ns->next_idx + size + sizeof(struct log) >= ns->buf_len) {
/*
* This message + an additional empty header does not fit
* at the end of the buffer. Add an empty header with len == 0
* to signify a wrap around.
*/
- memset(log_buf + log_next_idx, 0, sizeof(struct log));
- log_next_idx = 0;
+ memset(ns->buf + ns->next_idx, 0, sizeof(struct log));
+ ns->next_idx = 0;
}
/* fill message */
- msg = (struct log *)(log_buf + log_next_idx);
+ msg = (struct log *)(ns->buf + ns->next_idx);
memcpy(log_text(msg), text, text_len);
msg->text_len = text_len;
memcpy(log_dict(msg), dict, dict_len);
@@ -358,8 +354,8 @@ static void log_store(int facility, int level,
msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
/* insert message */
- log_next_idx += msg->len;
- log_next_seq++;
+ ns->next_idx += msg->len;
+ ns->next_seq++;
}
/* /dev/kmsg - userspace message inject/listen interface */
@@ -437,6 +433,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
char cont = '-';
size_t len;
ssize_t ret;
+ struct syslog_ns *ns = file->f_cred->user_ns->syslog_ns;
if (!user)
return -EBADF;
@@ -445,7 +442,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
if (ret)
return ret;
raw_spin_lock_irq(&logbuf_lock);
- while (user->seq == log_next_seq) {
+ while (user->seq == ns->next_seq) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
raw_spin_unlock_irq(&logbuf_lock);
@@ -454,22 +451,22 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
raw_spin_unlock_irq(&logbuf_lock);
ret = wait_event_interruptible(log_wait,
- user->seq != log_next_seq);
+ user->seq != ns->next_seq);
if (ret)
goto out;
raw_spin_lock_irq(&logbuf_lock);
}
- if (user->seq < log_first_seq) {
+ if (user->seq < ns->first_seq) {
/* our last seen message is gone, return error and reset */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ user->idx = ns->first_idx;
+ user->seq = ns->first_seq;
ret = -EPIPE;
raw_spin_unlock_irq(&logbuf_lock);
goto out;
}
- msg = log_from_idx(user->idx);
+ msg = log_from_idx(ns, user->idx);
ts_usec = msg->ts_nsec;
do_div(ts_usec, 1000);
@@ -530,7 +527,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
user->buf[len++] = '\n';
}
- user->idx = log_next(user->idx);
+ user->idx = log_next(ns, user->idx);
user->seq++;
raw_spin_unlock_irq(&logbuf_lock);
@@ -553,6 +550,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
struct devkmsg_user *user = file->private_data;
loff_t ret = 0;
+ struct syslog_ns *ns = file->f_cred->user_ns->syslog_ns;
if (!user)
return -EBADF;
@@ -563,8 +561,8 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case SEEK_SET:
/* the first record */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ user->idx = ns->first_idx;
+ user->seq = ns->first_seq;
break;
case SEEK_DATA:
/*
@@ -572,13 +570,13 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->idx = clear_idx;
- user->seq = clear_seq;
+ user->idx = ns->clear_idx;
+ user->seq = ns->clear_seq;
break;
case SEEK_END:
/* after the last record */
- user->idx = log_next_idx;
- user->seq = log_next_seq;
+ user->idx = ns->next_idx;
+ user->seq = ns->next_seq;
break;
default:
ret = -EINVAL;
@@ -591,6 +589,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
{
struct devkmsg_user *user = file->private_data;
int ret = 0;
+ struct syslog_ns *ns = file->f_cred->user_ns->syslog_ns;
if (!user)
return POLLERR|POLLNVAL;
@@ -598,9 +597,9 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
raw_spin_lock_irq(&logbuf_lock);
- if (user->seq < log_next_seq) {
+ if (user->seq < ns->next_seq) {
/* return error when data has vanished underneath us */
- if (user->seq < log_first_seq)
+ if (user->seq < ns->first_seq)
ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
ret = POLLIN|POLLRDNORM;
}
@@ -613,6 +612,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
{
struct devkmsg_user *user;
int err;
+ struct syslog_ns *ns = file->f_cred->user_ns->syslog_ns;
/* write-only does not need any file context */
if ((file->f_flags & O_ACCMODE) == O_WRONLY)
@@ -629,8 +629,8 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
raw_spin_lock_irq(&logbuf_lock);
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ user->idx = ns->first_idx;
+ user->seq = ns->first_seq;
raw_spin_unlock_irq(&logbuf_lock);
file->private_data = user;
@@ -669,10 +669,12 @@ const struct file_operations kmsg_fops = {
*/
void log_buf_kexec_setup(void)
{
- VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(log_first_idx);
- VMCOREINFO_SYMBOL(log_next_idx);
+ struct syslog_ns *ns = &init_syslog_ns;
+
+ VMCOREINFO_SYMBOL(logbuf_lock);
+ VMCOREINFO_SYMBOL(ns->buf_len);
+ VMCOREINFO_SYMBOL(ns->first_idx);
+ VMCOREINFO_SYMBOL(ns->next_idx);
/*
* Export struct log size and field offsets. User space tools can
* parse it and detect any changes to structure down the line.
@@ -695,7 +697,7 @@ static int __init log_buf_len_setup(char *str)
if (size)
size = roundup_pow_of_two(size);
- if (size > log_buf_len)
+ if (size > __LOG_BUF_LEN)
new_log_buf_len = size;
return 0;
@@ -729,14 +731,14 @@ void __init setup_log_buf(int early)
}
raw_spin_lock_irqsave(&logbuf_lock, flags);
- log_buf_len = new_log_buf_len;
- log_buf = new_log_buf;
+ init_syslog_ns.buf_len = new_log_buf_len;
+ init_syslog_ns.buf = new_log_buf;
new_log_buf_len = 0;
- free = __LOG_BUF_LEN - log_next_idx;
- memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
+ free = __LOG_BUF_LEN - init_syslog_ns.next_idx;
+ memcpy(&init_syslog_ns.buf, __log_buf, __LOG_BUF_LEN);
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
- pr_info("log_buf_len: %d\n", log_buf_len);
+ pr_info("log_buf_len: %d\n", init_syslog_ns.buf_len);
pr_info("early log buf free: %d(%d%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
}
@@ -794,21 +796,15 @@ static inline void boot_delay_msec(void)
}
#endif
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
-
-static int syslog_action_restricted(int type)
+static int syslog_action_restricted(struct syslog_ns *ns, int type)
{
- if (dmesg_restrict)
+ if (ns->dmesg_restrict)
return 1;
/* Unless restricted, we allow "read all" and "get buffer size" for everybody */
return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
}
-static int check_syslog_permissions(int type, bool from_file)
+static int check_syslog_permissions(struct syslog_ns *ns, int type, bool from_file)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
@@ -817,11 +813,22 @@ static int check_syslog_permissions(int type, bool from_file)
if (from_file && type != SYSLOG_ACTION_OPEN)
return 0;
- if (syslog_action_restricted(type)) {
- if (capable(CAP_SYSLOG))
+ /*
+ * we need to check for priv against init_user_ns for
+ * SYSLOG_ACTION_CONSOLE.
+ */
+ if (type == SYSLOG_ACTION_CONSOLE_OFF || type == SYSLOG_ACTION_CONSOLE_ON
+ || type == SYSLOG_ACTION_CONSOLE_LEVEL)
+ ns = &init_syslog_ns;
+
+ if (type == SYSLOG_ACTION_NEW_NS) // will be at create_syslog_ns()
+ return 0;
+
+ if (syslog_action_restricted(ns, type)) {
+ if (ns_capable(ns->owner, CAP_SYSLOG))
return 0;
/* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
- if (capable(CAP_SYS_ADMIN)) {
+ if (ns_capable(ns->owner, CAP_SYS_ADMIN)) {
printk_once(KERN_WARNING "%s (%d): "
"Attempt to access syslog with CAP_SYS_ADMIN "
"but no CAP_SYSLOG (deprecated).\n",
@@ -937,7 +944,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
return len;
}
-static int syslog_print(char __user *buf, int size)
+static int syslog_print(struct syslog_ns *ns, char __user *buf, int size)
{
char *text;
struct log *msg;
@@ -952,33 +959,33 @@ static int syslog_print(char __user *buf, int size)
size_t skip;
raw_spin_lock_irq(&logbuf_lock);
- if (syslog_seq < log_first_seq) {
+ if (ns->syslog_seq < ns->first_seq) {
/* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_prev = 0;
- syslog_partial = 0;
+ ns->syslog_seq = ns->first_seq;
+ ns->syslog_idx = ns->first_idx;
+ ns->syslog_prev = 0;
+ ns->syslog_partial = 0;
}
- if (syslog_seq == log_next_seq) {
+ if (ns->syslog_seq == ns->next_seq) {
raw_spin_unlock_irq(&logbuf_lock);
break;
}
- skip = syslog_partial;
- msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, syslog_prev, true, text,
+ skip = ns->syslog_partial;
+ msg = log_from_idx(ns, ns->syslog_idx);
+ n = msg_print_text(msg, ns->syslog_prev, true, text,
LOG_LINE_MAX + PREFIX_MAX);
- if (n - syslog_partial <= size) {
+ if (n - ns->syslog_partial <= size) {
/* message fits into buffer, move forward */
- syslog_idx = log_next(syslog_idx);
- syslog_seq++;
- syslog_prev = msg->flags;
- n -= syslog_partial;
- syslog_partial = 0;
+ ns->syslog_idx = log_next(ns, ns->syslog_idx);
+ ns->syslog_seq++;
+ ns->syslog_prev = msg->flags;
+ n -= ns->syslog_partial;
+ ns->syslog_partial = 0;
} else if (!len){
/* partial read(), remember position */
n = size;
- syslog_partial += n;
+ ns->syslog_partial += n;
} else
n = 0;
raw_spin_unlock_irq(&logbuf_lock);
@@ -1001,7 +1008,7 @@ static int syslog_print(char __user *buf, int size)
return len;
}
-static int syslog_print_all(char __user *buf, int size, bool clear)
+static int syslog_print_all(struct syslog_ns *ns, char __user *buf, int size, bool clear)
{
char *text;
int len = 0;
@@ -1017,48 +1024,48 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u32 idx;
enum log_flags prev;
- if (clear_seq < log_first_seq) {
+ if (ns->clear_seq < ns->first_seq) {
/* messages are gone, move to first available one */
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
+ ns->clear_seq = ns->first_seq;
+ ns->clear_idx = ns->first_idx;
}
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
- seq = clear_seq;
- idx = clear_idx;
+ seq = ns->clear_seq;
+ idx = ns->clear_idx;
prev = 0;
- while (seq < log_next_seq) {
- struct log *msg = log_from_idx(idx);
+ while (seq < ns->next_seq) {
+ struct log *msg = log_from_idx(ns, idx);
len += msg_print_text(msg, prev, true, NULL, 0);
prev = msg->flags;
- idx = log_next(idx);
+ idx = log_next(ns, idx);
seq++;
}
/* move first record forward until length fits into the buffer */
- seq = clear_seq;
- idx = clear_idx;
+ seq = ns->clear_seq;
+ idx = ns->clear_idx;
prev = 0;
- while (len > size && seq < log_next_seq) {
- struct log *msg = log_from_idx(idx);
+ while (len > size && seq < ns->next_seq) {
+ struct log *msg = log_from_idx(ns, idx);
len -= msg_print_text(msg, prev, true, NULL, 0);
prev = msg->flags;
- idx = log_next(idx);
+ idx = log_next(ns, idx);
seq++;
}
/* last message fitting into this dump */
- next_seq = log_next_seq;
+ next_seq = ns->next_seq;
len = 0;
prev = 0;
while (len >= 0 && seq < next_seq) {
- struct log *msg = log_from_idx(idx);
+ struct log *msg = log_from_idx(ns, idx);
int textlen;
textlen = msg_print_text(msg, prev, true, text,
@@ -1067,7 +1074,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
len = textlen;
break;
}
- idx = log_next(idx);
+ idx = log_next(ns, idx);
seq++;
prev = msg->flags;
@@ -1078,18 +1085,18 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
len += textlen;
raw_spin_lock_irq(&logbuf_lock);
- if (seq < log_first_seq) {
+ if (seq < ns->first_seq) {
/* messages are gone, move to next one */
- seq = log_first_seq;
- idx = log_first_idx;
+ seq = ns->first_seq;
+ idx = ns->first_idx;
prev = 0;
}
}
}
if (clear) {
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
+ ns->clear_seq = ns->next_seq;
+ ns->clear_idx = ns->next_idx;
}
raw_spin_unlock_irq(&logbuf_lock);
@@ -1097,13 +1104,68 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
return len;
}
-int do_syslog(int type, char __user *buf, int len, bool from_file)
+void free_syslog(struct kref *kref)
+{
+ struct syslog_ns *ns = container_of(kref, struct syslog_ns, kref);
+
+ kfree(ns->buf);
+ kfree(ns);
+}
+
+static DEFINE_SPINLOCK(syslog_ns_lock);
+
+static int create_syslog_ns(void)
+{
+ struct user_namespace *userns = current_user_ns();
+ struct syslog_ns *oldns, *newns;
+ int err;
+
+ /*
+ * syslog ns belongs to a user ns. So you can only unshare your
+ * user_ns if you share a user_ns with your parent userns
+ */
+ if (userns == &init_user_ns || userns->syslog_ns != userns->parent->syslog_ns)
+ return -EINVAL;
+
+ if (!ns_capable(userns, CAP_SYSLOG))
+ return -EPERM;
+
+ spin_lock(&syslog_ns_lock);
+ err = -ENOMEM;
+ oldns = userns->syslog_ns;
+ newns = kzalloc(sizeof(*newns), GFP_ATOMIC);
+ if (!newns)
+ goto out;
+ newns->buf_len = __LOG_BUF_LEN; // should be smaller? XXX
+ newns->buf = kzalloc(newns->buf_len, GFP_ATOMIC);
+ if (!newns->buf)
+ goto out;
+
+ newns->owner = get_user_ns(userns);
+ newns->dmesg_restrict = oldns->dmesg_restrict;
+ put_syslog_ns(oldns);
+ kref_init(&newns->kref);
+ userns->syslog_ns = newns;
+ newns = NULL;
+
+ err = 0;
+
+out:
+ spin_unlock(&syslog_ns_lock);
+ if (newns) {
+ kfree(newns->buf);
+ kfree(newns);
+ }
+ return err;
+}
+
+int do_syslog(struct syslog_ns *ns, int type, char __user *buf, int len, bool from_file)
{
bool clear = false;
static int saved_console_loglevel = -1;
int error;
- error = check_syslog_permissions(type, from_file);
+ error = check_syslog_permissions(ns, type, from_file);
if (error)
goto out;
@@ -1128,10 +1190,10 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
goto out;
}
error = wait_event_interruptible(log_wait,
- syslog_seq != log_next_seq);
+ ns->syslog_seq != ns->next_seq);
if (error)
goto out;
- error = syslog_print(buf, len);
+ error = syslog_print(ns, buf, len);
break;
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
@@ -1149,11 +1211,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
error = -EFAULT;
goto out;
}
- error = syslog_print_all(buf, len, clear);
+ error = syslog_print_all(ns, buf, len, clear);
break;
/* Clear ring buffer */
case SYSLOG_ACTION_CLEAR:
- syslog_print_all(NULL, 0, true);
+ syslog_print_all(ns, NULL, 0, true);
break;
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
@@ -1183,12 +1245,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
raw_spin_lock_irq(&logbuf_lock);
- if (syslog_seq < log_first_seq) {
+ if (ns->syslog_seq < ns->first_seq) {
/* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_prev = 0;
- syslog_partial = 0;
+ ns->syslog_seq = ns->first_seq;
+ ns->syslog_idx = ns->first_idx;
+ ns->syslog_prev = 0;
+ ns->syslog_partial = 0;
}
if (from_file) {
/*
@@ -1196,28 +1258,31 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_idx - syslog_idx;
+ error = ns->next_idx - ns->syslog_idx;
} else {
- u64 seq = syslog_seq;
- u32 idx = syslog_idx;
- enum log_flags prev = syslog_prev;
+ u64 seq = ns->syslog_seq;
+ u32 idx = ns->syslog_idx;
+ enum log_flags prev = ns->syslog_prev;
error = 0;
- while (seq < log_next_seq) {
- struct log *msg = log_from_idx(idx);
+ while (seq < ns->next_seq) {
+ struct log *msg = log_from_idx(ns, idx);
error += msg_print_text(msg, prev, true, NULL, 0);
- idx = log_next(idx);
+ idx = log_next(ns, idx);
seq++;
prev = msg->flags;
}
- error -= syslog_partial;
+ error -= ns->syslog_partial;
}
raw_spin_unlock_irq(&logbuf_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
- error = log_buf_len;
+ error = ns->buf_len;
+ break;
+ case SYSLOG_ACTION_NEW_NS:
+ error = create_syslog_ns();
break;
default:
error = -EINVAL;
@@ -1229,7 +1294,7 @@ out:
SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
{
- return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
+ return do_syslog(current_user_ns()->syslog_ns, type, buf, len, SYSLOG_FROM_CALL);
}
static bool __read_mostly ignore_loglevel;
@@ -1375,30 +1440,27 @@ static inline void printk_delay(void)
}
}
-/*
- * Continuation lines are buffered, and not committed to the record buffer
- * until the line is complete, or a race forces it. The line fragments
- * though, are printed immediately to the consoles to ensure everything has
- * reached the console in case of a kernel crash.
- */
static struct cont {
char buf[LOG_LINE_MAX];
- size_t len; /* length == 0 means unused buffer */
- size_t cons; /* bytes written to console */
- struct task_struct *owner; /* task of first print*/
- u64 ts_nsec; /* time of first print */
- u8 level; /* log level of first message */
- u8 facility; /* log level of first message */
- enum log_flags flags; /* prefix, newline flags */
- bool flushed:1; /* buffer sealed and committed */
-} cont;
+ size_t len; /* length == 0 means unused buffer */
+ size_t cons; /* bytes written to console */
+ struct task_struct *owner; /* task of first print*/
+ u64 ts_nsec; /* time of first print */
+ u8 level; /* log level of first message */
+ u8 facility; /* log level of first message */
+ enum log_flags flags; /* prefix, newline flags */
+ bool flushed:1; /* buffer sealed and committed */
+ struct syslog_ns *ns; /* namespace this msg belongs to */
+} cont = {
+ .ns = &init_syslog_ns,
+};
-static void cont_flush(enum log_flags flags)
+static void cont_flush(struct syslog_ns *ns, enum log_flags flags)
{
if (cont.flushed)
- return;
+ goto out;
if (cont.len == 0)
- return;
+ goto out;
if (cont.cons) {
/*
@@ -1406,7 +1468,7 @@ static void cont_flush(enum log_flags flags)
* console; wait for the console to pick up the rest of the
* line. LOG_NOCONS suppresses a duplicated output.
*/
- log_store(cont.facility, cont.level, flags | LOG_NOCONS,
+ log_store(cont.ns, cont.facility, cont.level, flags | LOG_NOCONS,
cont.ts_nsec, NULL, 0, cont.buf, cont.len);
cont.flags = flags;
cont.flushed = true;
@@ -1415,22 +1477,30 @@ static void cont_flush(enum log_flags flags)
* If no fragment of this line ever reached the console,
* just submit it to the store and free the buffer.
*/
- log_store(cont.facility, cont.level, flags, 0,
+ log_store(cont.ns, cont.facility, cont.level, flags, 0,
NULL, 0, cont.buf, cont.len);
cont.len = 0;
}
+
+out:
+ if (cont.ns != ns) {
+ put_syslog_ns(cont.ns);
+ cont.ns = get_syslog_ns(ns);
+ }
}
-static bool cont_add(int facility, int level, const char *text, size_t len)
+static bool cont_add(struct syslog_ns *ns, int facility, int level, const char *text, size_t len)
{
if (cont.len && cont.flushed)
return false;
if (cont.len + len > sizeof(cont.buf)) {
/* the line gets too long, split it up in separate records */
- cont_flush(LOG_CONT);
+ cont_flush(ns, LOG_CONT);
return false;
- }
+ } else if (cont.len && cont.ns != ns)
+ cont_flush(ns, 0);
+
if (!cont.len) {
cont.facility = facility;
@@ -1440,13 +1510,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
cont.flags = 0;
cont.cons = 0;
cont.flushed = false;
+ if (cont.ns != ns) {
+ put_syslog_ns(cont.ns);
+ cont.ns = get_syslog_ns(ns);
+ }
}
memcpy(cont.buf + cont.len, text, len);
cont.len += len;
if (cont.len > (sizeof(cont.buf) * 80) / 100)
- cont_flush(LOG_CONT);
+ cont_flush(ns, LOG_CONT);
return true;
}
@@ -1479,7 +1553,7 @@ static size_t cont_print_text(char *text, size_t size)
return textlen;
}
-asmlinkage int vprintk_emit(int facility, int level,
+int nsvprintk_emit(struct syslog_ns *ns, int facility, int level,
const char *dict, size_t dictlen,
const char *fmt, va_list args)
{
@@ -1528,7 +1602,7 @@ asmlinkage int vprintk_emit(int facility, int level,
recursion_bug = 0;
printed_len += strlen(recursion_msg);
/* emit KERN_CRIT message */
- log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+ log_store(ns, 0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
NULL, 0, recursion_msg, printed_len);
}
@@ -1576,11 +1650,11 @@ asmlinkage int vprintk_emit(int facility, int level,
* or another task also prints continuation lines.
*/
if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
- cont_flush(LOG_NEWLINE);
+ cont_flush(ns, LOG_NEWLINE);
/* buffer line if possible, otherwise store it right away */
- if (!cont_add(facility, level, text, text_len))
- log_store(facility, level, lflags | LOG_CONT, 0,
+ if (!cont_add(ns, facility, level, text, text_len))
+ log_store(ns, facility, level, lflags | LOG_CONT, 0,
dict, dictlen, text, text_len);
} else {
bool stored = false;
@@ -1593,12 +1667,12 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (cont.len && cont.owner == current) {
if (!(lflags & LOG_PREFIX))
- stored = cont_add(facility, level, text, text_len);
- cont_flush(LOG_NEWLINE);
+ stored = cont_add(ns, facility, level, text, text_len);
+ cont_flush(ns, LOG_NEWLINE);
}
if (!stored)
- log_store(facility, level, lflags, 0,
+ log_store(ns, facility, level, lflags, 0,
dict, dictlen, text, text_len);
}
printed_len += text_len;
@@ -1620,6 +1694,14 @@ out_restore_irqs:
return printed_len;
}
+
+asmlinkage int vprintk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, va_list args)
+{
+ return nsvprintk_emit(&init_syslog_ns, facility, level, dict, dictlen,
+ fmt, args);
+}
EXPORT_SYMBOL(vprintk_emit);
asmlinkage int vprintk(const char *fmt, va_list args)
@@ -1690,23 +1772,18 @@ EXPORT_SYMBOL(printk);
#define LOG_LINE_MAX 0
#define PREFIX_MAX 0
#define LOG_LINE_MAX 0
-static u64 syslog_seq;
-static u32 syslog_idx;
static u64 console_seq;
static u32 console_idx;
-static enum log_flags syslog_prev;
-static u64 log_first_seq;
-static u32 log_first_idx;
-static u64 log_next_seq;
static enum log_flags console_prev;
static struct cont {
size_t len;
size_t cons;
u8 level;
bool flushed:1;
+ struct syslog_ns *ns;
} cont;
-static struct log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
+static struct log *log_from_idx(struct syslog_ns *ns, u32 idx) { return NULL; }
+static u32 log_next(struct syslog_ns *ns, u32 idx) { return 0; }
static void call_console_drivers(int level, const char *text, size_t len) {}
static size_t msg_print_text(const struct log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
@@ -1988,7 +2065,7 @@ static void console_cont_flush(char *text, size_t size)
raw_spin_lock_irqsave(&logbuf_lock, flags);
- if (!cont.len)
+ if (!cont.len || cont.ns != &init_syslog_ns)
goto out;
/*
@@ -1996,7 +2073,7 @@ static void console_cont_flush(char *text, size_t size)
* busy. The earlier ones need to be printed before this one, we
* did not flush any fragment so far, so just let it queue up.
*/
- if (console_seq < log_next_seq && !cont.cons)
+ if (console_seq < init_syslog_ns.next_seq && !cont.cons)
goto out;
len = cont_print_text(text, size);
@@ -2031,6 +2108,7 @@ void console_unlock(void)
unsigned long flags;
bool wake_klogd = false;
bool retry;
+ struct syslog_ns *ns = &init_syslog_ns;
if (console_suspended) {
up(&console_sem);
@@ -2048,28 +2126,28 @@ again:
int level;
raw_spin_lock_irqsave(&logbuf_lock, flags);
- if (seen_seq != log_next_seq) {
+ if (seen_seq != ns->next_seq) {
wake_klogd = true;
- seen_seq = log_next_seq;
+ seen_seq = ns->next_seq;
}
- if (console_seq < log_first_seq) {
+ if (console_seq < ns->first_seq) {
/* messages are gone, move to first one */
- console_seq = log_first_seq;
- console_idx = log_first_idx;
+ console_seq = ns->first_seq;
+ console_idx = ns->first_idx;
console_prev = 0;
}
skip:
- if (console_seq == log_next_seq)
+ if (console_seq == ns->next_seq)
break;
- msg = log_from_idx(console_idx);
+ msg = log_from_idx(ns, console_idx);
if (msg->flags & LOG_NOCONS) {
/*
* Skip record we have buffered and already printed
* directly to the console when we received it.
*/
- console_idx = log_next(console_idx);
+ console_idx = log_next(ns, console_idx);
console_seq++;
/*
* We will get here again when we register a new
@@ -2084,7 +2162,7 @@ skip:
level = msg->level;
len = msg_print_text(msg, console_prev, false,
text, sizeof(text));
- console_idx = log_next(console_idx);
+ console_idx = log_next(ns, console_idx);
console_seq++;
console_prev = msg->flags;
raw_spin_unlock(&logbuf_lock);
@@ -2111,7 +2189,7 @@ skip:
* flush, no worries.
*/
raw_spin_lock(&logbuf_lock);
- retry = console_seq != log_next_seq;
+ retry = console_seq != ns->next_seq;
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
if (retry && console_trylock())
@@ -2237,6 +2315,7 @@ void register_console(struct console *newcon)
int i;
unsigned long flags;
struct console *bcon = NULL;
+ struct syslog_ns *ns = &init_syslog_ns;
/*
* before we register a new CON_BOOT console, make sure we don't
@@ -2347,9 +2426,9 @@ void register_console(struct console *newcon)
* for us.
*/
raw_spin_lock_irqsave(&logbuf_lock, flags);
- console_seq = syslog_seq;
- console_idx = syslog_idx;
- console_prev = syslog_prev;
+ console_seq = ns->syslog_seq;
+ console_idx = ns->syslog_idx;
+ console_prev = ns->syslog_prev;
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
/*
* We're about to replay the log buffer. Only do this to the
@@ -2573,6 +2652,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
{
struct kmsg_dumper *dumper;
unsigned long flags;
+ struct syslog_ns *ns = &init_syslog_ns;
if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
return;
@@ -2586,10 +2666,10 @@ void kmsg_dump(enum kmsg_dump_reason reason)
dumper->active = true;
raw_spin_lock_irqsave(&logbuf_lock, flags);
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ dumper->cur_seq = ns->clear_seq;
+ dumper->cur_idx = ns->clear_idx;
+ dumper->next_seq = ns->next_seq;
+ dumper->next_idx = ns->next_idx;
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
/* invoke dumper which will iterate over records */
@@ -2626,24 +2706,25 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
struct log *msg;
size_t l = 0;
bool ret = false;
+ struct syslog_ns *ns = &init_syslog_ns;
if (!dumper->active)
goto out;
- if (dumper->cur_seq < log_first_seq) {
+ if (dumper->cur_seq < ns->first_seq) {
/* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
+ dumper->cur_seq = ns->first_seq;
+ dumper->cur_idx = ns->first_idx;
}
/* last entry */
- if (dumper->cur_seq >= log_next_seq)
+ if (dumper->cur_seq >= ns->next_seq)
goto out;
- msg = log_from_idx(dumper->cur_idx);
+ msg = log_from_idx(ns, dumper->cur_idx);
l = msg_print_text(msg, 0, syslog, line, size);
- dumper->cur_idx = log_next(dumper->cur_idx);
+ dumper->cur_idx = log_next(ns, dumper->cur_idx);
dumper->cur_seq++;
ret = true;
out:
@@ -2713,15 +2794,16 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
enum log_flags prev;
size_t l = 0;
bool ret = false;
+ struct syslog_ns *ns = &init_syslog_ns;
if (!dumper->active)
goto out;
raw_spin_lock_irqsave(&logbuf_lock, flags);
- if (dumper->cur_seq < log_first_seq) {
+ if (dumper->cur_seq < ns->first_seq) {
/* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
+ dumper->cur_seq = ns->first_seq;
+ dumper->cur_idx = ns->first_idx;
}
/* last entry */
@@ -2735,10 +2817,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
idx = dumper->cur_idx;
prev = 0;
while (seq < dumper->next_seq) {
- struct log *msg = log_from_idx(idx);
+ struct log *msg = log_from_idx(ns, idx);
l += msg_print_text(msg, prev, true, NULL, 0);
- idx = log_next(idx);
+ idx = log_next(ns, idx);
seq++;
prev = msg->flags;
}
@@ -2748,10 +2830,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
idx = dumper->cur_idx;
prev = 0;
while (l > size && seq < dumper->next_seq) {
- struct log *msg = log_from_idx(idx);
+ struct log *msg = log_from_idx(ns, idx);
l -= msg_print_text(msg, prev, true, NULL, 0);
- idx = log_next(idx);
+ idx = log_next(ns, idx);
seq++;
prev = msg->flags;
}
@@ -2763,10 +2845,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
l = 0;
prev = 0;
while (seq < dumper->next_seq) {
- struct log *msg = log_from_idx(idx);
+ struct log *msg = log_from_idx(ns, idx);
l += msg_print_text(msg, prev, syslog, buf + l, size - l);
- idx = log_next(idx);
+ idx = log_next(ns, idx);
seq++;
prev = msg->flags;
}
@@ -2794,10 +2876,12 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
*/
void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
{
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ struct syslog_ns *ns = &init_syslog_ns;
+
+ dumper->cur_seq = ns->clear_seq;
+ dumper->cur_idx = ns->clear_idx;
+ dumper->next_seq = ns->next_seq;
+ dumper->next_idx = ns->next_idx;
}
/**
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65ea..3d5f19f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
#include <linux/kmod.h>
#include <linux/capability.h>
#include <linux/binfmts.h>
+#include <linux/syslog.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -712,7 +713,7 @@ static struct ctl_table kern_table[] = {
},
{
.procname = "dmesg_restrict",
- .data = &dmesg_restrict,
+ .data = &init_syslog_ns.dmesg_restrict,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax_sysadmin,
diff --git a/kernel/user.c b/kernel/user.c
index 33acb5e..bd176cc 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -18,6 +18,8 @@
#include <linux/user_namespace.h>
#include <linux/proc_fs.h>
+struct syslog_ns;
+extern struct syslog_ns init_syslog_ns;
/*
* userns count is 1 for root user, 1 for init_uts_ns,
* and 1 for... ?
@@ -53,6 +55,7 @@ struct user_namespace init_user_ns = {
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.proc_inum = PROC_USER_INIT_INO,
+ .syslog_ns = &init_syslog_ns,
};
EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index dafa125..f62c8a9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/projid.h>
+#include <linux/syslog.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
@@ -84,6 +85,7 @@ int create_user_ns(struct cred *new)
ns->owner = owner;
ns->group = group;
+ ns->syslog_ns = get_syslog_ns(parent_ns->syslog_ns);
/* Leave the new user namespace reference ns on new */
set_cred_user_ns(new, ns);
@@ -111,6 +113,7 @@ void free_user_ns(struct kref *kref)
struct user_namespace *parent, *ns =
container_of(kref, struct user_namespace, kref);
+ put_syslog_ns(ns->syslog_ns);
parent = ns->parent;
proc_free_inum(ns->proc_inum);
kmem_cache_free(user_ns_cachep, ns);
--
1.7.10.4
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH RFC] syslog ns proof of concept
2012-11-17 0:25 [PATCH RFC] syslog ns proof of concept Serge Hallyn
@ 2012-11-17 3:14 ` Eric W. Biederman
[not found] ` <87haoo3opt.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Eric W. Biederman @ 2012-11-17 3:14 UTC (permalink / raw)
To: Serge Hallyn
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Daniel Lezcano, Stéphane Graber
Serge Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org> writes:
> Introduce a system log namespace. The syslog ns is tied to a user
> namespace. You must create a new user namespace before you can create a
> new sylog ns. The syslog ns is created through a new command (11) to
> the __NR_syslog system call.
>
> Once a task enters a new syslog ns, it's "dmesg", "dmesg -c" and
> /dev/kmsg actions affect only itself, so that user-created syslog
> messages no longer are confusingly combined in the host's syslog.
> "printk" itself always goes to the initial syslog_ns, and consoles
> belong only to the initial syslog_ns. However printks relating to a
> specific network namespace, for instance, can now be targeted to the
> syslog ns for the user ns which owns the network ns, aiding in debugging
> in a container.
>
> This patch is on top of the user namespace enhanced kernel at
> git://kernel.ubuntu.com/serge/quantal-userns. It is good enough to
> compile with stock ubuntu kernel options, boot, launch other syslog
> namespaces and exercise them. It will need help before it will compile
> with funky options like CONFIG_PRINTK=n. This is only being sent out to
> get feedback on the general idea.
>
> Comments greatly appreciated.
>
> (See https://wiki.ubuntu.com/LxcSyslogNs for background).
Overall I would say the goal sounds well thought out.
I am not a fan of how this ties into the user namespace. I would prefer
closer or looser ties. The recursive reference count loop where a
userns refers to a syslogns and that syslogns refers to the same userns
is unpleasant.
The important case as I understand it is to handle injection of messages
into dmesg by userspace?
I would really like to see how messages from networking devices and
netfilter would be handled. Right now one of the ugliest bits of
lowering the permissions in the network namespace is what do about the
commands that set the message loglevel.
In general unless we can safely and sanely direct kernel messages into
this new dmesg I don't actually see the point of having another ring
buffer in the kernel. If the only success is userspace having the
syslog facility simply be unavailable seems more palatable.
Eric
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH RFC] syslog ns proof of concept
[not found] ` <87haoo3opt.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
@ 2012-11-17 4:02 ` Serge E. Hallyn
[not found] ` <20121117040200.GA24079-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Serge E. Hallyn @ 2012-11-17 4:02 UTC (permalink / raw)
To: Eric W. Biederman
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Daniel Lezcano, Stéphane Graber
Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> Serge Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org> writes:
>
> > Introduce a system log namespace. The syslog ns is tied to a user
> > namespace. You must create a new user namespace before you can create a
> > new sylog ns. The syslog ns is created through a new command (11) to
> > the __NR_syslog system call.
> >
> > Once a task enters a new syslog ns, it's "dmesg", "dmesg -c" and
> > /dev/kmsg actions affect only itself, so that user-created syslog
> > messages no longer are confusingly combined in the host's syslog.
> > "printk" itself always goes to the initial syslog_ns, and consoles
> > belong only to the initial syslog_ns. However printks relating to a
> > specific network namespace, for instance, can now be targeted to the
> > syslog ns for the user ns which owns the network ns, aiding in debugging
> > in a container.
> >
> > This patch is on top of the user namespace enhanced kernel at
> > git://kernel.ubuntu.com/serge/quantal-userns. It is good enough to
> > compile with stock ubuntu kernel options, boot, launch other syslog
> > namespaces and exercise them. It will need help before it will compile
> > with funky options like CONFIG_PRINTK=n. This is only being sent out to
> > get feedback on the general idea.
> >
> > Comments greatly appreciated.
> >
> > (See https://wiki.ubuntu.com/LxcSyslogNs for background).
>
> Overall I would say the goal sounds well thought out.
>
> I am not a fan of how this ties into the user namespace. I would prefer
> closer or looser ties. The recursive reference count loop where a
> userns refers to a syslogns and that syslogns refers to the same userns
> is unpleasant.
We could make the nsproxy point to the syslog_ns, but this seemed simpler.
Note that the syslog_ns does not need to pin the user_ns, since by design
the user_ns owning a syslog_ns can't go away if the syslog_ns is still
alive.
But yes, the question of "what should point to the syslog_ns" is what has
kept a syslog_ns from being seriously proposed since february 2010 :)
Hm, wait. A nagging feeling made me look back, and I see that I do in
fact pin the user_ns from the syslog_ns. I didn't mean to (and I don't
release it :) and we don't need to. When a syslog_ns is created, it
can only be inherited by child user_ns's, and its owner, the parent user_ns,
can never go away until the child user_ns's go away.
> The important case as I understand it is to handle injection of messages
> into dmesg by userspace?
1. injection of messages into dmesg by userspace, 2. clearing of messages
by userspace, but also 3. allowing appropriate kernel printks to be
targeted to containers.
> I would really like to see how messages from networking devices and
> netfilter would be handled. Right now one of the ugliest bits of
It would simply replace a
printk(KERN_NOTICE "doing something\n");
with
nsprintk(net->user_ns->syslog_ns, KERN_NOTICE "doing something\n");
I'm not yet clear on whether we'd want nsprintk to print to both the
init_syslog_ns (with a ns prefix) and the child ns.
> lowering the permissions in the network namespace is what do about the
> commands that set the message loglevel.
Here I'm not sure what you mean.
> In general unless we can safely and sanely direct kernel messages into
> this new dmesg I don't actually see the point of having another ring
> buffer in the kernel. If the only success is userspace having the
> syslog facility simply be unavailable seems more palatable.
No I didn't do any in this patch, but directing kernel messages into the
new dmesg was definately a goal and should be trivial now.
-serge
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH RFC] syslog ns proof of concept
[not found] ` <20121117040200.GA24079-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
@ 2012-11-17 6:08 ` Eric W. Biederman
[not found] ` <87pq3c223i.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Eric W. Biederman @ 2012-11-17 6:08 UTC (permalink / raw)
To: Serge E. Hallyn
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Daniel Lezcano, Stéphane Graber
"Serge E. Hallyn" <serge-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org> writes:
> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>> Serge Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org> writes:
>>
>> > Introduce a system log namespace. The syslog ns is tied to a user
>> > namespace. You must create a new user namespace before you can create a
>> > new sylog ns. The syslog ns is created through a new command (11) to
>> > the __NR_syslog system call.
>> >
>> > Once a task enters a new syslog ns, it's "dmesg", "dmesg -c" and
>> > /dev/kmsg actions affect only itself, so that user-created syslog
>> > messages no longer are confusingly combined in the host's syslog.
>> > "printk" itself always goes to the initial syslog_ns, and consoles
>> > belong only to the initial syslog_ns. However printks relating to a
>> > specific network namespace, for instance, can now be targeted to the
>> > syslog ns for the user ns which owns the network ns, aiding in debugging
>> > in a container.
>> >
>> > This patch is on top of the user namespace enhanced kernel at
>> > git://kernel.ubuntu.com/serge/quantal-userns. It is good enough to
>> > compile with stock ubuntu kernel options, boot, launch other syslog
>> > namespaces and exercise them. It will need help before it will compile
>> > with funky options like CONFIG_PRINTK=n. This is only being sent out to
>> > get feedback on the general idea.
>> >
>> > Comments greatly appreciated.
>> >
>> > (See https://wiki.ubuntu.com/LxcSyslogNs for background).
>>
>> Overall I would say the goal sounds well thought out.
>>
>> I am not a fan of how this ties into the user namespace. I would prefer
>> closer or looser ties. The recursive reference count loop where a
>> userns refers to a syslogns and that syslogns refers to the same userns
>> is unpleasant.
>
> We could make the nsproxy point to the syslog_ns, but this seemed simpler.
> Note that the syslog_ns does not need to pin the user_ns, since by design
> the user_ns owning a syslog_ns can't go away if the syslog_ns is still
> alive.
>
> But yes, the question of "what should point to the syslog_ns" is what has
> kept a syslog_ns from being seriously proposed since february 2010 :)
>
> Hm, wait. A nagging feeling made me look back, and I see that I do in
> fact pin the user_ns from the syslog_ns. I didn't mean to (and I don't
> release it :) and we don't need to. When a syslog_ns is created, it
> can only be inherited by child user_ns's, and its owner, the parent user_ns,
> can never go away until the child user_ns's go away.
There is an argument to be made that syslog messages are the kind of
security identifiers like uid, gids, and keys that should be part of a
user namespace. I'm not fully convinced but there are some DOS attacks
that would naturally prevent.
>> The important case as I understand it is to handle injection of messages
>> into dmesg by userspace?
>
> 1. injection of messages into dmesg by userspace, 2. clearing of messages
> by userspace, but also 3. allowing appropriate kernel printks to be
> targeted to containers.
>
>> I would really like to see how messages from networking devices and
>> netfilter would be handled. Right now one of the ugliest bits of
>
> It would simply replace a
> printk(KERN_NOTICE "doing something\n");
> with
> nsprintk(net->user_ns->syslog_ns, KERN_NOTICE "doing something\n");
>
> I'm not yet clear on whether we'd want nsprintk to print to both the
> init_syslog_ns (with a ns prefix) and the child ns.
There are some specialized forms of printk like dev_printk and in
particular netdev_printk that it would be very interesting if they
did the work behind the scenes. So that you could code the obvious
thing and it would do the right thing automatically.
>> lowering the permissions in the network namespace is what do about the
>> commands that set the message loglevel.
>
> Here I'm not sure what you mean.
There is a possible DOS attack that by turning on debug messages in a
user namespace you can overwhelm syslog.
>> In general unless we can safely and sanely direct kernel messages into
>> this new dmesg I don't actually see the point of having another ring
>> buffer in the kernel. If the only success is userspace having the
>> syslog facility simply be unavailable seems more palatable.
>
> No I didn't do any in this patch, but directing kernel messages into the
> new dmesg was definately a goal and should be trivial now.
Getting the semantics of which kernel messages should be directed at the
new ring buffer and what that means seems to me to be a key factor in
seeing how practical this is. Otherwise this seems to call out for a
change in userspace.
Certainly inside a user namespace now you can't destructively touch the
kernel's syslog at all.
Eric
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH RFC] syslog ns proof of concept
[not found] ` <87pq3c223i.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
@ 2012-11-19 14:18 ` Serge E. Hallyn
0 siblings, 0 replies; 5+ messages in thread
From: Serge E. Hallyn @ 2012-11-19 14:18 UTC (permalink / raw)
To: Eric W. Biederman
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Stéphane Graber, Daniel Lezcano
Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> "Serge E. Hallyn" <serge-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org> writes:
>
> > Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> >> Serge Hallyn <serge.hallyn-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org> writes:
> >>
> >> > Introduce a system log namespace. The syslog ns is tied to a user
> >> > namespace. You must create a new user namespace before you can create a
> >> > new sylog ns. The syslog ns is created through a new command (11) to
> >> > the __NR_syslog system call.
> >> >
> >> > Once a task enters a new syslog ns, it's "dmesg", "dmesg -c" and
> >> > /dev/kmsg actions affect only itself, so that user-created syslog
> >> > messages no longer are confusingly combined in the host's syslog.
> >> > "printk" itself always goes to the initial syslog_ns, and consoles
> >> > belong only to the initial syslog_ns. However printks relating to a
> >> > specific network namespace, for instance, can now be targeted to the
> >> > syslog ns for the user ns which owns the network ns, aiding in debugging
> >> > in a container.
> >> >
> >> > This patch is on top of the user namespace enhanced kernel at
> >> > git://kernel.ubuntu.com/serge/quantal-userns. It is good enough to
> >> > compile with stock ubuntu kernel options, boot, launch other syslog
> >> > namespaces and exercise them. It will need help before it will compile
> >> > with funky options like CONFIG_PRINTK=n. This is only being sent out to
> >> > get feedback on the general idea.
> >> >
> >> > Comments greatly appreciated.
> >> >
> >> > (See https://wiki.ubuntu.com/LxcSyslogNs for background).
> >>
> >> Overall I would say the goal sounds well thought out.
> >>
> >> I am not a fan of how this ties into the user namespace. I would prefer
> >> closer or looser ties. The recursive reference count loop where a
> >> userns refers to a syslogns and that syslogns refers to the same userns
> >> is unpleasant.
> >
> > We could make the nsproxy point to the syslog_ns, but this seemed simpler.
> > Note that the syslog_ns does not need to pin the user_ns, since by design
> > the user_ns owning a syslog_ns can't go away if the syslog_ns is still
> > alive.
> >
> > But yes, the question of "what should point to the syslog_ns" is what has
> > kept a syslog_ns from being seriously proposed since february 2010 :)
> >
> > Hm, wait. A nagging feeling made me look back, and I see that I do in
> > fact pin the user_ns from the syslog_ns. I didn't mean to (and I don't
> > release it :) and we don't need to. When a syslog_ns is created, it
> > can only be inherited by child user_ns's, and its owner, the parent user_ns,
> > can never go away until the child user_ns's go away.
>
> There is an argument to be made that syslog messages are the kind of
> security identifiers like uid, gids, and keys that should be part of a
> user namespace. I'm not fully convinced but there are some DOS attacks
> that would naturally prevent.
I can't really think of a good case for not putting the syslogns straight
into the userns (i.e. not having a separate syslogns), so I'd say let's
go that route.
There is a big locking bug (besides syslog_ns pinning user_ns) in my
patch - something needs to be done with struct cont, which pins the
syslog_ns. So either when a user_ns is freed we need to flush struct
cont if it is pinning this user_ns, or the struct cont should
explicitly pin the user_ns.
> >> The important case as I understand it is to handle injection of messages
> >> into dmesg by userspace?
> >
> > 1. injection of messages into dmesg by userspace, 2. clearing of messages
> > by userspace, but also 3. allowing appropriate kernel printks to be
> > targeted to containers.
> >
> >> I would really like to see how messages from networking devices and
> >> netfilter would be handled. Right now one of the ugliest bits of
> >
> > It would simply replace a
> > printk(KERN_NOTICE "doing something\n");
> > with
> > nsprintk(net->user_ns->syslog_ns, KERN_NOTICE "doing something\n");
> >
> > I'm not yet clear on whether we'd want nsprintk to print to both the
> > init_syslog_ns (with a ns prefix) and the child ns.
>
> There are some specialized forms of printk like dev_printk and in
> particular netdev_printk that it would be very interesting if they
> did the work behind the scenes. So that you could code the obvious
> thing and it would do the right thing automatically.
Agreed.
> >> lowering the permissions in the network namespace is what do about the
> >> commands that set the message loglevel.
> >
> > Here I'm not sure what you mean.
>
> There is a possible DOS attack that by turning on debug messages in a
> user namespace you can overwhelm syslog.
Oh, I see.
> >> In general unless we can safely and sanely direct kernel messages into
> >> this new dmesg I don't actually see the point of having another ring
> >> buffer in the kernel. If the only success is userspace having the
> >> syslog facility simply be unavailable seems more palatable.
> >
> > No I didn't do any in this patch, but directing kernel messages into the
> > new dmesg was definately a goal and should be trivial now.
>
> Getting the semantics of which kernel messages should be directed at the
> new ring buffer and what that means seems to me to be a key factor in
> seeing how practical this is. Otherwise this seems to call out for a
> change in userspace.
Ok, I was hoping that once there was a trivial to use nsprintk the
appopriate users would be converted by others :), but I can take a
look at converting compelling users before I resend.
> Certainly inside a user namespace now you can't destructively touch the
> kernel's syslog at all.
That should be true, yes.
thanks,
-serge
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2012-11-19 14:18 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-11-17 0:25 [PATCH RFC] syslog ns proof of concept Serge Hallyn
2012-11-17 3:14 ` Eric W. Biederman
[not found] ` <87haoo3opt.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2012-11-17 4:02 ` Serge E. Hallyn
[not found] ` <20121117040200.GA24079-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>
2012-11-17 6:08 ` Eric W. Biederman
[not found] ` <87pq3c223i.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2012-11-19 14:18 ` Serge E. Hallyn
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.