public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] x86/msr: Read MSRs individually
@ 2023-05-23 19:49 Tim Wiederhake
  2023-05-23 19:49 ` [PATCH 2/2] x86/msr: Allow unprivileged read access to some MSRs Tim Wiederhake
  2023-05-30 10:23 ` [PATCH v2] " Tim Wiederhake
  0 siblings, 2 replies; 6+ messages in thread
From: Tim Wiederhake @ 2023-05-23 19:49 UTC (permalink / raw)
  To: Borislav Petkov, Dave Hansen, H. Peter Anvin, Ingo Molnar,
	Paolo Bonzini, Thomas Gleixner, kvm, linux-kernel, x86
  Cc: Tim Wiederhake

Reading from /dev/cpu/*/msr with buffer size > 8 would read the data
of the same msr repeatedly instead of the data for consecutive msrs,
as one might expect.

Solve by restricting MSR reads to one per call.

Signed-off-by: Tim Wiederhake <twiederh@redhat.com>
---
 arch/x86/kernel/msr.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bb17d37db01..058f2b67d0c7 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -58,24 +58,17 @@ static ssize_t msr_read(struct file *file, char __user *buf,
 	u32 reg = *ppos;
 	int cpu = iminor(file_inode(file));
 	int err = 0;
-	ssize_t bytes = 0;
 
-	if (count % 8)
+	if (count < 8)
 		return -EINVAL;	/* Invalid chunk size */
 
-	for (; count; count -= 8) {
-		err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
-		if (err)
-			break;
-		if (copy_to_user(tmp, &data, 8)) {
-			err = -EFAULT;
-			break;
-		}
-		tmp += 2;
-		bytes += 8;
-	}
+	err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
+	if (err)
+		return err;
+	if (copy_to_user(tmp, &data, 8))
+		return -EFAULT;
 
-	return bytes ? bytes : err;
+	return 8;
 }
 
 static int filter_write(u32 reg)
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/2] x86/msr: Allow unprivileged read access to some MSRs
  2023-05-23 19:49 [PATCH 1/2] x86/msr: Read MSRs individually Tim Wiederhake
@ 2023-05-23 19:49 ` Tim Wiederhake
  2023-05-23 20:31   ` H. Peter Anvin
  2023-05-30 10:23 ` [PATCH v2] " Tim Wiederhake
  1 sibling, 1 reply; 6+ messages in thread
From: Tim Wiederhake @ 2023-05-23 19:49 UTC (permalink / raw)
  To: Borislav Petkov, Dave Hansen, H. Peter Anvin, Ingo Molnar,
	Paolo Bonzini, Thomas Gleixner, kvm, linux-kernel, x86
  Cc: Tim Wiederhake

Delaying access control allows unprivileged processes to
read specific MSRs, such as IA32_CORE_CAPABILITIES and
IA32_ARCH_CAPABILITIES. This is helpful for e.g. qemu and
libvirt who require the raw MSR content to calculate host
CPU capabilities. Other programs might be interested in
IA32_EFER for x86-64-v1 detection.

Signed-off-by: Tim Wiederhake <twiederh@redhat.com>
---
 arch/x86/kernel/msr.c | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 058f2b67d0c7..9485aa7f8161 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -50,6 +50,23 @@ enum allow_write_msrs {
 
 static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;
 
+static int filter_read(struct file *file, u32 reg)
+{
+	if (file->private_data)
+		return 0;
+
+	switch (reg) {
+	case MSR_IA32_CORE_CAPS:
+	case MSR_IA32_ARCH_CAPABILITIES:
+	case MSR_EFER:
+		return 0;
+	default:
+		break;
+	}
+
+	return -EPERM;
+}
+
 static ssize_t msr_read(struct file *file, char __user *buf,
 			size_t count, loff_t *ppos)
 {
@@ -59,6 +76,10 @@ static ssize_t msr_read(struct file *file, char __user *buf,
 	int cpu = iminor(file_inode(file));
 	int err = 0;
 
+	err = filter_read(file, reg);
+	if (err)
+		return err;
+
 	if (count < 8)
 		return -EINVAL;	/* Invalid chunk size */
 
@@ -71,7 +92,7 @@ static ssize_t msr_read(struct file *file, char __user *buf,
 	return 8;
 }
 
-static int filter_write(u32 reg)
+static int filter_write(struct file *file, u32 reg)
 {
 	/*
 	 * MSRs writes usually happen all at once, and can easily saturate kmsg.
@@ -83,6 +104,9 @@ static int filter_write(u32 reg)
 	 */
 	static DEFINE_RATELIMIT_STATE(fw_rs, 30 * HZ, 1);
 
+	if (!file->private_data)
+		return -EPERM;
+
 	switch (allow_writes) {
 	case MSR_WRITES_ON:  return 0;
 	case MSR_WRITES_OFF: return -EPERM;
@@ -113,7 +137,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
 	if (err)
 		return err;
 
-	err = filter_write(reg);
+	err = filter_write(file, reg);
 	if (err)
 		return err;
 
@@ -156,6 +180,9 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
 			err = -EFAULT;
 			break;
 		}
+		err = filter_read(file, regs[1]);
+		if (err)
+			return err;
 		err = rdmsr_safe_regs_on_cpu(cpu, regs);
 		if (err)
 			break;
@@ -176,7 +203,7 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
 		if (err)
 			break;
 
-		err = filter_write(regs[1]);
+		err = filter_write(file, regs[1]);
 		if (err)
 			return err;
 
@@ -202,8 +229,7 @@ static int msr_open(struct inode *inode, struct file *file)
 	unsigned int cpu = iminor(file_inode(file));
 	struct cpuinfo_x86 *c;
 
-	if (!capable(CAP_SYS_RAWIO))
-		return -EPERM;
+	file->private_data = (void *)(capable(CAP_SYS_RAWIO));
 
 	if (cpu >= nr_cpu_ids || !cpu_online(cpu))
 		return -ENXIO;	/* No such CPU */
@@ -245,6 +271,8 @@ static int msr_device_destroy(unsigned int cpu)
 
 static char *msr_devnode(const struct device *dev, umode_t *mode)
 {
+	if (mode)
+		*mode = 0644;
 	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
 }
 
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] x86/msr: Allow unprivileged read access to some MSRs
  2023-05-23 19:49 ` [PATCH 2/2] x86/msr: Allow unprivileged read access to some MSRs Tim Wiederhake
@ 2023-05-23 20:31   ` H. Peter Anvin
  0 siblings, 0 replies; 6+ messages in thread
From: H. Peter Anvin @ 2023-05-23 20:31 UTC (permalink / raw)
  To: Tim Wiederhake, Borislav Petkov, Dave Hansen, Ingo Molnar,
	Paolo Bonzini, Thomas Gleixner, kvm, linux-kernel, x86

On May 23, 2023 12:49:49 PM PDT, Tim Wiederhake <twiederh@redhat.com> wrote:
>Delaying access control allows unprivileged processes to
>read specific MSRs, such as IA32_CORE_CAPABILITIES and
>IA32_ARCH_CAPABILITIES. This is helpful for e.g. qemu and
>libvirt who require the raw MSR content to calculate host
>CPU capabilities. Other programs might be interested in
>IA32_EFER for x86-64-v1 detection.
>
>Signed-off-by: Tim Wiederhake <twiederh@redhat.com>
>---
> arch/x86/kernel/msr.c | 38 +++++++++++++++++++++++++++++++++-----
> 1 file changed, 33 insertions(+), 5 deletions(-)
>
>diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
>index 058f2b67d0c7..9485aa7f8161 100644
>--- a/arch/x86/kernel/msr.c
>+++ b/arch/x86/kernel/msr.c
>@@ -50,6 +50,23 @@ enum allow_write_msrs {
> 
> static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;
> 
>+static int filter_read(struct file *file, u32 reg)
>+{
>+	if (file->private_data)
>+		return 0;
>+
>+	switch (reg) {
>+	case MSR_IA32_CORE_CAPS:
>+	case MSR_IA32_ARCH_CAPABILITIES:
>+	case MSR_EFER:
>+		return 0;
>+	default:
>+		break;
>+	}
>+
>+	return -EPERM;
>+}
>+
> static ssize_t msr_read(struct file *file, char __user *buf,
> 			size_t count, loff_t *ppos)
> {
>@@ -59,6 +76,10 @@ static ssize_t msr_read(struct file *file, char __user *buf,
> 	int cpu = iminor(file_inode(file));
> 	int err = 0;
> 
>+	err = filter_read(file, reg);
>+	if (err)
>+		return err;
>+
> 	if (count < 8)
> 		return -EINVAL;	/* Invalid chunk size */
> 
>@@ -71,7 +92,7 @@ static ssize_t msr_read(struct file *file, char __user *buf,
> 	return 8;
> }
> 
>-static int filter_write(u32 reg)
>+static int filter_write(struct file *file, u32 reg)
> {
> 	/*
> 	 * MSRs writes usually happen all at once, and can easily saturate kmsg.
>@@ -83,6 +104,9 @@ static int filter_write(u32 reg)
> 	 */
> 	static DEFINE_RATELIMIT_STATE(fw_rs, 30 * HZ, 1);
> 
>+	if (!file->private_data)
>+		return -EPERM;
>+
> 	switch (allow_writes) {
> 	case MSR_WRITES_ON:  return 0;
> 	case MSR_WRITES_OFF: return -EPERM;
>@@ -113,7 +137,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
> 	if (err)
> 		return err;
> 
>-	err = filter_write(reg);
>+	err = filter_write(file, reg);
> 	if (err)
> 		return err;
> 
>@@ -156,6 +180,9 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
> 			err = -EFAULT;
> 			break;
> 		}
>+		err = filter_read(file, regs[1]);
>+		if (err)
>+			return err;
> 		err = rdmsr_safe_regs_on_cpu(cpu, regs);
> 		if (err)
> 			break;
>@@ -176,7 +203,7 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
> 		if (err)
> 			break;
> 
>-		err = filter_write(regs[1]);
>+		err = filter_write(file, regs[1]);
> 		if (err)
> 			return err;
> 
>@@ -202,8 +229,7 @@ static int msr_open(struct inode *inode, struct file *file)
> 	unsigned int cpu = iminor(file_inode(file));
> 	struct cpuinfo_x86 *c;
> 
>-	if (!capable(CAP_SYS_RAWIO))
>-		return -EPERM;
>+	file->private_data = (void *)(capable(CAP_SYS_RAWIO));
> 
> 	if (cpu >= nr_cpu_ids || !cpu_online(cpu))
> 		return -ENXIO;	/* No such CPU */
>@@ -245,6 +271,8 @@ static int msr_device_destroy(unsigned int cpu)
> 
> static char *msr_devnode(const struct device *dev, umode_t *mode)
> {
>+	if (mode)
>+		*mode = 0644;
> 	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
> }
> 

I believe the preferred way to do this is to export "cooked" information in sysfs. /dev/msr really should be considered a legacy, very low-level interface suitable only for restricted environments.

When I wrote this driver, it was a maxim that "root owns the system"; there was no way to restrict privileged interfaces like /dev/(k)mem, /dev/ioports, or kernel module installation from root; any of these interfaces can be trivially used to take full command of the hardware.

This is no longer the case, and it is better for the kernel to only export known good information at the semantic level. There is no inherent guarantee that any of these registers may not contain security sensitive information in the future.

At this point, /dev/msr should be considered a debugging-only interface (for which it is quite useful still.)

So NAK on allowing any kind of access without CAP_SYS_RAWIO.

NAK on your other patch, too, because it is based on an invalid assumption: read/write to /dev/msr does *not* advance the file pointer. This is analogous to /dev/ioports, rather than /dev/mem.

Incidentally, you have touched on a potential issue here: I suspect that we should check CAP_SYS_RAWIO for each access – the old policy of "if you can open it you can give someone else a file descriptor" probably no longer makes any sense, at least without some kind of mechanism to filter access control for a specific fd. (Then again, the idiom of "open then drop privileges" might justify caching the capability.)




^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v2] x86/msr: Allow unprivileged read access to some MSRs
  2023-05-23 19:49 [PATCH 1/2] x86/msr: Read MSRs individually Tim Wiederhake
  2023-05-23 19:49 ` [PATCH 2/2] x86/msr: Allow unprivileged read access to some MSRs Tim Wiederhake
@ 2023-05-30 10:23 ` Tim Wiederhake
  2023-05-30 16:56   ` Jim Mattson
  2023-05-30 17:19   ` Dave Hansen
  1 sibling, 2 replies; 6+ messages in thread
From: Tim Wiederhake @ 2023-05-30 10:23 UTC (permalink / raw)
  To: Borislav Petkov, Dave Hansen, H. Peter Anvin, Ingo Molnar,
	Paolo Bonzini, Thomas Gleixner, kvm, linux-kernel, x86
  Cc: Tim Wiederhake

Software such as qemu and libvirt require the raw content of some MSRs
to calculate host CPU capabilities. This is currently done through
/dev/cpu/*/msr which is locked behind both CAP_SYS_RAWIO and file mode
0600, allowing only root to read and write MSRs.

Expose some non-security sensitive MSRs through sysfs to allow access
for unprivileged processes. This also helps other programs that are
interested in IA32_EFER for x86-64-v1 detection.

Signed-off-by: Tim Wiederhake <twiederh@redhat.com>
---
Changes to v1 (https://lkml.org/lkml/2023/5/23/1230):
* removed patch to limit reads to /dev/cpu/*/msr to 8 bytes per read
* removed CAP_SYS_RAWIO-less access to /dev/cpu/*/msr
* introduced sysfs interface to msrs

With this sysfs-based, unrestricted read access to some select msrs in
place, a later patch could introduce checks for CAP_SYS_RAWIO for every
access to /dev/cpu/*/msr as mentioned in the feedback to v1.
---
 arch/x86/kernel/msr.c | 45 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bb17d37db01..3c8354f3c2bd 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -50,6 +50,31 @@ enum allow_write_msrs {
 
 static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;
 
+struct allow_read_msrs {
+	const char *procname;
+	u32 index;
+	u32 value[2];
+};
+
+static struct allow_read_msrs allow_reads[] = {
+	{
+		.procname = "ia32_core_caps",
+		.index = MSR_IA32_CORE_CAPS,
+	},
+	{
+		.procname = "ia32_arch_capabilities",
+		.index = MSR_IA32_ARCH_CAPABILITIES,
+	},
+	{
+		.procname = "efer",
+		.index = MSR_EFER,
+	},
+};
+
+static struct ctl_table msr_files[ARRAY_SIZE(allow_reads) + 1];
+
+static struct ctl_table_header *msr_files_header;
+
 static ssize_t msr_read(struct file *file, char __user *buf,
 			size_t count, loff_t *ppos)
 {
@@ -258,6 +283,25 @@ static char *msr_devnode(const struct device *dev, umode_t *mode)
 static int __init msr_init(void)
 {
 	int err;
+	int i, j;
+
+	for (i = 0, j = 0; i < ARRAY_SIZE(allow_reads); ++i) {
+		err = rdmsr_safe_on_cpu(0, allow_reads[i].index,
+					&allow_reads[i].value[0],
+					&allow_reads[i].value[1]);
+		if (err)
+			continue;
+		msr_files[j].procname = allow_reads[i].procname;
+		msr_files[j].data = &allow_reads[i].value;
+		msr_files[j].maxlen = 2 * sizeof(u32);
+		msr_files[j].mode = 0444;
+		msr_files[j].proc_handler = proc_doulongvec_minmax;
+		++j;
+	}
+
+	msr_files_header = register_sysctl("vm/msr", msr_files);
+	if (!msr_files_header)
+		return -ENOMEM;
 
 	if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
 		pr_err("unable to get major %d for msr\n", MSR_MAJOR);
@@ -287,6 +331,7 @@ module_init(msr_init);
 
 static void __exit msr_exit(void)
 {
+	unregister_sysctl_table(msr_files_header);
 	cpuhp_remove_state(cpuhp_msr_state);
 	class_destroy(msr_class);
 	__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86/msr: Allow unprivileged read access to some MSRs
  2023-05-30 10:23 ` [PATCH v2] " Tim Wiederhake
@ 2023-05-30 16:56   ` Jim Mattson
  2023-05-30 17:19   ` Dave Hansen
  1 sibling, 0 replies; 6+ messages in thread
From: Jim Mattson @ 2023-05-30 16:56 UTC (permalink / raw)
  To: Tim Wiederhake
  Cc: Borislav Petkov, Dave Hansen, H. Peter Anvin, Ingo Molnar,
	Paolo Bonzini, Thomas Gleixner, kvm, linux-kernel, x86

On Tue, May 30, 2023 at 3:28 AM Tim Wiederhake <twiederh@redhat.com> wrote:
>
> Software such as qemu and libvirt require the raw content of some MSRs

Note that KVM doesn't return the raw value of IA32_ARCH_CAPABILITIES.
First, it filters out unsupported bits, and then it massages the
result a bit. See kvm_get_arch_capabilities(). Isn't this what qemu
actually wants?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86/msr: Allow unprivileged read access to some MSRs
  2023-05-30 10:23 ` [PATCH v2] " Tim Wiederhake
  2023-05-30 16:56   ` Jim Mattson
@ 2023-05-30 17:19   ` Dave Hansen
  1 sibling, 0 replies; 6+ messages in thread
From: Dave Hansen @ 2023-05-30 17:19 UTC (permalink / raw)
  To: Tim Wiederhake, Borislav Petkov, Dave Hansen, H. Peter Anvin,
	Ingo Molnar, Paolo Bonzini, Thomas Gleixner, kvm, linux-kernel,
	x86

On 5/30/23 03:23, Tim Wiederhake wrote:
> Expose some non-security sensitive MSRs through sysfs to allow access
> for unprivileged processes. This also helps other programs that are
> interested in IA32_EFER for x86-64-v1 detection.

Did you mean "sysfs" or "sysctl"?

I'm still on the fence about whether we should do this.  This seems
_marginally_ better than the /dev approach.

But whatever we do we need some *VERY* explicit, tight rules about what
can be exposed via this interface in the future.  We absolutely can't
have folks adding to this in the future without following those rules.

A lot of this is implicit in the implementation and even the ABI, but
let's say them out loud, please:

  * The MSRs must be read-only.  If they are read-write, the snapshot
    can get out of date. This can be guaranteed by either:
   * Never being written at runtime after they are snapshotted, or
     preferably:
   * Being defined to be read-only (wrmsr just doesn't work)
  * The MSRs must be have the same exact value on all CPUs (because
    there is only one file per MSR)
  * The value must be static.  Not only read-only from the software
    point of view, but the hardware and hypervisor must also promise not
    to change it.

The first two seem doable.  I'm not sure how we deal with the third,
though, especially in the case of microcode updates or clever hypervisors.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-05-30 17:19 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-05-23 19:49 [PATCH 1/2] x86/msr: Read MSRs individually Tim Wiederhake
2023-05-23 19:49 ` [PATCH 2/2] x86/msr: Allow unprivileged read access to some MSRs Tim Wiederhake
2023-05-23 20:31   ` H. Peter Anvin
2023-05-30 10:23 ` [PATCH v2] " Tim Wiederhake
2023-05-30 16:56   ` Jim Mattson
2023-05-30 17:19   ` Dave Hansen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox