public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [git pull] x86 PAT changes
@ 2008-04-24 22:56 Ingo Molnar
  2008-04-25 23:43 ` Linus Torvalds
  2008-04-26 13:40 ` Gabriel C
  0 siblings, 2 replies; 15+ messages in thread
From: Ingo Molnar @ 2008-04-24 22:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Andrew Morton, H. Peter Anvin, Thomas Gleixner

Linus, please pull the x86-pat git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86-pat.git for-linus

this adds the second (and final) phase of the x86 PAT changes. Due to 
generic impact (the drivers/char/mem.c and include/asm-generic/iomap.h 
changes) this is offered as a separate tree.

The new CONFIG_NONPROMISC_DEVMEM defaults to disabled so PAT will be 
disabled, to stay compatible and to be a bit cautious/gradual.

Thanks,

	Ingo

------------------>
Arjan van de Ven (1):
      x86: introduce /dev/mem restrictions with a config option

Ingo Molnar (2):
      pat: cleanups
      /dev/mem: make promisc the default

Venki Pallipadi (1):
      devmem: add range_is_allowed() check to mmap of /dev/mem

venkatesh.pallipadi@intel.com (4):
      x86: PAT avoid aliasing in /dev/mem read/write
      x86: PAT phys_mem_access_prot_allowed for dev/mem mmap
      x86: PAT use reserve free memtype in mmap of /dev/mem
      generic: add ioremap_wc() interface wrapper

 arch/x86/Kconfig.debug      |   11 +++
 arch/x86/mm/init_32.c       |   19 +++++
 arch/x86/mm/init_64.c       |   20 +++++
 arch/x86/mm/ioremap.c       |   29 +++++++
 arch/x86/mm/pat.c           |  175 +++++++++++++++++++++++++++++++++++++++----
 drivers/char/mem.c          |  133 +++++++++++++++++++++++++--------
 include/asm-generic/iomap.h |    4 +
 include/asm-x86/io.h        |    8 ++
 include/asm-x86/io_32.h     |    6 --
 include/asm-x86/io_64.h     |    6 --
 include/asm-x86/page.h      |    1 +
 include/asm-x86/pgtable.h   |    9 ++
 12 files changed, 363 insertions(+), 58 deletions(-)

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 610aaec..239fd9f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,6 +5,17 @@ config TRACE_IRQFLAGS_SUPPORT
 
 source "lib/Kconfig.debug"
 
+config NONPROMISC_DEVMEM
+	bool "Disable promiscuous /dev/mem"
+	help
+	  The /dev/mem file by default only allows userspace access to PCI
+	  space and the BIOS code and data regions. This is sufficient for
+	  dosemu and X and all common users of /dev/mem. With this config
+	  option, you allow userspace access to all of memory, including
+	  kernel and userspace memory. Accidental access to this is
+	  obviously disasterous, but specific access can be used by people
+	  debugging the kernel.
+
 config EARLY_PRINTK
 	bool "Early printk" if EMBEDDED
 	default y
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9ec62da..39852d5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -227,6 +227,25 @@ static inline int page_kills_ppro(unsigned long pagenr)
 	return 0;
 }
 
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (!page_is_ram(pagenr))
+		return 1;
+	return 0;
+}
+
 #ifdef CONFIG_HIGHMEM
 pte_t *kmap_pte;
 pgprot_t kmap_prot;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ff7906..49c274e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -664,6 +664,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (!page_is_ram(pagenr))
+		return 1;
+	return 0;
+}
+
+
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
 			 kcore_modules, kcore_vsyscall;
 
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3a4baf9..caac7d5 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -336,6 +336,35 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(unsigned long phys)
+{
+	void *addr;
+	unsigned long start = phys & PAGE_MASK;
+
+	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+	if (page_is_ram(start >> PAGE_SHIFT))
+		return __va(phys);
+
+	addr = (void *)ioremap(start, PAGE_SIZE);
+	if (addr)
+		addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+
+	return addr;
+}
+
+void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+	if (page_is_ram(phys >> PAGE_SHIFT))
+		return;
+
+	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+	return;
+}
+
 #ifdef CONFIG_X86_32
 
 int __initdata early_ioremap_debug;
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 72c0f60..ef8b64b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/gfp.h>
 #include <linux/fs.h>
+#include <linux/bootmem.h>
 
 #include <asm/msr.h>
 #include <asm/tlbflush.h>
@@ -21,6 +22,7 @@
 #include <asm/cacheflush.h>
 #include <asm/fcntl.h>
 #include <asm/mtrr.h>
+#include <asm/io.h>
 
 int pat_wc_enabled = 1;
 
@@ -190,6 +192,21 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
 	return 0;
 }
 
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_WB
+ * - _PAGE_CACHE_WC
+ * - _PAGE_CACHE_UC_MINUS
+ * - _PAGE_CACHE_UC
+ *
+ * req_type will have a special case value '-1', when requester want to inherit
+ * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
+ *
+ * If ret_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If ret_type is non-null, function will return
+ * available type in ret_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 			unsigned long *ret_type)
 {
@@ -200,9 +217,14 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	/* Only track when pat_wc_enabled */
 	if (!pat_wc_enabled) {
-		if (ret_type)
-			*ret_type = req_type;
-
+		/* This is identical to page table setting without PAT */
+		if (ret_type) {
+			if (req_type == -1) {
+				*ret_type = _PAGE_CACHE_WB;
+			} else {
+				*ret_type = req_type;
+			}
+		}
 		return 0;
 	}
 
@@ -214,8 +236,29 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		return 0;
 	}
 
-	req_type &= _PAGE_CACHE_MASK;
-	err = pat_x_mtrr_type(start, end, req_type, &actual_type);
+	if (req_type == -1) {
+		/*
+		 * Special case where caller wants to inherit from mtrr or
+		 * existing pat mapping, defaulting to UC_MINUS in case of
+		 * no match.
+		 */
+		u8 mtrr_type = mtrr_type_lookup(start, end);
+		if (mtrr_type == 0xFE) { /* MTRR match error */
+			err = -1;
+		}
+
+		if (mtrr_type == MTRR_TYPE_WRBACK) {
+			req_type = _PAGE_CACHE_WB;
+			actual_type = _PAGE_CACHE_WB;
+		} else {
+			req_type = _PAGE_CACHE_UC_MINUS;
+			actual_type = _PAGE_CACHE_UC_MINUS;
+		}
+	} else {
+		req_type &= _PAGE_CACHE_MASK;
+		err = pat_x_mtrr_type(start, end, req_type, &actual_type);
+	}
+
 	if (err) {
 		if (ret_type)
 			*ret_type = actual_type;
@@ -241,7 +284,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		struct memtype *saved_ptr;
 
 		if (parse->start >= end) {
-			printk("New Entry\n");
+			pr_debug("New Entry\n");
 			list_add(&new_entry->nd, parse->nd.prev);
 			new_entry = NULL;
 			break;
@@ -343,7 +386,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 				break;
 			}
 
-			printk("Overlap at 0x%Lx-0x%Lx\n",
+			printk(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
 			       saved_ptr->start, saved_ptr->end);
 			/* No conflict. Go ahead and add this new entry */
 			list_add(&new_entry->nd, &saved_ptr->nd);
@@ -353,7 +396,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 	}
 
 	if (err) {
-		printk(
+		printk(KERN_INFO
 	"reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
 			start, end, cattr_name(new_entry->type),
 			cattr_name(req_type));
@@ -365,16 +408,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 	if (new_entry) {
 		/* No conflict. Not yet added to the list. Add to the tail */
 		list_add_tail(&new_entry->nd, &memtype_list);
-		printk("New Entry\n");
-  	}
+		pr_debug("New Entry\n");
+	}
 
 	if (ret_type) {
-		printk(
+		pr_debug(
 	"reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
 			start, end, cattr_name(actual_type),
 			cattr_name(req_type), cattr_name(*ret_type));
 	} else {
-		printk(
+		pr_debug(
 	"reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
 			start, end, cattr_name(actual_type),
 			cattr_name(req_type));
@@ -411,11 +454,115 @@ int free_memtype(u64 start, u64 end)
 	spin_unlock(&memtype_lock);
 
 	if (err) {
-		printk(KERN_DEBUG "%s:%d freeing invalid memtype %Lx-%Lx\n",
+		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
 			current->comm, current->pid, start, end);
 	}
 
-	printk( "free_memtype request 0x%Lx-0x%Lx\n", start, end);
+	pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
 	return err;
 }
 
+
+/*
+ * /dev/mem mmap interface. The memtype used for mapping varies:
+ * - Use UC for mappings with O_SYNC flag
+ * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
+ *   inherit the memtype from existing mapping.
+ * - Else use UC_MINUS memtype (for backward compatibility with existing
+ *   X drivers.
+ */
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+				unsigned long size, pgprot_t vma_prot)
+{
+	return vma_prot;
+}
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+				unsigned long size, pgprot_t *vma_prot)
+{
+	u64 offset = ((u64) pfn) << PAGE_SHIFT;
+	unsigned long flags = _PAGE_CACHE_UC_MINUS;
+	unsigned long ret_flags;
+	int retval;
+
+	if (file->f_flags & O_SYNC) {
+		flags = _PAGE_CACHE_UC;
+	}
+
+#ifdef CONFIG_X86_32
+	/*
+	 * On the PPro and successors, the MTRRs are used to set
+	 * memory types for physical addresses outside main memory,
+	 * so blindly setting UC or PWT on those pages is wrong.
+	 * For Pentiums and earlier, the surround logic should disable
+	 * caching for the high addresses through the KEN pin, but
+	 * we maintain the tradition of paranoia in this code.
+	 */
+	if (!pat_wc_enabled &&
+	    ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
+		test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
+		test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
+		test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
+	   (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+		flags = _PAGE_CACHE_UC;
+	}
+#endif
+
+	/*
+	 * With O_SYNC, we can only take UC mapping. Fail if we cannot.
+	 * Without O_SYNC, we want to get
+	 * - WB for WB-able memory and no other conflicting mappings
+	 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+	 * - Inherit from confliting mappings otherwise
+	 */
+	if (flags != _PAGE_CACHE_UC_MINUS) {
+		retval = reserve_memtype(offset, offset + size, flags, NULL);
+	} else {
+		retval = reserve_memtype(offset, offset + size, -1, &ret_flags);
+	}
+
+	if (retval < 0)
+		return 0;
+
+	flags = ret_flags;
+
+	if (pfn <= max_pfn_mapped &&
+            ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
+		free_memtype(offset, offset + size);
+		printk(KERN_INFO
+		"%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
+			current->comm, current->pid,
+			cattr_name(flags),
+			offset, offset + size);
+		return 0;
+	}
+
+	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+			     flags);
+	return 1;
+}
+
+void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
+{
+	u64 addr = (u64)pfn << PAGE_SHIFT;
+	unsigned long flags;
+	unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
+
+	reserve_memtype(addr, addr + size, want_flags, &flags);
+	if (flags != want_flags) {
+		printk(KERN_INFO
+		"%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
+			current->comm, current->pid,
+			cattr_name(want_flags),
+			addr, addr + size,
+			cattr_name(flags));
+	}
+}
+
+void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
+{
+	u64 addr = (u64)pfn << PAGE_SHIFT;
+
+	free_memtype(addr, addr + size);
+}
+
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 20070b7..e83623e 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -41,36 +41,7 @@
  */
 static inline int uncached_access(struct file *file, unsigned long addr)
 {
-#if defined(__i386__) && !defined(__arch_um__)
-	/*
-	 * On the PPro and successors, the MTRRs are used to set
-	 * memory types for physical addresses outside main memory,
-	 * so blindly setting PCD or PWT on those pages is wrong.
-	 * For Pentiums and earlier, the surround logic should disable
-	 * caching for the high addresses through the KEN pin, but
-	 * we maintain the tradition of paranoia in this code.
-	 */
-	if (file->f_flags & O_SYNC)
-		return 1;
- 	return !( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
-		  test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
-		  test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
-		  test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability) )
-	  && addr >= __pa(high_memory);
-#elif defined(__x86_64__) && !defined(__arch_um__)
-	/* 
-	 * This is broken because it can generate memory type aliases,
-	 * which can cause cache corruptions
-	 * But it is only available for root and we have to be bug-to-bug
-	 * compatible with i386.
-	 */
-	if (file->f_flags & O_SYNC)
-		return 1;
-	/* same behaviour as i386. PAT always set to cached and MTRRs control the
-	   caching behaviour. 
-	   Hopefully a full PAT implementation will fix that soon. */	   
-	return 0;
-#elif defined(CONFIG_IA64)
+#if defined(CONFIG_IA64)
 	/*
 	 * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases.
 	 */
@@ -108,6 +79,36 @@ static inline int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
 }
 #endif
 
+#ifdef CONFIG_NONPROMISC_DEVMEM
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+	u64 from = ((u64)pfn) << PAGE_SHIFT;
+	u64 to = from + size;
+	u64 cursor = from;
+
+	while (cursor < to) {
+		if (!devmem_is_allowed(pfn)) {
+			printk(KERN_INFO
+		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+				current->comm, from, to);
+			return 0;
+		}
+		cursor += PAGE_SIZE;
+		pfn++;
+	}
+	return 1;
+}
+#else
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+	return 1;
+}
+#endif
+
+void __attribute__((weak)) unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+}
+
 /*
  * This funcion reads the *physical* memory. The f_pos points directly to the 
  * memory location. 
@@ -150,15 +151,25 @@ static ssize_t read_mem(struct file * file, char __user * buf,
 
 		sz = min_t(unsigned long, sz, count);
 
+		if (!range_is_allowed(p >> PAGE_SHIFT, count))
+			return -EPERM;
+
 		/*
 		 * On ia64 if a page has been mapped somewhere as
 		 * uncached, then it must also be accessed uncached
 		 * by the kernel or data corruption may occur
 		 */
 		ptr = xlate_dev_mem_ptr(p);
+		if (!ptr)
+			return -EFAULT;
 
-		if (copy_to_user(buf, ptr, sz))
+		if (copy_to_user(buf, ptr, sz)) {
+			unxlate_dev_mem_ptr(p, ptr);
 			return -EFAULT;
+		}
+
+		unxlate_dev_mem_ptr(p, ptr);
+
 		buf += sz;
 		p += sz;
 		count -= sz;
@@ -207,20 +218,32 @@ static ssize_t write_mem(struct file * file, const char __user * buf,
 
 		sz = min_t(unsigned long, sz, count);
 
+		if (!range_is_allowed(p >> PAGE_SHIFT, sz))
+			return -EPERM;
+
 		/*
 		 * On ia64 if a page has been mapped somewhere as
 		 * uncached, then it must also be accessed uncached
 		 * by the kernel or data corruption may occur
 		 */
 		ptr = xlate_dev_mem_ptr(p);
+		if (!ptr) {
+			if (written)
+				break;
+			return -EFAULT;
+		}
 
 		copied = copy_from_user(ptr, buf, sz);
 		if (copied) {
 			written += sz - copied;
+			unxlate_dev_mem_ptr(p, ptr);
 			if (written)
 				break;
 			return -EFAULT;
 		}
+
+		unxlate_dev_mem_ptr(p, ptr);
+
 		buf += sz;
 		p += sz;
 		count -= sz;
@@ -231,6 +254,12 @@ static ssize_t write_mem(struct file * file, const char __user * buf,
 	return written;
 }
 
+int __attribute__((weak)) phys_mem_access_prot_allowed(struct file *file,
+	unsigned long pfn, unsigned long size, pgprot_t *vma_prot)
+{
+	return 1;
+}
+
 #ifndef __HAVE_PHYS_MEM_ACCESS_PROT
 static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				     unsigned long size, pgprot_t vma_prot)
@@ -271,6 +300,35 @@ static inline int private_mapping_ok(struct vm_area_struct *vma)
 }
 #endif
 
+void __attribute__((weak))
+map_devmem(unsigned long pfn, unsigned long len, pgprot_t prot)
+{
+	/* nothing. architectures can override. */
+}
+
+void __attribute__((weak))
+unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t prot)
+{
+	/* nothing. architectures can override. */
+}
+
+static void mmap_mem_open(struct vm_area_struct *vma)
+{
+	map_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
+			vma->vm_page_prot);
+}
+
+static void mmap_mem_close(struct vm_area_struct *vma)
+{
+	unmap_devmem(vma->vm_pgoff,  vma->vm_end - vma->vm_start,
+			vma->vm_page_prot);
+}
+
+static struct vm_operations_struct mmap_mem_ops = {
+	.open  = mmap_mem_open,
+	.close = mmap_mem_close
+};
+
 static int mmap_mem(struct file * file, struct vm_area_struct * vma)
 {
 	size_t size = vma->vm_end - vma->vm_start;
@@ -281,17 +339,28 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma)
 	if (!private_mapping_ok(vma))
 		return -ENOSYS;
 
+	if (!range_is_allowed(vma->vm_pgoff, size))
+		return -EPERM;
+
+	if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
+						&vma->vm_page_prot))
+		return -EINVAL;
+
 	vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
 						 size,
 						 vma->vm_page_prot);
 
+	vma->vm_ops = &mmap_mem_ops;
+
 	/* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
 	if (remap_pfn_range(vma,
 			    vma->vm_start,
 			    vma->vm_pgoff,
 			    size,
-			    vma->vm_page_prot))
+			    vma->vm_page_prot)) {
+		unmap_devmem(vma->vm_pgoff, size, vma->vm_page_prot);
 		return -EAGAIN;
+	}
 	return 0;
 }
 
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 67dc84c..76b0cc5 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -60,6 +60,10 @@ extern void iowrite32_rep(void __iomem *port, const void *buf, unsigned long cou
 extern void __iomem *ioport_map(unsigned long port, unsigned int nr);
 extern void ioport_unmap(void __iomem *);
 
+#ifndef ARCH_HAS_IOREMAP_WC
+#define ioremap_wc ioremap_nocache
+#endif
+
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
diff --git a/include/asm-x86/io.h b/include/asm-x86/io.h
index 7b292d3..d5b11f6 100644
--- a/include/asm-x86/io.h
+++ b/include/asm-x86/io.h
@@ -1,3 +1,6 @@
+#ifndef _ASM_X86_IO_H
+#define _ASM_X86_IO_H
+
 #define ARCH_HAS_IOREMAP_WC
 
 #ifdef CONFIG_X86_32
@@ -5,7 +8,12 @@
 #else
 # include "io_64.h"
 #endif
+
+extern void *xlate_dev_mem_ptr(unsigned long phys);
+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
+
 extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 				unsigned long prot_val);
 extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
 
+#endif /* _ASM_X86_IO_H */
diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h
index 509045f..6e73467 100644
--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -49,12 +49,6 @@
 #include <linux/vmalloc.h>
 
 /*
- * Convert a physical pointer to a virtual kernel pointer for /dev/mem
- * access
- */
-#define xlate_dev_mem_ptr(p)	__va(p)
-
-/*
  * Convert a virtual cached pointer to an uncached pointer
  */
 #define xlate_dev_kmem_ptr(p)	p
diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h
index c2f5eef..0930bed 100644
--- a/include/asm-x86/io_64.h
+++ b/include/asm-x86/io_64.h
@@ -308,12 +308,6 @@ extern int iommu_bio_merge;
 #define BIO_VMERGE_BOUNDARY iommu_bio_merge
 
 /*
- * Convert a physical pointer to a virtual kernel pointer for /dev/mem
- * access
- */
-#define xlate_dev_mem_ptr(p)	__va(p)
-
-/*
  * Convert a virtual cached pointer to an uncached pointer
  */
 #define xlate_dev_kmem_ptr(p)	p
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h
index 6724a4b..b381f4a 100644
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -47,6 +47,7 @@
 #ifndef __ASSEMBLY__
 
 extern int page_is_ram(unsigned long pagenr);
+extern int devmem_is_allowed(unsigned long pagenr);
 
 extern unsigned long max_pfn_mapped;
 
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index f1d9f4a..1902f0a 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -289,6 +289,15 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 
 #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
 
+#ifndef __ASSEMBLY__
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t *vma_prot);
+#endif
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-24 22:56 [git pull] x86 PAT changes Ingo Molnar
@ 2008-04-25 23:43 ` Linus Torvalds
  2008-04-26  0:06   ` H. Peter Anvin
  2008-04-26  9:57   ` [git pull] x86 PAT changes Ingo Molnar
  2008-04-26 13:40 ` Gabriel C
  1 sibling, 2 replies; 15+ messages in thread
From: Linus Torvalds @ 2008-04-25 23:43 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Andrew Morton, H. Peter Anvin, Thomas Gleixner



On Fri, 25 Apr 2008, Ingo Molnar wrote:
> Linus, please pull the x86-pat git tree from:
> 
>    git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86-pat.git for-linus
> 
> this adds the second (and final) phase of the x86 PAT changes. Due to 
> generic impact (the drivers/char/mem.c and include/asm-generic/iomap.h 
> changes) this is offered as a separate tree.

So why is PAT dependent on NONPROMISC_DEVMEM here?

It seems pointless and wrong. Somebody might want to have both the full 
/dev/mem and PAT.

Also, it causes this message for me on one of my machines:

	EXT3-fs: mounted filesystem with ordered data mode.
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0400000
	mtrr: no more MTRRs available
	Overlap at 0xd0000000-0xe0000000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0380000
	EXT3 FS on dm-0, internal journal
	...
	eth0: no IPv6 routers present
	Overlap at 0xd0000000-0xe0000000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0380000
	Overlap at 0xd0000000-0xe0000000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0380000
	Overlap at 0xd0000000-0xe0000000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0380000
	Overlap at 0xd0000000-0xe0000000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0380000
	Overlap at 0xd0000000-0xe0000000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0380000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xd0000000-0xd0020000
	mtrr: no more MTRRs available
	Overlap at 0xd0000000-0xe0000000
	Overlap at 0xe0300000-0xe0400000
	Overlap at 0xe0300000-0xe0380000

which is a bit annoying. Forgotten debug printk, perhaps?

			Linus

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-25 23:43 ` Linus Torvalds
@ 2008-04-26  0:06   ` H. Peter Anvin
  2008-04-26  1:12     ` Linus Torvalds
  2008-04-26  9:57   ` [git pull] x86 PAT changes Ingo Molnar
  1 sibling, 1 reply; 15+ messages in thread
From: H. Peter Anvin @ 2008-04-26  0:06 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ingo Molnar, linux-kernel, Andrew Morton, Thomas Gleixner

Linus Torvalds wrote:
> 
> So why is PAT dependent on NONPROMISC_DEVMEM here?
> 
> It seems pointless and wrong. Somebody might want to have both the full 
> /dev/mem and PAT.
> 

The problem is that that can create cached/uncached aliases, which can 
cause some processors to lock up (especially AMD is known to have a lot 
of errata in this area.)

	-hpa

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-26  0:06   ` H. Peter Anvin
@ 2008-04-26  1:12     ` Linus Torvalds
  2008-04-26  8:56       ` Ingo Molnar
  0 siblings, 1 reply; 15+ messages in thread
From: Linus Torvalds @ 2008-04-26  1:12 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Ingo Molnar, linux-kernel, Andrew Morton, Thomas Gleixner



On Fri, 25 Apr 2008, H. Peter Anvin wrote:
> 
> The problem is that that can create cached/uncached aliases, which can cause
> some processors to lock up (especially AMD is known to have a lot of errata in
> this area.)

Umm.. I don't think you understand. Right now, NONPROMISC_DEVMEM doesn't 
just disable mmap() on /dev/mem, it disables totally regular reads and 
writes too. That seems pretty damn excessive.

If it was just mmap(), I don't think it would matter much. I don't think 
we traditionally even supported mmap() on real RAM (because the page 
counting would get confused), and that actually got supported only thanks 
to VM changes that made it possible.

But read/write has always been supported, and shouldn't cause any 
cached/uncached aliases!

		Linus

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-26  1:12     ` Linus Torvalds
@ 2008-04-26  8:56       ` Ingo Molnar
  2008-04-26 16:54         ` Pallipadi, Venkatesh
  0 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2008-04-26  8:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: H. Peter Anvin, linux-kernel, Andrew Morton, Thomas Gleixner


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, 25 Apr 2008, H. Peter Anvin wrote:
> > 
> > The problem is that that can create cached/uncached aliases, which 
> > can cause some processors to lock up (especially AMD is known to 
> > have a lot of errata in this area.)
> 
> Umm.. I don't think you understand. Right now, NONPROMISC_DEVMEM 
> doesn't just disable mmap() on /dev/mem, it disables totally regular 
> reads and writes too. That seems pretty damn excessive.
> 
> If it was just mmap(), I don't think it would matter much. I don't 
> think we traditionally even supported mmap() on real RAM (because the 
> page counting would get confused), and that actually got supported 
> only thanks to VM changes that made it possible.
> 
> But read/write has always been supported, and shouldn't cause any 
> cached/uncached aliases!

You are right, there should be no architectural need to make PAT 
dependent on nonpromisc-devmem, and thus the patch below should be safe.

In theory even mmap() of /dev/mem should be safe this way - as all 
memtypes are properly tracked.

The thinking behind this dependency was three-fold:

- historic: from the days when the PAT patchset didnt do fully correct 
  tracking yet

- practical: that PAT would be utilized in newer distros on newer 
  systems - with older distros on older systems not really wanting 
  (or needing) neither /dev/mem restrictions nor PAT

- paranoia: one less degree of freedom to take into account

	Ingo

----------------------->
Subject: x86 PAT: decouple from nonpromisc devmem
From: Ingo Molnar <mingo@elte.hu>
Date: Sat Apr 26 10:26:52 CEST 2008

Linus pointed it out that PAT should not depend on NONPROMISC_DEVMEM.

Also make PAT non-default.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: linux-x86.q/arch/x86/Kconfig
===================================================================
--- linux-x86.q.orig/arch/x86/Kconfig
+++ linux-x86.q/arch/x86/Kconfig
@@ -1042,9 +1042,9 @@ config MTRR
 	  See <file:Documentation/mtrr.txt> for more information.
 
 config X86_PAT
-	def_bool y
+	bool
 	prompt "x86 PAT support"
-	depends on MTRR && NONPROMISC_DEVMEM
+	depends on MTRR
 	help
 	  Use PAT attributes to setup page level cache control.
 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-25 23:43 ` Linus Torvalds
  2008-04-26  0:06   ` H. Peter Anvin
@ 2008-04-26  9:57   ` Ingo Molnar
  1 sibling, 0 replies; 15+ messages in thread
From: Ingo Molnar @ 2008-04-26  9:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Andrew Morton, H. Peter Anvin, Thomas Gleixner


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> Also, it causes this message for me on one of my machines:
> 
> 	EXT3-fs: mounted filesystem with ordered data mode.
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0400000
> 	mtrr: no more MTRRs available
> 	Overlap at 0xd0000000-0xe0000000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0380000
> 	EXT3 FS on dm-0, internal journal
> 	...
> 	eth0: no IPv6 routers present
> 	Overlap at 0xd0000000-0xe0000000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0380000
> 	Overlap at 0xd0000000-0xe0000000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0380000
> 	Overlap at 0xd0000000-0xe0000000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0380000
> 	Overlap at 0xd0000000-0xe0000000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0380000
> 	Overlap at 0xd0000000-0xe0000000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0380000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xd0000000-0xd0020000
> 	mtrr: no more MTRRs available
> 	Overlap at 0xd0000000-0xe0000000
> 	Overlap at 0xe0300000-0xe0400000
> 	Overlap at 0xe0300000-0xe0380000
> 
> which is a bit annoying. Forgotten debug printk, perhaps?

yeah - i already toned it down to KERN_INFO once in commit 28eb559b5 - 
but it should be pr_debug() instead. The idea in pat.c is to only print 
out if it's a material condition that the user should know about. I've 
queued up the patch below.

	Ingo

-------->
Subject: x86 PAT: tone down debugging messages
From: Ingo Molnar <mingo@elte.hu>
Date: Sat Apr 26 11:40:31 CEST 2008

Linus reported these excessive debug printouts:

>       Overlap at 0xe0300000-0xe0400000
>       Overlap at 0xe0300000-0xe0380000
>       Overlap at 0xe0300000-0xe0400000
>       Overlap at 0xe0300000-0xe0400000
>       Overlap at 0xe0300000-0xe0400000
>       Overlap at 0xe0300000-0xe0400000
>       Overlap at 0xe0300000-0xe0400000

turn that into a pr_debug().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-x86.q/arch/x86/mm/pat.c
===================================================================
--- linux-x86.q.orig/arch/x86/mm/pat.c
+++ linux-x86.q/arch/x86/mm/pat.c
@@ -334,7 +334,7 @@ int reserve_memtype(u64 start, u64 end, 
 				break;
 			}
 
-			printk("Overlap at 0x%Lx-0x%Lx\n",
+			pr_debug("Overlap at 0x%Lx-0x%Lx\n",
 			       saved_ptr->start, saved_ptr->end);
 			/* No conflict. Go ahead and add this new entry */
 			list_add(&new_entry->nd, saved_ptr->nd.prev);

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-24 22:56 [git pull] x86 PAT changes Ingo Molnar
  2008-04-25 23:43 ` Linus Torvalds
@ 2008-04-26 13:40 ` Gabriel C
  2008-04-26 14:42   ` Ingo Molnar
  1 sibling, 1 reply; 15+ messages in thread
From: Gabriel C @ 2008-04-26 13:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, linux-kernel, Andrew Morton, H. Peter Anvin,
	Thomas Gleixner

Ingo Molnar wrote:
> Linus, please pull the x86-pat git tree from:
> 
>    git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86-pat.git for-linus
> 
> this adds the second (and final) phase of the x86 PAT changes. Due to 
> generic impact (the drivers/char/mem.c and include/asm-generic/iomap.h 
> changes) this is offered as a separate tree.
> 
> The new CONFIG_NONPROMISC_DEVMEM defaults to disabled so PAT will be 
> disabled, to stay compatible and to be a bit cautious/gradual.
> 

Hi Ingo,

When using an 64bit ( didn't got time to test 32bit now ) kernel[1] with PAT enabled , kvm-intel
does not work anymore. 

When modprobing kvm-intel , kvm is saying VT extension is disable by BIOS which isn't true.
When disabling PAT again ( no changes to BIOS ) kvm-intel works again here.

Is that an known problem ?

If you need more infos just let me know.

Gabriel

[1]  2.6.25-05096-gb1721d0-dirty
     with following patches :
     http://lkml.org/lkml/2008/4/26/37
     http://lkml.org/lkml/2008/4/26/24
     http://lkml.org/lkml/2008/4/25/107



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-26 13:40 ` Gabriel C
@ 2008-04-26 14:42   ` Ingo Molnar
  2008-04-26 15:37     ` Gabriel C
  0 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2008-04-26 14:42 UTC (permalink / raw)
  To: Gabriel C
  Cc: Linus Torvalds, linux-kernel, Andrew Morton, H. Peter Anvin,
	Thomas Gleixner, Pallipadi, Venkatesh, Arjan van de Ven,
	Siddha, Suresh B, Avi Kivity


* Gabriel C <nix.or.die@googlemail.com> wrote:

> Hi Ingo,
> 
> When using an 64bit ( didn't got time to test 32bit now ) kernel[1] 
> with PAT enabled , kvm-intel does not work anymore.
> 
> When modprobing kvm-intel , kvm is saying VT extension is disable by 
> BIOS which isn't true. When disabling PAT again ( no changes to BIOS ) 
> kvm-intel works again here.
> 
> Is that an known problem ?

no, that side-effect was not known. Cc:-ed more folks.

> If you need more infos just let me know.
> 
> Gabriel
> 
> [1]  2.6.25-05096-gb1721d0-dirty
>      with following patches :
>      http://lkml.org/lkml/2008/4/26/37
>      http://lkml.org/lkml/2008/4/26/24
>      http://lkml.org/lkml/2008/4/25/107

thanks - that should be enough for now. We'll try to reproduce these 
problems.

A blind guess: maybe it's the CONFIG_NONPROMISC_DEVMEM somehow breaks 
Qemu. With the patch below you'd be able to disable NONPROMISC_DEVMEM 
without disabling PAT.

	Ingo

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4d350b5..4aa4180 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1049,9 +1049,9 @@ config MTRR
 	  See <file:Documentation/mtrr.txt> for more information.
 
 config X86_PAT
-	def_bool y
+	bool
 	prompt "x86 PAT support"
-	depends on MTRR && NONPROMISC_DEVMEM
+	depends on MTRR
 	help
 	  Use PAT attributes to setup page level cache control.
 

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-26 14:42   ` Ingo Molnar
@ 2008-04-26 15:37     ` Gabriel C
  2008-04-26 15:41       ` Ingo Molnar
  0 siblings, 1 reply; 15+ messages in thread
From: Gabriel C @ 2008-04-26 15:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, linux-kernel, Andrew Morton, H. Peter Anvin,
	Thomas Gleixner, Pallipadi, Venkatesh, Arjan van de Ven,
	Siddha, Suresh B, Avi Kivity

Ingo Molnar wrote:
> * Gabriel C <nix.or.die@googlemail.com> wrote:
> 
>> Hi Ingo,
>>
>> When using an 64bit ( didn't got time to test 32bit now ) kernel[1] 
>> with PAT enabled , kvm-intel does not work anymore.
>>
>> When modprobing kvm-intel , kvm is saying VT extension is disable by 
>> BIOS which isn't true. When disabling PAT again ( no changes to BIOS ) 
>> kvm-intel works again here.
>>
>> Is that an known problem ?
> 
> no, that side-effect was not known. Cc:-ed more folks.

It seems like the update triggers some compiler bug.

Setting CC_OPTIMIZE_FOR_SIZE=n fixed that btw. 

I don't have the time right now but later on today I will compile 
gcc43_svn_branch and test again with CC_OPTIMIZE_FOR_SIZE=y and see what I get.

Ingo sorry for the noise.

Gabriel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-26 15:37     ` Gabriel C
@ 2008-04-26 15:41       ` Ingo Molnar
  2008-04-26 16:43         ` Linus Torvalds
  0 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2008-04-26 15:41 UTC (permalink / raw)
  To: Gabriel C
  Cc: Linus Torvalds, linux-kernel, Andrew Morton, H. Peter Anvin,
	Thomas Gleixner, Pallipadi, Venkatesh, Arjan van de Ven,
	Siddha, Suresh B, Avi Kivity


* Gabriel C <nix.or.die@googlemail.com> wrote:

> Ingo Molnar wrote:
> > * Gabriel C <nix.or.die@googlemail.com> wrote:
> > 
> >> Hi Ingo,
> >>
> >> When using an 64bit ( didn't got time to test 32bit now ) kernel[1] 
> >> with PAT enabled , kvm-intel does not work anymore.
> >>
> >> When modprobing kvm-intel , kvm is saying VT extension is disable by 
> >> BIOS which isn't true. When disabling PAT again ( no changes to BIOS ) 
> >> kvm-intel works again here.
> >>
> >> Is that an known problem ?
> > 
> > no, that side-effect was not known. Cc:-ed more folks.
> 
> It seems like the update triggers some compiler bug.
> 
> Setting CC_OPTIMIZE_FOR_SIZE=n fixed that btw.
> 
> I don't have the time right now but later on today I will compile 
> gcc43_svn_branch and test again with CC_OPTIMIZE_FOR_SIZE=y and see 
> what I get.
> 
> Ingo sorry for the noise.

no problem and it's not noise even if it turns out to be a compiler 
problem - we very much want to know about them when they affect the 
kernel.

	Ingo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-26 15:41       ` Ingo Molnar
@ 2008-04-26 16:43         ` Linus Torvalds
  0 siblings, 0 replies; 15+ messages in thread
From: Linus Torvalds @ 2008-04-26 16:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Gabriel C, linux-kernel, Andrew Morton, H. Peter Anvin,
	Thomas Gleixner, Pallipadi, Venkatesh, Arjan van de Ven,
	Siddha, Suresh B, Avi Kivity



On Sat, 26 Apr 2008, Ingo Molnar wrote:
> * Gabriel C <nix.or.die@googlemail.com> wrote:
> >
> > It seems like the update triggers some compiler bug.
> > 
> > Setting CC_OPTIMIZE_FOR_SIZE=n fixed that btw.
> > 
> > I don't have the time right now but later on today I will compile 
> > gcc43_svn_branch and test again with CC_OPTIMIZE_FOR_SIZE=y and see 
> > what I get.
> > 
> > Ingo sorry for the noise.
> 
> no problem and it's not noise even if it turns out to be a compiler 
> problem - we very much want to know about them when they affect the 
> kernel.

Indeed. Often these kinds of compiler issues are also actually our bugs, 
where some code generation thing just happened to hide the fact that we 
did something wrong.

So Gabriel, it would be wondeful if you could try to pinpoint where the 
problem happens. That said, especially if you are using something like a 
self-compiled gcc from the current SVN tree, that obviously does make it a 
*lot* more likely that you are hitting a real compiler bug.

(But even then, if you can pinpoint it, I bet the gcc people would be 
really happy to hear about it.)

			Linus

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [git pull] x86 PAT changes
  2008-04-26  8:56       ` Ingo Molnar
@ 2008-04-26 16:54         ` Pallipadi, Venkatesh
  2008-04-26 17:15           ` Linus Torvalds
  0 siblings, 1 reply; 15+ messages in thread
From: Pallipadi, Venkatesh @ 2008-04-26 16:54 UTC (permalink / raw)
  To: Ingo Molnar, Linus Torvalds
  Cc: H. Peter Anvin, linux-kernel, Andrew Morton, Thomas Gleixner

 

>-----Original Message-----
>From: linux-kernel-owner@vger.kernel.org 
>[mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of Ingo Molnar
>Sent: Saturday, April 26, 2008 1:56 AM
>To: Linus Torvalds
>Cc: H. Peter Anvin; linux-kernel@vger.kernel.org; Andrew 
>Morton; Thomas Gleixner
>Subject: Re: [git pull] x86 PAT changes
>
>
>* Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> On Fri, 25 Apr 2008, H. Peter Anvin wrote:
>> > 
>> > The problem is that that can create cached/uncached aliases, which 
>> > can cause some processors to lock up (especially AMD is known to 
>> > have a lot of errata in this area.)
>> 
>> Umm.. I don't think you understand. Right now, NONPROMISC_DEVMEM 
>> doesn't just disable mmap() on /dev/mem, it disables totally regular 
>> reads and writes too. That seems pretty damn excessive.
>> 
>> If it was just mmap(), I don't think it would matter much. I don't 
>> think we traditionally even supported mmap() on real RAM 
>(because the 
>> page counting would get confused), and that actually got supported 
>> only thanks to VM changes that made it possible.
>> 
>> But read/write has always been supported, and shouldn't cause any 
>> cached/uncached aliases!
>
>You are right, there should be no architectural need to make PAT 
>dependent on nonpromisc-devmem, and thus the patch below 
>should be safe.
>

Agreed that NONPROMISC_DEVMEM is not really needed for read/write. But,
we will still need it for /dev/mem.

The problem with /dev/mem maps of RAM is situation like this:
1) drivers does vmalloc(), followed by set_memory_uc.
2) User does a /dev/mem map of that vmalloced physical address. User
will get a UC mapping for /dev/mem.
3) driver changes the memory to set_memory_wb and frees the memory.
4) user mapping for this address is still UC which will lead to
aliasing.

Read/write is ok, as they will just use __va for RAM to access and that
will always be consistent.

Thanks,
Venki

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [git pull] x86 PAT changes
  2008-04-26 16:54         ` Pallipadi, Venkatesh
@ 2008-04-26 17:15           ` Linus Torvalds
  2008-04-26 18:32             ` Venki Pallipadi
  0 siblings, 1 reply; 15+ messages in thread
From: Linus Torvalds @ 2008-04-26 17:15 UTC (permalink / raw)
  To: Pallipadi, Venkatesh
  Cc: Ingo Molnar, H. Peter Anvin, linux-kernel, Andrew Morton,
	Thomas Gleixner



On Sat, 26 Apr 2008, Pallipadi, Venkatesh wrote:
> 
> Agreed that NONPROMISC_DEVMEM is not really needed for read/write. But,
> we will still need it for /dev/mem.

If so, just disable it unconditionally for mmap.

As mentioned, that's really just a return to original Linux /dev/mmap 
semantics: long ago (well, not _that_ long ago) we never used to be able 
to mmap() normal kernel memory, because the page counts would get screwed 
up on pages that weren't marked PG_Reserved.

So the traditional Linux behavior for mmap() on /dev/mem was always to 
only allow it on memory that either had no "struct page *" backing at all, 
or that was marked PG_Reserved (ie the ISA hole ay 640k-1M and things like 
the BIOS tables etc).

Going back to that doesn't sound horrible.

		Linus

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [git pull] x86 PAT changes
  2008-04-26 17:15           ` Linus Torvalds
@ 2008-04-26 18:32             ` Venki Pallipadi
  2008-04-26 19:07               ` [patch] x86, PAT: disable /dev/mem mmap RAM with PAT Ingo Molnar
  0 siblings, 1 reply; 15+ messages in thread
From: Venki Pallipadi @ 2008-04-26 18:32 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Pallipadi, Venkatesh, Ingo Molnar, H. Peter Anvin, linux-kernel,
	Andrew Morton, Thomas Gleixner

On Sat, Apr 26, 2008 at 10:15:33AM -0700, Linus Torvalds wrote:
> 
> 
> On Sat, 26 Apr 2008, Pallipadi, Venkatesh wrote:
> > 
> > Agreed that NONPROMISC_DEVMEM is not really needed for read/write. But,
> > we will still need it for /dev/mem.
> 
> If so, just disable it unconditionally for mmap.
> 
> As mentioned, that's really just a return to original Linux /dev/mmap 
> semantics: long ago (well, not _that_ long ago) we never used to be able 
> to mmap() normal kernel memory, because the page counts would get screwed 
> up on pages that weren't marked PG_Reserved.
> 
> So the traditional Linux behavior for mmap() on /dev/mem was always to 
> only allow it on memory that either had no "struct page *" backing at all, 
> or that was marked PG_Reserved (ie the ISA hole ay 640k-1M and things like 
> the BIOS tables etc).
> 
> Going back to that doesn't sound horrible.
> 

OK. Below is the quick to disable /dev/mem mmap of RAM with PAT.
This should go along with Ingo's patch that removes PAT dependency on
NONPROMISC_DEVMEM.  It makes things safer and eliminates aliasing.
Still somewhat unclean as the range_is_allowed is duplicated.
And also, just compile tested right now.


Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>

---
 arch/x86/mm/pat.c |   31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

Index: linux-2.6/arch/x86/mm/pat.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/pat.c	2008-04-26 09:34:31.000000000 -0700
+++ linux-2.6/arch/x86/mm/pat.c	2008-04-26 11:25:57.000000000 -0700
@@ -16,6 +16,7 @@
 #include <asm/msr.h>
 #include <asm/tlbflush.h>
 #include <asm/processor.h>
+#include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/pat.h>
 #include <asm/e820.h>
@@ -477,6 +478,33 @@ pgprot_t phys_mem_access_prot(struct fil
 	return vma_prot;
 }
 
+#ifdef CONFIG_NONPROMISC_DEVMEM
+/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+	return 1;
+}
+#else
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+	u64 from = ((u64)pfn) << PAGE_SHIFT;
+	u64 to = from + size;
+	u64 cursor = from;
+
+	while (cursor < to) {
+		if (!devmem_is_allowed(pfn)) {
+			printk(KERN_INFO
+		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+				current->comm, from, to);
+			return 0;
+		}
+		cursor += PAGE_SIZE;
+		pfn++;
+	}
+	return 1;
+}
+#endif /* CONFIG_NONPROMISC_DEVMEM */
+
 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 				unsigned long size, pgprot_t *vma_prot)
 {
@@ -485,6 +513,9 @@ int phys_mem_access_prot_allowed(struct 
 	unsigned long ret_flags;
 	int retval;
 
+	if (!range_is_allowed(pfn, size))
+		return 0;
+
 	if (file->f_flags & O_SYNC) {
 		flags = _PAGE_CACHE_UC;
 	}


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [patch] x86, PAT: disable /dev/mem mmap RAM with PAT
  2008-04-26 18:32             ` Venki Pallipadi
@ 2008-04-26 19:07               ` Ingo Molnar
  0 siblings, 0 replies; 15+ messages in thread
From: Ingo Molnar @ 2008-04-26 19:07 UTC (permalink / raw)
  To: Venki Pallipadi
  Cc: Linus Torvalds, H. Peter Anvin, linux-kernel, Andrew Morton,
	Thomas Gleixner


* Venki Pallipadi <venkatesh.pallipadi@intel.com> wrote:

> OK. Below is the quick to disable /dev/mem mmap of RAM with PAT. This 
> should go along with Ingo's patch that removes PAT dependency on 
> NONPROMISC_DEVMEM.  It makes things safer and eliminates aliasing. 
> Still somewhat unclean as the range_is_allowed is duplicated. And 
> also, just compile tested right now.

thanks, i've queued up the patch below. I'll do some testing and then 
send it to Linus.

	Ingo

--------------->
Subject: x86, PAT: disable /dev/mem mmap RAM with PAT
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Sat, 26 Apr 2008 11:32:12 -0700

disable /dev/mem mmap of RAM with PAT. It makes things safer and
eliminates aliasing. A future improvement would be to avoid the
range_is_allowed duplication.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c |   31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

Index: linux-x86.q/arch/x86/mm/pat.c
===================================================================
--- linux-x86.q.orig/arch/x86/mm/pat.c
+++ linux-x86.q/arch/x86/mm/pat.c
@@ -16,6 +16,7 @@
 #include <asm/msr.h>
 #include <asm/tlbflush.h>
 #include <asm/processor.h>
+#include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/pat.h>
 #include <asm/e820.h>
@@ -477,6 +478,33 @@ pgprot_t phys_mem_access_prot(struct fil
 	return vma_prot;
 }
 
+#ifdef CONFIG_NONPROMISC_DEVMEM
+/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+	return 1;
+}
+#else
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+	u64 from = ((u64)pfn) << PAGE_SHIFT;
+	u64 to = from + size;
+	u64 cursor = from;
+
+	while (cursor < to) {
+		if (!devmem_is_allowed(pfn)) {
+			printk(KERN_INFO
+		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+				current->comm, from, to);
+			return 0;
+		}
+		cursor += PAGE_SIZE;
+		pfn++;
+	}
+	return 1;
+}
+#endif /* CONFIG_NONPROMISC_DEVMEM */
+
 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 				unsigned long size, pgprot_t *vma_prot)
 {
@@ -485,6 +513,9 @@ int phys_mem_access_prot_allowed(struct 
 	unsigned long ret_flags;
 	int retval;
 
+	if (!range_is_allowed(pfn, size))
+		return 0;
+
 	if (file->f_flags & O_SYNC) {
 		flags = _PAGE_CACHE_UC;
 	}

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2008-04-26 19:07 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-24 22:56 [git pull] x86 PAT changes Ingo Molnar
2008-04-25 23:43 ` Linus Torvalds
2008-04-26  0:06   ` H. Peter Anvin
2008-04-26  1:12     ` Linus Torvalds
2008-04-26  8:56       ` Ingo Molnar
2008-04-26 16:54         ` Pallipadi, Venkatesh
2008-04-26 17:15           ` Linus Torvalds
2008-04-26 18:32             ` Venki Pallipadi
2008-04-26 19:07               ` [patch] x86, PAT: disable /dev/mem mmap RAM with PAT Ingo Molnar
2008-04-26  9:57   ` [git pull] x86 PAT changes Ingo Molnar
2008-04-26 13:40 ` Gabriel C
2008-04-26 14:42   ` Ingo Molnar
2008-04-26 15:37     ` Gabriel C
2008-04-26 15:41       ` Ingo Molnar
2008-04-26 16:43         ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox