* [PATCH] VM routine fixes
@ 2004-11-08 14:32 dhowells
2004-11-09 12:55 ` Christoph Hellwig
2004-11-11 22:31 ` Andrew Morton
0 siblings, 2 replies; 18+ messages in thread
From: dhowells @ 2004-11-08 14:32 UTC (permalink / raw)
To: torvalds, akpm, davidm; +Cc: linux-kernel, uclinux-dev
The attached patch fixes a number of problems in the VM routines:
(1) Some inline funcs don't compile if CONFIG_MMU is not set.
(2) swapper_pml4 needn't exist if CONFIG_MMU is not set.
(3) __free_pages_ok() doesn't counter set_page_refs() different behaviour if
CONFIG_MMU is not set.
(4) swsusp.c invokes TLB flushing functions without including the header file
that declares them.
Signed-Off-By: dhowells@redhat.com
---
diffstat mmu-fixes-2610rc1mm3.diff
include/linux/mm.h | 10 ++++++++++
init/Kconfig | 5 +++--
kernel/power/swsusp.c | 1 +
kernel/sysctl.c | 4 ++++
mm/Makefile | 4 ++--
mm/bootmem.c | 10 ++++++----
mm/internal.h | 13 +++++++++++++
mm/page_alloc.c | 13 +++++++++++--
mm/tiny-shmem.c | 2 ++
9 files changed, 52 insertions(+), 10 deletions(-)
diff -uNrp /warthog/kernels/linux-2.6.10-rc1-mm3/include/linux/mm.h linux-2.6.10-rc1-mm3-frv/include/linux/mm.h
--- /warthog/kernels/linux-2.6.10-rc1-mm3/include/linux/mm.h 2004-11-05 13:15:50.000000000 +0000
+++ linux-2.6.10-rc1-mm3-frv/include/linux/mm.h 2004-11-05 17:44:09.120160519 +0000
@@ -37,6 +37,10 @@ extern int sysctl_legacy_va_layout;
#include <asm/processor.h>
#include <asm/atomic.h>
+#ifndef CONFIG_MMU
+#define swapper_pml4 NULL
+#endif
+
#ifndef MM_VM_SIZE
#define MM_VM_SIZE(mm) TASK_SIZE
#endif
@@ -640,6 +674,7 @@ extern void remove_shrinker(struct shrin
* inlining and the symmetry break with pte_alloc_map() that does all
* of this out-of-line.
*/
+#ifdef CONFIG_MMU
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
if (pgd_none(*pgd))
@@ -660,6 +695,7 @@ static inline pgd_t *pgd_alloc_k(struct
return __pgd_alloc(mm, pml4, address);
return pml4_pgd_offset_k(pml4, address);
}
+#endif
extern void free_area_init(unsigned long * zones_size);
extern void free_area_init_node(int nid, pg_data_t *pgdat,
@@ -682,12 +718,14 @@ struct vm_area_struct *vma_prio_tree_nex
for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \
(vma = vma_prio_tree_next(vma, iter)); )
+#ifdef CONFIG_MMU
static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
struct list_head *list)
{
vma->shared.vm_set.parent = NULL;
list_add_tail(&vma->shared.vm_set.list, list);
}
+#endif
/* mmap.c */
extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
@@ -801,6 +839,7 @@ static inline void __vm_stat_account(str
}
#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_MMU
static inline void vm_stat_account(struct vm_area_struct *vma)
{
__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
@@ -812,6 +851,7 @@ static inline void vm_stat_unaccount(str
__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-vma_pages(vma));
}
+#endif
#ifndef CONFIG_DEBUG_PAGEALLOC
static inline void
diff -uNrp /warthog/kernels/linux-2.6.10-rc1-mm3/init/Kconfig linux-2.6.10-rc1-mm3-frv/init/Kconfig
--- /warthog/kernels/linux-2.6.10-rc1-mm3/init/Kconfig 2004-11-05 13:15:51.000000000 +0000
+++ linux-2.6.10-rc1-mm3-frv/init/Kconfig 2004-11-05 14:13:04.481446957 +0000
@@ -327,8 +327,9 @@ config CC_OPTIMIZE_FOR_SIZE
If unsure, say N.
config SHMEM
- default y
- bool "Use full shmem filesystem" if EMBEDDED && MMU
+ bool "Use full shmem filesystem"
+ default y if EMBEDDED
+ depends on MMU
help
The shmem is an internal filesystem used to manage shared memory.
It is backed by swap and manages resource limits. It is also exported
diff -uNrp /warthog/kernels/linux-2.6.10-rc1-mm3/kernel/power/swsusp.c linux-2.6.10-rc1-mm3-frv/kernel/power/swsusp.c
--- /warthog/kernels/linux-2.6.10-rc1-mm3/kernel/power/swsusp.c 2004-11-05 13:15:51.000000000 +0000
+++ linux-2.6.10-rc1-mm3-frv/kernel/power/swsusp.c 2004-11-05 14:13:04.000000000 +0000
@@ -67,6 +67,7 @@
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
#include <asm/io.h>
#include "power.h"
diff -uNrp /warthog/kernels/linux-2.6.10-rc1-mm3/kernel/sysctl.c linux-2.6.10-rc1-mm3-frv/kernel/sysctl.c
--- /warthog/kernels/linux-2.6.10-rc1-mm3/kernel/sysctl.c 2004-11-05 13:15:51.000000000 +0000
+++ linux-2.6.10-rc1-mm3-frv/kernel/sysctl.c 2004-11-05 17:36:16.857528949 +0000
@@ -755,6 +755,7 @@ static ctl_table vm_table[] = {
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
+#ifdef CONFIG_MMU
{
.ctl_name = VM_MAX_MAP_COUNT,
.procname = "max_map_count",
@@ -763,6 +764,7 @@ static ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec
},
+#endif
{
.ctl_name = VM_LAPTOP_MODE,
.procname = "laptop_mode",
@@ -816,6 +818,7 @@ static ctl_table vm_table[] = {
.strategy = &sysctl_jiffies,
},
#endif
+#ifdef CONFIG_MMU
{
.ctl_name = VM_HEAP_STACK_GAP,
.procname = "heap-stack-gap",
@@ -824,6 +827,7 @@ static ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#endif
{ .ctl_name = 0 }
};
diff -uNrp /warthog/kernels/linux-2.6.10-rc1-mm3/mm/bootmem.c linux-2.6.10-rc1-mm3-frv/mm/bootmem.c
--- /warthog/kernels/linux-2.6.10-rc1-mm3/mm/bootmem.c 2004-11-05 13:15:52.000000000 +0000
+++ linux-2.6.10-rc1-mm3-frv/mm/bootmem.c 2004-11-05 14:13:04.617435471 +0000
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <asm/dma.h>
#include <asm/io.h>
+#include "internal.h"
/*
* Access to this subsystem has to be serialized externally. (this is
@@ -280,17 +281,18 @@ static unsigned long __init free_all_boo
for (i = 0; i < idx; ) {
unsigned long v = ~map[i / BITS_PER_LONG];
if (gofast && v == ~0UL) {
- int j;
+ int j, order;
count += BITS_PER_LONG;
__ClearPageReserved(page);
- set_page_count(page, 1);
+ order = ffs(BITS_PER_LONG) - 1;
+ set_page_refs(page, order);
for (j = 1; j < BITS_PER_LONG; j++) {
if (j + 16 < BITS_PER_LONG)
prefetchw(page + j + 16);
__ClearPageReserved(page + j);
}
- __free_pages(page, ffs(BITS_PER_LONG)-1);
+ __free_pages(page, order);
i += BITS_PER_LONG;
page += BITS_PER_LONG;
} else if (v) {
@@ -299,7 +301,7 @@ static unsigned long __init free_all_boo
if (v & m) {
count++;
__ClearPageReserved(page);
- set_page_count(page, 1);
+ set_page_refs(page, 0);
__free_page(page);
}
}
diff -uNrp /warthog/kernels/linux-2.6.10-rc1-mm3/mm/internal.h linux-2.6.10-rc1-mm3-frv/mm/internal.h
--- /warthog/kernels/linux-2.6.10-rc1-mm3/mm/internal.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-rc1-mm3-frv/mm/internal.h 2004-11-05 14:13:04.621435134 +0000
@@ -0,0 +1,13 @@
+/* internal.h: mm/ internal definitions
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* page_alloc.c */
+extern void set_page_refs(struct page *page, int order);
diff -uNrp /warthog/kernels/linux-2.6.10-rc1-mm3/mm/Makefile linux-2.6.10-rc1-mm3-frv/mm/Makefile
--- /warthog/kernels/linux-2.6.10-rc1-mm3/mm/Makefile 2004-10-19 10:42:19.000000000 +0100
+++ linux-2.6.10-rc1-mm3-frv/mm/Makefile 2004-11-05 14:13:04.624434880 +0000
@@ -5,10 +5,10 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o
+ vmalloc.o prio_tree.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
- page_alloc.o page-writeback.o pdflush.o prio_tree.o \
+ page_alloc.o page-writeback.o pdflush.o \
readahead.o slab.o swap.o truncate.o vmscan.o \
$(mmu-y)
diff -uNrp /warthog/kernels/linux-2.6.10-rc1-mm3/mm/page_alloc.c linux-2.6.10-rc1-mm3-frv/mm/page_alloc.c
--- /warthog/kernels/linux-2.6.10-rc1-mm3/mm/page_alloc.c 2004-11-05 13:15:52.000000000 +0000
+++ linux-2.6.10-rc1-mm3-frv/mm/page_alloc.c 2004-11-05 14:13:04.654432347 +0000
@@ -35,6 +35,7 @@
#include <linux/nodemask.h>
#include <asm/tlbflush.h>
+#include "internal.h"
nodemask_t node_online_map = NODE_MASK_NONE;
nodemask_t node_possible_map = NODE_MASK_ALL;
@@ -326,6 +328,13 @@ void __free_pages_ok(struct page *page,
arch_free_page(page, order);
mod_page_state(pgfree, 1 << order);
+
+#ifndef CONFIG_MMU
+ if (order > 0)
+ for (i = 1 ; i < (1 << order) ; ++i)
+ __put_page(page + i);
+#endif
+
for (i = 0 ; i < (1 << order) ; ++i)
free_pages_check(__FUNCTION__, page + i);
list_add(&page->lru, &list);
@@ -366,7 +375,7 @@ expand(struct zone *zone, struct page *p
return page;
}
-static inline void set_page_refs(struct page *page, int order)
+void set_page_refs(struct page *page, int order)
{
#ifdef CONFIG_MMU
set_page_count(page, 1);
@@ -376,9 +385,10 @@ static inline void set_page_refs(struct
/*
* We need to reference all the pages for this order, otherwise if
* anyone accesses one of the pages with (get/put) it will be freed.
+ * - eg: access_process_vm()
*/
for (i = 0; i < (1 << order); i++)
- set_page_count(page+i, 1);
+ set_page_count(page + i, 1);
#endif /* CONFIG_MMU */
}
diff -uNrp /warthog/kernels/linux-2.6.10-rc1-mm3/mm/tiny-shmem.c linux-2.6.10-rc1-mm3-frv/mm/tiny-shmem.c
--- /warthog/kernels/linux-2.6.10-rc1-mm3/mm/tiny-shmem.c 2004-10-27 17:32:38.000000000 +0100
+++ linux-2.6.10-rc1-mm3-frv/mm/tiny-shmem.c 2004-11-05 14:13:05.000000000 +0000
@@ -112,7 +112,9 @@ int shmem_zero_setup(struct vm_area_stru
if (vma->vm_file)
fput(vma->vm_file);
vma->vm_file = file;
+#ifdef CONFIG_MMU
vma->vm_ops = &generic_file_vm_ops;
+#endif
return 0;
}
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-08 14:32 [PATCH] VM routine fixes dhowells
@ 2004-11-09 12:55 ` Christoph Hellwig
2004-11-09 13:53 ` David Howells
2004-11-11 22:31 ` Andrew Morton
1 sibling, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2004-11-09 12:55 UTC (permalink / raw)
To: dhowells; +Cc: torvalds, akpm, davidm, linux-kernel, uclinux-dev
> +++ linux-2.6.10-rc1-mm3-frv/include/linux/mm.h 2004-11-05 17:44:09.120160519 +0000
> @@ -37,6 +37,10 @@ extern int sysctl_legacy_va_layout;
> #include <asm/processor.h>
> #include <asm/atomic.h>
>
> +#ifndef CONFIG_MMU
> +#define swapper_pml4 NULL
> +#endif
> +
> #ifndef MM_VM_SIZE
> #define MM_VM_SIZE(mm) TASK_SIZE
> #endif
> @@ -640,6 +674,7 @@ extern void remove_shrinker(struct shrin
> * inlining and the symmetry break with pte_alloc_map() that does all
> * of this out-of-line.
> */
> +#ifdef CONFIG_MMU
> static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
> {
> if (pgd_none(*pgd))
> @@ -660,6 +695,7 @@ static inline pgd_t *pgd_alloc_k(struct
> return __pgd_alloc(mm, pml4, address);
> return pml4_pgd_offset_k(pml4, address);
> }
> +#endif
Please don't stick CONFIG_MMU all over the place but keep them in as small
as possible blocks.
> +#ifdef CONFIG_MMU
> {
> .ctl_name = VM_MAX_MAP_COUNT,
> .procname = "max_map_count",
> @@ -763,6 +764,7 @@ static ctl_table vm_table[] = {
> .mode = 0644,
> .proc_handler = &proc_dointvec
> },
> +#endif
As I told you before please move registration of MMU-only sysctls
to a MMU-only file in mm/
> +extern void set_page_refs(struct page *page, int order);
this should probably be an inline.
> mod_page_state(pgfree, 1 << order);
> +
> +#ifndef CONFIG_MMU
> + if (order > 0)
> + for (i = 1 ; i < (1 << order) ; ++i)
> + __put_page(page + i);
> +#endif
> +
this is nasty. The right thing would probably to swich !MMU arches
to use the compount-page mechanism from the hugetlb code for this.
> --- /warthog/kernels/linux-2.6.10-rc1-mm3/mm/tiny-shmem.c 2004-10-27 17:32:38.000000000 +0100
> +++ linux-2.6.10-rc1-mm3-frv/mm/tiny-shmem.c 2004-11-05 14:13:05.000000000 +0000
> @@ -112,7 +112,9 @@ int shmem_zero_setup(struct vm_area_stru
> if (vma->vm_file)
> fput(vma->vm_file);
> vma->vm_file = file;
> +#ifdef CONFIG_MMU
> vma->vm_ops = &generic_file_vm_ops;
> +#endif
And you ocmpletely ignored the previous comment here aswell.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-09 12:55 ` Christoph Hellwig
@ 2004-11-09 13:53 ` David Howells
2004-11-09 14:01 ` Christoph Hellwig
0 siblings, 1 reply; 18+ messages in thread
From: David Howells @ 2004-11-09 13:53 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: torvalds, akpm, davidm, linux-kernel, uclinux-dev
> Please don't stick CONFIG_MMU all over the place but keep them in as small
> as possible blocks.
You seem to have changed your mind. Last I heard from you wanted them in as
few large blocks as possible. Now you want them in many small blocks. If you
want it changing, please feel free to supply me with a patch.
> As I told you before please move registration of MMU-only sysctls
> to a MMU-only file in mm/
No. These belong in the vm_table. It doesn't seem especially straightforward
to do what you want. If you want it doing your way, then feel free to send
Andrew a patch.
> > +extern void set_page_refs(struct page *page, int order);
>
> this should probably be an inline.
Probably. I'll deal with that after Andrew/Linus take my patches, if they take
my patches.
> this is nasty. The right thing would probably to swich !MMU arches
> to use the compount-page mechanism from the hugetlb code for this.
Supply me with a patch and I'll test it. I don't know how the compound-page
stuff works, but it's quite possibly the wrong way to do it. There is no MMU
available, so you can't generate adjacency that way.
> And you ocmpletely ignored the previous comment here aswell.
No, I didn't; you're wrong.
Feel free to supply a patch to change it to what you think is correct. I'll
test it for you.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-09 13:53 ` David Howells
@ 2004-11-09 14:01 ` Christoph Hellwig
2004-11-10 13:37 ` David Howells
0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2004-11-09 14:01 UTC (permalink / raw)
To: David Howells
Cc: Christoph Hellwig, torvalds, akpm, davidm, linux-kernel,
uclinux-dev
On Tue, Nov 09, 2004 at 01:53:06PM +0000, David Howells wrote:
>
> > Please don't stick CONFIG_MMU all over the place but keep them in as small
> > as possible blocks.
>
> You seem to have changed your mind. Last I heard from you wanted them in as
> few large blocks as possible. Now you want them in many small blocks. If you
> want it changing, please feel free to supply me with a patch.
sorry, typo. Should have read as large as possible - small doesn't make
sense with the first half of the sentence anyway.
> > As I told you before please move registration of MMU-only sysctls
> > to a MMU-only file in mm/
>
> No. These belong in the vm_table. It doesn't seem especially straightforward
> to do what you want. If you want it doing your way, then feel free to send
> Andrew a patch.
It is. You can register multiple tables for the same hierachy.
> > this is nasty. The right thing would probably to swich !MMU arches
> > to use the compount-page mechanism from the hugetlb code for this.
>
> Supply me with a patch and I'll test it. I don't know how the compound-page
> stuff works, but it's quite possibly the wrong way to do it. There is no MMU
> available, so you can't generate adjacency that way.
so look at the code. It's not about the MMU at all, it's about handling
the Linux page structures. What you're doing is a big mess and should
not be merged. So if you want to fix the issue do it properly.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-09 14:01 ` Christoph Hellwig
@ 2004-11-10 13:37 ` David Howells
2004-11-10 19:01 ` Andrew Morton
0 siblings, 1 reply; 18+ messages in thread
From: David Howells @ 2004-11-10 13:37 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: torvalds, akpm, davidm, linux-kernel, uclinux-dev
Christoph Hellwig <hch@infradead.org> wrote:
> > > this is nasty. The right thing would probably to swich !MMU arches
> > > to use the compount-page mechanism from the hugetlb code for this.
> >
> > Supply me with a patch and I'll test it. I don't know how the
> > compound-page stuff works, but it's quite possibly the wrong way to do
> > it. There is no MMU available, so you can't generate adjacency that way.
>
> so look at the code. It's not about the MMU at all, it's about handling
> the Linux page structures. What you're doing is a big mess and should
> not be merged. So if you want to fix the issue do it properly.
It's not a big mess, and nor is it of my making; I'm just trying to make it
work. It's two tiny sections in page_alloc.c that have #ifdef variations
conditional on CONFIG_MMU. That's it. Full stop. Furthermore, it's not really
a mess at all. It's the least variation required to make the page allocator
also work in !MMU situations.
It's also a lot more efficient than the compound page stuff - though I'll
grant it won't necessarily catch the same errors - for the following reasons:
(1) get_page() and put_page() acquire extra conditional expressions.
(3) put_page() is out of line, not inline. This probably makes it less
efficient, especially given register clobberage.
(2) put_page() potentially has to access three struct pages per call, not
just one.
(4) prep_compound_page() and destroy_compount_page() are much bigger and more
complicated than the !MMU variations to which you're objecting.
I hope you also noticed that these are themselves #ifdef'd...
Compound pages seem to be in some way tied to the TLB entry coverage sizes
available (for hugetlb), so it's not obvious that it's permitted to have
compound pages not of these sizes, and as I need to allocate arbitrary
sizes...
If I am correct about this, then the !MMU problem would still exist - just
with adjacent sets of compound pages rather than adjacent sets of pages.
Compound pages might be nice, but they're overkill.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-10 13:37 ` David Howells
@ 2004-11-10 19:01 ` Andrew Morton
2004-11-11 1:17 ` David Woodhouse
0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2004-11-10 19:01 UTC (permalink / raw)
To: David Howells; +Cc: hch, torvalds, davidm, linux-kernel, uclinux-dev
David Howells <dhowells@redhat.com> wrote:
>
> Compound pages seem to be in some way tied to the TLB entry coverage sizes
> available (for hugetlb), so it's not obvious that it's permitted to have
> compound pages not of these sizes, and as I need to allocate arbitrary
> sizes...
We've considered enabling compound pages permanently. We thought sparc64
might want that, and it simplifies coverage testing. But the
conditionality has been left in for now as a microoptimisation.
> If I am correct about this, then the !MMU problem would still exist - just
> with adjacent sets of compound pages rather than adjacent sets of pages.
Why _does_ !CONFIG_MMU futz around with page counts in such weird ways
anyway? Why does it have requirements for higher-order pages which differ
from !CONFIG_MMU?
If someone could explain the reasoning behind the current code, and the FRV
enhancements then perhaps we could work something out.
> Compound pages might be nice, but they're overkill.
I don't expect they have significant performance overhead, because they'll
only add cycles for higher-order pages. They're rare, and are already
rather inefficient.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-10 19:01 ` Andrew Morton
@ 2004-11-11 1:17 ` David Woodhouse
2004-11-11 2:26 ` Andrew Morton
0 siblings, 1 reply; 18+ messages in thread
From: David Woodhouse @ 2004-11-11 1:17 UTC (permalink / raw)
To: Andrew Morton; +Cc: dhowells, hch, torvalds, davidm, linux-kernel, uclinux-dev
On Wed, 2004-11-10 at 11:01 -0800, Andrew Morton wrote:
> Why _does_ !CONFIG_MMU futz around with page counts in such weird ways
> anyway? Why does it have requirements for higher-order pages which differ
> from !CONFIG_MMU?
Because in the absence of an MMU, an mmap of a large region (like an
executable) has to be satisfied by a large enough allocation followed by
a read.
> If someone could explain the reasoning behind the current code, and the FRV
> enhancements then perhaps we could work something out.
I think these parts aren't FRV-specific; they're the fixes required to
do proper shared readable mmap with !CONFIG_MMU. That was a prerequisite
for the ELF-FDPIC executable format, which allows real shared libraries
on uClinux.
--
dwmw2
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-11 1:17 ` David Woodhouse
@ 2004-11-11 2:26 ` Andrew Morton
2004-11-11 11:29 ` David Howells
0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2004-11-11 2:26 UTC (permalink / raw)
To: David Woodhouse
Cc: dhowells, hch, torvalds, davidm, linux-kernel, uclinux-dev,
Hiroyuki KAMEZAWA
David Woodhouse <dwmw2@infradead.org> wrote:
>
> On Wed, 2004-11-10 at 11:01 -0800, Andrew Morton wrote:
> > Why _does_ !CONFIG_MMU futz around with page counts in such weird ways
> > anyway? Why does it have requirements for higher-order pages which differ
> > from !CONFIG_MMU?
>
> Because in the absence of an MMU, an mmap of a large region (like an
> executable) has to be satisfied by a large enough allocation followed by
> a read.
That's currently implemented via this:
/*
* We need to reference all the pages for this order, otherwise if
* anyone accesses one of the pages with (get/put) it will be freed.
*/
for (i = 0; i < (1 << order); i++)
set_page_count(page+i, 1);
I assume the CONFIG_MMU logic assumes that it's legal to physically free a
single page from inside the middle of a higher-order page. I wonder if the
no-buddy-bitmap patches allow that? And if they've been tested with that?
See, if we enable the compound page logic if !CONFIG_MMU then all this
stuff just goes away and the page refcounting is controlled purely by the
head page. A get_page() or a put_page() against any of the constituent
pages will manipulate the head page's refcount.
> > If someone could explain the reasoning behind the current code, and the FRV
> > enhancements then perhaps we could work something out.
>
> I think these parts aren't FRV-specific; they're the fixes required to
> do proper shared readable mmap with !CONFIG_MMU. That was a prerequisite
> for the ELF-FDPIC executable format, which allows real shared libraries
> on uClinux.
>
OK.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-11 2:26 ` Andrew Morton
@ 2004-11-11 11:29 ` David Howells
2004-11-11 11:43 ` Andrew Morton
0 siblings, 1 reply; 18+ messages in thread
From: David Howells @ 2004-11-11 11:29 UTC (permalink / raw)
To: Andrew Morton
Cc: David Woodhouse, hch, torvalds, davidm, linux-kernel, uclinux-dev,
Hiroyuki KAMEZAWA
> > On Wed, 2004-11-10 at 11:01 -0800, Andrew Morton wrote:
> > > Why _does_ !CONFIG_MMU futz around with page counts in such weird ways
> > > anyway? Why does it have requirements for higher-order pages which
> > > differ from !CONFIG_MMU?
The problem is ptrace() and various /proc/<pid/ files; or more properly, the
problem is access_process_vm() and suchlike.
When one process goes rooting around in another process's memory map, it
increments the refcount on the page it is looking at to stop it going away.
The problem on uClinux is that a process can have multipage chunks mapped into
userspace, and access_process_vm() can look at a page in the middle of such a
span.
Without this change, bad_page() is called when access_process_vm() calls a
put_page() on a page in the middle because its refcount would say it's no
longer in use and then put_page() would then attempt to free the page.
access_process_vm() guards against the target page going away due to the
target process exiting at an inconvenient moment by pinning the target mm and
holding its semaphore.
> I assume the CONFIG_MMU logic assumes that it's legal to physically free a
> single page from inside the middle of a higher-order page.
No, it isn't. munmap() is prohibited from releasing anything other than a
complete mmap() on uClinux.
> See, if we enable the compound page logic if !CONFIG_MMU then all this
> stuff just goes away and the page refcounting is controlled purely by the
> head page. A get_page() or a put_page() against any of the constituent
> pages will manipulate the head page's refcount.
That's true to a point, and probably a reasonable idea, assuming that compound
pages aren't limited in size to TLB-specific values. If they are, then it
makes no real difference.
> > > If someone could explain the reasoning behind the current code, and the
> > > FRV enhancements then perhaps we could work something out.
> >
> > I think these parts aren't FRV-specific;
They aren't FRV specific. They were uClinux enhancements before I got my hands
on it. They are even in your -mm3 kernel. All I did was to make the freeing
case work properly.
> > they're the fixes required to do proper shared readable mmap with
> > !CONFIG_MMU.
That's not 100% true. They're required to do multipage mmap with !MMU. As I've
said they predate my involvement with the nommu stuff in 2.6 and 2.4 both.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-11 11:29 ` David Howells
@ 2004-11-11 11:43 ` Andrew Morton
2004-11-11 12:03 ` David Howells
0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2004-11-11 11:43 UTC (permalink / raw)
To: David Howells
Cc: dwmw2, hch, torvalds, davidm, linux-kernel, uclinux-dev,
kamezawa.hiroyu
David Howells <dhowells@redhat.com> wrote:
>
>
> > > On Wed, 2004-11-10 at 11:01 -0800, Andrew Morton wrote:
> > > > Why _does_ !CONFIG_MMU futz around with page counts in such weird ways
> > > > anyway? Why does it have requirements for higher-order pages which
> > > > differ from !CONFIG_MMU?
>
> The problem is ptrace() and various /proc/<pid/ files; or more properly, the
> problem is access_process_vm() and suchlike.
>
> When one process goes rooting around in another process's memory map, it
> increments the refcount on the page it is looking at to stop it going away.
>
> The problem on uClinux is that a process can have multipage chunks mapped into
> userspace, and access_process_vm() can look at a page in the middle of such a
> span.
>
> Without this change, bad_page() is called when access_process_vm() calls a
> put_page() on a page in the middle because its refcount would say it's no
> longer in use and then put_page() would then attempt to free the page.
>
> access_process_vm() guards against the target page going away due to the
> target process exiting at an inconvenient moment by pinning the target mm and
> holding its semaphore.
Compound pages will fix all that up. You take a ref on any of the subpages
and that will pin the entire higher-order page.
> > I assume the CONFIG_MMU logic assumes that it's legal to physically free a
> > single page from inside the middle of a higher-order page.
>
> No, it isn't. munmap() is prohibited from releasing anything other than a
> complete mmap() on uClinux.
hrm. I'd have thought that such a restriction would be unnecessary if we
get the page refcounting done right. With, say, compound pages!
Maybe the restriction is there for other reasons.
> > See, if we enable the compound page logic if !CONFIG_MMU then all this
> > stuff just goes away and the page refcounting is controlled purely by the
> > head page. A get_page() or a put_page() against any of the constituent
> > pages will manipulate the head page's refcount.
>
> That's true to a point, and probably a reasonable idea, assuming that compound
> pages aren't limited in size to TLB-specific values. If they are, then it
> makes no real difference.
Compound pages are just a way of handling refcounting of a higher-order
page. Nothing to do with TLBs at all.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-11 11:43 ` Andrew Morton
@ 2004-11-11 12:03 ` David Howells
0 siblings, 0 replies; 18+ messages in thread
From: David Howells @ 2004-11-11 12:03 UTC (permalink / raw)
To: Andrew Morton
Cc: dwmw2, hch, torvalds, davidm, linux-kernel, uclinux-dev,
kamezawa.hiroyu
> > No, it isn't. munmap() is prohibited from releasing anything other than a
> > complete mmap() on uClinux.
>
> hrm. I'd have thought that such a restriction would be unnecessary if we
> get the page refcounting done right. With, say, compound pages!
I don't think you want to go there. Don't forget in a lot of cases you're
dealing with shared objects, and so you can't just go punching holes in the
middle thereof.
> Compound pages are just a way of handling refcounting of a higher-order
> page. Nothing to do with TLBs at all.
In that case, I'll look at making compound pages mandatory if you wish. I'd
rather avoid doing so because they incur additional overheads, and it'll be on
MMU and !MMU both, but it does make multi-order page handling appear more
robust.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-08 14:32 [PATCH] VM routine fixes dhowells
2004-11-09 12:55 ` Christoph Hellwig
@ 2004-11-11 22:31 ` Andrew Morton
2004-11-12 10:33 ` David Howells
1 sibling, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2004-11-11 22:31 UTC (permalink / raw)
To: dhowells; +Cc: torvalds, davidm, linux-kernel, uclinux-dev
dhowells@redhat.com wrote:
>
> The attached patch fixes a number of problems in the VM routines:
>
> (1) Some inline funcs don't compile if CONFIG_MMU is not set.
>
> (2) swapper_pml4 needn't exist if CONFIG_MMU is not set.
>
> (3) __free_pages_ok() doesn't counter set_page_refs() different behaviour if
> CONFIG_MMU is not set.
>
> (4) swsusp.c invokes TLB flushing functions without including the header file
> that declares them.
>
I spy a secret, uncommented, unchangelogged hunk:
> @@ -112,7 +112,9 @@ int shmem_zero_setup(struct vm_area_stru
> if (vma->vm_file)
> fput(vma->vm_file);
> vma->vm_file = file;
> +#ifdef CONFIG_MMU
> vma->vm_ops = &generic_file_vm_ops;
> +#endif
> return 0;
> }
>
What's happening there?
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-11 22:31 ` Andrew Morton
@ 2004-11-12 10:33 ` David Howells
2004-11-12 10:38 ` Andrew Morton
0 siblings, 1 reply; 18+ messages in thread
From: David Howells @ 2004-11-12 10:33 UTC (permalink / raw)
To: Andrew Morton; +Cc: torvalds, davidm, linux-kernel, uclinux-dev
> I spy a secret, uncommented, unchangelogged hunk:
Hardly secret. After all, I broadcast it to two mailing lists.
> > +#ifdef CONFIG_MMU
> > vma->vm_ops = &generic_file_vm_ops;
> > +#endif
> ...
> What's happening there?
The vm_area_struct doesn't have an ops member when !MMU.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-12 10:33 ` David Howells
@ 2004-11-12 10:38 ` Andrew Morton
2004-11-12 11:05 ` David Howells
0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2004-11-12 10:38 UTC (permalink / raw)
To: David Howells; +Cc: torvalds, davidm, linux-kernel, uclinux-dev
David Howells <dhowells@redhat.com> wrote:
>
> > > +#ifdef CONFIG_MMU
> > > vma->vm_ops = &generic_file_vm_ops;
> > > +#endif
> > ...
> > What's happening there?
>
> The vm_area_struct doesn't have an ops member when !MMU.
What is the reason for this change?
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-12 10:38 ` Andrew Morton
@ 2004-11-12 11:05 ` David Howells
2004-11-14 5:07 ` Linus Torvalds
0 siblings, 1 reply; 18+ messages in thread
From: David Howells @ 2004-11-12 11:05 UTC (permalink / raw)
To: Andrew Morton; +Cc: torvalds, davidm, linux-kernel, uclinux-dev
> > The vm_area_struct doesn't have an ops member when !MMU.
>
> What is the reason for this change?
vm_area_struct not having an ops member? There's no real need for ops on
uClinux since almost all of the ops are irrelevant. The only one that could be
relevant is vmops->close(). I suppose it might make sense to have this. It
would indicate to a chardev for instance that the last mapping upon it has
been gone, but then fops->release() is probably more appropriate still.
vmops can be added back in for the !MMU case if it's deemed appropriate.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-12 11:05 ` David Howells
@ 2004-11-14 5:07 ` Linus Torvalds
2004-11-15 13:14 ` David Howells
0 siblings, 1 reply; 18+ messages in thread
From: Linus Torvalds @ 2004-11-14 5:07 UTC (permalink / raw)
To: David Howells; +Cc: Andrew Morton, davidm, linux-kernel, uclinux-dev
On Fri, 12 Nov 2004, David Howells wrote:
>
> vm_area_struct not having an ops member? There's no real need for ops on
> uClinux since almost all of the ops are irrelevant.
I don't think that is a valid argument.
If uClinux wants to be a different source-base, then go wild. But if you
want to integrate into the standard kernel, there are other priorities.
One of them is that it has to integrate cleanly. And that means that we
don't do micro-optimizations that make the non-MMU case affect mainline
code unless there is a damn good reason.
Linus
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-14 5:07 ` Linus Torvalds
@ 2004-11-15 13:14 ` David Howells
2004-11-15 15:35 ` Linus Torvalds
0 siblings, 1 reply; 18+ messages in thread
From: David Howells @ 2004-11-15 13:14 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Andrew Morton, davidm, linux-kernel, uclinux-dev
> > vm_area_struct not having an ops member? There's no real need for ops on
> > uClinux since almost all of the ops are irrelevant.
>
> I don't think that is a valid argument.
>
> If uClinux wants to be a different source-base, then go wild. But if you
> want to integrate into the standard kernel, there are other priorities.
> One of them is that it has to integrate cleanly. And that means that we
> don't do micro-optimizations that make the non-MMU case affect mainline
> code unless there is a damn good reason.
So not having an MMU, page tables or PTEs or any requirement for operations
that act upon them is not enough?
Do you then want me to use the MMU version of struct vm_area_struct for both
MMU and !MMU? It can be done, almost. I'll need to add one member variable (a
refcount).
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH] VM routine fixes
2004-11-15 13:14 ` David Howells
@ 2004-11-15 15:35 ` Linus Torvalds
0 siblings, 0 replies; 18+ messages in thread
From: Linus Torvalds @ 2004-11-15 15:35 UTC (permalink / raw)
To: David Howells; +Cc: Andrew Morton, davidm, linux-kernel, uclinux-dev
On Mon, 15 Nov 2004, David Howells wrote:
>
> So not having an MMU, page tables or PTEs or any requirement for operations
> that act upon them is not enough?
No. It's a matter of abstraction. If you can _abstract_ the thing away,
that's fine. I don't want more #ifdef's in source code, but you can have a
totally different file that doesnt' do the things that aren't appropriate
for non-MMU.
Yes, we've already got #ifdef's in code, but the point is that we don't
add them unless there is serious _need_. And even then it's a sign of
trouble. In this case, the sign of trouble is bigger than the need.
uClinux might as well have a dummy "struct vm_operations", if only to make
the damn thing look more like real Linux.
Linus
^ permalink raw reply [flat|nested] 18+ messages in thread
end of thread, other threads:[~2004-11-15 15:36 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-11-08 14:32 [PATCH] VM routine fixes dhowells
2004-11-09 12:55 ` Christoph Hellwig
2004-11-09 13:53 ` David Howells
2004-11-09 14:01 ` Christoph Hellwig
2004-11-10 13:37 ` David Howells
2004-11-10 19:01 ` Andrew Morton
2004-11-11 1:17 ` David Woodhouse
2004-11-11 2:26 ` Andrew Morton
2004-11-11 11:29 ` David Howells
2004-11-11 11:43 ` Andrew Morton
2004-11-11 12:03 ` David Howells
2004-11-11 22:31 ` Andrew Morton
2004-11-12 10:33 ` David Howells
2004-11-12 10:38 ` Andrew Morton
2004-11-12 11:05 ` David Howells
2004-11-14 5:07 ` Linus Torvalds
2004-11-15 13:14 ` David Howells
2004-11-15 15:35 ` Linus Torvalds
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).