From: Christoph Hellwig <hch@lst.de>
To: Andrew Morton <akpm@zip.com.au>
Cc: linux-mm@kvack.org
Date: Mon, 2 Sep 2002 21:04:43 +0200 [thread overview]
Message-ID: <20020902210443.A32010@lst.de> (raw)
In-Reply-To: <3D73B7F1.2EB3131E@zip.com.au>; from akpm@zip.com.au on Mon, Sep 02, 2002 at 12:11:45PM -0700
On Mon, Sep 02, 2002 at 12:11:45PM -0700, Andrew Morton wrote:
> Christoph Hellwig wrote:
> >
> > On Mon, Sep 02, 2002 at 11:50:33AM -0700, Andrew Morton wrote:
> > > Christoph Hellwig wrote:
> > > >
> > > > This patch was done after Linus requested it when I intended to split
> > > > madvice out of filemap.c. We extend splitvma() in mmap.c to take
> > > > another argument that specifies whether to split above or below the
> > > > address given, and thus can use it in those function, cleaning them up
> > > > a lot and removing most of their code.
> > > >
> > >
> > > This description seems to have leaked from a different patch.
> > >
> > > Your patch purely shuffles code about, yes?
> >
> > No. it makes madvise/mlock/mprotect use slit_vma (that involved from
> > splitvma). There is no change in behaviour (verified by ltp testruns),
> > but the implementation is very different, and lots of code is gone.
>
> did you send the right patch?
>
> mnm:/usr/src/25> grep split patches/madvise-move.patch
> - * We can potentially split a vm area into separate
> + * We can potentially split a vm area into separate
>
> mnm:/usr/src/25> diffstat patches/madvise-move.patch
> Makefile | 2
> filemap.c | 332 ------------------------------------------------------------
> madvise.c | 340 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 342 insertions(+), 332 deletions(-)
Sorry, that was the first patch I sent to Linus before I did the changes
I explained..
Here's the right one:
include/linux/mm.h | 7 -
mm/Makefile | 2
mm/filemap.c | 332 -----------------------------------------------------
mm/madvise.c | 238 +++++++++++++++++++++++++++++++++++++
mm/mlock.c | 158 ++++---------------------
mm/mmap.c | 37 +++--
mm/mprotect.c | 218 +++++++++++-----------------------
7 files changed, 365 insertions, 627 deletions
--- 1.70/include/linux/mm.h Thu Aug 15 21:55:18 2002
+++ edited/include/linux/mm.h Sun Aug 18 16:14:27 2002
@@ -483,6 +483,7 @@
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);
+extern struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr);
/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
NULL if none. Assume start_addr < end_addr. */
@@ -495,11 +496,11 @@
return vma;
}
-extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
+extern int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+ unsigned long addr, int new_below);
extern struct page * vmalloc_to_page(void *addr);
extern unsigned long get_page_cache_size(void);
#endif /* __KERNEL__ */
-
-#endif
+#endif /* _LINUX_MM_H */
--- 1.12/mm/Makefile Tue Jul 16 23:46:26 2002
+++ edited/mm/Makefile Sun Aug 18 11:25:20 2002
@@ -16,6 +16,6 @@
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
- pdflush.o page-writeback.o rmap.o
+ pdflush.o page-writeback.o rmap.o madvise.o
include $(TOPDIR)/Rules.make
--- 1.127/mm/filemap.c Thu Aug 15 14:24:40 2002
+++ edited/mm/filemap.c Sun Aug 18 11:25:20 2002
@@ -1376,337 +1376,7 @@
return 0;
}
-static inline void setup_read_behavior(struct vm_area_struct * vma,
- int behavior)
-{
- VM_ClearReadHint(vma);
- switch(behavior) {
- case MADV_SEQUENTIAL:
- vma->vm_flags |= VM_SEQ_READ;
- break;
- case MADV_RANDOM:
- vma->vm_flags |= VM_RAND_READ;
- break;
- default:
- break;
- }
- return;
-}
-
-static long madvise_fixup_start(struct vm_area_struct * vma,
- unsigned long end, int behavior)
-{
- struct vm_area_struct * n;
- struct mm_struct * mm = vma->vm_mm;
-
- n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!n)
- return -EAGAIN;
- *n = *vma;
- n->vm_end = end;
- setup_read_behavior(n, behavior);
- n->vm_raend = 0;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
- lock_vma_mappings(vma);
- spin_lock(&mm->page_table_lock);
- vma->vm_start = end;
- __insert_vm_struct(mm, n);
- spin_unlock(&mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
-
-static long madvise_fixup_end(struct vm_area_struct * vma,
- unsigned long start, int behavior)
-{
- struct vm_area_struct * n;
- struct mm_struct * mm = vma->vm_mm;
-
- n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!n)
- return -EAGAIN;
- *n = *vma;
- n->vm_start = start;
- n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
- setup_read_behavior(n, behavior);
- n->vm_raend = 0;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- lock_vma_mappings(vma);
- spin_lock(&mm->page_table_lock);
- vma->vm_end = start;
- __insert_vm_struct(mm, n);
- spin_unlock(&mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
-
-static long madvise_fixup_middle(struct vm_area_struct * vma,
- unsigned long start, unsigned long end, int behavior)
-{
- struct vm_area_struct * left, * right;
- struct mm_struct * mm = vma->vm_mm;
-
- left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!left)
- return -EAGAIN;
- right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!right) {
- kmem_cache_free(vm_area_cachep, left);
- return -EAGAIN;
- }
- *left = *vma;
- *right = *vma;
- left->vm_end = start;
- right->vm_start = end;
- right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
- left->vm_raend = 0;
- right->vm_raend = 0;
- if (vma->vm_file)
- atomic_add(2, &vma->vm_file->f_count);
-
- if (vma->vm_ops && vma->vm_ops->open) {
- vma->vm_ops->open(left);
- vma->vm_ops->open(right);
- }
- vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
- vma->vm_raend = 0;
- lock_vma_mappings(vma);
- spin_lock(&mm->page_table_lock);
- vma->vm_start = start;
- vma->vm_end = end;
- setup_read_behavior(vma, behavior);
- __insert_vm_struct(mm, left);
- __insert_vm_struct(mm, right);
- spin_unlock(&mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
-
-/*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
- */
-static long madvise_behavior(struct vm_area_struct * vma,
- unsigned long start, unsigned long end, int behavior)
-{
- int error = 0;
-
- /* This caps the number of vma's this process can own */
- if (vma->vm_mm->map_count > MAX_MAP_COUNT)
- return -ENOMEM;
-
- if (start == vma->vm_start) {
- if (end == vma->vm_end) {
- setup_read_behavior(vma, behavior);
- vma->vm_raend = 0;
- } else
- error = madvise_fixup_start(vma, end, behavior);
- } else {
- if (end == vma->vm_end)
- error = madvise_fixup_end(vma, start, behavior);
- else
- error = madvise_fixup_middle(vma, start, end, behavior);
- }
-
- return error;
-}
-
-/*
- * Schedule all required I/O operations, then run the disk queue
- * to make sure they are started. Do not wait for completion.
- */
-static long madvise_willneed(struct vm_area_struct * vma,
- unsigned long start, unsigned long end)
-{
- long error = -EBADF;
- struct file * file;
- unsigned long size, rlim_rss;
-
- /* Doesn't work if there's no mapped file. */
- if (!vma->vm_file)
- return error;
- file = vma->vm_file;
- size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
-
- start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- if (end > vma->vm_end)
- end = vma->vm_end;
- end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-
- /* Make sure this doesn't exceed the process's max rss. */
- error = -EIO;
- rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
- LONG_MAX; /* default: see resource.h */
- if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
- return error;
-
- do_page_cache_readahead(file, start, end - start);
- return 0;
-}
-
-/*
- * Application no longer needs these pages. If the pages are dirty,
- * it's OK to just throw them away. The app will be more careful about
- * data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for refill_inactive to actually free
- * these pages later if no one else has touched them in the meantime,
- * although we could add these pages to a global reuse list for
- * refill_inactive to pick up before reclaiming other pages.
- *
- * NB: This interface discards data rather than pushes it out to swap,
- * as some implementations do. This has performance implications for
- * applications like large transactional databases which want to discard
- * pages in anonymous maps after committing to backing store the data
- * that was kept in them. There is no reason to write this data out to
- * the swap area if the application is discarding it.
- *
- * An interface that causes the system to free clean pages and flush
- * dirty pages is already available as msync(MS_INVALIDATE).
- */
-static long madvise_dontneed(struct vm_area_struct * vma,
- unsigned long start, unsigned long end)
-{
- if (vma->vm_flags & VM_LOCKED)
- return -EINVAL;
-
- zap_page_range(vma, start, end - start);
- return 0;
-}
-
-static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
- unsigned long end, int behavior)
-{
- long error = -EBADF;
-
- switch (behavior) {
- case MADV_NORMAL:
- case MADV_SEQUENTIAL:
- case MADV_RANDOM:
- error = madvise_behavior(vma, start, end, behavior);
- break;
-
- case MADV_WILLNEED:
- error = madvise_willneed(vma, start, end);
- break;
-
- case MADV_DONTNEED:
- error = madvise_dontneed(vma, start, end);
- break;
-
- default:
- error = -EINVAL;
- break;
- }
-
- return error;
-}
-
-/*
- * The madvise(2) system call.
- *
- * Applications can use madvise() to advise the kernel how it should
- * handle paging I/O in this VM area. The idea is to help the kernel
- * use appropriate read-ahead and caching techniques. The information
- * provided is advisory only, and can be safely disregarded by the
- * kernel without affecting the correct operation of the application.
- *
- * behavior values:
- * MADV_NORMAL - the default behavior is to read clusters. This
- * results in some read-ahead and read-behind.
- * MADV_RANDOM - the system should read the minimum amount of data
- * on any access, since it is unlikely that the appli-
- * cation will need more than what it asks for.
- * MADV_SEQUENTIAL - pages in the given range will probably be accessed
- * once, so they can be aggressively read ahead, and
- * can be freed soon after they are accessed.
- * MADV_WILLNEED - the application is notifying the system to read
- * some pages ahead.
- * MADV_DONTNEED - the application is finished with the given range,
- * so the kernel can free resources associated with it.
- *
- * return values:
- * zero - success
- * -EINVAL - start + len < 0, start is not page-aligned,
- * "behavior" is not a valid value, or application
- * is attempting to release locked or shared pages.
- * -ENOMEM - addresses in the specified range are not currently
- * mapped, or are outside the AS of the process.
- * -EIO - an I/O error occurred while paging in data.
- * -EBADF - map exists, but area maps something that isn't a file.
- * -EAGAIN - a kernel resource was temporarily unavailable.
- */
-asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
-{
- unsigned long end;
- struct vm_area_struct * vma;
- int unmapped_error = 0;
- int error = -EINVAL;
-
- down_write(¤t->mm->mmap_sem);
-
- if (start & ~PAGE_MASK)
- goto out;
- len = (len + ~PAGE_MASK) & PAGE_MASK;
- end = start + len;
- if (end < start)
- goto out;
-
- error = 0;
- if (end == start)
- goto out;
-
- /*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- */
- vma = find_vma(current->mm, start);
- for (;;) {
- /* Still start < end. */
- error = -ENOMEM;
- if (!vma)
- goto out;
-
- /* Here start < vma->vm_end. */
- if (start < vma->vm_start) {
- unmapped_error = -ENOMEM;
- start = vma->vm_start;
- }
-
- /* Here vma->vm_start <= start < vma->vm_end. */
- if (end <= vma->vm_end) {
- if (start < end) {
- error = madvise_vma(vma, start, end,
- behavior);
- if (error)
- goto out;
- }
- error = unmapped_error;
- goto out;
- }
-
- /* Here vma->vm_start <= start < vma->vm_end < end. */
- error = madvise_vma(vma, start, vma->vm_end, behavior);
- if (error)
- goto out;
- start = vma->vm_end;
- vma = vma->vm_next;
- }
-
-out:
- up_write(¤t->mm->mmap_sem);
- return error;
-}
-
-static inline
-struct page *__read_cache_page(struct address_space *mapping,
+static inline struct page *__read_cache_page(struct address_space *mapping,
unsigned long index,
int (*filler)(void *,struct page*),
void *data)
--- 1.3/mm/mlock.c Tue Feb 5 08:45:30 2002
+++ edited/mm/mlock.c Sun Aug 18 16:02:43 2002
@@ -2,147 +2,49 @@
* linux/mm/mlock.c
*
* (C) Copyright 1995 Linus Torvalds
+ * (C) Copyright 2002 Christoph Hellwig
*/
-#include <linux/slab.h>
-#include <linux/shm.h>
-#include <linux/mman.h>
-#include <linux/smp_lock.h>
-#include <linux/pagemap.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-
-static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags)
-{
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_flags = newflags;
- spin_unlock(&vma->vm_mm->page_table_lock);
- return 0;
-}
-
-static inline int mlock_fixup_start(struct vm_area_struct * vma,
- unsigned long end, int newflags)
-{
- struct vm_area_struct * n;
-
- n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!n)
- return -EAGAIN;
- *n = *vma;
- n->vm_end = end;
- n->vm_flags = newflags;
- n->vm_raend = 0;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_start = end;
- __insert_vm_struct(current->mm, n);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
-
-static inline int mlock_fixup_end(struct vm_area_struct * vma,
- unsigned long start, int newflags)
-{
- struct vm_area_struct * n;
-
- n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!n)
- return -EAGAIN;
- *n = *vma;
- n->vm_start = start;
- n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
- n->vm_flags = newflags;
- n->vm_raend = 0;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_end = start;
- __insert_vm_struct(current->mm, n);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
-static inline int mlock_fixup_middle(struct vm_area_struct * vma,
- unsigned long start, unsigned long end, int newflags)
-{
- struct vm_area_struct * left, * right;
+#include <linux/mman.h>
+#include <linux/mm.h>
- left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!left)
- return -EAGAIN;
- right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!right) {
- kmem_cache_free(vm_area_cachep, left);
- return -EAGAIN;
- }
- *left = *vma;
- *right = *vma;
- left->vm_end = start;
- right->vm_start = end;
- right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
- vma->vm_flags = newflags;
- left->vm_raend = 0;
- right->vm_raend = 0;
- if (vma->vm_file)
- atomic_add(2, &vma->vm_file->f_count);
-
- if (vma->vm_ops && vma->vm_ops->open) {
- vma->vm_ops->open(left);
- vma->vm_ops->open(right);
- }
- vma->vm_raend = 0;
- vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_flags = newflags;
- __insert_vm_struct(current->mm, left);
- __insert_vm_struct(current->mm, right);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
static int mlock_fixup(struct vm_area_struct * vma,
unsigned long start, unsigned long end, unsigned int newflags)
{
- int pages, retval;
+ struct mm_struct * mm = vma->vm_mm;
+ int pages, error;
if (newflags == vma->vm_flags)
return 0;
- if (start == vma->vm_start) {
- if (end == vma->vm_end)
- retval = mlock_fixup_all(vma, newflags);
- else
- retval = mlock_fixup_start(vma, end, newflags);
- } else {
- if (end == vma->vm_end)
- retval = mlock_fixup_end(vma, start, newflags);
- else
- retval = mlock_fixup_middle(vma, start, end, newflags);
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ return -EAGAIN;
}
- if (!retval) {
- /* keep track of amount of locked VM */
- pages = (end - start) >> PAGE_SHIFT;
- if (newflags & VM_LOCKED) {
- pages = -pages;
- make_pages_present(start, end);
- }
- vma->vm_mm->locked_vm -= pages;
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ return -EAGAIN;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ vma->vm_flags = newflags;
+ spin_unlock(&mm->page_table_lock);
+
+ /*
+ * Keep track of amount of locked VM.
+ */
+ pages = (end - start) >> PAGE_SHIFT;
+ if (newflags & VM_LOCKED) {
+ pages = -pages;
+ make_pages_present(start, end);
}
- return retval;
+
+ vma->vm_mm->locked_vm -= pages;
+ return 0;
}
static int do_mlock(unsigned long start, size_t len, int on)
--- 1.45/mm/mmap.c Fri Aug 2 16:24:26 2002
+++ edited/mm/mmap.c Sun Aug 18 16:13:02 2002
@@ -1043,10 +1043,11 @@
}
/*
- * Split a vma into two pieces at address 'addr', the original vma
- * will contain the first part, a new vma is allocated for the tail.
+ * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * either for the first part or the the tail.
*/
-static int splitvma(struct mm_struct *mm, struct vm_area_struct *mpnt, unsigned long addr)
+int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+ unsigned long addr, int new_below)
{
struct vm_area_struct *new;
@@ -1058,22 +1059,28 @@
return -ENOMEM;
/* most fields are the same, copy all, and then fixup */
- *new = *mpnt;
+ *new = *vma;
+
+ if (new_below) {
+ vma->vm_start = new->vm_end = addr;
+ vma->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
+ } else {
+ new->vm_start = vma->vm_end = addr;
+ new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
+ }
- new->vm_start = addr;
- new->vm_pgoff = mpnt->vm_pgoff + ((addr - mpnt->vm_start) >> PAGE_SHIFT);
new->vm_raend = 0;
- if (mpnt->vm_file)
- get_file(mpnt->vm_file);
- if (mpnt->vm_ops && mpnt->vm_ops->open)
- mpnt->vm_ops->open(mpnt);
- mpnt->vm_end = addr; /* Truncate area */
+ if (new->vm_file)
+ get_file(new->vm_file);
+
+ if (new->vm_ops && new->vm_ops->open)
+ new->vm_ops->open(new);
spin_lock(&mm->page_table_lock);
- lock_vma_mappings(mpnt);
+ lock_vma_mappings(vma);
__insert_vm_struct(mm, new);
- unlock_vma_mappings(mpnt);
+ unlock_vma_mappings(vma);
spin_unlock(&mm->page_table_lock);
return 0;
@@ -1110,7 +1117,7 @@
* If we need to split any vma, do it now to save pain later.
*/
if (start > mpnt->vm_start) {
- if (splitvma(mm, mpnt, start))
+ if (split_vma(mm, mpnt, start, 0))
return -ENOMEM;
prev = mpnt;
mpnt = mpnt->vm_next;
@@ -1119,7 +1126,7 @@
/* Does it split the last one? */
last = find_vma(mm, end);
if (last && end > last->vm_start) {
- if (splitvma(mm, last, end))
+ if (split_vma(mm, last, end, 0))
return -ENOMEM;
}
--- 1.14/mm/mprotect.c Mon Jul 29 21:23:46 2002
+++ edited/mm/mprotect.c Sun Aug 18 16:20:40 2002
@@ -2,13 +2,14 @@
* mm/mprotect.c
*
* (C) Copyright 1994 Linus Torvalds
+ * (C) Copyright 2002 Christoph Hellwig
*
* Address space accounting code <alan@redhat.com>
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
+
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
@@ -100,158 +101,59 @@
spin_unlock(¤t->mm->page_table_lock);
return;
}
-
-static inline int mprotect_fixup_all(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
- int newflags, pgprot_t prot)
+/*
+ * Try to merge a vma with the previos flag, return 1 if successfull or 0 if it
+ * was impossible.
+ */
+static int mprotect_attemp_merge(struct vm_area_struct * vma,
+ struct vm_area_struct * prev,
+ unsigned long end, int newflags)
{
- struct vm_area_struct * prev = *pprev;
struct mm_struct * mm = vma->vm_mm;
- if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) &&
- !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+ if (!prev || !vma)
+ return 0;
+ if (prev->vm_end != vma->vm_start)
+ return 0;
+ if (!can_vma_merge(prev, newflags))
+ return 0;
+ if (vma->vm_file || (vma->vm_flags & VM_SHARED))
+ return 0;
+
+ /*
+ * If the whole area changes to the protection of the previous one
+ * we can just get rid of it.
+ */
+ if (end == vma->vm_end) {
spin_lock(&mm->page_table_lock);
- prev->vm_end = vma->vm_end;
+ prev->vm_end = end;
__vma_unlink(mm, vma, prev);
spin_unlock(&mm->page_table_lock);
kmem_cache_free(vm_area_cachep, vma);
mm->map_count--;
+ return 1;
+ }
- return 0;
- }
-
+ /*
+ * Otherwise extend it.
+ */
spin_lock(&mm->page_table_lock);
- vma->vm_flags = newflags;
- vma->vm_page_prot = prot;
- spin_unlock(&mm->page_table_lock);
-
- *pprev = vma;
-
- return 0;
-}
-
-static inline int mprotect_fixup_start(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
- unsigned long end,
- int newflags, pgprot_t prot)
-{
- struct vm_area_struct * n, * prev = *pprev;
-
- *pprev = vma;
-
- if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) &&
- !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
- spin_lock(&vma->vm_mm->page_table_lock);
- prev->vm_end = end;
- vma->vm_start = end;
- spin_unlock(&vma->vm_mm->page_table_lock);
-
- return 0;
- }
- n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!n)
- return -ENOMEM;
- *n = *vma;
- n->vm_end = end;
- n->vm_flags = newflags;
- n->vm_raend = 0;
- n->vm_page_prot = prot;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
+ prev->vm_end = end;
vma->vm_start = end;
- __insert_vm_struct(current->mm, n);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
-
- return 0;
-}
-
-static inline int mprotect_fixup_end(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
- unsigned long start,
- int newflags, pgprot_t prot)
-{
- struct vm_area_struct * n;
-
- n = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (!n)
- return -ENOMEM;
- *n = *vma;
- n->vm_start = start;
- n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
- n->vm_flags = newflags;
- n->vm_raend = 0;
- n->vm_page_prot = prot;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_end = start;
- __insert_vm_struct(current->mm, n);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
-
- *pprev = n;
-
- return 0;
+ spin_unlock(&mm->page_table_lock);
+ return 1;
}
-static inline int mprotect_fixup_middle(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
- unsigned long start, unsigned long end,
- int newflags, pgprot_t prot)
-{
- struct vm_area_struct * left, * right;
-
- left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!left)
- return -ENOMEM;
- right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!right) {
- kmem_cache_free(vm_area_cachep, left);
- return -ENOMEM;
- }
- *left = *vma;
- *right = *vma;
- left->vm_end = start;
- right->vm_start = end;
- right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
- left->vm_raend = 0;
- right->vm_raend = 0;
- if (vma->vm_file)
- atomic_add(2,&vma->vm_file->f_count);
- if (vma->vm_ops && vma->vm_ops->open) {
- vma->vm_ops->open(left);
- vma->vm_ops->open(right);
- }
- vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
- vma->vm_raend = 0;
- vma->vm_page_prot = prot;
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_flags = newflags;
- __insert_vm_struct(current->mm, left);
- __insert_vm_struct(current->mm, right);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
-
- *pprev = right;
- return 0;
-}
static int mprotect_fixup(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
unsigned long start, unsigned long end, unsigned int newflags)
{
+ struct mm_struct * mm = vma->vm_mm;
+ unsigned long charged = 0;
pgprot_t newprot;
int error;
- unsigned long charged = 0;
if (newflags == vma->vm_flags) {
*pprev = vma;
@@ -266,29 +168,46 @@
* FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
* a MAP_NORESERVE private mapping to writable will now reserve.
*/
- if ((newflags & VM_WRITE) &&
- !(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
- charged = (end - start) >> PAGE_SHIFT;
- if (!vm_enough_memory(charged))
- return -ENOMEM;
- newflags |= VM_ACCOUNT;
+ if (newflags & VM_WRITE) {
+ if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
+ charged = (end - start) >> PAGE_SHIFT;
+ if (!vm_enough_memory(charged))
+ return -ENOMEM;
+ newflags |= VM_ACCOUNT;
+ }
}
+
newprot = protection_map[newflags & 0xf];
+
if (start == vma->vm_start) {
- if (end == vma->vm_end)
- error = mprotect_fixup_all(vma, pprev, newflags, newprot);
- else
- error = mprotect_fixup_start(vma, pprev, end, newflags, newprot);
- } else if (end == vma->vm_end)
- error = mprotect_fixup_end(vma, pprev, start, newflags, newprot);
- else
- error = mprotect_fixup_middle(vma, pprev, start, end, newflags, newprot);
- if (error) {
- vm_unacct_memory(charged);
- return error;
+ /*
+ * Try to merge with the previous vma.
+ */
+ if (mprotect_attemp_merge(vma, *pprev, end, newflags))
+ return 0;
+ } else {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ goto fail;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ goto fail;
}
+
+ spin_lock(&mm->page_table_lock);
+ vma->vm_flags = newflags;
+ vma->vm_page_prot = newprot;
+ spin_unlock(&mm->page_table_lock);
+
change_protection(vma, start, end, newprot);
return 0;
+
+fail:
+ vm_unacct_memory(charged);
+ return error;
}
asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
@@ -352,6 +271,7 @@
goto out;
}
}
+
if (next && prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags) &&
!prev->vm_file && !(prev->vm_flags & VM_SHARED)) {
spin_lock(&prev->vm_mm->page_table_lock);
--- 1.0/mm/madvise.c Thu Dec 13 11:34:58 2001
+++ edited/mm/madvise.c Sun Aug 18 14:28:08 2002
@@ -0,0 +1,238 @@
+/*
+ * linux/mm/madvise.c
+ *
+ * Copyright (C) 1999 Linus Torvalds
+ * Copyright (C) 2002 Christoph Hellwig
+ */
+
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
+ unsigned long end, int behavior)
+{
+ struct mm_struct * mm = vma->vm_mm;
+ int error;
+
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ return -EAGAIN;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ return -EAGAIN;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ vma->vm_raend = 0;
+ VM_ClearReadHint(vma);
+
+ switch (behavior) {
+ case MADV_SEQUENTIAL:
+ vma->vm_flags |= VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ vma->vm_flags |= VM_RAND_READ;
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return 0;
+}
+
+/*
+ * Schedule all required I/O operations, then run the disk queue
+ * to make sure they are started. Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end)
+{
+ long error = -EBADF;
+ struct file * file;
+ unsigned long size, rlim_rss;
+
+ /* Doesn't work if there's no mapped file. */
+ if (!vma->vm_file)
+ return error;
+ file = vma->vm_file;
+ size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+ /* Make sure this doesn't exceed the process's max rss. */
+ error = -EIO;
+ rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
+ LONG_MAX; /* default: see resource.h */
+ if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
+ return error;
+
+ do_page_cache_readahead(file, start, end - start);
+ return 0;
+}
+
+/*
+ * Application no longer needs these pages. If the pages are dirty,
+ * it's OK to just throw them away. The app will be more careful about
+ * data it wants to keep. Be sure to free swap resources too. The
+ * zap_page_range call sets things up for refill_inactive to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * refill_inactive to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do. This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them. There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end)
+{
+ if (vma->vm_flags & VM_LOCKED)
+ return -EINVAL;
+
+ zap_page_range(vma, start, end - start);
+ return 0;
+}
+
+static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
+ unsigned long end, int behavior)
+{
+ long error = -EBADF;
+
+ switch (behavior) {
+ case MADV_NORMAL:
+ case MADV_SEQUENTIAL:
+ case MADV_RANDOM:
+ error = madvise_behavior(vma, start, end, behavior);
+ break;
+
+ case MADV_WILLNEED:
+ error = madvise_willneed(vma, start, end);
+ break;
+
+ case MADV_DONTNEED:
+ error = madvise_dontneed(vma, start, end);
+ break;
+
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area. The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques. The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ * MADV_NORMAL - the default behavior is to read clusters. This
+ * results in some read-ahead and read-behind.
+ * MADV_RANDOM - the system should read the minimum amount of data
+ * on any access, since it is unlikely that the appli-
+ * cation will need more than what it asks for.
+ * MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ * once, so they can be aggressively read ahead, and
+ * can be freed soon after they are accessed.
+ * MADV_WILLNEED - the application is notifying the system to read
+ * some pages ahead.
+ * MADV_DONTNEED - the application is finished with the given range,
+ * so the kernel can free resources associated with it.
+ *
+ * return values:
+ * zero - success
+ * -EINVAL - start + len < 0, start is not page-aligned,
+ * "behavior" is not a valid value, or application
+ * is attempting to release locked or shared pages.
+ * -ENOMEM - addresses in the specified range are not currently
+ * mapped, or are outside the AS of the process.
+ * -EIO - an I/O error occurred while paging in data.
+ * -EBADF - map exists, but area maps something that isn't a file.
+ * -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
+{
+ unsigned long end;
+ struct vm_area_struct * vma;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ down_write(¤t->mm->mmap_sem);
+
+ if (start & ~PAGE_MASK)
+ goto out;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ goto out;
+
+ error = 0;
+ if (end == start)
+ goto out;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ */
+ vma = find_vma(current->mm, start);
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if (end <= vma->vm_end) {
+ if (start < end) {
+ error = madvise_vma(vma, start, end,
+ behavior);
+ if (error)
+ goto out;
+ }
+ error = unmapped_error;
+ goto out;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end < end. */
+ error = madvise_vma(vma, start, vma->vm_end, behavior);
+ if (error)
+ goto out;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ }
+
+out:
+ up_write(¤t->mm->mmap_sem);
+ return error;
+}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/
next prev parent reply other threads:[~2002-09-02 19:04 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2002-09-02 17:43 Christoph Hellwig
2002-09-02 18:50 ` Andrew Morton
2002-09-02 18:41 ` Christoph Hellwig
2002-09-02 19:11 ` Andrew Morton
2002-09-02 19:04 ` Christoph Hellwig [this message]
2002-09-02 21:17 ` Andrew Morton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20020902210443.A32010@lst.de \
--to=hch@lst.de \
--cc=akpm@zip.com.au \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.