linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC] madvise(MADV_TRUNCATE)
@ 2005-10-26 22:49 Badari Pulavarty
  2005-10-27  8:38 ` Andi Kleen
  2005-10-28  3:46 ` Jeff Dike
  0 siblings, 2 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-26 22:49 UTC (permalink / raw)
  To: Hugh Dickins, akpm, andrea; +Cc: Jeff Dike, dvhltc, linux-mm

[-- Attachment #1: Type: text/plain, Size: 528 bytes --]

Hi All,

Based on comments from Hugh & Andrea, I took a shot at implementing
madvise(MADV_TRUNCATE) - which truncates range of pages in the file.
(basically provides ability to punche a hole in to the file).

Basically, I added "truncate_range" inode operation to provide
opportunity for the filesystem to zero the blocks and/or free
them up. 

I also attempted to implement shmem_truncate_range() which 
needs lots of testing before I work out bugs :(

I would really appreciate your comments on my approach.

Thanks,
Badari



[-- Attachment #2: madvise-truncate3.patch --]
[-- Type: text/x-patch, Size: 22709 bytes --]

diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-alpha/mman.h linux-2.6.14-rc5-madv/include/asm-alpha/mman.h
--- linux-2.6.14-rc5/include/asm-alpha/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-alpha/mman.h	2005-10-26 15:48:48.000000000 -0700
@@ -42,6 +42,7 @@
 #define MADV_WILLNEED	3		/* will need these pages */
 #define	MADV_SPACEAVAIL	5		/* ensure resources are available */
 #define MADV_DONTNEED	6		/* don't need these pages */
+#define MADV_TRUNCATE	7		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-arm/mman.h linux-2.6.14-rc5-madv/include/asm-arm/mman.h
--- linux-2.6.14-rc5/include/asm-arm/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-arm/mman.h	2005-10-26 15:48:58.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-arm26/mman.h linux-2.6.14-rc5-madv/include/asm-arm26/mman.h
--- linux-2.6.14-rc5/include/asm-arm26/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-arm26/mman.h	2005-10-26 15:48:53.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-cris/mman.h linux-2.6.14-rc5-madv/include/asm-cris/mman.h
--- linux-2.6.14-rc5/include/asm-cris/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-cris/mman.h	2005-10-26 15:49:02.000000000 -0700
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-frv/mman.h linux-2.6.14-rc5-madv/include/asm-frv/mman.h
--- linux-2.6.14-rc5/include/asm-frv/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-frv/mman.h	2005-10-26 15:49:11.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-h8300/mman.h linux-2.6.14-rc5-madv/include/asm-h8300/mman.h
--- linux-2.6.14-rc5/include/asm-h8300/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-h8300/mman.h	2005-10-26 15:49:15.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-i386/mman.h linux-2.6.14-rc5-madv/include/asm-i386/mman.h
--- linux-2.6.14-rc5/include/asm-i386/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-i386/mman.h	2005-10-26 15:49:20.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-ia64/mman.h linux-2.6.14-rc5-madv/include/asm-ia64/mman.h
--- linux-2.6.14-rc5/include/asm-ia64/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-ia64/mman.h	2005-10-26 15:49:26.000000000 -0700
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-m32r/mman.h linux-2.6.14-rc5-madv/include/asm-m32r/mman.h
--- linux-2.6.14-rc5/include/asm-m32r/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-m32r/mman.h	2005-10-26 15:49:31.000000000 -0700
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-m68k/mman.h linux-2.6.14-rc5-madv/include/asm-m68k/mman.h
--- linux-2.6.14-rc5/include/asm-m68k/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-m68k/mman.h	2005-10-26 15:49:35.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-mips/mman.h linux-2.6.14-rc5-madv/include/asm-mips/mman.h
--- linux-2.6.14-rc5/include/asm-mips/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-mips/mman.h	2005-10-26 15:49:41.000000000 -0700
@@ -65,6 +65,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-parisc/mman.h linux-2.6.14-rc5-madv/include/asm-parisc/mman.h
--- linux-2.6.14-rc5/include/asm-parisc/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-parisc/mman.h	2005-10-26 15:49:49.000000000 -0700
@@ -38,6 +38,7 @@
 #define MADV_SPACEAVAIL 5               /* insure that resources are reserved */
 #define MADV_VPS_PURGE  6               /* Purge pages from VM page cache */
 #define MADV_VPS_INHERIT 7              /* Inherit parents page size */
+#define MADV_TRUNCATE	8		/* truncate range of pages */
 
 /* The range 12-64 is reserved for page size specification. */
 #define MADV_4K_PAGES   12              /* Use 4K pages  */
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-powerpc/mman.h linux-2.6.14-rc5-madv/include/asm-powerpc/mman.h
--- linux-2.6.14-rc5/include/asm-powerpc/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-powerpc/mman.h	2005-10-26 15:49:53.000000000 -0700
@@ -44,6 +44,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-s390/mman.h linux-2.6.14-rc5-madv/include/asm-s390/mman.h
--- linux-2.6.14-rc5/include/asm-s390/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-s390/mman.h	2005-10-26 15:50:08.000000000 -0700
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL        0x2             /* read-ahead aggressively */
 #define MADV_WILLNEED  0x3              /* pre-fault pages */
 #define MADV_DONTNEED  0x4              /* discard these pages */
+#define MADV_TRUNCATE  0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-sh/mman.h linux-2.6.14-rc5-madv/include/asm-sh/mman.h
--- linux-2.6.14-rc5/include/asm-sh/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-sh/mman.h	2005-10-26 15:50:15.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-sparc/mman.h linux-2.6.14-rc5-madv/include/asm-sparc/mman.h
--- linux-2.6.14-rc5/include/asm-sparc/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-sparc/mman.h	2005-10-26 15:50:31.000000000 -0700
@@ -54,6 +54,7 @@
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
 #define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_TRUNCATE	0x6		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-sparc64/mman.h linux-2.6.14-rc5-madv/include/asm-sparc64/mman.h
--- linux-2.6.14-rc5/include/asm-sparc64/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-sparc64/mman.h	2005-10-26 15:50:25.000000000 -0700
@@ -54,6 +54,7 @@
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
 #define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_TRUNCATE	0x6		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-v850/mman.h linux-2.6.14-rc5-madv/include/asm-v850/mman.h
--- linux-2.6.14-rc5/include/asm-v850/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-v850/mman.h	2005-10-26 15:50:39.000000000 -0700
@@ -32,6 +32,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-x86_64/mman.h linux-2.6.14-rc5-madv/include/asm-x86_64/mman.h
--- linux-2.6.14-rc5/include/asm-x86_64/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-x86_64/mman.h	2005-10-26 15:50:43.000000000 -0700
@@ -36,6 +36,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-xtensa/mman.h linux-2.6.14-rc5-madv/include/asm-xtensa/mman.h
--- linux-2.6.14-rc5/include/asm-xtensa/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/asm-xtensa/mman.h	2005-10-26 15:50:46.000000000 -0700
@@ -72,6 +72,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/linux/fs.h linux-2.6.14-rc5-madv/include/linux/fs.h
--- linux-2.6.14-rc5/include/linux/fs.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/linux/fs.h	2005-10-25 08:59:52.000000000 -0700
@@ -995,6 +995,7 @@ struct inode_operations {
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
+	void (*truncate_range)(struct inode *, loff_t, loff_t);
 };
 
 struct seq_file;
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/linux/mm.h linux-2.6.14-rc5-madv/include/linux/mm.h
--- linux-2.6.14-rc5/include/linux/mm.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/include/linux/mm.h	2005-10-26 10:15:05.000000000 -0700
@@ -704,6 +704,7 @@ static inline void unmap_shared_mapping_
 }
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
+extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
 extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
@@ -865,6 +866,7 @@ extern unsigned long do_brk(unsigned lon
 /* filemap.c */
 extern unsigned long page_unuse(struct page *);
 extern void truncate_inode_pages(struct address_space *, loff_t);
+extern void truncate_inode_pages_range(struct address_space *, loff_t, loff_t);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
diff -Naurp -X dontdiff linux-2.6.14-rc5/mm/madvise.c linux-2.6.14-rc5-madv/mm/madvise.c
--- linux-2.6.14-rc5/mm/madvise.c	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/mm/madvise.c	2005-10-26 15:12:24.000000000 -0700
@@ -140,6 +140,33 @@ static long madvise_dontneed(struct vm_a
 	return 0;
 }
 
+static long madvise_truncate(struct vm_area_struct * vma,
+			     unsigned long start, unsigned long end)
+{
+	struct address_space *mapping;
+        loff_t offset, endoff;
+	int error = 0;
+
+	if (!vma->vm_file || !vma->vm_file->f_mapping 
+		|| !vma->vm_file->f_mapping->host) {
+			return -EINVAL;
+	}
+
+	mapping = vma->vm_file->f_mapping;
+	if (mapping == &swapper_space) {
+		return -EINVAL;
+	}
+
+	offset = (loff_t)(start - vma->vm_start);
+	endoff = (loff_t)(end - vma->vm_start);
+	printk("call vmtruncate_range(%p, %x %x)\n", mapping, 
+			(unsigned int)offset, (unsigned int)endoff);
+	down(&mapping->host->i_sem);
+	error = vmtruncate_range(mapping->host, offset, endoff);
+	up(&mapping->host->i_sem);
+	return error;
+}
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -152,6 +179,9 @@ madvise_vma(struct vm_area_struct *vma, 
 	case MADV_RANDOM:
 		error = madvise_behavior(vma, prev, start, end, behavior);
 		break;
+	case MADV_TRUNCATE:
+		error = madvise_truncate(vma, start, end);
+		break;
 
 	case MADV_WILLNEED:
 		error = madvise_willneed(vma, prev, start, end);
diff -Naurp -X dontdiff linux-2.6.14-rc5/mm/memory.c linux-2.6.14-rc5-madv/mm/memory.c
--- linux-2.6.14-rc5/mm/memory.c	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/mm/memory.c	2005-10-26 15:35:15.000000000 -0700
@@ -1597,6 +1597,28 @@ out_busy:
 
 EXPORT_SYMBOL(vmtruncate);
 
+int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * If the underlying filesystem is not going to provide 
+	 * a way to truncate a range of blocks (punch a hole) - 
+	 * we should return failure right now.
+	 */
+	if (!inode->i_op || !inode->i_op->truncate_range)
+		return -ENOSYS;
+		
+	unmap_mapping_range(mapping, offset, (end - offset), 1);
+	truncate_inode_pages_range(mapping, offset, end);
+	inode->i_op->truncate_range(inode, offset, end);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(vmtruncate_range);
+
+
 /* 
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
diff -Naurp -X dontdiff linux-2.6.14-rc5/mm/shmem.c linux-2.6.14-rc5-madv/mm/shmem.c
--- linux-2.6.14-rc5/mm/shmem.c	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/mm/shmem.c	2005-10-26 15:37:47.000000000 -0700
@@ -616,6 +616,168 @@ done2:
 	}
 }
 
+/*
+ * WIP ! WIP !! WIP !!!
+ *
+ * The idea is to free up the swap entries for the given range (start, end)
+ * in the file. 
+ *
+ * This is based on shmem_truncate() and I need to merge both of them
+ * into common routine.
+ */
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	unsigned long idx;
+	unsigned long size;
+	unsigned long limit;
+	unsigned long stage;
+	unsigned long diroff;
+	struct page **dir;
+	struct page *topdir;
+	struct page *middir;
+	struct page *subdir;
+	swp_entry_t *ptr;
+	LIST_HEAD(pages_to_free);
+	long nr_pages_to_free = 0;
+	long nr_swaps_freed = 0;
+	int offset;
+	int freed;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (idx >= info->next_index)
+		return;
+
+	limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	spin_lock(&info->lock);
+	info->flags |= SHMEM_TRUNCATE;
+	if (limit > info->next_index)
+		limit = info->next_index;
+	topdir = info->i_indirect;
+#if 0
+	if (topdir && idx <= SHMEM_NR_DIRECT) {
+		info->i_indirect = NULL;
+		nr_pages_to_free++;
+		list_add(&topdir->lru, &pages_to_free);
+	}
+#endif
+	spin_unlock(&info->lock);
+
+	if (info->swapped && idx < SHMEM_NR_DIRECT) {
+		ptr = info->i_direct;
+		size = limit;
+		if (size > SHMEM_NR_DIRECT)
+			size = SHMEM_NR_DIRECT;
+		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
+	}
+	if (!topdir)
+		goto done2;
+
+	BUG_ON(limit <= SHMEM_NR_DIRECT);
+	limit -= SHMEM_NR_DIRECT;
+	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
+	offset = idx % ENTRIES_PER_PAGE;
+	idx -= offset;
+
+	dir = shmem_dir_map(topdir);
+	stage = ENTRIES_PER_PAGEPAGE/2;
+	if (idx < ENTRIES_PER_PAGEPAGE/2) {
+		middir = topdir;
+		diroff = idx/ENTRIES_PER_PAGE;
+	} else {
+		dir += ENTRIES_PER_PAGE/2;
+		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
+		while (stage <= idx)
+			stage += ENTRIES_PER_PAGEPAGE;
+		middir = *dir;
+		if (*dir) {
+			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
+				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
+			if (!diroff && !offset) {
+				*dir = NULL;
+				nr_pages_to_free++;
+				list_add(&middir->lru, &pages_to_free);
+			}
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(middir);
+		} else {
+			diroff = 0;
+			offset = 0;
+			idx = stage;
+		}
+	}
+
+	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
+		if (unlikely(idx == stage)) {
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(topdir) +
+			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
+			while (!*dir) {
+				dir++;
+				idx += ENTRIES_PER_PAGEPAGE;
+				if (idx >= limit)
+					goto done1;
+			}
+			stage = idx + ENTRIES_PER_PAGEPAGE;
+			middir = *dir;
+			*dir = NULL;
+			nr_pages_to_free++;
+			list_add(&middir->lru, &pages_to_free);
+			shmem_dir_unmap(dir);
+			cond_resched();
+			dir = shmem_dir_map(middir);
+			diroff = 0;
+		}
+		subdir = dir[diroff];
+		if (subdir && subdir->nr_swapped) {
+			size = limit - idx;
+			if (size > ENTRIES_PER_PAGE)
+				size = ENTRIES_PER_PAGE;
+			freed = shmem_map_and_free_swp(subdir,
+						offset, size, &dir);
+			if (!dir)
+				dir = shmem_dir_map(middir);
+			nr_swaps_freed += freed;
+			if (offset)
+				spin_lock(&info->lock);
+			subdir->nr_swapped -= freed;
+			if (offset)
+				spin_unlock(&info->lock);
+			BUG_ON(subdir->nr_swapped > offset);
+		}
+		if (offset)
+			offset = 0;
+		else if (subdir) {
+			dir[diroff] = NULL;
+			nr_pages_to_free++;
+			list_add(&subdir->lru, &pages_to_free);
+		}
+	}
+done1:
+	shmem_dir_unmap(dir);
+done2:
+	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
+		truncate_inode_pages_range(inode->i_mapping, start, end);
+	}
+
+	spin_lock(&info->lock);
+	info->flags &= ~SHMEM_TRUNCATE;
+	info->swapped -= nr_swaps_freed;
+	if (nr_pages_to_free)
+		shmem_free_blocks(inode, nr_pages_to_free);
+	shmem_recalc_inode(inode);
+	spin_unlock(&info->lock);
+
+	/*
+	 * Empty swap vector directory pages to be freed?
+	 */
+	if (!list_empty(&pages_to_free)) {
+		pages_to_free.prev->next = NULL;
+		shmem_free_pages(pages_to_free.next);
+	}
+}
+
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -2083,6 +2245,7 @@ static struct file_operations shmem_file
 static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
+	.truncate_range	= shmem_truncate_range,
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
diff -Naurp -X dontdiff linux-2.6.14-rc5/mm/truncate.c linux-2.6.14-rc5-madv/mm/truncate.c
--- linux-2.6.14-rc5/mm/truncate.c	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5-madv/mm/truncate.c	2005-10-26 10:14:43.000000000 -0700
@@ -113,7 +113,8 @@ invalidate_complete_page(struct address_
  *
  * Called under (and serialised by) inode->i_sem.
  */
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart,
+		loff_t end)
 {
 	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
@@ -126,7 +127,8 @@ void truncate_inode_pages(struct address
 
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+	while (next <= end &&
+			pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index = page->index;
@@ -142,6 +144,8 @@ void truncate_inode_pages(struct address
 			}
 			truncate_complete_page(mapping, page);
 			unlock_page(page);
+			if (next > end)
+				break;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
@@ -176,12 +180,20 @@ void truncate_inode_pages(struct address
 			next++;
 			truncate_complete_page(mapping, page);
 			unlock_page(page);
+			if (next > end)
+				break;
 		}
 		pagevec_release(&pvec);
 	}
 }
 
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	return truncate_inode_pages_range(mapping, lstart, ~0UL);
+}
+
 EXPORT_SYMBOL(truncate_inode_pages);
+EXPORT_SYMBOL(truncate_inode_pages_range);
 
 /**
  * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-26 22:49 [RFC] madvise(MADV_TRUNCATE) Badari Pulavarty
@ 2005-10-27  8:38 ` Andi Kleen
  2005-10-27 13:17   ` Andrea Arcangeli
  2005-10-28  3:46 ` Jeff Dike
  1 sibling, 1 reply; 86+ messages in thread
From: Andi Kleen @ 2005-10-27  8:38 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Hugh Dickins, akpm, andrea, Jeff Dike, dvhltc, linux-mm

On Thursday 27 October 2005 00:49, Badari Pulavarty wrote:

>
> I would really appreciate your comments on my approach.

(from a high level point of view) It sounds very scary. Traditionally
a lot of code had special case handling to avoid truncate
races, and it might need a lot of auditing to make sure
everybode else can handle arbitary punch hole too.

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27  8:38 ` Andi Kleen
@ 2005-10-27 13:17   ` Andrea Arcangeli
  2005-10-27 15:00     ` Badari Pulavarty
  0 siblings, 1 reply; 86+ messages in thread
From: Andrea Arcangeli @ 2005-10-27 13:17 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Badari Pulavarty, Hugh Dickins, akpm, Jeff Dike, dvhltc, linux-mm

On Thu, Oct 27, 2005 at 10:38:51AM +0200, Andi Kleen wrote:
> On Thursday 27 October 2005 00:49, Badari Pulavarty wrote:
> 
> >
> > I would really appreciate your comments on my approach.
> 
> (from a high level point of view) It sounds very scary. Traditionally
> a lot of code had special case handling to avoid truncate
> races, and it might need a lot of auditing to make sure
> everybode else can handle arbitary punch hole too.

-ENOSYS is returned for all fs but tmpfs (the short term big need of
this feature). so as long as tmpfs works and -ENOSYS is returned to the
other fs, complexity should remain reasonably low, and for the long term
the API sounds nicer than a local tmpfs hack like MADV_DISCARD.

Patch looks good to me, thanks Baudari for taking care of this!

I'll try to give it some testing and I'll let you know if I run into
troubles.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 13:17   ` Andrea Arcangeli
@ 2005-10-27 15:00     ` Badari Pulavarty
  2005-10-27 15:11       ` Andrea Arcangeli
  0 siblings, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-27 15:00 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andi Kleen, Hugh Dickins, akpm, Jeff Dike, dvhltc, linux-mm

On Thu, 2005-10-27 at 15:17 +0200, Andrea Arcangeli wrote:
> On Thu, Oct 27, 2005 at 10:38:51AM +0200, Andi Kleen wrote:
> > On Thursday 27 October 2005 00:49, Badari Pulavarty wrote:
> > 
> > >
> > > I would really appreciate your comments on my approach.
> > 
> > (from a high level point of view) It sounds very scary. Traditionally
> > a lot of code had special case handling to avoid truncate
> > races, and it might need a lot of auditing to make sure
> > everybode else can handle arbitary punch hole too.
> 
> -ENOSYS is returned for all fs but tmpfs (the short term big need of
> this feature). so as long as tmpfs works and -ENOSYS is returned to the
> other fs, complexity should remain reasonably low, and for the long term
> the API sounds nicer than a local tmpfs hack like MADV_DISCARD.
> 
> Patch looks good to me, thanks Baudari for taking care of this!
> 
> I'll try to give it some testing and I'll let you know if I run into
> troubles.

Thank you for taking a look at it. I am hoping this would satisfy
Jeff's UML requirement too.

BTW, my initial testing found no bugs so far - thats why I am scared :(
But again, I am sure my testing is not covering cases where shared 
memory segments got swapped out. I need to do a closer audit to make
sure that I am indeed freeing up all the swap entries.

And also, I am not sure we should allow using this interface for
truncating up. 

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 15:00     ` Badari Pulavarty
@ 2005-10-27 15:11       ` Andrea Arcangeli
  2005-10-27 18:20         ` Andrew Morton
  0 siblings, 1 reply; 86+ messages in thread
From: Andrea Arcangeli @ 2005-10-27 15:11 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Andi Kleen, Hugh Dickins, akpm, Jeff Dike, dvhltc, linux-mm

On Thu, Oct 27, 2005 at 08:00:12AM -0700, Badari Pulavarty wrote:
> BTW, my initial testing found no bugs so far - thats why I am scared :(
> But again, I am sure my testing is not covering cases where shared 
> memory segments got swapped out. I need to do a closer audit to make
> sure that I am indeed freeing up all the swap entries.

Freeing swap entries is the most important thing and at the same time
the most complex in the patch (that's why the previous MADV_DISCARD was
so simple ;).

> And also, I am not sure we should allow using this interface for
> truncating up. 

I guess we can allow using the interface for truncating up too.
Currently you can map beyond the end of the i_size but it sigbus if you
touch it. So if you want to suddently have more mmap space to store
data, you can first MADV_TRUNCATE it, and then it won't sigbus anymore
(and it will be recorded on disk/swap depending if it's a real fs or
tmpfs) up to the highest point of the truncate range.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 15:11       ` Andrea Arcangeli
@ 2005-10-27 18:20         ` Andrew Morton
  2005-10-27 18:35           ` Badari Pulavarty
  2005-10-27 20:04           ` Andrea Arcangeli
  0 siblings, 2 replies; 86+ messages in thread
From: Andrew Morton @ 2005-10-27 18:20 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: pbadari, ak, hugh, jdike, dvhltc, linux-mm

err, guys.

Andrea Arcangeli <andrea@suse.de> wrote:
>
> ...
>
> tmpfs (the short term big need of this feature).
> 
> ...
>
> Freeing swap entries is the most important thing and at the same time
> the most complex in the patch (that's why the previous MADV_DISCARD was
> so simple ;).
> 

I think there's something you're not telling us!

googling MADV_DISCARD comes up with basically nothing.  MADV_TRUNCATE comes
up with precisely nothing.

Why does tmpfs need this feature?  What's the requirement here?  Please
spill the beans ;)


Comment on the patch: doing it via madvise sneakily gets around the
problems with partial-page truncation (we don't currently have a way to
release anything but the the tail-end of a page's blocks).

But if we start adding infrastructure of this sort people are, reasonably,
going to want to add sys_holepunch(fd, start, len) and it's going to get
complexer.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 18:20         ` Andrew Morton
@ 2005-10-27 18:35           ` Badari Pulavarty
  2005-10-27 18:50             ` Andrew Morton
  2005-10-27 20:04           ` Andrea Arcangeli
  1 sibling, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-27 18:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andrea Arcangeli, ak, hugh, jdike, dvhltc, linux-mm

On Thu, 2005-10-27 at 11:20 -0700, Andrew Morton wrote:
> err, guys.
> 
> Andrea Arcangeli <andrea@suse.de> wrote:
> >
> > ...
> >
> > tmpfs (the short term big need of this feature).
> > 
> > ...
> >
> > Freeing swap entries is the most important thing and at the same time
> > the most complex in the patch (that's why the previous MADV_DISCARD was
> > so simple ;).
> > 
> 
> I think there's something you're not telling us!
> 
> googling MADV_DISCARD comes up with basically nothing.  MADV_TRUNCATE comes
> up with precisely nothing.

I sent out a patch (linux-mm) for review madvise(MADV_DISCARD) to drop
the pagecache pages for shared memory segments. Andrea & Hugh commented
that - its not good enough, since:

(1) It doesn't work on shmfs, if the blocks are swapped out.
(2) it doesn't work on real filesystems and corrupts stuff (because
we are thrashing pagecache without filesystem knowledge).

> 
> Why does tmpfs need this feature?  What's the requirement here?  Please
> spill the beans ;)

I have 2 reasons (I don't know if Andrea has more uses/reasons):

(1) Our database folks want to drop parts of shared memory segments
when they see memory pressure or memory hotplug/virtualization stuff.
madvise(DONTNEED) is not really releasing the pagecache pages. So 
they want madvise(DISCARD).

(2) Jeff Dike wants to use this for UML.

> 
> 
> Comment on the patch: doing it via madvise sneakily gets around the
> problems with partial-page truncation (we don't currently have a way to
> release anything but the the tail-end of a page's blocks).
> 
> But if we start adding infrastructure of this sort people are, reasonably,
> going to want to add sys_holepunch(fd, start, len) and it's going to get
> complexer.

Please advise on what you would prefer. A small extension to madvise()
to solve few problems right now OR lets do real sys_holepunch() and
bite the bullet (even though we may not get any more users for it).

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 18:35           ` Badari Pulavarty
@ 2005-10-27 18:50             ` Andrew Morton
  2005-10-27 19:40               ` Gerrit Huizenga
                                 ` (2 more replies)
  0 siblings, 3 replies; 86+ messages in thread
From: Andrew Morton @ 2005-10-27 18:50 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: andrea, ak, hugh, jdike, dvhltc, linux-mm

Badari Pulavarty <pbadari@us.ibm.com> wrote:
>
> I have 2 reasons (I don't know if Andrea has more uses/reasons):
>
> (1) Our database folks want to drop parts of shared memory segments
> when they see memory pressure

How do they "see memory pressure"?

The kernel's supposed to write the memory out to swap under memory
pressure, so why is a manual interface needed?

> or memory hotplug/virtualization stuff.

Really?  Are you sure?  Is this the only means by which the memory hotplug
developers can free up shmem pages?  I think not...

> madvise(DONTNEED) is not really releasing the pagecache pages. So 
> they want madvise(DISCARD).
>
> (2) Jeff Dike wants to use this for UML.

Why?  For what purpose?   Will he only ever want it for shmem segments?

> Please advise on what you would prefer. A small extension to madvise()
>  to solve few problems right now OR lets do real sys_holepunch() and
>  bite the bullet (even though we may not get any more users for it).

I don't think that the benefits for a full holepunch would be worth the
complexity - nasty, complex, rarely-tested changes to every filesystem.  So
let's not go there.

If we take the position that this is a shmem-specific thing and we don't
intend to extend it to real/regular filesytems then perhaps a new syscall
would be more appropriate.  On x86 that'd probably be another entry in the
sys_shm() switch statement.  Maybe?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 18:50             ` Andrew Morton
@ 2005-10-27 19:40               ` Gerrit Huizenga
  2005-10-27 19:56                 ` Andi Kleen
  2005-10-27 20:05               ` Theodore Ts'o
  2005-10-27 20:22               ` Jeff Dike
  2 siblings, 1 reply; 86+ messages in thread
From: Gerrit Huizenga @ 2005-10-27 19:40 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Badari Pulavarty, andrea, ak, hugh, jdike, dvhltc, linux-mm

On Thu, 27 Oct 2005 11:50:50 PDT, Andrew Morton wrote:
> Badari Pulavarty <pbadari@us.ibm.com> wrote:
> >
> > I have 2 reasons (I don't know if Andrea has more uses/reasons):
> >
> > (1) Our database folks want to drop parts of shared memory segments
> > when they see memory pressure
> 
> How do they "see memory pressure"?
> 
> The kernel's supposed to write the memory out to swap under memory
> pressure, so why is a manual interface needed?
> 
> > or memory hotplug/virtualization stuff.
> 
> Really?  Are you sure?  Is this the only means by which the memory hotplug
> developers can free up shmem pages?  I think not...
 
On pSeries, an LPAR shrink the amount of memory/number of processors
available to an OS instance.  The most convenient way for this to happen
for some applications is to tell them that their world has shrunk, so
they can conssciously resize their various data pools, mmap segments,
buffers, pre-fault rates, heaps, etc. in some uniform way.  Once they
have been told the world is going to shrink the LPAR can more easily
find free pages to scavenge without sending them machine into paroxysms
of page paging and thrashing.

> > madvise(DONTNEED) is not really releasing the pagecache pages. So 
> > they want madvise(DISCARD).
> >
> > (2) Jeff Dike wants to use this for UML.
> 
> Why?  For what purpose?   Will he only ever want it for shmem segments?

 I don't know Jeff's purpose, but this allows some large applications
 to mmap a rediculously large mmap segment which doesn't have to be
 remapped every time the underlying hardware changes.  At the same time,
 some applications (DB2 is the prime example here, but Java wants this
 as well) know when pages are no longer needed and would like to free
 them.

 In Java, for instance, the heap can a two hand sweep and compress,
 moving active pages from one side of the heap to the other periodically.
 (Actually the heap management is a bit more complex than that, but...)
 The overall heap is a large virtual address space but in reality
 when pages are freed from it, the application really believes those
 pages can go away and should not be cached or preserved for that section.
 The physical pages can be re-used immediately and re-faulted (possibly
 ZFOD) if necessary afterwards.

> > Please advise on what you would prefer. A small extension to madvise()
> >  to solve few problems right now OR lets do real sys_holepunch() and
> >  bite the bullet (even though we may not get any more users for it).
> 
> I don't think that the benefits for a full holepunch would be worth the
> complexity - nasty, complex, rarely-tested changes to every filesystem.  So
> let's not go there.
> 
> If we take the position that this is a shmem-specific thing and we don't
> intend to extend it to real/regular filesytems then perhaps a new syscall
> would be more appropriate.  On x86 that'd probably be another entry in the
> sys_shm() switch statement.  Maybe?

 I believe Java uses mmap() today for this; DB2 probably uses both mmap()
 and shm*().

gerrit

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 19:40               ` Gerrit Huizenga
@ 2005-10-27 19:56                 ` Andi Kleen
  2005-10-27 23:21                   ` Darren Hart
  0 siblings, 1 reply; 86+ messages in thread
From: Andi Kleen @ 2005-10-27 19:56 UTC (permalink / raw)
  To: Gerrit Huizenga
  Cc: Andrew Morton, Badari Pulavarty, andrea, hugh, jdike, dvhltc,
	linux-mm

On Thursday 27 October 2005 21:40, Gerrit Huizenga wrote:

>  I believe Java uses mmap() today for this; DB2 probably uses both mmap()
>  and shm*().

In the java case the memory should be anonymous, no? This means just plain
munmap would work. Or do I miss something?

-Andi

 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 18:20         ` Andrew Morton
  2005-10-27 18:35           ` Badari Pulavarty
@ 2005-10-27 20:04           ` Andrea Arcangeli
  2005-10-27 20:50             ` Andrew Morton
  2005-10-27 23:28             ` Peter Chubb
  1 sibling, 2 replies; 86+ messages in thread
From: Andrea Arcangeli @ 2005-10-27 20:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: pbadari, ak, hugh, jdike, dvhltc, linux-mm

On Thu, Oct 27, 2005 at 11:20:54AM -0700, Andrew Morton wrote:
> googling MADV_DISCARD comes up with basically nothing.  MADV_TRUNCATE comes
> up with precisely nothing.
> 
> Why does tmpfs need this feature?  What's the requirement here?  Please
> spill the beans ;)

MADV_TRUNCATE is a name I made up myself last month. During a
presentation at suse labs conf some people at SUSE even complained that
it may not be the right name (they intended the word truncate as
reducing the i_size), but it made sense to me since internally
what it does is a truncate_range (plus truncate also increases the size,
it's not only a "truncate" anyway).

The idea is to implement a sys_truncate_range, but using the mappings so
the user doesn't need to keep track of which parts of the file have to
be truncated, and it only needs to know which part of the address space
is obsolete. This will be the first API that allows to re-create holes
in files.

I'm not a buzzword(tm) producer, so if you don't like the name feel free
to rename it, I don't actually care about names. For now MADV_TRUNCATE
is a placeholder name, which quite clearly explains what the syscall
does.

> Comment on the patch: doing it via madvise sneakily gets around the
> problems with partial-page truncation (we don't currently have a way to
> release anything but the the tail-end of a page's blocks).
> 
> But if we start adding infrastructure of this sort people are, reasonably,
> going to want to add sys_holepunch(fd, start, len) and it's going to get
> complexer.

Yes, I also wanted to add both a sys_truncate_range and a MADV_TRUNCATE,
but the partner only needs MADV_TRUNCATE and they don't care about the
sys_truncate_range, so it got higher prio.

When I received MADV_DISCARD patch I suggested Badari to actually
implement the MADV_TRUNCATE, in the short term we only care about tmpfs
of course (the same would apply to a sys_truncate_range), but I think
the MADV_TRUNCATE API is cleaner for the long term than a tmpfs specific
hack.

Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released. However the only way to
release tmpfs-swap is to MADV_TRUNCATE.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 18:50             ` Andrew Morton
  2005-10-27 19:40               ` Gerrit Huizenga
@ 2005-10-27 20:05               ` Theodore Ts'o
  2005-10-27 20:16                 ` Andrea Arcangeli
  2005-10-28  1:42                 ` Badari Pulavarty
  2005-10-27 20:22               ` Jeff Dike
  2 siblings, 2 replies; 86+ messages in thread
From: Theodore Ts'o @ 2005-10-27 20:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Badari Pulavarty, andrea, ak, hugh, jdike, dvhltc, linux-mm

This is somewhat related to something which the JVM folks have been
pestering us (i.e., anyone within the LTC who will listen :-) for a
while now, which is a way to very _quickly_ (i.e., faster than munmap)
tell the kernel that a certain range of pages are not used any more by
the JVM, because the garbage collector has finished, and the indicated
region of memory is unused "oldspace".  

If those pages are needed the kernel is free to grab them for an other
purpose without writing them back to swap, and any attempt to read
from said memory afterwards should result in undefined behaviour.  In
practice, the JVM should never (absent bugs) try to read or write from
such pages before it tells the kernel that it cares about a region of
memory again (i.e., when the garbage collector runs again and needs to
use that section of memory for memory allocations, at which point it
won't care what the old memory values).

The JVM folks have tried using munmap, but it's too slow and if the
system isn't under memory pressure (as would be the case when an
application is correctly tuned for the machine and in benchmark
situations :-), completely unnecessary, since the pages will have to
mmaped back in after the next GC anyway.  So currently today, the JVM
folks simply do not release oldspace memory back to the system at all
after a GC.

What would be nice would be there is some way that an VMA could be
marked, "contents are unimportant", so that if there is a need for any
pages, the pages can be assumed to be clean and can simply be reused
for another purpose once they are deactivated without needing to waste
any swap bandwidth writing out pages whose contents are unimportant
and not in use by the JVM.  Then when the region is marked as being in
use again, and when it is touched, we simply map in the zero page COW.

That way, if the system is operating with plenty of memory, the
performance is minimal (simply setting and clearing a bit in the VMA).
But if the system is under memory pressure, the JVM is being a good
citizen and allowing its memory pages to be used for other purposes.

Does this sound like an idea that would be workable?  I'm not a VM
expert, but it doesn't sound like it's that hard, and I don't see any
obvious flaws with this plan.

						- Ted

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 20:05               ` Theodore Ts'o
@ 2005-10-27 20:16                 ` Andrea Arcangeli
  2005-10-28  1:42                 ` Badari Pulavarty
  1 sibling, 0 replies; 86+ messages in thread
From: Andrea Arcangeli @ 2005-10-27 20:16 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Andrew Morton, Badari Pulavarty, ak, hugh, jdike, dvhltc,
	linux-mm

Hi Ted,

On Thu, Oct 27, 2005 at 04:05:15PM -0400, Theodore Ts'o wrote:
> Does this sound like an idea that would be workable?  I'm not a VM
> expert, but it doesn't sound like it's that hard, and I don't see any
> obvious flaws with this plan.

AFIK, the closest thing today is a MADV_DONTNEED.

Actually our MADV_DONTNEED is equivalent to the Slowlaris MADV_FREE. Our
MADV_DONTNEED is too aggressive for anonymous memory (IIRC their
MADV_DONTNEED is not destructive).

So in short our linux MADV_DONTNEED already does what you suggested for
anonymous memory and it effectively implements the MADV_FREE (actually
our MADV_DONTNEED also works on non-anymous vmas, but it's not
destructive for the non anonymous vmas, our MADV_DONTNEED is destructive
only for the anonymous vmas).

Our MADV_DONTNEED is a bit heavy though, not as heavy as an munmap but
quite heavy too since it will walk all the pagetables for the region you
unmap. No syscall is still cheaper than MADV_DONTNEED ;)

I think we should rename our MADV_DONTNEED to MADV_FREE since we already
match the semantics of MADV_FREE, the only difference is that our
MADV_DONTNEED doesn't return -EINVAL if the mapping is not anonymous
(i.e. filebacked).

The place where MADV_TRUNCATE kicks in is for the filebacked vmas, for
the anonymous vmas our MADV_DONTNEED already works.

Not sure if we should change MADV_TRUNCATE to transparently fallback to
MADV_FREE for the anonymous vmas. That would provide an universal
destructive API that works anywhere (as long as there's some vma mapped
in the region). OTOH forcing people to use MADV_TRUNCATE for filebacked
vmas, and MADV_FREE for anonymous vmas would be more strict behaviour,
but then it's less handy to use.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 18:50             ` Andrew Morton
  2005-10-27 19:40               ` Gerrit Huizenga
  2005-10-27 20:05               ` Theodore Ts'o
@ 2005-10-27 20:22               ` Jeff Dike
  2 siblings, 0 replies; 86+ messages in thread
From: Jeff Dike @ 2005-10-27 20:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Badari Pulavarty, andrea, ak, hugh, dvhltc, linux-mm,
	Theodore Ts'o

On Thu, Oct 27, 2005 at 11:50:50AM -0700, Andrew Morton wrote:
> > or memory hotplug/virtualization stuff.
> 
> Really?  Are you sure?  Is this the only means by which the memory hotplug
> developers can free up shmem pages?  I think not...
> 
> > madvise(DONTNEED) is not really releasing the pagecache pages. So 
> > they want madvise(DISCARD).
> >
> > (2) Jeff Dike wants to use this for UML.
> 
> Why?  For what purpose?   Will he only ever want it for shmem segments?

I want this for memory hotplug.  This isn't the only possible
mechanism.  Others that will work are
	sys_punch
	a special driver that frees memory when its map count goes to
zero

I kludged the second into shmfs, but I wouldn't recommend it to
anyone.

madvise(DONT_NEED) doesn't work because it only actually frees memory
when called on anonymous pages.  I need dirty file-backed pages to be
freed as though they are clean.

An shmem-only implementation would work for me.  tmpfs is noticably
faster as backing for UML memory than a disk-based filesystem.
However, if a disk-backed filesystem is faster than tmpfs, then I'll
start wanting something more like sys_punch :-)

Ted's comment about freeing oldmemory might also be interesting for
UML.  In that case, __free_pages might invoke some host mechanism to
free the pages on the host.  The mechanism would have to be fast, and
I'm not sure how well it would do in practice because freed pages are
pretty likely to be reallocated quickly.  This could help when a bunch
of dirty anonymous pages get freed when a large process exits.  But if
the system is under any kind of memory pressure, freed pages will just
get reused immediately, so freeing them on the host would be
pointless.

				Jeff

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 20:04           ` Andrea Arcangeli
@ 2005-10-27 20:50             ` Andrew Morton
  2005-10-27 21:37               ` Andrea Arcangeli
  2005-10-27 22:32               ` Badari Pulavarty
  2005-10-27 23:28             ` Peter Chubb
  1 sibling, 2 replies; 86+ messages in thread
From: Andrew Morton @ 2005-10-27 20:50 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: pbadari, ak, hugh, jdike, dvhltc, linux-mm

Andrea Arcangeli <andrea@suse.de> wrote:
>
> On Thu, Oct 27, 2005 at 11:20:54AM -0700, Andrew Morton wrote:
> > googling MADV_DISCARD comes up with basically nothing.  MADV_TRUNCATE comes
> > up with precisely nothing.
> > 
> > Why does tmpfs need this feature?  What's the requirement here?  Please
> > spill the beans ;)
> 
> MADV_TRUNCATE is a name I made up myself last month.

You misunderstand.  I'm unconcerned about the names.  My reasons for
googling was to wonder "wtf is this feature for?".  And it came up blank.

> ...
> but the partner only needs MADV_TRUNCATE and they don't care about the
> sys_truncate_range, so it got higher prio.

This is what I'm asking about.  What's the requirement?  What's the
application?  What's the workload?  What's the testcase?  All that old
stuff.  This should have been the very, very first thing which Badari
presented to us.

> I think
> the MADV_TRUNCATE API is cleaner for the long term than a tmpfs specific
> hack.

Why?

If we do it this way then we should do it for other filesystems.  And then
we should do it for files which _aren't_ mmapped.  And then we should do it
on a finer-than-PAGE_SIZE granularity.

IOW: we're unlikely to implement MADV_TRUNCATE for anything other than
tmpfs, in which case MADV_TRUNCATE will remain a tmpfs specific hack, no?

> Some app allocates large tmpfs files, then when some task quits and some
> client disconnect, some memory can be released. However the only way to
> release tmpfs-swap is to MADV_TRUNCATE.

Or to swap it out.


I think we need to restart this discussion.  Can we please have a
*detailed* description of the problem?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 20:50             ` Andrew Morton
@ 2005-10-27 21:37               ` Andrea Arcangeli
  2005-10-27 22:23                 ` Andrew Morton
  2005-10-27 22:32               ` Badari Pulavarty
  1 sibling, 1 reply; 86+ messages in thread
From: Andrea Arcangeli @ 2005-10-27 21:37 UTC (permalink / raw)
  To: Andrew Morton; +Cc: pbadari, ak, hugh, jdike, dvhltc, linux-mm

On Thu, Oct 27, 2005 at 01:50:58PM -0700, Andrew Morton wrote:
> This is what I'm asking about.  What's the requirement?  What's the
> application?  What's the workload?  What's the testcase?  All that old
> stuff.  This should have been the very, very first thing which Badari
> presented to us.

I mentioned the reason we need that feature at the end of the last email.

> If we do it this way then we should do it for other filesystems.  And then

Why do you think so? Even O_DIRECT and the acl were not supported by all
the fs immediately, what's wrong with that? This is normal procedure as
far as I can tell. If -ENOSYS is returned, it means the app should
fallback to some other way to do the truncate by hand (depending on the
app, bzero could work or some other app can be ok with doing nothing at
all if -ENOSYS is returned).

> we should do it for files which _aren't_ mmapped.  And then we should do it
> on a finer-than-PAGE_SIZE granularity.

I agree with this. I also suggested doing all of it, not just the mmap
interface. However the only thing they care about is the mmap interface,
and this is why this is coming first. Also note, my MADV_TRUNCATE is by
coincidence needed by IBM too, the testcase I was trying to improve was
not an IBM workload, I learnt about the IBM effort only a few days ago.
But others happen to need it for the very same reason (no, not Oracle,
but Oracle would benefit from it too of course).

> IOW: we're unlikely to implement MADV_TRUNCATE for anything other than
> tmpfs, in which case MADV_TRUNCATE will remain a tmpfs specific hack, no?

In 2.6 yes. But in the future it's an API we can extend to work on more
fs with well defined semantics.

What's the benefit in having MADV_DISCARD that works on tmpfs, and then
some day in the future to add a MADV_TRUNCATE that works on other fs too?

The retval of MADV_TRUNCATE will still be an error in both cases for
older kernels. So we may go for the more generic API in the first place
IMHO.

The less MADV_MESS there is the better and the more explicit the name is
the better too.

> Or to swap it out.

Ok, the whole point is to release the swap. This stuff is already in
completely swap for ages, nobody touched it for ages, but it's bad for
performance and for swap fragmentation if after a peak of load 16G
remains always in swap when infact the app could release all the
swap after the load went down (if only it could use MADV_TRUNCATE).
 
At some point during the lifetime of the appliaction thousand of clients
connects, each one allocats from tmpfs, then when the load goes down we
want to free the swap that contains no useful info anymore. Perhaps such
a peak load will never happen again in the lifetime of the application,
and we want to have swap available for other usages. munmap isn't
enough, that's tmpfs backed storage, only truncate can release the swap.

> I think we need to restart this discussion.  Can we please have a

Sure no problem.

> *detailed* description of the problem?

Hope the above clarifies some more bits.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 21:37               ` Andrea Arcangeli
@ 2005-10-27 22:23                 ` Andrew Morton
  2005-10-27 23:05                   ` Badari Pulavarty
  2005-10-28  0:22                   ` Andrea Arcangeli
  0 siblings, 2 replies; 86+ messages in thread
From: Andrew Morton @ 2005-10-27 22:23 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: pbadari, ak, hugh, jdike, dvhltc, linux-mm

Andrea Arcangeli <andrea@suse.de> wrote:
>
> On Thu, Oct 27, 2005 at 01:50:58PM -0700, Andrew Morton wrote:
> > This is what I'm asking about.  What's the requirement?  What's the
> > application?  What's the workload?  What's the testcase?  All that old
> > stuff.  This should have been the very, very first thing which Badari
> > presented to us.
> 
> I mentioned the reason we need that feature at the end of the last email.

It's slowly becoming clearer ;)

> > If we do it this way then we should do it for other filesystems.  And then
> 
> Why do you think so? Even O_DIRECT and the acl were not supported by all
> the fs immediately, what's wrong with that? This is normal procedure as
> far as I can tell. If -ENOSYS is returned, it means the app should
> fallback to some other way to do the truncate by hand (depending on the
> app, bzero could work or some other app can be ok with doing nothing at
> all if -ENOSYS is returned).

But in the case of O_DIRECT and acls we had a plan, from day one, to extend
the capability to many (ideally all) filesystems.

We have no such plan for holepunching!

Maybe we _should_ have such a plan, but we've never discussed it.

If we _do_ have such a plan (or might in the future) then what would the
API look like?  I think sys_holepunch(fd, start, len), so we should start
out with that.

If we don't have such a plan, and we don't think that we ever will have
such a plan, then what should the API look like?

Using madvise is very weird, because people will ask "why do I need to mmap
my file before I can stick a hole in it?"

None of the other madvise operations call into the filesystem in this manner.

A broad question is: is this capability an MM operation or a filesytem
operation?  truncate, for example, is a filesystem operation which
sometimes has MM side-effects.  madvise is an mm operation and with this
patch, it gains FS side-effects, only they're really, really significant
ones.

So I'm struggling to work out where all this is headed, and how we should
think about it all.

> > we should do it for files which _aren't_ mmapped.  And then we should do it
> > on a finer-than-PAGE_SIZE granularity.
> 
> I agree with this. I also suggested doing all of it, not just the mmap
> interface.

Right.  Sometime, maybe.  There's been _some_ demand for holepunching, but
it's been fairly minor and is probably a distraction from this immediate
and specific customer requirement.

> However the only thing they care about is the mmap interface,
> and this is why this is coming first. Also note, my MADV_TRUNCATE is by
> coincidence needed by IBM too, the testcase I was trying to improve was
> not an IBM workload, I learnt about the IBM effort only a few days ago.
> But others happen to need it for the very same reason (no, not Oracle,
> but Oracle would benefit from it too of course).
> 
> > IOW: we're unlikely to implement MADV_TRUNCATE for anything other than
> > tmpfs, in which case MADV_TRUNCATE will remain a tmpfs specific hack, no?
> 
> In 2.6 yes. But in the future it's an API we can extend to work on more
> fs with well defined semantics.

Right.  And in the future I think it would be designed as a generalisation
of sys_ftruncate().

> What's the benefit in having MADV_DISCARD that works on tmpfs, and then
> some day in the future to add a MADV_TRUNCATE that works on other fs too?
> 
> The retval of MADV_TRUNCATE will still be an error in both cases for
> older kernels. So we may go for the more generic API in the first place
> IMHO.
> 
> The less MADV_MESS there is the better and the more explicit the name is
> the better too.
> 
> > Or to swap it out.
> 
> Ok, the whole point is to release the swap. This stuff is already in
> completely swap for ages, nobody touched it for ages, but it's bad for
> performance and for swap fragmentation if after a peak of load 16G
> remains always in swap when infact the app could release all the
> swap after the load went down (if only it could use MADV_TRUNCATE).

ah-hah.

hm.   Tossing ideas out here:

- Implement the internal infrastructure as you have it

- View it as a filesystem operation which has MM side-effects.

- Initially access it via sys_ipc()  (or madvise, I guess.  Both are a bit odd)

- Later access it via sys_[hole]punch()

Alternatively, access it via sys_[hole]punch() immediately, but I'm not
sure that userspace can get access to the shm area's fd?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 20:50             ` Andrew Morton
  2005-10-27 21:37               ` Andrea Arcangeli
@ 2005-10-27 22:32               ` Badari Pulavarty
  1 sibling, 0 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-27 22:32 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andrea Arcangeli, ak, hugh, jdike, dvhltc, linux-mm

On Thu, 2005-10-27 at 13:50 -0700, Andrew Morton wrote:

> I think we need to restart this discussion.  Can we please have a
> *detailed* description of the problem?

Andrew,

Sorry for replying late, I am just relaxing and watching fun :)

Here are the reasons I believe our database folks wants this.
(I am not a database person, if you need more info I can go back
and ask them).

1) In most customer environments, they run multiple instances
of DB2 in the system (single OS) to serve different databases.
At the time of starting these instances, they size their buffers,
shared memory segments etc and hope to run with it. Depending
on the load & access patterns on different databases - they 
would like to grow and shrink their buffers. 

Currently, they are using /proc/meminfo to notice the memory
usages (and pressure - they want a better way and discussion
for another topic) and they want to release part of their
shared memory segments (drop them to floor and free up the
swap entires - since they already did whatever they need to
do with those).

So, I proposed madvise(DISCARD) functionality AND I care
about ONLY shared memory segments. (I don't remember they 
wanting this for mmap(), could be wrong - but I am definite
about file-backed mmap()s).

2) In virtualized environments, they want to react "nicely"
to changes in the memory configuration - by releasing the
portions of segments they don't need. (Its not a hotplug
remove - where hotplug is trying to free up a particular
memory region). They would like to size their resources
depending on memory add/remove events.

Does this help ?

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 22:23                 ` Andrew Morton
@ 2005-10-27 23:05                   ` Badari Pulavarty
  2005-10-27 23:16                     ` Andrew Morton
  2005-10-28  0:22                   ` Andrea Arcangeli
  1 sibling, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-27 23:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andrea Arcangeli, ak, hugh, jdike, dvhltc, linux-mm

On Thu, 2005-10-27 at 15:23 -0700, Andrew Morton wrote:

> 
> hm.   Tossing ideas out here:
> 
> - Implement the internal infrastructure as you have it
> 
> - View it as a filesystem operation which has MM side-effects.
> 
> - Initially access it via sys_ipc()  (or madvise, I guess.  Both are a bit odd)
> 
> - Later access it via sys_[hole]punch()

Thats exactly what my patch provides. Do you really want to see this
through sys_ipc() or shmctl() ? I personally think madvise() or
sys_holepunch are the closest (since they work on a range).

What else I need to do to make it more palatable ?

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 23:05                   ` Badari Pulavarty
@ 2005-10-27 23:16                     ` Andrew Morton
  2005-10-27 23:33                       ` Peter Chubb
  0 siblings, 1 reply; 86+ messages in thread
From: Andrew Morton @ 2005-10-27 23:16 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: andrea, ak, hugh, jdike, dvhltc, linux-mm

Badari Pulavarty <pbadari@us.ibm.com> wrote:
>
> On Thu, 2005-10-27 at 15:23 -0700, Andrew Morton wrote:
> 
> > 
> > hm.   Tossing ideas out here:
> > 
> > - Implement the internal infrastructure as you have it
> > 
> > - View it as a filesystem operation which has MM side-effects.
> > 
> > - Initially access it via sys_ipc()  (or madvise, I guess.  Both are a bit odd)
> > 
> > - Later access it via sys_[hole]punch()
> 
> Thats exactly what my patch provides. Do you really want to see this
> through sys_ipc() or shmctl() ? I personally think madvise() or
> sys_holepunch are the closest (since they work on a range).

Well I do think mdavise() is an unnatural interface to what is mainly a
filesystem operation.

It's just that this initial requirement is actually a need for the
operation's MM side-effects, so we're incorrectly thinking of it as an MM
operation.  I think.

> What else I need to do to make it more palatable ?

Can we do sys_fholepunch(int fd, loff_t offset, loff_t length)?  That
requires that your applications know both the fd and the file offset.  

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 19:56                 ` Andi Kleen
@ 2005-10-27 23:21                   ` Darren Hart
  0 siblings, 0 replies; 86+ messages in thread
From: Darren Hart @ 2005-10-27 23:21 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Gerrit Huizenga, Andrew Morton, Badari Pulavarty, andrea, hugh,
	jdike, linux-mm

Andi Kleen wrote:
> On Thursday 27 October 2005 21:40, Gerrit Huizenga wrote:
> 
> 
>> I believe Java uses mmap() today for this; DB2 probably uses both mmap()
>> and shm*().
> 
> 
> In the java case the memory should be anonymous, no? This means just plain
> munmap would work. Or do I miss something?

I believe it was mentioned earlier (Andrea in reply to Ted) that 
madvise(MADV_DONTNEED) would work in the anonymous case.

> 
> -Andi
> 
>  
> 


-- 
Darren Hart
IBM Linux Technology Center
Linux Kernel Team
Phone: 503 578 3185
   T/L: 775 3185

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 20:04           ` Andrea Arcangeli
  2005-10-27 20:50             ` Andrew Morton
@ 2005-10-27 23:28             ` Peter Chubb
  2005-10-27 23:49               ` Andrew Morton
  1 sibling, 1 reply; 86+ messages in thread
From: Peter Chubb @ 2005-10-27 23:28 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Andrew Morton, pbadari, ak, hugh, jdike, dvhltc, linux-mm

>>>>> "Andrea" == Andrea Arcangeli <andrea@suse.de> writes:

Andrea> On Thu, Oct 27, 2005 at 11:20:54AM -0700, Andrew Morton wrote:

Andrea> The idea is to implement a sys_truncate_range, but using the
Andrea> mappings so the user doesn't need to keep track of which parts
Andrea> of the file have to be truncated, and it only needs to know
Andrea> which part of the address space is obsolete. This will be the
Andrea> first API that allows to re-create holes in files.

The preexisting art is for the SysVr4 fcntl(fd, F_FREESP, &lk);
which frees space in the file covered by the struct flock * third
argument.   Depending on the fileystem, this may or may not work in
the middle of a file: it does for XFS, and could for tmpfs.  It always
works at the end of a file.  So that should be `first API in Linux'

Peter C


-- 
Dr Peter Chubb  http://www.gelato.unsw.edu.au  peterc AT gelato.unsw.edu.au
The technical we do immediately,  the political takes *forever*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 23:16                     ` Andrew Morton
@ 2005-10-27 23:33                       ` Peter Chubb
  0 siblings, 0 replies; 86+ messages in thread
From: Peter Chubb @ 2005-10-27 23:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Badari Pulavarty, andrea, ak, hugh, jdike, dvhltc, linux-mm

>>>>> "Andrew" == Andrew Morton <akpm@osdl.org> writes:


Andrew> Can we do sys_fholepunch(int fd, loff_t offset, loff_t
Andrew> length)?  That requires that your applications know both the
Andrew> fd and the file offset.

Can we copy the SvR4 fcntl(int fd, F_FREESP, struct flock *lkp) ??
It'd ease the  porting burden for some things.


-- 
Dr Peter Chubb  http://www.gelato.unsw.edu.au  peterc AT gelato.unsw.edu.au
The technical we do immediately,  the political takes *forever*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 23:28             ` Peter Chubb
@ 2005-10-27 23:49               ` Andrew Morton
  2005-10-27 23:56                 ` Nathan Scott
  2005-10-27 23:59                 ` Peter Chubb
  0 siblings, 2 replies; 86+ messages in thread
From: Andrew Morton @ 2005-10-27 23:49 UTC (permalink / raw)
  To: Peter Chubb; +Cc: andrea, pbadari, ak, hugh, jdike, dvhltc, linux-mm

Peter Chubb <peterc@gelato.unsw.edu.au> wrote:
>
> >>>>> "Andrea" == Andrea Arcangeli <andrea@suse.de> writes:
> 
> Andrea> On Thu, Oct 27, 2005 at 11:20:54AM -0700, Andrew Morton wrote:
> 
> Andrea> The idea is to implement a sys_truncate_range, but using the
> Andrea> mappings so the user doesn't need to keep track of which parts
> Andrea> of the file have to be truncated, and it only needs to know
> Andrea> which part of the address space is obsolete. This will be the
> Andrea> first API that allows to re-create holes in files.
> 
> The preexisting art is for the SysVr4 fcntl(fd, F_FREESP, &lk);
> which frees space in the file covered by the struct flock * third
> argument.

Thanks.  That's a rather klunky API but it'd be straightforward enough to
implement.

However if we did this we'd need to do a 64-bit version as well, using
flock64.  Which means we really needn't bother with the 32-bit version,
which means we're not svr4-compatible, unless svr4 also has a 64-bit
version??

>   Depending on the fileystem, this may or may not work in
> the middle of a file: it does for XFS, and could for tmpfs.  It always
> works at the end of a file.  So that should be `first API in Linux'

Sounds sane.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 23:49               ` Andrew Morton
@ 2005-10-27 23:56                 ` Nathan Scott
  2005-10-28  0:15                   ` Andrea Arcangeli
  2005-10-27 23:59                 ` Peter Chubb
  1 sibling, 1 reply; 86+ messages in thread
From: Nathan Scott @ 2005-10-27 23:56 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Chubb, andrea, pbadari, ak, hugh, jdike, dvhltc, linux-mm

On Thu, Oct 27, 2005 at 04:49:59PM -0700, Andrew Morton wrote:
> Peter Chubb <peterc@gelato.unsw.edu.au> wrote:
> > The preexisting art is for the SysVr4 fcntl(fd, F_FREESP, &lk);
> > which frees space in the file covered by the struct flock * third
> > argument.
> 
> Thanks.  That's a rather klunky API but it'd be straightforward enough to
> implement.
> 
> However if we did this we'd need to do a 64-bit version as well, using
> flock64.  Which means we really needn't bother with the 32-bit version,
> which means we're not svr4-compatible, unless svr4 also has a 64-bit
> version??

There is, at least on IRIX (F_FREESP64).  Agreed on the API klunkiness
though ... its really not pretty. :|  Personally, I'd recommend going
with a sane API, and perhaps emulating the other on top of it if need
be.

cheers.

-- 
Nathan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 23:49               ` Andrew Morton
  2005-10-27 23:56                 ` Nathan Scott
@ 2005-10-27 23:59                 ` Peter Chubb
  1 sibling, 0 replies; 86+ messages in thread
From: Peter Chubb @ 2005-10-27 23:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Chubb, andrea, pbadari, ak, hugh, jdike, dvhltc, linux-mm

>>>>> "Andrew" == Andrew Morton <akpm@osdl.org> writes:


Andrew> However if we did this we'd need to do a 64-bit version as
Andrew> well, using flock64.  Which means we really needn't bother
Andrew> with the 32-bit version, which means we're not
Andrew> svr4-compatible, unless svr4 also has a 64-bit version??

Yes it does.

-- 
Dr Peter Chubb  http://www.gelato.unsw.edu.au  peterc AT gelato.unsw.edu.au
The technical we do immediately,  the political takes *forever*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 23:56                 ` Nathan Scott
@ 2005-10-28  0:15                   ` Andrea Arcangeli
  0 siblings, 0 replies; 86+ messages in thread
From: Andrea Arcangeli @ 2005-10-28  0:15 UTC (permalink / raw)
  To: Nathan Scott
  Cc: Andrew Morton, Peter Chubb, pbadari, ak, hugh, jdike, dvhltc,
	linux-mm

On Fri, Oct 28, 2005 at 09:56:00AM +1000, Nathan Scott wrote:
> There is, at least on IRIX (F_FREESP64).  Agreed on the API klunkiness
> though ... its really not pretty. :|  Personally, I'd recommend going
> with a sane API, and perhaps emulating the other on top of it if need
> be.

That's fine with me as replacement of truncate_rage, this is such a
corner case usage that this api is probably ok. however this is a
separate thing from the madvise one. The madvise one is the only one
where I'm aware of a real life need (of course madvise can be also
replaced by the F_FREESP64 but see below).

I had a specific requirement of using virtual addresses of mapped tmpfs
files, and not physical offsets and filedescriptors. At first I
suggested adding a sys_truncate_range but they apparently they don't
know where the file maps to (or they would need to translate it and
that's not cheap), but the kernel can find it faster than userland (or
at least not slower than userland) by using the vmas.

About madvise not being used to do fs actions I agree about that,
however madvise is already destructive in terms of anonymous memory, so
it doesn't make an huge difference to me. I just didn't imagine a better
way to do that using the virtual range. There's nothing fundamentally
wrong in using MADV_TRUNCATE to do that. This involves a fs callback but
it certainly is a mm operation too given it changes the view of the
address space the same way MADV_DONTNEET (aka MADV_FREE) does for
anonymous memory.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 22:23                 ` Andrew Morton
  2005-10-27 23:05                   ` Badari Pulavarty
@ 2005-10-28  0:22                   ` Andrea Arcangeli
  2005-10-28  0:32                     ` Andrew Morton
  1 sibling, 1 reply; 86+ messages in thread
From: Andrea Arcangeli @ 2005-10-28  0:22 UTC (permalink / raw)
  To: Andrew Morton; +Cc: pbadari, ak, hugh, jdike, dvhltc, linux-mm

On Thu, Oct 27, 2005 at 03:23:40PM -0700, Andrew Morton wrote:
> It's slowly becoming clearer ;)

;)

> But in the case of O_DIRECT and acls we had a plan, from day one, to extend
> the capability to many (ideally all) filesystems.

for acl I'm unsure if we really hoped for all fs to have it, it's
similar for holepunching, vfat simply can't get it ;)

> Right.  Sometime, maybe.  There's been _some_ demand for holepunching, but
> it's been fairly minor and is probably a distraction from this immediate
> and specific customer requirement.

Yes, holepunching in a real fs is a distraction at the moment, tmpfs is
the real need.

> Right.  And in the future I think it would be designed as a generalisation
> of sys_ftruncate().

Except we can't change sys_ftruncate, and they don't have a clue on
what's the fd backing the mapping, nor the offsets.

> - View it as a filesystem operation which has MM side-effects.

I suggested the fs operation too but then it's more efficient to have it
as a mm operation with fs side effects, because they don't immediatly
know fd and physical offset of the range. It's possible to fixup in
userland and to use the fs operation but it's more expensive, the vmas
are already in the kernel and we can use them.

> - Initially access it via sys_ipc()  (or madvise, I guess.  Both are a bit odd)
> 
> - Later access it via sys_[hole]punch()
> 
> Alternatively, access it via sys_[hole]punch() immediately, but I'm not
> sure that userspace can get access to the shm area's fd?

The ipc is not a problem, all data is in tmpfs, it's not ipc shared
memory. The problem is translating from virtual range to fd/physical
range, that something the kernel can do faster internally.

The app is not open source so I've to trust them it's true kernel will
lookup it faster and cheaper with the vma.

I like both of the fs and mmap apis being available, and for the mmap
one madvise just looks normal. madvise is already destructive for
anonymous memory, we're just going to make it destructive for filebacked
mappings too the same way. Infact perhaps we can make MADV_TRUNCATE
fallback into MADV_FREE (current MADV_DONTNEED) mode when on anonymous
memory. I don't see anything fundamentally wrong with such an API, as
long as kernel internals remains sane (i.e. we don't run into
fundamental locking problems etc..).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28  0:22                   ` Andrea Arcangeli
@ 2005-10-28  0:32                     ` Andrew Morton
  2005-10-28  1:10                       ` Andrea Arcangeli
  2005-10-28  1:27                       ` Badari Pulavarty
  0 siblings, 2 replies; 86+ messages in thread
From: Andrew Morton @ 2005-10-28  0:32 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: pbadari, ak, hugh, jdike, dvhltc, linux-mm

Andrea Arcangeli <andrea@suse.de> wrote:
>
> > - View it as a filesystem operation which has MM side-effects.
> 
>  I suggested the fs operation too but then it's more efficient to have it
>  as a mm operation with fs side effects, because they don't immediatly
>  know fd and physical offset of the range. It's possible to fixup in
>  userland and to use the fs operation but it's more expensive, the vmas
>  are already in the kernel and we can use them.

hm, so we have a somewhat awkward interface to a very specific thing to
benefit a closed-source app.  That'll go down well ;)

ho-hum.  Can we think of a better name than MADV_TRUNCATE please?  Dunno
what - MADV_REMOVE?

I think it'll need to return -EINVAL for nonlinear vma's?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28  0:32                     ` Andrew Morton
@ 2005-10-28  1:10                       ` Andrea Arcangeli
  2005-10-28  1:27                       ` Badari Pulavarty
  1 sibling, 0 replies; 86+ messages in thread
From: Andrea Arcangeli @ 2005-10-28  1:10 UTC (permalink / raw)
  To: Andrew Morton; +Cc: pbadari, ak, hugh, jdike, dvhltc, linux-mm

On Thu, Oct 27, 2005 at 05:32:43PM -0700, Andrew Morton wrote:
> hm, so we have a somewhat awkward interface to a very specific thing to
> benefit a closed-source app.  That'll go down well ;)

I know. Many of the database features have benefited closed-source apps
first and only later they have the potential to benefit everything else
too. As far as I don't have to run the closed-source apps myself I'm ok.

Plus the fact Badari also came up with a virtual-range interface with
its first MADV_DISCARD patch makes me suspect they'll also have a
benefit for similar reasons compared to the fs interface.

> what - MADV_REMOVE?

No problem with the name change.

> I think it'll need to return -EINVAL for nonlinear vma's?

That would be fine. For tmpfs it may not be too difficult to free the
swap even when the page offsets are sparse. For real fs it would be more
tricky to support many tiny holes. But the real reason I think -EINVAL
is ok, is that I generally dislike nonlinear related complexity because
I dislike nonlinear in the first place (nonlinear avoids vma overhead at
the expense of screwing up paging scalability, it should be used only in
extreme cases were the mapping is mlocked anyway).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28  0:32                     ` Andrew Morton
  2005-10-28  1:10                       ` Andrea Arcangeli
@ 2005-10-28  1:27                       ` Badari Pulavarty
  2005-10-28  2:00                         ` Andrew Morton
  1 sibling, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-28  1:27 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andrea Arcangeli, ak, hugh, jdike, dvhltc, linux-mm

Andrew Morton wrote:
> Andrea Arcangeli <andrea@suse.de> wrote:
> 
>>>- View it as a filesystem operation which has MM side-effects.
>>
>> I suggested the fs operation too but then it's more efficient to have it
>> as a mm operation with fs side effects, because they don't immediatly
>> know fd and physical offset of the range. It's possible to fixup in
>> userland and to use the fs operation but it's more expensive, the vmas
>> are already in the kernel and we can use them.
> 
> 
> hm, so we have a somewhat awkward interface to a very specific thing to
> benefit a closed-source app.  That'll go down well ;)
> 

I am not sure how apps can work out (fd, phys off, len) for a given
shared memory segment range easily.

> ho-hum.  Can we think of a better name than MADV_TRUNCATE please?  Dunno
> what - MADV_REMOVE?

how about - MADV_DISCARD :) Just kidding - MADV_REMOVE is a good
name.

I am still not clear on the consensus here - the plan is go forward
with the patch (ofcourse, naming changes) and may be later add
(fd, offset, len) version of it through sys_holepunch ?

If so, I can quickly redo my patch + I need to work out bugs in
shm_truncate_range().

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-27 20:05               ` Theodore Ts'o
  2005-10-27 20:16                 ` Andrea Arcangeli
@ 2005-10-28  1:42                 ` Badari Pulavarty
  2005-10-28 16:33                   ` Theodore Ts'o
  1 sibling, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-28  1:42 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Andrew Morton, andrea, ak, hugh, jdike, dvhltc, linux-mm

Theodore Ts'o wrote:

> This is somewhat related to something which the JVM folks have been
> pestering us (i.e., anyone within the LTC who will listen :-) for a
> while now, which is a way to very _quickly_ (i.e., faster than munmap)
> tell the kernel that a certain range of pages are not used any more by
> the JVM, because the garbage collector has finished, and the indicated
> region of memory is unused "oldspace".  
> 
> If those pages are needed the kernel is free to grab them for an other
> purpose without writing them back to swap, and any attempt to read
> from said memory afterwards should result in undefined behaviour.  In
> practice, the JVM should never (absent bugs) try to read or write from
> such pages before it tells the kernel that it cares about a region of
> memory again (i.e., when the garbage collector runs again and needs to
> use that section of memory for memory allocations, at which point it
> won't care what the old memory values).
> 
> The JVM folks have tried using munmap, but it's too slow and if the
> system isn't under memory pressure (as would be the case when an
> application is correctly tuned for the machine and in benchmark
> situations :-), completely unnecessary, since the pages will have to
> mmaped back in after the next GC anyway.  So currently today, the JVM
> folks simply do not release oldspace memory back to the system at all
> after a GC.
> 
> What would be nice would be there is some way that an VMA could be
> marked, "contents are unimportant", so that if there is a need for any
> pages, the pages can be assumed to be clean and can simply be reused
> for another purpose once they are deactivated without needing to waste
> any swap bandwidth writing out pages whose contents are unimportant
> and not in use by the JVM.  Then when the region is marked as being in
> use again, and when it is touched, we simply map in the zero page COW.
> 
> That way, if the system is operating with plenty of memory, the
> performance is minimal (simply setting and clearing a bit in the VMA).
> But if the system is under memory pressure, the JVM is being a good
> citizen and allowing its memory pages to be used for other purposes.
> 
> Does this sound like an idea that would be workable?  I'm not a VM
> expert, but it doesn't sound like it's that hard, and I don't see any
> obvious flaws with this plan.

Ted,

Like Andrea mentioned MADV_DONTNEED should be able to do what JVM
folks want. If they want more than that, get in touch with me.
While doing MADV_REMOVE, I will see if I can satsify their needs also.

Thanks,
Badari



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28  1:27                       ` Badari Pulavarty
@ 2005-10-28  2:00                         ` Andrew Morton
  0 siblings, 0 replies; 86+ messages in thread
From: Andrew Morton @ 2005-10-28  2:00 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: andrea, ak, hugh, jdike, dvhltc, linux-mm

Badari Pulavarty <pbadari@us.ibm.com> wrote:
>
> I am still not clear on the consensus here - the plan is go forward
>  with the patch (ofcourse, naming changes) and may be later add
>  (fd, offset, len) version of it through sys_holepunch ?

Spose so.  <mutter>.

Please ensure that the changlog captures everything which we've discussed.

>  If so, I can quickly redo my patch + I need to work out bugs in
>  shm_truncate_range().

Don't forget VM_NONLINEAR.   And VM_HUGETLB, VM_IO, VM_whatever come to that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-26 22:49 [RFC] madvise(MADV_TRUNCATE) Badari Pulavarty
  2005-10-27  8:38 ` Andi Kleen
@ 2005-10-28  3:46 ` Jeff Dike
  2005-10-28 11:03   ` Blaisorblade
                     ` (2 more replies)
  1 sibling, 3 replies; 86+ messages in thread
From: Jeff Dike @ 2005-10-28  3:46 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade

On Wed, Oct 26, 2005 at 03:49:55PM -0700, Badari Pulavarty wrote:
> Basically, I added "truncate_range" inode operation to provide
> opportunity for the filesystem to zero the blocks and/or free
> them up. 
> 
> I also attempted to implement shmem_truncate_range() which 
> needs lots of testing before I work out bugs :(

I added memory hotplug to UML to check this out.  It seems to be freeing
pages that are outside the desired range.  I'm doing the simplest possible
thing - grabbing a bunch of pages that are most likely not dirty yet, 
and MADV_TRUNCATEing them one at a time.  Everything in UML goes harwire
after that, and the cases that I've looked at involve pages being suddenly
zero.

UML isn't exactly a minimal test case, but I'll give you what you need
to reproduce this if you want.

				Jeff

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28  3:46 ` Jeff Dike
@ 2005-10-28 11:03   ` Blaisorblade
  2005-10-28 13:29     ` Andrea Arcangeli
  2005-10-28 16:16     ` Badari Pulavarty
  2005-10-28 16:19   ` Badari Pulavarty
  2005-10-28 17:55   ` [RFC] madvise(MADV_TRUNCATE) Blaisorblade
  2 siblings, 2 replies; 86+ messages in thread
From: Blaisorblade @ 2005-10-28 11:03 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Badari Pulavarty, Hugh Dickins, akpm, andrea, dvhltc, linux-mm

On Friday 28 October 2005 05:46, Jeff Dike wrote:
> On Wed, Oct 26, 2005 at 03:49:55PM -0700, Badari Pulavarty wrote:
> > Basically, I added "truncate_range" inode operation to provide
> > opportunity for the filesystem to zero the blocks and/or free
> > them up.
> >
> > I also attempted to implement shmem_truncate_range() which
> > needs lots of testing before I work out bugs :(
>
> I added memory hotplug to UML to check this out.  It seems to be freeing
> pages that are outside the desired range.  I'm doing the simplest possible
> thing - grabbing a bunch of pages that are most likely not dirty yet,
> and MADV_TRUNCATEing them one at a time.  Everything in UML goes harwire
> after that, and the cases that I've looked at involve pages being suddenly
> zero.

Thanks for CC'ing me, Jeff.

I've just read the whole thread, and I'd thank you for this effort. I've also 
found a couple of bugs I think (see below).

It seems you completely missed the purpose of vma->vm_pgoff.

Jeff, I think this is enough to explain the problem in UML. See below.

On the plan, however, I have a concern: VM_NONLINEAR.

For now it can be ok to leave madvise(REMOVE) unimplemented for that, but if 
and when I'll get the time to finish the remap_file_pages changes* for UML to 
use it, UML will _require_ this to be implemented too.

However, looking at the patch, the implementation would boil down to something 
like

for each page in range {
	start = page->index;
	end = start + PAGE_SIZE;
	call truncate_inode_pages_range(mapping, offset, end);
	inode->i_op->truncate_range(inode, offset, end);
}

unmap_mapping_range() should be done at once for the whole range.

While looking at these, here's what I'd call "strange" in the patch:

Also, why is unmap_mapping_range done with the inode semaphore held? I don't 
remember locking rule but conceptually this has no point, IMHO.

Btw, why I don't see vm_pgoff mentioned in these lines of the patch (nor 
anywhere else in the patch)?

You call truncate_inode_pages_range(mapping, offset, endoff), so I think 
you're really burned here.

+offset = (loff_t)(start - vma->vm_start);
+endoff = (loff_t)(end - vma->vm_start);

* UML uses mmap()/munmap()/mprotect() to implement the virtual "hardware MMU", 
which means we have one vma per page usually and that we can call hundred of 
unmaps on process exit. Ingo Molnar implemented time ago remap_file_pages() 
prot support (see around 2.6.4/2.6.5 -mm trees) and I recovered and completed 
it (and posted for review) during last summer.
-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

	

	
		
___________________________________ 
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB 
http://mail.yahoo.it

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 11:03   ` Blaisorblade
@ 2005-10-28 13:29     ` Andrea Arcangeli
  2005-10-28 16:56       ` Blaisorblade
  2005-10-28 16:16     ` Badari Pulavarty
  1 sibling, 1 reply; 86+ messages in thread
From: Andrea Arcangeli @ 2005-10-28 13:29 UTC (permalink / raw)
  To: Blaisorblade
  Cc: Jeff Dike, Badari Pulavarty, Hugh Dickins, akpm, dvhltc, linux-mm

On Fri, Oct 28, 2005 at 01:03:56PM +0200, Blaisorblade wrote:
> and when I'll get the time to finish the remap_file_pages changes* for UML to 
> use it, UML will _require_ this to be implemented too.

Would it be possible to make remap_file_pages an option? I mean, if
you're doing a xen-like usage, remap_file_pages is a good thing, but if
you're in a multiuser system and you want to be friendly when the system
swaps, remap_file_pages can hurt. The worst is when remap_file_pages
covers huge large areas, that forces the vm to walk all the ptes for the
whole vma region for each page that could be mapped by that region.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 11:03   ` Blaisorblade
  2005-10-28 13:29     ` Andrea Arcangeli
@ 2005-10-28 16:16     ` Badari Pulavarty
  2005-10-28 18:40       ` Blaisorblade
  1 sibling, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-28 16:16 UTC (permalink / raw)
  To: Blaisorblade; +Cc: Jeff Dike, Hugh Dickins, akpm, andrea, dvhltc, linux-mm

Blaisorblade wrote:
> On Friday 28 October 2005 05:46, Jeff Dike wrote:
> 
>>On Wed, Oct 26, 2005 at 03:49:55PM -0700, Badari Pulavarty wrote:
>>
>>>Basically, I added "truncate_range" inode operation to provide
>>>opportunity for the filesystem to zero the blocks and/or free
>>>them up.
>>>
>>>I also attempted to implement shmem_truncate_range() which
>>>needs lots of testing before I work out bugs :(
>>
>>I added memory hotplug to UML to check this out.  It seems to be freeing
>>pages that are outside the desired range.  I'm doing the simplest possible
>>thing - grabbing a bunch of pages that are most likely not dirty yet,
>>and MADV_TRUNCATEing them one at a time.  Everything in UML goes harwire
>>after that, and the cases that I've looked at involve pages being suddenly
>>zero.
> 
> 
> Thanks for CC'ing me, Jeff.
> 
> I've just read the whole thread, and I'd thank you for this effort. I've also 
> found a couple of bugs I think (see below).
> 
> It seems you completely missed the purpose of vma->vm_pgoff.
> 
> Jeff, I think this is enough to explain the problem in UML. See below.
> 
> On the plan, however, I have a concern: VM_NONLINEAR.
> 
> For now it can be ok to leave madvise(REMOVE) unimplemented for that, but if 
> and when I'll get the time to finish the remap_file_pages changes* for UML to 
> use it, UML will _require_ this to be implemented too.
> 
> However, looking at the patch, the implementation would boil down to something 
> like
> 
> for each page in range {
> 	start = page->index;
> 	end = start + PAGE_SIZE;
> 	call truncate_inode_pages_range(mapping, offset, end);
> 	inode->i_op->truncate_range(inode, offset, end);
> }
> 
> unmap_mapping_range() should be done at once for the whole range.
> 

patch does

for all the pages in the given vma {
	unmap_mapping_range(mapping, offset, end);
	truncate_inode_pages_range(mapping, offset, end);
	inode->op->truncate_range(inode, offset, end)
}

It operates on bunch of pages in the given VMA. Since UML has
one page for VMA, it operates on one page at a time - do you
see anything wrong here ?

> While looking at these, here's what I'd call "strange" in the patch:
> 
> Also, why is unmap_mapping_range done with the inode semaphore held? I don't 
> remember locking rule but conceptually this has no point, IMHO.

I am not sure either, let me look at it. (I thought we should hold it
for truncate()).

> Btw, why I don't see vm_pgoff mentioned in these lines of the patch (nor 
> anywhere else in the patch)?

vm_pgoff - don't remember what that supposed to represent...


> You call truncate_inode_pages_range(mapping, offset, endoff), so I think 
> you're really burned here.
> 
> +offset = (loff_t)(start - vma->vm_start);
> +endoff = (loff_t)(end - vma->vm_start);

"end" here is not end of VMA - its end of the region we want to discard
(in UML case its start + PAGE_SIZE). Anything wrong ?
> 
> * UML uses mmap()/munmap()/mprotect() to implement the virtual "hardware MMU", 
> which means we have one vma per page usually and that we can call hundred of 
> unmaps on process exit. Ingo Molnar implemented time ago remap_file_pages() 
> prot support (see around 2.6.4/2.6.5 -mm trees) and I recovered and completed 
> it (and posted for review) during last summer.

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28  3:46 ` Jeff Dike
  2005-10-28 11:03   ` Blaisorblade
@ 2005-10-28 16:19   ` Badari Pulavarty
  2005-10-28 17:10     ` Blaisorblade
  2005-10-28 18:42     ` Jeff Dike
  2005-10-28 17:55   ` [RFC] madvise(MADV_TRUNCATE) Blaisorblade
  2 siblings, 2 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-28 16:19 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade

Jeff Dike wrote:

> On Wed, Oct 26, 2005 at 03:49:55PM -0700, Badari Pulavarty wrote:
> 
>>Basically, I added "truncate_range" inode operation to provide
>>opportunity for the filesystem to zero the blocks and/or free
>>them up. 
>>
>>I also attempted to implement shmem_truncate_range() which 
>>needs lots of testing before I work out bugs :(
> 
> 
> I added memory hotplug to UML to check this out.  It seems to be freeing
> pages that are outside the desired range.  I'm doing the simplest possible
> thing - grabbing a bunch of pages that are most likely not dirty yet, 
> and MADV_TRUNCATEing them one at a time.  Everything in UML goes harwire
> after that, and the cases that I've looked at involve pages being suddenly
> zero.
> 
> UML isn't exactly a minimal test case, but I'll give you what you need
> to reproduce this if you want.
> 

I cut-n-pasted shmem_truncate_range() from shmem_truncate() and fixed
few obvious things. Its very likely that, I missed whole bunch of changes.

My touch tests so far, doesn't really verify data after freeing. I was
thinking about writing cases. If I can use UML to do it, please send it
to me. I would rather test with real world case :)

Thanks,
Badari


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28  1:42                 ` Badari Pulavarty
@ 2005-10-28 16:33                   ` Theodore Ts'o
  0 siblings, 0 replies; 86+ messages in thread
From: Theodore Ts'o @ 2005-10-28 16:33 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Andrew Morton, andrea, ak, hugh, jdike, dvhltc, linux-mm

On Thu, Oct 27, 2005 at 06:42:36PM -0700, Badari Pulavarty wrote:
> Like Andrea mentioned MADV_DONTNEED should be able to do what JVM
> folks want. If they want more than that, get in touch with me.
> While doing MADV_REMOVE, I will see if I can satsify their needs also.

Well, I asked if what he wanted was simply walking all of the page
tables and marking the indicated pages as "clean", but he claimed that
anything that involved walking the pages tables would be too slow.
But it may be that he was assuming this would be as painful as
munmap(), when of course it wouldn't be.  I don't know if they've
actually benchmarked MADV_DONTNEED or not.

						- Ted

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 13:29     ` Andrea Arcangeli
@ 2005-10-28 16:56       ` Blaisorblade
  0 siblings, 0 replies; 86+ messages in thread
From: Blaisorblade @ 2005-10-28 16:56 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jeff Dike, Badari Pulavarty, Hugh Dickins, akpm, dvhltc, linux-mm

On Friday 28 October 2005 15:29, Andrea Arcangeli wrote:
> On Fri, Oct 28, 2005 at 01:03:56PM +0200, Blaisorblade wrote:
> > and when I'll get the time to finish the remap_file_pages changes* for
> > UML to use it, UML will _require_ this to be implemented too.

> Would it be possible to make remap_file_pages an option?

Kernel-compile time? Possibly yes...

> I mean, if 
> you're doing a xen-like usage, remap_file_pages is a good thing, but if
> you're in a multiuser system and you want to be friendly when the system
> swaps, remap_file_pages can hurt. The worst is when remap_file_pages
> covers huge large areas, that forces the vm to walk all the ptes for the
> whole vma region for each page that could be mapped by that region.
Insulting 2.4 this way is like when Microsoft said "Win98 could never be 
secure or reliable" :-) . That said, yes, 

Yes, this concern was expressed a bit by Hugh too, time ago...

I think that resurrecting Rik's rss ulimits would be good. Plus, fallbacking 
install_page to install_file_pte when the limit is hit.
-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

	

	
		
___________________________________ 
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB 
http://mail.yahoo.it

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 16:19   ` Badari Pulavarty
@ 2005-10-28 17:10     ` Blaisorblade
  2005-10-28 18:28       ` Jeff Dike
  2005-10-28 18:42     ` Jeff Dike
  1 sibling, 1 reply; 86+ messages in thread
From: Blaisorblade @ 2005-10-28 17:10 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Jeff Dike, Hugh Dickins, akpm, andrea, dvhltc, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1975 bytes --]

On Friday 28 October 2005 18:19, Badari Pulavarty wrote:
> Jeff Dike wrote:

> I cut-n-pasted shmem_truncate_range() from shmem_truncate() and fixed
> few obvious things. Its very likely that, I missed whole bunch of changes.

> My touch tests so far, doesn't really verify data after freeing. I was
> thinking about writing cases. If I can use UML to do it, please send it
> to me. I would rather test with real world case :)
I would call that a _bad_ idea, at this stage.

It may be good when the patch is already really polished, IMHO, but not for 
verifying what's really wrong.

Also, you can gdb an UML running with the patch, to verify what's going on.

But I wouldn't suggest testing this with nested UMLs - using that means 
looking for trouble.

As an example, I'm attaching a test-program I wrote during my work on 
remap_file_pages - it also has a mechanism for trying memory accesses and 
catching SIGSEGV, and reporting they were / weren't got.

Not as nice as the kernel one, but it's little enough for our purposes.

There's much stuff you likely won't need, but it can be useful both as an 
example and as a starting point.

In your case, I'd write code as:

* fill many pages (enough to cover all indirections level in shmfs) with 
numbers. The first with 1, the second with 2, and so on (I'm avoiding 0 on 
purpose).

* loop over all them {
	truncate one of them

	loop over all pages to verify only that one is zero'ed (mincore() might help
	too, depending on its implementation).

	optionally, refill it (otherwise we can't check that truncating the next page
	doesn't clear this one).
}
(Possibly, truncate sets of pages, and verify only those ones are truncated).

* verify meanwhile that the tmpfs usage with statfs64() decreases.

Other suggestions?
-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

[-- Attachment #2: fremap-test-complete.c.bz2 --]
[-- Type: application/x-bzip2, Size: 5377 bytes --]

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28  3:46 ` Jeff Dike
  2005-10-28 11:03   ` Blaisorblade
  2005-10-28 16:19   ` Badari Pulavarty
@ 2005-10-28 17:55   ` Blaisorblade
  2005-10-28 21:23     ` Theodore Ts'o
  2 siblings, 1 reply; 86+ messages in thread
From: Blaisorblade @ 2005-10-28 17:55 UTC (permalink / raw)
  To: Badari Pulavarty, Theodore Ts'o
  Cc: Jeff Dike, Hugh Dickins, akpm, andrea, dvhltc, linux-mm

> On Thu, Oct 27, 2005 at 06:42:36PM -0700, Badari Pulavarty wrote:
> > Like Andrea mentioned MADV_DONTNEED should be able to do what JVM
> > folks want. If they want more than that, get in touch with me.
> > While doing MADV_REMOVE, I will see if I can satsify their needs also.

> Well, I asked if what he wanted was simply walking all of the page
> tables and marking the indicated pages as "clean",
This idea sounds interesting and kludgy enough :-) .
> but he claimed that 
> anything that involved walking the pages tables would be too slow.
> But it may be that he was assuming this would be as painful as
> munmap(), when of course it wouldn't be.

I am curious which is the difference between the two. I know that we must also 
walk the vma tree, and that since we bundle the pointers in the vma the 
spatial locality is very poor, but I still don't get this huge loss.

Apart for the CONFIG_PREEMPT excess case, which was just pointed out on LKML:

http://lkml.org/lkml/2005/10/27/215

(possibly point it out to your Java people, and see what they say).
> I don't know if they've 
> actually benchmarked MADV_DONTNEED or not.

-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

	

	
		
___________________________________ 
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB 
http://mail.yahoo.it

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 17:10     ` Blaisorblade
@ 2005-10-28 18:28       ` Jeff Dike
  2005-10-28 18:44         ` Blaisorblade
  0 siblings, 1 reply; 86+ messages in thread
From: Jeff Dike @ 2005-10-28 18:28 UTC (permalink / raw)
  To: Blaisorblade
  Cc: Badari Pulavarty, Hugh Dickins, akpm, andrea, dvhltc, linux-mm

On Fri, Oct 28, 2005 at 07:10:39PM +0200, Blaisorblade wrote:
> It may be good when the patch is already really polished, IMHO, but not for 
> verifying what's really wrong.
> 
> Also, you can gdb an UML running with the patch, to verify what's going on.
> 
> But I wouldn't suggest testing this with nested UMLs - using that means 
> looking for trouble.

I think he's looking for test cases, not debugging this inside a UML.

If he's debugging on hardware, then nesting UMLs doesn't come into the
picture.

				Jeff

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 16:16     ` Badari Pulavarty
@ 2005-10-28 18:40       ` Blaisorblade
  2005-10-28 18:56         ` Badari Pulavarty
  2005-10-29  0:35         ` Badari Pulavarty
  0 siblings, 2 replies; 86+ messages in thread
From: Blaisorblade @ 2005-10-28 18:40 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Jeff Dike, Hugh Dickins, akpm, andrea, dvhltc, linux-mm

On Friday 28 October 2005 18:16, Badari Pulavarty wrote:
> Blaisorblade wrote:
> > On Friday 28 October 2005 05:46, Jeff Dike wrote:
> >>On Wed, Oct 26, 2005 at 03:49:55PM -0700, Badari Pulavarty wrote:

> > On the plan, however, I have a concern: VM_NONLINEAR.

> > However, looking at the patch, the implementation would boil down to
> > something like
> >
> > for each page in range {
> > 	start = page->index;
> > 	end = start + PAGE_SIZE;
> > 	call truncate_inode_pages_range(mapping, offset, end);
> > 	inode->i_op->truncate_range(inode, offset, end);
> > }
> >
> > unmap_mapping_range() should be done at once for the whole range.
>
> patch does
>
> for all the pages in the given vma {
> 	unmap_mapping_range(mapping, offset, end);
> 	truncate_inode_pages_range(mapping, offset, end);
> 	inode->op->truncate_range(inode, offset, end)
> }

> It operates on bunch of pages in the given VMA. Since UML has
> one page for VMA, it operates on one page at a time - do you
> see anything wrong here ?

My point was the support to VM_NONLINEAR. In the future, UML will have one big 
VMA, but different pages will be remapped with different offsets (already in 
mainline) and different protections (I have patches, I sent an earlier 
version, still revising).

In that case, you could really truncate (in one single call) pages which are 
one at the start of the file and one at the end. That's why with VM_NONLINEAR 
it wouldn't work.

However, Jeff made me note that we'd probably call madvise() on the linear 
kernel mapping (the kernel maps pages from the RAM file all at once, 
linearly). So you can safely just refuse operating on VM_NONLINEAR vmas.

> > While looking at these, here's what I'd call "strange" in the patch:

> > Also, why is unmap_mapping_range done with the inode semaphore held? I
> > don't remember locking rule but conceptually this has no point, IMHO.

> I am not sure either, let me look at it. (I thought we should hold it
> for truncate()).

Ok, do_truncate() uses the semaphore around the whole ops, because it's 
implemented in a radically different way (through notify_change()).

We don't need IMHO to do things that way; we don't even change i_size - not 
even when at the end of the file, as we don't want SIGBUS.

And anyway FS's must already handle holes at the end of a file.

Btw, when truncating, notify_change does:

        if (ia_valid & ATTR_SIZE)
                down_write(&dentry->d_inode->i_alloc_sem);

(which I suppose is used to protect against concurrent file extensions - page 
allocations in previous holes - and such). You should probably take that too 
(nest it inside mapping->host->i_sem).

Also, vmtruncate is called with the semaphore held because it must call 
truncate_inode_pages(), and because even the calls to i_size_write() must be 
atomic with the rest. But other than that, there's no reason. Especially, 
unmap_mapping_range() does purely pagetable operations.

> > Btw, why I don't see vm_pgoff mentioned in these lines of the patch (nor
> > anywhere else in the patch)?

> vm_pgoff - don't remember what that supposed to represent...

Call mmap() with non-0 pgoff (i.e. offset in the file), say the second file 
page. You're gonna store the pgoff parameter in vma->vm_pgoff (in PAGE_SHIFT 
units).

If I then request you to truncate the first page in the VMA, how does your 
code realize that it should punch the second page rather than the first?

However, Jeff said this _isn't_ the bug he's hitting - in his case the VMA has 
a 0 initial offset (for the same reason we don't need VM_NONLINEAR support).

> > You call truncate_inode_pages_range(mapping, offset, endoff), so I think
> > you're really burned here.

> > +offset = (loff_t)(start - vma->vm_start);
> > +endoff = (loff_t)(end - vma->vm_start);

So they would become:

offset = (loff_t)(start - vma->vm_start) + vma->vm_pgoff << PAGE_SHIFT; 

or with page_offset(). Btw, shouldn't this be done by some macro in 
<linux/pagemap.h>, as page_offset() and linear_page_index()?

Btw, also compare with mm/rmap.c:vma_address()/page_address_in_vma().

> "end" here is not end of VMA - its end of the region we want to discard
> (in UML case its start + PAGE_SIZE). Anything wrong ?

All ok for that, I was complaining about not using ->vm_pgoff.

I had the doubt that vm_pgoff entered the picture later, but I'm sure 
truncate_inode_pages{_range} wants file offsets, so it wasn't something I was 
missing.
-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

	

	
		
___________________________________ 
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB 
http://mail.yahoo.it

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 16:19   ` Badari Pulavarty
  2005-10-28 17:10     ` Blaisorblade
@ 2005-10-28 18:42     ` Jeff Dike
  2005-10-28 18:54       ` Badari Pulavarty
  2005-10-29  0:03       ` Badari Pulavarty
  1 sibling, 2 replies; 86+ messages in thread
From: Jeff Dike @ 2005-10-28 18:42 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade

[-- Attachment #1: Type: text/plain, Size: 1321 bytes --]

On Fri, Oct 28, 2005 at 09:19:14AM -0700, Badari Pulavarty wrote:
> My touch tests so far, doesn't really verify data after freeing. I was
> thinking about writing cases. If I can use UML to do it, please send it
> to me. I would rather test with real world case :)

Grab and unpack http://www.user-mode-linux.org/~jdike/truncate.tar.bz2

That will give you a "linux" directory.

Make sure that your /tmp is tmpfs with > 192M of space.

Run UML - from above the linux directory, this would be something like
	linux/2.6/linux-2.6.14-rc5/obj/linux con0=fd:0,fd:1 con1=none con=pts ssl=pts umid=debian mem=192M ubda=linux/debian_22 devfs=nomount

Log in, the root password is "root".

Unplug some memory -
	linux/uml_mconsole debian config mem=-10M

Go back to the UML and try do to something - ps, ls, anything.

It will be hung on handling an infinite page fault loop due to a whole lot
of pages having been zeroed all of a sudden.

This will happen even when you unplug 2 pages (mem=-8K).  Only one of them
will be madvised because the other is used to keep track of the madvised
pages.

I also included my patchset in there (linux/2.6/linux-2.6.14-rc5/patches) if
you want to build UML from source.  Due to my not refreshing the hotplug 
patch before making the tarball, it's not there.  So, I've attached it.

				Jeff

[-- Attachment #2: hotplug-mem --]
[-- Type: text/plain, Size: 3398 bytes --]

Index: linux-2.6.14-rc5/arch/um/drivers/mconsole_kern.c
===================================================================
--- linux-2.6.14-rc5.orig/arch/um/drivers/mconsole_kern.c	2005-10-27 17:56:17.000000000 -0400
+++ linux-2.6.14-rc5/arch/um/drivers/mconsole_kern.c	2005-10-27 23:43:04.000000000 -0400
@@ -20,6 +20,8 @@
 #include "linux/namei.h"
 #include "linux/proc_fs.h"
 #include "linux/syscalls.h"
+#include "linux/list.h"
+#include "linux/mm.h"
 #include "asm/irq.h"
 #include "asm/uaccess.h"
 #include "user_util.h"
@@ -345,6 +347,140 @@ static struct mc_device *mconsole_find_d
 	return(NULL);
 }
 
+#define UNPLUGGED_PER_PAGE \
+	((PAGE_SIZE - sizeof(struct list_head)) / sizeof(unsigned long))
+
+struct unplugged_pages {
+	struct list_head list;
+	void *pages[UNPLUGGED_PER_PAGE];
+};
+
+static unsigned long long unplugged_pages_count = 0;
+static struct list_head unplugged_pages = LIST_HEAD_INIT(unplugged_pages);
+static int unplug_index = UNPLUGGED_PER_PAGE;
+
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
+
+static int mem_config(char *str)
+{
+	unsigned long long diff;
+	int err = -EINVAL, i, add;
+	char *ret;
+
+	if(str[0] != '=')
+		goto out;
+
+	str++;
+	if(str[0] == '-')
+		add = 0;
+	else if(str[0] == '+'){
+		add = 1;
+	}
+	else goto out;
+	
+	str++;
+	diff = memparse(str, &ret);
+	if(*ret != '\0')
+		goto out;
+
+	diff /= PAGE_SIZE;
+
+	for(i = 0; i < diff; i++){
+		struct unplugged_pages *unplugged;
+		void *addr;
+
+		if(add){
+			if(list_empty(&unplugged_pages))
+				break;
+
+			unplugged = list_entry(unplugged_pages.next,
+					       struct unplugged_pages, list);
+			if(unplug_index > 0)
+				addr = unplugged->pages[--unplug_index];
+			else {
+				list_del(&unplugged->list);
+				addr = unplugged;
+				unplug_index = UNPLUGGED_PER_PAGE;
+			}
+				
+			free_page((unsigned long) addr);
+			unplugged_pages_count--;
+		}
+		else {
+			struct page *page;
+			
+			page = alloc_page(GFP_ATOMIC);
+			if(page == NULL)
+				break;
+
+			unplugged = page_address(page);
+			if(unplug_index == UNPLUGGED_PER_PAGE){
+				INIT_LIST_HEAD(&unplugged->list);
+				list_add(&unplugged->list, &unplugged_pages);
+				unplug_index = 0;
+			}
+			else {
+				struct list_head *entry = unplugged_pages.next;
+				addr = unplugged;
+
+				unplugged = list_entry(entry, 
+						       struct unplugged_pages,
+						       list);
+				unplugged->pages[unplug_index++] = addr;
+				err = madvise(addr, PAGE_SIZE, MADV_TRUNCATE);
+				if(err)
+					printk("MADV_TRUNCATE failed\n");
+			}
+
+			unplugged_pages_count++;
+		}
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static int mem_get_config(char *name, char *str, int size, char **error_out)
+{
+	char buf[sizeof("18446744073709551615\0")];
+	int len = 0;
+
+	sprintf(buf, "%ld", uml_physmem);
+	CONFIG_CHUNK(str, size, len, buf, 1);
+
+	return len;
+}
+
+static int mem_id(char **str, int *start_out, int *end_out)
+{
+	*start_out = 0;
+	*end_out = 0;
+
+	return 0;
+}
+
+static int mem_remove(int n)
+{
+	return -EBUSY;
+}
+
+static struct mc_device mem_mc = {
+	.name		= "mem",
+	.config		= mem_config,
+	.get_config	= mem_get_config,
+	.id		= mem_id,
+	.remove		= mem_remove,
+};
+
+static int mem_mc_init(void)
+{
+	mconsole_register_dev(&mem_mc);
+	return 0;
+}
+
+__initcall(mem_mc_init);
+
 #define CONFIG_BUF_SIZE 64
 
 static void mconsole_get_config(int (*get_config)(char *, char *, int, 

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 18:28       ` Jeff Dike
@ 2005-10-28 18:44         ` Blaisorblade
  0 siblings, 0 replies; 86+ messages in thread
From: Blaisorblade @ 2005-10-28 18:44 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Badari Pulavarty, Hugh Dickins, akpm, andrea, dvhltc, linux-mm

On Friday 28 October 2005 20:28, Jeff Dike wrote:
> On Fri, Oct 28, 2005 at 07:10:39PM +0200, Blaisorblade wrote:
> > It may be good when the patch is already really polished, IMHO, but not
> > for verifying what's really wrong.
> >
> > Also, you can gdb an UML running with the patch, to verify what's going
> > on.
> >
> > But I wouldn't suggest testing this with nested UMLs - using that means
> > looking for trouble.
>
> I think he's looking for test cases, not debugging this inside a UML.

> If he's debugging on hardware, then nesting UMLs doesn't come into the
> picture.
In fact, I'm suggesting debugging this with UML as debuggee kernel. I've used 
that _a lot_ for my remap_file_pages work. And then the nested UML thing 
_does_ comes into the picture.
-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

	

	
		
___________________________________ 
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB 
http://mail.yahoo.it

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 18:42     ` Jeff Dike
@ 2005-10-28 18:54       ` Badari Pulavarty
  2005-10-29  0:03       ` Badari Pulavarty
  1 sibling, 0 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-28 18:54 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade

Jeff Dike wrote:

> On Fri, Oct 28, 2005 at 09:19:14AM -0700, Badari Pulavarty wrote:
> 
>>My touch tests so far, doesn't really verify data after freeing. I was
>>thinking about writing cases. If I can use UML to do it, please send it
>>to me. I would rather test with real world case :)
> 
> 
> Grab and unpack http://www.user-mode-linux.org/~jdike/truncate.tar.bz2
> 
> That will give you a "linux" directory.
> 
> Make sure that your /tmp is tmpfs with > 192M of space.
> 
> Run UML - from above the linux directory, this would be something like
> 	linux/2.6/linux-2.6.14-rc5/obj/linux con0=fd:0,fd:1 con1=none con=pts ssl=pts umid=debian mem=192M ubda=linux/debian_22 devfs=nomount
> 
> Log in, the root password is "root".
> 
> Unplug some memory -
> 	linux/uml_mconsole debian config mem=-10M
> 
> Go back to the UML and try do to something - ps, ls, anything.
> 
> It will be hung on handling an infinite page fault loop due to a whole lot
> of pages having been zeroed all of a sudden.
> 
> This will happen even when you unplug 2 pages (mem=-8K).  Only one of them
> will be madvised because the other is used to keep track of the madvised
> pages.
> 
> I also included my patchset in there (linux/2.6/linux-2.6.14-rc5/patches) if
> you want to build UML from source.  Due to my not refreshing the hotplug 
> patch before making the tarball, it's not there.  So, I've attached it.
> 

Thank you. Its going to be Monday before I get to it.
I will let you know.

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 18:40       ` Blaisorblade
@ 2005-10-28 18:56         ` Badari Pulavarty
  2005-10-29  0:35         ` Badari Pulavarty
  1 sibling, 0 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-28 18:56 UTC (permalink / raw)
  To: Blaisorblade; +Cc: Jeff Dike, Hugh Dickins, akpm, andrea, dvhltc, linux-mm

Blaisorblade wrote:

> On Friday 28 October 2005 18:16, Badari Pulavarty wrote:
> 
>>Blaisorblade wrote:
>>
>>>On Friday 28 October 2005 05:46, Jeff Dike wrote:
>>>
>>>>On Wed, Oct 26, 2005 at 03:49:55PM -0700, Badari Pulavarty wrote:
> 
> 
>>>On the plan, however, I have a concern: VM_NONLINEAR.
> 
> 
>>>However, looking at the patch, the implementation would boil down to
>>>something like
>>>
>>>for each page in range {
>>>	start = page->index;
>>>	end = start + PAGE_SIZE;
>>>	call truncate_inode_pages_range(mapping, offset, end);
>>>	inode->i_op->truncate_range(inode, offset, end);
>>>}
>>>
>>>unmap_mapping_range() should be done at once for the whole range.
>>
>>patch does
>>
>>for all the pages in the given vma {
>>	unmap_mapping_range(mapping, offset, end);
>>	truncate_inode_pages_range(mapping, offset, end);
>>	inode->op->truncate_range(inode, offset, end)
>>}
> 
> 
>>It operates on bunch of pages in the given VMA. Since UML has
>>one page for VMA, it operates on one page at a time - do you
>>see anything wrong here ?
> 
> 
> My point was the support to VM_NONLINEAR. In the future, UML will have one big 
> VMA, but different pages will be remapped with different offsets (already in 
> mainline) and different protections (I have patches, I sent an earlier 
> version, still revising).
> 
> In that case, you could really truncate (in one single call) pages which are 
> one at the start of the file and one at the end. That's why with VM_NONLINEAR 
> it wouldn't work.
> 
> However, Jeff made me note that we'd probably call madvise() on the linear 
> kernel mapping (the kernel maps pages from the RAM file all at once, 
> linearly). So you can safely just refuse operating on VM_NONLINEAR vmas.
> 
> 
>>>While looking at these, here's what I'd call "strange" in the patch:
> 
> 
>>>Also, why is unmap_mapping_range done with the inode semaphore held? I
>>>don't remember locking rule but conceptually this has no point, IMHO.
> 
> 
>>I am not sure either, let me look at it. (I thought we should hold it
>>for truncate()).
> 
> 
> Ok, do_truncate() uses the semaphore around the whole ops, because it's 
> implemented in a radically different way (through notify_change()).
> 
> We don't need IMHO to do things that way; we don't even change i_size - not 
> even when at the end of the file, as we don't want SIGBUS.
> 
> And anyway FS's must already handle holes at the end of a file.
> 
> Btw, when truncating, notify_change does:
> 
>         if (ia_valid & ATTR_SIZE)
>                 down_write(&dentry->d_inode->i_alloc_sem);
> 
> (which I suppose is used to protect against concurrent file extensions - page 
> allocations in previous holes - and such). You should probably take that too 
> (nest it inside mapping->host->i_sem).
> 
> Also, vmtruncate is called with the semaphore held because it must call 
> truncate_inode_pages(), and because even the calls to i_size_write() must be 
> atomic with the rest. But other than that, there's no reason. Especially, 
> unmap_mapping_range() does purely pagetable operations.
> 
> 
>>>Btw, why I don't see vm_pgoff mentioned in these lines of the patch (nor
>>>anywhere else in the patch)?
> 
> 
>>vm_pgoff - don't remember what that supposed to represent...
> 
> 
> Call mmap() with non-0 pgoff (i.e. offset in the file), say the second file 
> page. You're gonna store the pgoff parameter in vma->vm_pgoff (in PAGE_SHIFT 
> units).
> 
> If I then request you to truncate the first page in the VMA, how does your 
> code realize that it should punch the second page rather than the first?
> 
> However, Jeff said this _isn't_ the bug he's hitting - in his case the VMA has 
> a 0 initial offset (for the same reason we don't need VM_NONLINEAR support).
> 
> 
>>>You call truncate_inode_pages_range(mapping, offset, endoff), so I think
>>>you're really burned here.
> 
> 
>>>+offset = (loff_t)(start - vma->vm_start);
>>>+endoff = (loff_t)(end - vma->vm_start);
> 
> 
> So they would become:
> 
> offset = (loff_t)(start - vma->vm_start) + vma->vm_pgoff << PAGE_SHIFT; 
> 
> or with page_offset(). Btw, shouldn't this be done by some macro in 
> <linux/pagemap.h>, as page_offset() and linear_page_index()?
> 
> Btw, also compare with mm/rmap.c:vma_address()/page_address_in_vma().
> 
> 
>>"end" here is not end of VMA - its end of the region we want to discard
>>(in UML case its start + PAGE_SIZE). Anything wrong ?
> 
> 
> All ok for that, I was complaining about not using ->vm_pgoff.
> 
> I had the doubt that vm_pgoff entered the picture later, but I'm sure 
> truncate_inode_pages{_range} wants file offsets, so it wasn't something I was 
> missing.

Thank you for your comments. I need sometime to digest all these,
make changes and debug the current problem. For now, I am going
to restrict/ignore VM_NONLINEAR case.

I will get back to you on Monday.

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 17:55   ` [RFC] madvise(MADV_TRUNCATE) Blaisorblade
@ 2005-10-28 21:23     ` Theodore Ts'o
  0 siblings, 0 replies; 86+ messages in thread
From: Theodore Ts'o @ 2005-10-28 21:23 UTC (permalink / raw)
  To: Blaisorblade
  Cc: Badari Pulavarty, Jeff Dike, Hugh Dickins, akpm, andrea, dvhltc,
	linux-mm

On Fri, Oct 28, 2005 at 07:55:09PM +0200, Blaisorblade wrote:
> > On Thu, Oct 27, 2005 at 06:42:36PM -0700, Badari Pulavarty wrote:
> > > Like Andrea mentioned MADV_DONTNEED should be able to do what JVM
> > > folks want. If they want more than that, get in touch with me.
> > > While doing MADV_REMOVE, I will see if I can satsify their needs also.
> 
> > Well, I asked if what he wanted was simply walking all of the page
> > tables and marking the indicated pages as "clean",
> This idea sounds interesting and kludgy enough :-) .
> > but he claimed that 
> > anything that involved walking the pages tables would be too slow.
> > But it may be that he was assuming this would be as painful as
> > munmap(), when of course it wouldn't be.
> 
> I am curious which is the difference between the two. I know that we must also 
> walk the vma tree, and that since we bundle the pointers in the vma the 
> spatial locality is very poor, but I still don't get this huge loss.

Because if we do an munmap, we're removing entries from the page table
entries, which means we have to do cross-CPU IPI's to flush TLB's on
all of the CPU's.  That wouldn't be necessary if we're just marking
the pages clean.

						- Ted

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 18:42     ` Jeff Dike
  2005-10-28 18:54       ` Badari Pulavarty
@ 2005-10-29  0:03       ` Badari Pulavarty
  2005-10-29  2:51         ` Jeff Dike
  1 sibling, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-29  0:03 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade

On Fri, 2005-10-28 at 14:42 -0400, Jeff Dike wrote:
> On Fri, Oct 28, 2005 at 09:19:14AM -0700, Badari Pulavarty wrote:
> > My touch tests so far, doesn't really verify data after freeing. I was
> > thinking about writing cases. If I can use UML to do it, please send it
> > to me. I would rather test with real world case :)
> 
> Grab and unpack http://www.user-mode-linux.org/~jdike/truncate.tar.bz2

Here is the update on the patch.

I found few bugs in my shmem_truncate_range() (surprise!!)
	- BUG_ON(subdir->nr_swapped > offset);
	- freeing up the "subdir" while it has some more entries
	swapped.

I wrote some tests to force swapping and working out the bugs.
I haven't tried your test yet, since its kind of intimidating :(

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-28 18:40       ` Blaisorblade
  2005-10-28 18:56         ` Badari Pulavarty
@ 2005-10-29  0:35         ` Badari Pulavarty
  1 sibling, 0 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-29  0:35 UTC (permalink / raw)
  To: Blaisorblade; +Cc: Jeff Dike, Hugh Dickins, akpm, andrea, dvhltc, linux-mm

On Fri, 2005-10-28 at 20:40 +0200, Blaisorblade wrote:

> 
> All ok for that, I was complaining about not using ->vm_pgoff.
> 
> I had the doubt that vm_pgoff entered the picture later, but I'm sure 
> truncate_inode_pages{_range} wants file offsets, so it wasn't something I was 
> missing.

Yep. You are right on -- Jeff's UML problem is due to not handling
vm_pgoff correctly. I was able to reproduce the problem 
(Thank you Jeff for the testcase with instructions).

call vmtruncate_range(ffff81011fec0ff8, a7e3000 a7e4000) pgoff:259

I need to think what I need to do with ->vm_pgoff, before I hack
up something.

Thanks,
Badari


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-29  0:03       ` Badari Pulavarty
@ 2005-10-29  2:51         ` Jeff Dike
  2005-10-31 16:34           ` Badari Pulavarty
                             ` (2 more replies)
  0 siblings, 3 replies; 86+ messages in thread
From: Jeff Dike @ 2005-10-29  2:51 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade

On Fri, Oct 28, 2005 at 05:03:21PM -0700, Badari Pulavarty wrote:
> Here is the update on the patch.
> 
> I found few bugs in my shmem_truncate_range() (surprise!!)
> 	- BUG_ON(subdir->nr_swapped > offset);
> 	- freeing up the "subdir" while it has some more entries
> 	swapped.
> 
> I wrote some tests to force swapping and working out the bugs.
> I haven't tried your test yet, since its kind of intimidating :(

Well, then send me the patch since I don't find this the least bit 
intimidating :-)

				Jeff

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-29  2:51         ` Jeff Dike
@ 2005-10-31 16:34           ` Badari Pulavarty
  2005-10-31 19:15           ` Badari Pulavarty
  2005-10-31 19:49           ` [RFC][PATCH] madvise(MADV_TRUNCATE) Badari Pulavarty
  2 siblings, 0 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-31 16:34 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade

On Fri, 2005-10-28 at 22:51 -0400, Jeff Dike wrote:
> On Fri, Oct 28, 2005 at 05:03:21PM -0700, Badari Pulavarty wrote:
> > Here is the update on the patch.
> > 
> > I found few bugs in my shmem_truncate_range() (surprise!!)
> > 	- BUG_ON(subdir->nr_swapped > offset);
> > 	- freeing up the "subdir" while it has some more entries
> > 	swapped.
> > 
> > I wrote some tests to force swapping and working out the bugs.
> > I haven't tried your test yet, since its kind of intimidating :(
> 
> Well, then send me the patch since I don't find this the least bit 
> intimidating :-)

Jeff,

I tried your testcase again (tried to remove 8K). I see nothing
wrong from madvise() side - but after removing all commands
hang in UML. Few uml processes keep spinning. Does these mean
anything to you. I can't seem to find out what wrong with my
code.

(BTW, I wrote a testcase to release few pages and then go back
and touch those pages again - I don't see any problem).

Please let me know.

Thanks,
Badari


top - 03:36:09 up 8 min,  3 users,  load average: 1.26, 0.70, 0.33
Tasks:  70 total,   3 running,  57 sleeping,  10 stopped,   0 zombie
Cpu(s):  8.8% us, 41.2% sy,  0.0% ni, 49.9% id,  0.0% wa,  0.0% hi,
0.0% si
Mem:   4042308k total,   283296k used,  3759012k free,     9728k buffers
Swap:  1052648k total,        0k used,  1052648k free,   149052k cached

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
11826 root      16   0  193m  51m  51m R 62.6  1.3   1:01.44 linux
11834 root      15   0   504  456  440 R 37.3  0.0   0:35.70 linux
    1 root      16   0   720  260  216 S  0.0  0.0   0:00.57 init
    2 root      RT   0     0    0    0 S  0.0  0.0   0:00.00 migration/0

sysrq-t output:

linux         S 0000000000000001     0 11826  10995 11832
(NOTLB)
0000000000000001 0000000000000000 0000000000000001 0000000000000006
       ffffffff8062e480 ffffffffffffffef ffffffff80404544
0000000000000010
       0000000000000202 ffff810111e8dd48
Call Trace:<ffffffff80404544>{_write_lock_irqsave+132}
<ffffffff80404599>{_write_lock_irq+9}
       <ffffffff8013a522>{do_wait+610}
<ffffffff80132430>{default_wake_function+0}
       <ffffffff80132430>{default_wake_function+0}
<ffffffff8013241b>{try_to_wake_up+1083}
       <ffffffff8013b05a>{sys_wait4+42}
<ffffffff80159771>{compat_sys_wait4+49}
       <ffffffff80132460>{wake_up_process+16}
<ffffffff80111a05>{sys_ptrace+2261}
       <ffffffff8012ea5f>{sys32_ptrace+111}
<ffffffff80124efb>{sys32_waitpid+11}
       <ffffffff801235eb>{sysenter_do_call+27}

linux         t ffff810120ee0100     0 11834  11826               12082
(NOTLB)
ffff810109fdbdb8 0000000000000082 0000000000000001 0000000000000046
       ffff810109fdbd08 ffffffff801331e2 0000000000000000
ffff81011b01a600
       ffff81011b01ae08 ffff8101234a0ec0
Call Trace:<ffffffff801331e2>{__wake_up_sync+98}
<ffffffff801422f2>{recalc_sigpending+18}
<ffffffff801433a5>{__dequeue_signal+501} <ffffffff80144939>{ptrace_stop
+313}
       <ffffffff80145997>{get_signal_to_deliver+407}
<ffffffff8010d21d>{do_signal+125}
       <ffffffff80406480>{do_page_fault+2176}
<ffffffff8012e2eb>{restore_i387_ia32+75}
       <ffffffff80129ba1>{ia32_restore_sigcontext+129}
<ffffffff80129cd1>{ia32_restore_sigcontext+433}
       <ffffffff8010d890>{do_notify_resume+48}
<ffffffff8010e036>{int_signal+18}



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC] madvise(MADV_TRUNCATE)
  2005-10-29  2:51         ` Jeff Dike
  2005-10-31 16:34           ` Badari Pulavarty
@ 2005-10-31 19:15           ` Badari Pulavarty
  2005-10-31 19:49           ` [RFC][PATCH] madvise(MADV_TRUNCATE) Badari Pulavarty
  2 siblings, 0 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-31 19:15 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade

Hi Jeff,

Okay. Here is the latest.

Please ignore my previous mail. I found few issues in my code
where by, truncating one more page than what I need to. 
(off by 1 byte error). Took long time for me to figure out.

UML testcase is working fine. I will send out the patch
after a little cleanup.

Thanks for your help with *real* testcase :)

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC][PATCH] madvise(MADV_TRUNCATE)
  2005-10-29  2:51         ` Jeff Dike
  2005-10-31 16:34           ` Badari Pulavarty
  2005-10-31 19:15           ` Badari Pulavarty
@ 2005-10-31 19:49           ` Badari Pulavarty
  2005-11-01  0:05             ` Jeff Dike
  2 siblings, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-10-31 19:49 UTC (permalink / raw)
  To: Jeff Dike; +Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade

[-- Attachment #1: Type: text/plain, Size: 511 bytes --]

Hi All,

Here is the latest patch. Still not cleaned up - but I thought I would
get more feedback & testing while I finish cleanups (since they are all
cosmetic).

TODO:
	- Change the naming to MADV_FREE (as Andrew suggested)
	- Merge shmem_truncate_range() with shmem_truncate()
	- Disallow VMA_NONLINEAR, HUGETLB etc.
	- Take a closer look at i_sem & i_alloc_sem. 
	- comments, white space, tab cleanups.
	- Drop truncate_inode_pages_range() changes - since they
	  are already in -mm tree.

Thanks,
Badari



[-- Attachment #2: madvise-truncate4.patch --]
[-- Type: text/x-patch, Size: 24896 bytes --]

diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-alpha/mman.h linux-2.6.14-rc5.madv/include/asm-alpha/mman.h
--- linux-2.6.14-rc5/include/asm-alpha/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-alpha/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -42,6 +42,7 @@
 #define MADV_WILLNEED	3		/* will need these pages */
 #define	MADV_SPACEAVAIL	5		/* ensure resources are available */
 #define MADV_DONTNEED	6		/* don't need these pages */
+#define MADV_TRUNCATE	7		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-arm/mman.h linux-2.6.14-rc5.madv/include/asm-arm/mman.h
--- linux-2.6.14-rc5/include/asm-arm/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-arm/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-arm26/mman.h linux-2.6.14-rc5.madv/include/asm-arm26/mman.h
--- linux-2.6.14-rc5/include/asm-arm26/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-arm26/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-cris/mman.h linux-2.6.14-rc5.madv/include/asm-cris/mman.h
--- linux-2.6.14-rc5/include/asm-cris/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-cris/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-frv/mman.h linux-2.6.14-rc5.madv/include/asm-frv/mman.h
--- linux-2.6.14-rc5/include/asm-frv/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-frv/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-h8300/mman.h linux-2.6.14-rc5.madv/include/asm-h8300/mman.h
--- linux-2.6.14-rc5/include/asm-h8300/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-h8300/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-i386/mman.h linux-2.6.14-rc5.madv/include/asm-i386/mman.h
--- linux-2.6.14-rc5/include/asm-i386/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-i386/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-ia64/mman.h linux-2.6.14-rc5.madv/include/asm-ia64/mman.h
--- linux-2.6.14-rc5/include/asm-ia64/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-ia64/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-m32r/mman.h linux-2.6.14-rc5.madv/include/asm-m32r/mman.h
--- linux-2.6.14-rc5/include/asm-m32r/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-m32r/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-m68k/mman.h linux-2.6.14-rc5.madv/include/asm-m68k/mman.h
--- linux-2.6.14-rc5/include/asm-m68k/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-m68k/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-mips/mman.h linux-2.6.14-rc5.madv/include/asm-mips/mman.h
--- linux-2.6.14-rc5/include/asm-mips/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-mips/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -65,6 +65,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-parisc/mman.h linux-2.6.14-rc5.madv/include/asm-parisc/mman.h
--- linux-2.6.14-rc5/include/asm-parisc/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-parisc/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -38,6 +38,7 @@
 #define MADV_SPACEAVAIL 5               /* insure that resources are reserved */
 #define MADV_VPS_PURGE  6               /* Purge pages from VM page cache */
 #define MADV_VPS_INHERIT 7              /* Inherit parents page size */
+#define MADV_TRUNCATE	8		/* truncate range of pages */
 
 /* The range 12-64 is reserved for page size specification. */
 #define MADV_4K_PAGES   12              /* Use 4K pages  */
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-powerpc/mman.h linux-2.6.14-rc5.madv/include/asm-powerpc/mman.h
--- linux-2.6.14-rc5/include/asm-powerpc/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-powerpc/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -44,6 +44,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-s390/mman.h linux-2.6.14-rc5.madv/include/asm-s390/mman.h
--- linux-2.6.14-rc5/include/asm-s390/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-s390/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL        0x2             /* read-ahead aggressively */
 #define MADV_WILLNEED  0x3              /* pre-fault pages */
 #define MADV_DONTNEED  0x4              /* discard these pages */
+#define MADV_TRUNCATE  0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-sh/mman.h linux-2.6.14-rc5.madv/include/asm-sh/mman.h
--- linux-2.6.14-rc5/include/asm-sh/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-sh/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-sparc/mman.h linux-2.6.14-rc5.madv/include/asm-sparc/mman.h
--- linux-2.6.14-rc5/include/asm-sparc/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-sparc/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -54,6 +54,7 @@
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
 #define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_TRUNCATE	0x6		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-sparc64/mman.h linux-2.6.14-rc5.madv/include/asm-sparc64/mman.h
--- linux-2.6.14-rc5/include/asm-sparc64/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-sparc64/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -54,6 +54,7 @@
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
 #define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_TRUNCATE	0x6		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-v850/mman.h linux-2.6.14-rc5.madv/include/asm-v850/mman.h
--- linux-2.6.14-rc5/include/asm-v850/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-v850/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -32,6 +32,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-x86_64/mman.h linux-2.6.14-rc5.madv/include/asm-x86_64/mman.h
--- linux-2.6.14-rc5/include/asm-x86_64/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-x86_64/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -36,6 +36,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/asm-xtensa/mman.h linux-2.6.14-rc5.madv/include/asm-xtensa/mman.h
--- linux-2.6.14-rc5/include/asm-xtensa/mman.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/asm-xtensa/mman.h	2005-10-27 05:22:59.000000000 -0700
@@ -72,6 +72,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_TRUNCATE	0x5		/* truncate range of pages */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/linux/fs.h linux-2.6.14-rc5.madv/include/linux/fs.h
--- linux-2.6.14-rc5/include/linux/fs.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/linux/fs.h	2005-10-27 05:22:59.000000000 -0700
@@ -995,6 +995,7 @@ struct inode_operations {
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
+	void (*truncate_range)(struct inode *, loff_t, loff_t);
 };
 
 struct seq_file;
diff -Naurp -X dontdiff linux-2.6.14-rc5/include/linux/mm.h linux-2.6.14-rc5.madv/include/linux/mm.h
--- linux-2.6.14-rc5/include/linux/mm.h	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/include/linux/mm.h	2005-10-27 05:22:59.000000000 -0700
@@ -704,6 +704,7 @@ static inline void unmap_shared_mapping_
 }
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
+extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
 extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
@@ -865,6 +866,7 @@ extern unsigned long do_brk(unsigned lon
 /* filemap.c */
 extern unsigned long page_unuse(struct page *);
 extern void truncate_inode_pages(struct address_space *, loff_t);
+extern void truncate_inode_pages_range(struct address_space *, loff_t, loff_t);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
diff -Naurp -X dontdiff linux-2.6.14-rc5/mm/madvise.c linux-2.6.14-rc5.madv/mm/madvise.c
--- linux-2.6.14-rc5/mm/madvise.c	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/mm/madvise.c	2005-10-31 06:10:17.000000000 -0800
@@ -140,6 +140,31 @@ static long madvise_dontneed(struct vm_a
 	return 0;
 }
 
+static long madvise_truncate(struct vm_area_struct * vma,
+			     unsigned long start, unsigned long end)
+{
+	struct address_space *mapping;
+        loff_t offset, endoff;
+	int error = 0;
+
+	if (!vma->vm_file || !vma->vm_file->f_mapping 
+		|| !vma->vm_file->f_mapping->host) {
+			return -EINVAL;
+	}
+
+	mapping = vma->vm_file->f_mapping;
+	if (mapping == &swapper_space) {
+		return -EINVAL;
+	}
+
+	offset = (loff_t)(start - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
+	endoff = (loff_t)(end - vma->vm_start - 1) + (vma->vm_pgoff << PAGE_SHIFT);
+	printk("call vmtruncate_range(%p, %x %x) pgoff:%x\n", mapping, 
+			(unsigned int)offset, (unsigned int)endoff, vma->vm_pgoff);
+	error = vmtruncate_range(mapping->host, offset, endoff);
+	return error;
+}
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -152,6 +177,9 @@ madvise_vma(struct vm_area_struct *vma, 
 	case MADV_RANDOM:
 		error = madvise_behavior(vma, prev, start, end, behavior);
 		break;
+	case MADV_TRUNCATE:
+		error = madvise_truncate(vma, start, end);
+		break;
 
 	case MADV_WILLNEED:
 		error = madvise_willneed(vma, prev, start, end);
diff -Naurp -X dontdiff linux-2.6.14-rc5/mm/memory.c linux-2.6.14-rc5.madv/mm/memory.c
--- linux-2.6.14-rc5/mm/memory.c	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/mm/memory.c	2005-10-31 03:19:35.000000000 -0800
@@ -1597,6 +1597,32 @@ out_busy:
 
 EXPORT_SYMBOL(vmtruncate);
 
+int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * If the underlying filesystem is not going to provide 
+	 * a way to truncate a range of blocks (punch a hole) - 
+	 * we should return failure right now.
+	 */
+	if (!inode->i_op || !inode->i_op->truncate_range)
+		return -ENOSYS;
+		
+	down(&inode->i_sem);
+	down_write(&inode->i_alloc_sem);
+	unmap_mapping_range(mapping, offset, (end - offset), 1);
+	truncate_inode_pages_range(mapping, offset, end);
+	inode->i_op->truncate_range(inode, offset, end);
+	up_write(&inode->i_alloc_sem);
+	up(&inode->i_sem);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(vmtruncate_range);
+
+
 /* 
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
diff -Naurp -X dontdiff linux-2.6.14-rc5/mm/shmem.c linux-2.6.14-rc5.madv/mm/shmem.c
--- linux-2.6.14-rc5/mm/shmem.c	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/mm/shmem.c	2005-10-31 06:46:13.000000000 -0800
@@ -616,6 +616,184 @@ done2:
 	}
 }
 
+/*
+ * WIP ! WIP !! WIP !!!
+ *
+ * The idea is to free up the swap entries for the given range (start, end)
+ * in the file. 
+ *
+ * This is based on shmem_truncate() and I need to merge both of them
+ * into common routine.
+ */
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	unsigned long idx;
+	unsigned long size;
+	unsigned long limit;
+	unsigned long stage;
+	unsigned long diroff;
+	struct page **dir;
+	struct page *topdir;
+	struct page *middir;
+	struct page *subdir;
+	swp_entry_t *ptr;
+	LIST_HEAD(pages_to_free);
+	long nr_pages_to_free = 0;
+	long nr_swaps_freed = 0;
+	int offset;
+	int freed;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (idx >= info->next_index)
+		return;
+
+	limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	spin_lock(&info->lock);
+	info->flags |= SHMEM_TRUNCATE;
+	if (limit > info->next_index)
+		limit = info->next_index;
+	topdir = info->i_indirect;
+#if 0
+	if (topdir && idx <= SHMEM_NR_DIRECT) {
+		info->i_indirect = NULL;
+		nr_pages_to_free++;
+		list_add(&topdir->lru, &pages_to_free);
+	}
+#endif
+	spin_unlock(&info->lock);
+
+	if (info->swapped && idx < SHMEM_NR_DIRECT) {
+		ptr = info->i_direct;
+		size = limit;
+		if (size > SHMEM_NR_DIRECT)
+			size = SHMEM_NR_DIRECT;
+#if 0
+printk("freeing swap entries <%d  - %d> limit %d\n", idx, size, limit);
+#endif
+		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
+	}
+	if (!topdir)
+		goto done2;
+
+	BUG_ON(limit <= SHMEM_NR_DIRECT);
+	limit -= SHMEM_NR_DIRECT;
+	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
+	offset = idx % ENTRIES_PER_PAGE;
+	idx -= offset;
+
+	dir = shmem_dir_map(topdir);
+	stage = ENTRIES_PER_PAGEPAGE/2;
+	if (idx < ENTRIES_PER_PAGEPAGE/2) {
+		middir = topdir;
+		diroff = idx/ENTRIES_PER_PAGE;
+	} else {
+		dir += ENTRIES_PER_PAGE/2;
+		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
+		while (stage <= idx)
+			stage += ENTRIES_PER_PAGEPAGE;
+		middir = *dir;
+		if (*dir) {
+			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
+				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
+			if (!diroff && !offset) {
+				*dir = NULL;
+				nr_pages_to_free++;
+#if 0
+printk("added middir page to free list\n");
+#endif
+				list_add(&middir->lru, &pages_to_free);
+			}
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(middir);
+		} else {
+			diroff = 0;
+			offset = 0;
+			idx = stage;
+		}
+	}
+
+	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
+		if (unlikely(idx == stage)) {
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(topdir) +
+			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
+			while (!*dir) {
+				dir++;
+				idx += ENTRIES_PER_PAGEPAGE;
+				if (idx >= limit)
+					goto done1;
+			}
+			stage = idx + ENTRIES_PER_PAGEPAGE;
+			middir = *dir;
+			*dir = NULL;
+			nr_pages_to_free++;
+			list_add(&middir->lru, &pages_to_free);
+			shmem_dir_unmap(dir);
+			cond_resched();
+			dir = shmem_dir_map(middir);
+			diroff = 0;
+		}
+		subdir = dir[diroff];
+		if (subdir && subdir->nr_swapped) {
+			size = limit - idx;
+			if (size > ENTRIES_PER_PAGE)
+				size = ENTRIES_PER_PAGE;
+#if 0
+printk("freeing swap entries offset: %d  size: %d (%d %d)\n", offset, size, idx, limit);
+#endif
+			freed = shmem_map_and_free_swp(subdir,
+						offset, size, &dir);
+			if (!dir)
+				dir = shmem_dir_map(middir);
+			nr_swaps_freed += freed;
+			if (offset)
+				spin_lock(&info->lock);
+			subdir->nr_swapped -= freed;
+			if (offset)
+				spin_unlock(&info->lock);
+#if 0
+			BUG_ON(subdir->nr_swapped > offset);
+printk("subdir swapped %d\n", subdir->nr_swapped);
+#endif
+		}
+		if (offset)
+			offset = 0;
+		else if (subdir && !subdir->nr_swapped) {
+			dir[diroff] = NULL;
+			nr_pages_to_free++;
+#if 0
+printk("added dir page to free list\n");
+#endif
+			list_add(&subdir->lru, &pages_to_free);
+		}
+	}
+done1:
+	shmem_dir_unmap(dir);
+done2:
+	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
+		truncate_inode_pages_range(inode->i_mapping, start, end);
+	}
+
+	spin_lock(&info->lock);
+	info->flags &= ~SHMEM_TRUNCATE;
+	info->swapped -= nr_swaps_freed;
+	if (nr_pages_to_free)
+		shmem_free_blocks(inode, nr_pages_to_free);
+printk("swap entries free %d pages freed %d\n", nr_swaps_freed, nr_pages_to_free);
+	shmem_recalc_inode(inode);
+	spin_unlock(&info->lock);
+
+	/*
+	 * Empty swap vector directory pages to be freed?
+	 */
+	if (!list_empty(&pages_to_free)) {
+		pages_to_free.prev->next = NULL;
+		shmem_free_pages(pages_to_free.next);
+	}
+}
+
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -2083,6 +2261,7 @@ static struct file_operations shmem_file
 static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
+	.truncate_range	= shmem_truncate_range,
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
diff -Naurp -X dontdiff linux-2.6.14-rc5/mm/truncate.c linux-2.6.14-rc5.madv/mm/truncate.c
--- linux-2.6.14-rc5/mm/truncate.c	2005-10-19 23:23:05.000000000 -0700
+++ linux-2.6.14-rc5.madv/mm/truncate.c	2005-10-31 06:43:10.000000000 -0800
@@ -91,12 +91,15 @@ invalidate_complete_page(struct address_
 }
 
 /**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
  * @mapping: mapping to truncate
  * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
  *
- * Truncate the page cache at a set offset, removing the pages that are beyond
- * that offset (and zeroing out partial pages).
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
  *
  * Truncate takes two passes - the first pass is nonblocking.  It will not
  * block on page locks and it will not block on writeback.  The second pass
@@ -110,12 +113,12 @@ invalidate_complete_page(struct address_
  * We pass down the cache-hot hint to the page freeing code.  Even if the
  * mapping is large, it is probably the case that the final pages are the most
  * recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
  */
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+				loff_t lstart, loff_t lend)
 {
 	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	pgoff_t end;
 	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t next;
@@ -124,13 +127,22 @@ void truncate_inode_pages(struct address
 	if (mapping->nrpages == 0)
 		return;
 
+	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+	end = (lend  >> PAGE_CACHE_SHIFT);
+
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+	while (next <= end &&
+	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index = page->index;
 
+			if (page_index > end) {
+				next = page_index;
+				break;
+			}
+
 			if (page_index > next)
 				next = page_index;
 			next++;
@@ -166,9 +178,15 @@ void truncate_inode_pages(struct address
 			next = start;
 			continue;
 		}
+		if (pvec.pages[0]->index > end) {
+			pagevec_release(&pvec);
+			break;
+		}
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
+			if (page->index > end)
+				break;
 			lock_page(page);
 			wait_on_page_writeback(page);
 			if (page->index > next)
@@ -180,7 +198,19 @@ void truncate_inode_pages(struct address
 		pagevec_release(&pvec);
 	}
 }
+EXPORT_SYMBOL(truncate_inode_pages_range);
 
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
 EXPORT_SYMBOL(truncate_inode_pages);
 
 /**

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [RFC][PATCH] madvise(MADV_TRUNCATE)
  2005-10-31 19:49           ` [RFC][PATCH] madvise(MADV_TRUNCATE) Badari Pulavarty
@ 2005-11-01  0:05             ` Jeff Dike
  2005-11-02  1:15               ` [PATCH] 2.6.14 patch for supporting madvise(MADV_FREE) Badari Pulavarty
  0 siblings, 1 reply; 86+ messages in thread
From: Jeff Dike @ 2005-11-01  0:05 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade

On Mon, Oct 31, 2005 at 11:49:36AM -0800, Badari Pulavarty wrote:
> Here is the latest patch. Still not cleaned up - but I thought I would
> get more feedback & testing while I finish cleanups (since they are all
> cosmetic).

This one looks a lot better.  I've been playing with it some, and no
unexpected behavior.

				Jeff

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* [PATCH] 2.6.14 patch for supporting madvise(MADV_FREE)
  2005-11-01  0:05             ` Jeff Dike
@ 2005-11-02  1:15               ` Badari Pulavarty
  2005-11-02  1:43                 ` Andrea Arcangeli
  0 siblings, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-11-02  1:15 UTC (permalink / raw)
  To: lkml; +Cc: Hugh Dickins, akpm, andrea, dvhltc, linux-mm, Blaisorblade,
	Jeff Dike

[-- Attachment #1: Type: text/plain, Size: 1124 bytes --]

Hi All,

Here is the patch to support madvise(MADV_FREE) - which frees 
up the given range of pages and truncates the underlying backing 
store. This basically provides "punch hole into file" functionality.
Currently it supports ONLY shmfs/tmpfs - where we have short term 
need. Other filesystems return -ENOSYS.

Yes. This is a *crazy* interface to do it. But this is what
we exactly need for now. Here is the discussion on linux-mm 
(for all the fun discussion and the naming):

http://marc.theaimsgroup.com/?l=linux-mm&m=113078625426989&w=2

Andrew, could you include this in your next -mm release ?
BTW, for completeness - this patch includes reiser4-truncate-inode-
pages-range patch from  your -mm series. If you want me to re-work 
my patch without that, please let me know.

http://www.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.14-
rc5/2.6.14-rc5-mm1/broken-out/reiser4-truncate_inode_pages_range.patch

I tested with my test cases and Jeff Dike was kind enough to provide
a test case with UML - which found more bugs. I thank Andrea & Hugh
for helping me out heavily :)

Comments ?

Thanks,
Badari



[-- Attachment #2: madvise-free.patch --]
[-- Type: text/x-patch, Size: 22594 bytes --]

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
diff -Naurp -X dontdiff linux-2.6.14/include/asm-alpha/mman.h linux-2.6.14.madv/include/asm-alpha/mman.h
--- linux-2.6.14/include/asm-alpha/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-alpha/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -42,6 +42,7 @@
 #define MADV_WILLNEED	3		/* will need these pages */
 #define	MADV_SPACEAVAIL	5		/* ensure resources are available */
 #define MADV_DONTNEED	6		/* don't need these pages */
+#define MADV_FREE	7		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-arm/mman.h linux-2.6.14.madv/include/asm-arm/mman.h
--- linux-2.6.14/include/asm-arm/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-arm/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-arm26/mman.h linux-2.6.14.madv/include/asm-arm26/mman.h
--- linux-2.6.14/include/asm-arm26/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-arm26/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-cris/mman.h linux-2.6.14.madv/include/asm-cris/mman.h
--- linux-2.6.14/include/asm-cris/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-cris/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-frv/mman.h linux-2.6.14.madv/include/asm-frv/mman.h
--- linux-2.6.14/include/asm-frv/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-frv/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-h8300/mman.h linux-2.6.14.madv/include/asm-h8300/mman.h
--- linux-2.6.14/include/asm-h8300/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-h8300/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-i386/mman.h linux-2.6.14.madv/include/asm-i386/mman.h
--- linux-2.6.14/include/asm-i386/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-i386/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-ia64/mman.h linux-2.6.14.madv/include/asm-ia64/mman.h
--- linux-2.6.14/include/asm-ia64/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-ia64/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-m32r/mman.h linux-2.6.14.madv/include/asm-m32r/mman.h
--- linux-2.6.14/include/asm-m32r/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-m32r/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-m68k/mman.h linux-2.6.14.madv/include/asm-m68k/mman.h
--- linux-2.6.14/include/asm-m68k/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-m68k/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-mips/mman.h linux-2.6.14.madv/include/asm-mips/mman.h
--- linux-2.6.14/include/asm-mips/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-mips/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -65,6 +65,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-parisc/mman.h linux-2.6.14.madv/include/asm-parisc/mman.h
--- linux-2.6.14/include/asm-parisc/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-parisc/mman.h	2005-11-01 11:47:06.000000000 -0800
@@ -38,6 +38,7 @@
 #define MADV_SPACEAVAIL 5               /* insure that resources are reserved */
 #define MADV_VPS_PURGE  6               /* Purge pages from VM page cache */
 #define MADV_VPS_INHERIT 7              /* Inherit parents page size */
+#define MADV_FREE       8		/* free up these pages */
 
 /* The range 12-64 is reserved for page size specification. */
 #define MADV_4K_PAGES   12              /* Use 4K pages  */
diff -Naurp -X dontdiff linux-2.6.14/include/asm-powerpc/mman.h linux-2.6.14.madv/include/asm-powerpc/mman.h
--- linux-2.6.14/include/asm-powerpc/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-powerpc/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -44,6 +44,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-s390/mman.h linux-2.6.14.madv/include/asm-s390/mman.h
--- linux-2.6.14/include/asm-s390/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-s390/mman.h	2005-11-01 11:46:37.000000000 -0800
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL        0x2             /* read-ahead aggressively */
 #define MADV_WILLNEED  0x3              /* pre-fault pages */
 #define MADV_DONTNEED  0x4              /* discard these pages */
+#define MADV_FREE      0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-sh/mman.h linux-2.6.14.madv/include/asm-sh/mman.h
--- linux-2.6.14/include/asm-sh/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-sh/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-sparc/mman.h linux-2.6.14.madv/include/asm-sparc/mman.h
--- linux-2.6.14/include/asm-sparc/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-sparc/mman.h	2005-11-01 11:45:42.000000000 -0800
@@ -53,7 +53,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
-#define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-sparc64/mman.h linux-2.6.14.madv/include/asm-sparc64/mman.h
--- linux-2.6.14/include/asm-sparc64/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-sparc64/mman.h	2005-11-01 11:46:08.000000000 -0800
@@ -53,7 +53,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
-#define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-v850/mman.h linux-2.6.14.madv/include/asm-v850/mman.h
--- linux-2.6.14/include/asm-v850/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-v850/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -32,6 +32,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-x86_64/mman.h linux-2.6.14.madv/include/asm-x86_64/mman.h
--- linux-2.6.14/include/asm-x86_64/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-x86_64/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -36,6 +36,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-xtensa/mman.h linux-2.6.14.madv/include/asm-xtensa/mman.h
--- linux-2.6.14/include/asm-xtensa/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-xtensa/mman.h	2005-11-01 11:45:09.000000000 -0800
@@ -72,6 +72,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_FREE	0x5		/* free up these pages */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/linux/fs.h linux-2.6.14.madv/include/linux/fs.h
--- linux-2.6.14/include/linux/fs.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/linux/fs.h	2005-11-01 11:45:09.000000000 -0800
@@ -995,6 +995,7 @@ struct inode_operations {
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
+	void (*truncate_range)(struct inode *, loff_t, loff_t);
 };
 
 struct seq_file;
diff -Naurp -X dontdiff linux-2.6.14/include/linux/mm.h linux-2.6.14.madv/include/linux/mm.h
--- linux-2.6.14/include/linux/mm.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/linux/mm.h	2005-11-01 11:45:09.000000000 -0800
@@ -704,6 +704,7 @@ static inline void unmap_shared_mapping_
 }
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
+extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
 extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
@@ -865,6 +866,7 @@ extern unsigned long do_brk(unsigned lon
 /* filemap.c */
 extern unsigned long page_unuse(struct page *);
 extern void truncate_inode_pages(struct address_space *, loff_t);
+extern void truncate_inode_pages_range(struct address_space *, loff_t, loff_t);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
diff -Naurp -X dontdiff linux-2.6.14/mm/madvise.c linux-2.6.14.madv/mm/madvise.c
--- linux-2.6.14/mm/madvise.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/mm/madvise.c	2005-11-01 12:01:27.000000000 -0800
@@ -140,6 +140,39 @@ static long madvise_dontneed(struct vm_a
 	return 0;
 }
 
+/*
+ * Application wants to free up the pages and associated backing store. 
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_free(struct vm_area_struct * vma,
+			     unsigned long start, unsigned long end)
+{
+	struct address_space *mapping;
+        loff_t offset, endoff;
+
+	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 
+		return -EINVAL;
+
+	if (!vma->vm_file || !vma->vm_file->f_mapping 
+		|| !vma->vm_file->f_mapping->host) {
+			return -EINVAL;
+	}
+
+	mapping = vma->vm_file->f_mapping;
+	if (mapping == &swapper_space) {
+		return -EINVAL;
+	}
+
+	offset = (loff_t)(start - vma->vm_start) 
+			+ (vma->vm_pgoff << PAGE_SHIFT);
+	endoff = (loff_t)(end - vma->vm_start - 1) 
+			+ (vma->vm_pgoff << PAGE_SHIFT);
+	return  vmtruncate_range(mapping->host, offset, endoff);
+}
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -152,6 +185,9 @@ madvise_vma(struct vm_area_struct *vma, 
 	case MADV_RANDOM:
 		error = madvise_behavior(vma, prev, start, end, behavior);
 		break;
+	case MADV_FREE:
+		error = madvise_free(vma, start, end);
+		break;
 
 	case MADV_WILLNEED:
 		error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +226,8 @@ madvise_vma(struct vm_area_struct *vma, 
  *		some pages ahead.
  *  MADV_DONTNEED - the application is finished with the given range,
  *		so the kernel can free resources associated with it.
+ *  MADV_FREE - the application wants to free up the given range of
+ *		pages and associated backing store.
  *
  * return values:
  *  zero    - success
diff -Naurp -X dontdiff linux-2.6.14/mm/memory.c linux-2.6.14.madv/mm/memory.c
--- linux-2.6.14/mm/memory.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/mm/memory.c	2005-11-01 11:45:09.000000000 -0800
@@ -1597,6 +1597,32 @@ out_busy:
 
 EXPORT_SYMBOL(vmtruncate);
 
+int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * If the underlying filesystem is not going to provide 
+	 * a way to truncate a range of blocks (punch a hole) - 
+	 * we should return failure right now.
+	 */
+	if (!inode->i_op || !inode->i_op->truncate_range)
+		return -ENOSYS;
+		
+	/* XXX - Do we need both i_sem and i_allocsem all the way ? */
+	down(&inode->i_sem);
+	down_write(&inode->i_alloc_sem);
+	unmap_mapping_range(mapping, offset, (end - offset), 1);
+	truncate_inode_pages_range(mapping, offset, end);
+	inode->i_op->truncate_range(inode, offset, end);
+	up_write(&inode->i_alloc_sem);
+	up(&inode->i_sem);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(vmtruncate_range);
+
 /* 
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
diff -Naurp -X dontdiff linux-2.6.14/mm/shmem.c linux-2.6.14.madv/mm/shmem.c
--- linux-2.6.14/mm/shmem.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/mm/shmem.c	2005-11-01 11:45:09.000000000 -0800
@@ -459,7 +459,7 @@ static void shmem_free_pages(struct list
 	} while (next);
 }
 
-static void shmem_truncate(struct inode *inode)
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	unsigned long idx;
@@ -477,18 +477,27 @@ static void shmem_truncate(struct inode 
 	long nr_swaps_freed = 0;
 	int offset;
 	int freed;
+	int punch_hole = 0;
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (idx >= info->next_index)
 		return;
 
 	spin_lock(&info->lock);
 	info->flags |= SHMEM_TRUNCATE;
-	limit = info->next_index;
-	info->next_index = idx;
+	if (likely(end == (loff_t) -1)) {
+		limit = info->next_index;
+		info->next_index = idx;
+	} else {
+		limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		if (limit > info->next_index)
+			limit = info->next_index;
+		punch_hole = 1;
+	}
+
 	topdir = info->i_indirect;
-	if (topdir && idx <= SHMEM_NR_DIRECT) {
+	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
 		info->i_indirect = NULL;
 		nr_pages_to_free++;
 		list_add(&topdir->lru, &pages_to_free);
@@ -575,11 +584,12 @@ static void shmem_truncate(struct inode 
 			subdir->nr_swapped -= freed;
 			if (offset)
 				spin_unlock(&info->lock);
-			BUG_ON(subdir->nr_swapped > offset);
+			if (!punch_hole)
+				BUG_ON(subdir->nr_swapped > offset);
 		}
 		if (offset)
 			offset = 0;
-		else if (subdir) {
+		else if (subdir && !subdir->nr_swapped) {
 			dir[diroff] = NULL;
 			nr_pages_to_free++;
 			list_add(&subdir->lru, &pages_to_free);
@@ -596,7 +606,7 @@ done2:
 		 * Also, though shmem_getpage checks i_size before adding to
 		 * cache, no recheck after: so fix the narrow window there too.
 		 */
-		truncate_inode_pages(inode->i_mapping, inode->i_size);
+		truncate_inode_pages_range(inode->i_mapping, start, end);
 	}
 
 	spin_lock(&info->lock);
@@ -616,6 +626,11 @@ done2:
 	}
 }
 
+static void shmem_truncate(struct inode *inode)
+{
+	shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
+
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file
 static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
+	.truncate_range	= shmem_truncate_range,
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
diff -Naurp -X dontdiff linux-2.6.14/mm/truncate.c linux-2.6.14.madv/mm/truncate.c
--- linux-2.6.14/mm/truncate.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/mm/truncate.c	2005-11-01 11:45:09.000000000 -0800
@@ -91,12 +91,15 @@ invalidate_complete_page(struct address_
 }
 
 /**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
  * @mapping: mapping to truncate
  * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
  *
- * Truncate the page cache at a set offset, removing the pages that are beyond
- * that offset (and zeroing out partial pages).
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
  *
  * Truncate takes two passes - the first pass is nonblocking.  It will not
  * block on page locks and it will not block on writeback.  The second pass
@@ -110,12 +113,12 @@ invalidate_complete_page(struct address_
  * We pass down the cache-hot hint to the page freeing code.  Even if the
  * mapping is large, it is probably the case that the final pages are the most
  * recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
  */
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+				loff_t lstart, loff_t lend)
 {
 	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	pgoff_t end;
 	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t next;
@@ -124,13 +127,22 @@ void truncate_inode_pages(struct address
 	if (mapping->nrpages == 0)
 		return;
 
+	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+	end = (lend  >> PAGE_CACHE_SHIFT);
+
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+	while (next <= end &&
+	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index = page->index;
 
+			if (page_index > end) {
+				next = page_index;
+				break;
+			}
+
 			if (page_index > next)
 				next = page_index;
 			next++;
@@ -166,9 +178,15 @@ void truncate_inode_pages(struct address
 			next = start;
 			continue;
 		}
+		if (pvec.pages[0]->index > end) {
+			pagevec_release(&pvec);
+			break;
+		}
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
+			if (page->index > end)
+				break;
 			lock_page(page);
 			wait_on_page_writeback(page);
 			if (page->index > next)
@@ -180,7 +198,19 @@ void truncate_inode_pages(struct address
 		pagevec_release(&pvec);
 	}
 }
+EXPORT_SYMBOL(truncate_inode_pages_range);
 
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
 EXPORT_SYMBOL(truncate_inode_pages);
 
 /**

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_FREE)
  2005-11-02  1:15               ` [PATCH] 2.6.14 patch for supporting madvise(MADV_FREE) Badari Pulavarty
@ 2005-11-02  1:43                 ` Andrea Arcangeli
  2005-11-02 15:49                   ` Badari Pulavarty
  2005-11-02 16:12                   ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Badari Pulavarty
  0 siblings, 2 replies; 86+ messages in thread
From: Andrea Arcangeli @ 2005-11-02  1:43 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: lkml, Hugh Dickins, akpm, dvhltc, linux-mm, Blaisorblade,
	Jeff Dike

On Tue, Nov 01, 2005 at 05:15:01PM -0800, Badari Pulavarty wrote:
> Here is the patch to support madvise(MADV_FREE) - which frees 
> up the given range of pages and truncates the underlying backing 
> store. This basically provides "punch hole into file" functionality.
> Currently it supports ONLY shmfs/tmpfs - where we have short term 
> need. Other filesystems return -ENOSYS.

MADV_FREE as a name isn't right if we return -ENOSYS for anonymoys
memory.

MADV_FREE in other OS works _only_ on anonymous memory and returns
-EINVAL if used on filebacked vmas. Infact we probably should rename our
MADV_DONTNEED to MADV_FREE.

http://docs.sun.com/app/docs/doc/816-5168/6mbb3hrde?a=view

	"This value cannot be used on mappings that have underlying file objects."

Our MADV_DONTNEED exactly matches the MADV_FREE semantics, and it seems
the MADV_DONTNEED of other OS isn't destructive like ours. Except our
MADV_DONTNEED also works on filebacked mappings but it's destructive
only on anonymous memory.


I thought Andrew suggested MADV_REMOVE for the new feature.

This feature didn't exist in other OS yet AFIK, so a new MADV_name for
it makes sense. I'm not completely against extending MADV_FREE but then we
shouldn't return -ENOSYS on anonymous memory and we should do the same
thing MADV_DONTNEED does on anonymous memory. Probably a new name is
safer to avoid confusion (think an application running MADV_FREE and
expecting -EINVAL when used on filebacked mappings).

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_FREE)
  2005-11-02  1:43                 ` Andrea Arcangeli
@ 2005-11-02 15:49                   ` Badari Pulavarty
  2005-11-02 16:12                   ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Badari Pulavarty
  1 sibling, 0 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-11-02 15:49 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: lkml, Hugh Dickins, akpm, dvhltc, linux-mm, Blaisorblade,
	Jeff Dike

On Wed, 2005-11-02 at 02:43 +0100, Andrea Arcangeli wrote:
> On Tue, Nov 01, 2005 at 05:15:01PM -0800, Badari Pulavarty wrote:
> > Here is the patch to support madvise(MADV_FREE) - which frees 
> > up the given range of pages and truncates the underlying backing 
> > store. This basically provides "punch hole into file" functionality.
> > Currently it supports ONLY shmfs/tmpfs - where we have short term 
> > need. Other filesystems return -ENOSYS.
> 
> MADV_FREE as a name isn't right if we return -ENOSYS for anonymoys
> memory.
> 
> MADV_FREE in other OS works _only_ on anonymous memory and returns
> -EINVAL if used on filebacked vmas. Infact we probably should rename our
> MADV_DONTNEED to MADV_FREE.
> 
> http://docs.sun.com/app/docs/doc/816-5168/6mbb3hrde?a=view
> 
> 	"This value cannot be used on mappings that have underlying file objects."
> 
> Our MADV_DONTNEED exactly matches the MADV_FREE semantics, and it seems
> the MADV_DONTNEED of other OS isn't destructive like ours. Except our
> MADV_DONTNEED also works on filebacked mappings but it's destructive
> only on anonymous memory.
> 
> 
> I thought Andrew suggested MADV_REMOVE for the new feature.

Yep. My bad. Andrew did suggest MADV_REMOVE. Let me rename and
generate patch once again !! Thanks for pointing out.

> 
> This feature didn't exist in other OS yet AFIK, so a new MADV_name for
> it makes sense. I'm not completely against extending MADV_FREE but then we
> shouldn't return -ENOSYS on anonymous memory and we should do the same
> thing MADV_DONTNEED does on anonymous memory. Probably a new name is
> safer to avoid confusion (think an application running MADV_FREE and
> expecting -EINVAL when used on filebacked mappings).
> 
> Thanks!
> 

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)
  2005-11-02  1:43                 ` Andrea Arcangeli
  2005-11-02 15:49                   ` Badari Pulavarty
@ 2005-11-02 16:12                   ` Badari Pulavarty
  2005-11-02 19:54                     ` New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)) Blaisorblade
                                       ` (2 more replies)
  1 sibling, 3 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-11-02 16:12 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: lkml, Hugh Dickins, akpm, dvhltc, linux-mm, Blaisorblade,
	Jeff Dike

[-- Attachment #1: Type: text/plain, Size: 650 bytes --]

Hi Andrew & Andrea,

Here is the updated patch with name change again :(
Hopefully this would be final. (MADV_REMOVE).

BTW, I am not sure if we need to hold i_sem and i_allocsem
all the way ? I wanted to be safe - but this may be overkill ?


+       /* XXX - Do we need both i_sem and i_allocsem all the way ? */
+       down(&inode->i_sem);
+       down_write(&inode->i_alloc_sem);
+       unmap_mapping_range(mapping, offset, (end - offset), 1);
+       truncate_inode_pages_range(mapping, offset, end);
+       inode->i_op->truncate_range(inode, offset, end);
+       up_write(&inode->i_alloc_sem);
+       up(&inode->i_sem);


Thanks,
Badari



[-- Attachment #2: madvise-remove.patch --]
[-- Type: text/x-patch, Size: 22685 bytes --]

diff -Naurp -X dontdiff linux-2.6.14/include/asm-alpha/mman.h linux-2.6.14.madv/include/asm-alpha/mman.h
--- linux-2.6.14/include/asm-alpha/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-alpha/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -42,6 +42,7 @@
 #define MADV_WILLNEED	3		/* will need these pages */
 #define	MADV_SPACEAVAIL	5		/* ensure resources are available */
 #define MADV_DONTNEED	6		/* don't need these pages */
+#define MADV_REMOVE	7		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-arm/mman.h linux-2.6.14.madv/include/asm-arm/mman.h
--- linux-2.6.14/include/asm-arm/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-arm/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-arm26/mman.h linux-2.6.14.madv/include/asm-arm26/mman.h
--- linux-2.6.14/include/asm-arm26/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-arm26/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-cris/mman.h linux-2.6.14.madv/include/asm-cris/mman.h
--- linux-2.6.14/include/asm-cris/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-cris/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-frv/mman.h linux-2.6.14.madv/include/asm-frv/mman.h
--- linux-2.6.14/include/asm-frv/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-frv/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-h8300/mman.h linux-2.6.14.madv/include/asm-h8300/mman.h
--- linux-2.6.14/include/asm-h8300/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-h8300/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-i386/mman.h linux-2.6.14.madv/include/asm-i386/mman.h
--- linux-2.6.14/include/asm-i386/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-i386/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-ia64/mman.h linux-2.6.14.madv/include/asm-ia64/mman.h
--- linux-2.6.14/include/asm-ia64/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-ia64/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-m32r/mman.h linux-2.6.14.madv/include/asm-m32r/mman.h
--- linux-2.6.14/include/asm-m32r/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-m32r/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-m68k/mman.h linux-2.6.14.madv/include/asm-m68k/mman.h
--- linux-2.6.14/include/asm-m68k/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-m68k/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-mips/mman.h linux-2.6.14.madv/include/asm-mips/mman.h
--- linux-2.6.14/include/asm-mips/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-mips/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -65,6 +65,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-parisc/mman.h linux-2.6.14.madv/include/asm-parisc/mman.h
--- linux-2.6.14/include/asm-parisc/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-parisc/mman.h	2005-11-02 03:12:02.000000000 -0800
@@ -38,6 +38,7 @@
 #define MADV_SPACEAVAIL 5               /* insure that resources are reserved */
 #define MADV_VPS_PURGE  6               /* Purge pages from VM page cache */
 #define MADV_VPS_INHERIT 7              /* Inherit parents page size */
+#define MADV_REMOVE     8		/* remove these pages & resources */
 
 /* The range 12-64 is reserved for page size specification. */
 #define MADV_4K_PAGES   12              /* Use 4K pages  */
diff -Naurp -X dontdiff linux-2.6.14/include/asm-powerpc/mman.h linux-2.6.14.madv/include/asm-powerpc/mman.h
--- linux-2.6.14/include/asm-powerpc/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-powerpc/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -44,6 +44,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-s390/mman.h linux-2.6.14.madv/include/asm-s390/mman.h
--- linux-2.6.14/include/asm-s390/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-s390/mman.h	2005-11-02 03:12:13.000000000 -0800
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL        0x2             /* read-ahead aggressively */
 #define MADV_WILLNEED  0x3              /* pre-fault pages */
 #define MADV_DONTNEED  0x4              /* discard these pages */
+#define MADV_REMOVE    0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-sh/mman.h linux-2.6.14.madv/include/asm-sh/mman.h
--- linux-2.6.14/include/asm-sh/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-sh/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-sparc/mman.h linux-2.6.14.madv/include/asm-sparc/mman.h
--- linux-2.6.14/include/asm-sparc/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-sparc/mman.h	2005-11-02 03:04:57.000000000 -0800
@@ -54,6 +54,7 @@
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
 #define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_REMOVE	0x6		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-sparc64/mman.h linux-2.6.14.madv/include/asm-sparc64/mman.h
--- linux-2.6.14/include/asm-sparc64/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-sparc64/mman.h	2005-11-02 03:04:35.000000000 -0800
@@ -54,6 +54,7 @@
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
 #define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_REMOVE	0x6		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-v850/mman.h linux-2.6.14.madv/include/asm-v850/mman.h
--- linux-2.6.14/include/asm-v850/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-v850/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -32,6 +32,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-x86_64/mman.h linux-2.6.14.madv/include/asm-x86_64/mman.h
--- linux-2.6.14/include/asm-x86_64/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-x86_64/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -36,6 +36,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/asm-xtensa/mman.h linux-2.6.14.madv/include/asm-xtensa/mman.h
--- linux-2.6.14/include/asm-xtensa/mman.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/asm-xtensa/mman.h	2005-11-02 03:03:55.000000000 -0800
@@ -72,6 +72,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff -Naurp -X dontdiff linux-2.6.14/include/linux/fs.h linux-2.6.14.madv/include/linux/fs.h
--- linux-2.6.14/include/linux/fs.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/linux/fs.h	2005-11-02 03:03:55.000000000 -0800
@@ -995,6 +995,7 @@ struct inode_operations {
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
+	void (*truncate_range)(struct inode *, loff_t, loff_t);
 };
 
 struct seq_file;
diff -Naurp -X dontdiff linux-2.6.14/include/linux/mm.h linux-2.6.14.madv/include/linux/mm.h
--- linux-2.6.14/include/linux/mm.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/include/linux/mm.h	2005-11-02 03:03:55.000000000 -0800
@@ -704,6 +704,7 @@ static inline void unmap_shared_mapping_
 }
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
+extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
 extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
@@ -865,6 +866,7 @@ extern unsigned long do_brk(unsigned lon
 /* filemap.c */
 extern unsigned long page_unuse(struct page *);
 extern void truncate_inode_pages(struct address_space *, loff_t);
+extern void truncate_inode_pages_range(struct address_space *, loff_t, loff_t);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
diff -Naurp -X dontdiff linux-2.6.14/mm/madvise.c linux-2.6.14.madv/mm/madvise.c
--- linux-2.6.14/mm/madvise.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/mm/madvise.c	2005-11-02 03:03:55.000000000 -0800
@@ -140,6 +140,39 @@ static long madvise_dontneed(struct vm_a
 	return 0;
 }
 
+/*
+ * Application wants to free up the pages and associated backing store. 
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_remove(struct vm_area_struct * vma,
+			     unsigned long start, unsigned long end)
+{
+	struct address_space *mapping;
+        loff_t offset, endoff;
+
+	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 
+		return -EINVAL;
+
+	if (!vma->vm_file || !vma->vm_file->f_mapping 
+		|| !vma->vm_file->f_mapping->host) {
+			return -EINVAL;
+	}
+
+	mapping = vma->vm_file->f_mapping;
+	if (mapping == &swapper_space) {
+		return -EINVAL;
+	}
+
+	offset = (loff_t)(start - vma->vm_start) 
+			+ (vma->vm_pgoff << PAGE_SHIFT);
+	endoff = (loff_t)(end - vma->vm_start - 1) 
+			+ (vma->vm_pgoff << PAGE_SHIFT);
+	return  vmtruncate_range(mapping->host, offset, endoff);
+}
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -152,6 +185,9 @@ madvise_vma(struct vm_area_struct *vma, 
 	case MADV_RANDOM:
 		error = madvise_behavior(vma, prev, start, end, behavior);
 		break;
+	case MADV_REMOVE:
+		error = madvise_remove(vma, start, end);
+		break;
 
 	case MADV_WILLNEED:
 		error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +226,8 @@ madvise_vma(struct vm_area_struct *vma, 
  *		some pages ahead.
  *  MADV_DONTNEED - the application is finished with the given range,
  *		so the kernel can free resources associated with it.
+ *  MADV_REMOVE - the application wants to free up the given range of
+ *		pages and associated backing store.
  *
  * return values:
  *  zero    - success
diff -Naurp -X dontdiff linux-2.6.14/mm/memory.c linux-2.6.14.madv/mm/memory.c
--- linux-2.6.14/mm/memory.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/mm/memory.c	2005-11-02 03:03:55.000000000 -0800
@@ -1597,6 +1597,32 @@ out_busy:
 
 EXPORT_SYMBOL(vmtruncate);
 
+int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * If the underlying filesystem is not going to provide 
+	 * a way to truncate a range of blocks (punch a hole) - 
+	 * we should return failure right now.
+	 */
+	if (!inode->i_op || !inode->i_op->truncate_range)
+		return -ENOSYS;
+		
+	/* XXX - Do we need both i_sem and i_allocsem all the way ? */
+	down(&inode->i_sem);
+	down_write(&inode->i_alloc_sem);
+	unmap_mapping_range(mapping, offset, (end - offset), 1);
+	truncate_inode_pages_range(mapping, offset, end);
+	inode->i_op->truncate_range(inode, offset, end);
+	up_write(&inode->i_alloc_sem);
+	up(&inode->i_sem);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(vmtruncate_range);
+
 /* 
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
diff -Naurp -X dontdiff linux-2.6.14/mm/shmem.c linux-2.6.14.madv/mm/shmem.c
--- linux-2.6.14/mm/shmem.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/mm/shmem.c	2005-11-02 03:03:55.000000000 -0800
@@ -459,7 +459,7 @@ static void shmem_free_pages(struct list
 	} while (next);
 }
 
-static void shmem_truncate(struct inode *inode)
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	unsigned long idx;
@@ -477,18 +477,27 @@ static void shmem_truncate(struct inode 
 	long nr_swaps_freed = 0;
 	int offset;
 	int freed;
+	int punch_hole = 0;
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (idx >= info->next_index)
 		return;
 
 	spin_lock(&info->lock);
 	info->flags |= SHMEM_TRUNCATE;
-	limit = info->next_index;
-	info->next_index = idx;
+	if (likely(end == (loff_t) -1)) {
+		limit = info->next_index;
+		info->next_index = idx;
+	} else {
+		limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		if (limit > info->next_index)
+			limit = info->next_index;
+		punch_hole = 1;
+	}
+
 	topdir = info->i_indirect;
-	if (topdir && idx <= SHMEM_NR_DIRECT) {
+	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
 		info->i_indirect = NULL;
 		nr_pages_to_free++;
 		list_add(&topdir->lru, &pages_to_free);
@@ -575,11 +584,12 @@ static void shmem_truncate(struct inode 
 			subdir->nr_swapped -= freed;
 			if (offset)
 				spin_unlock(&info->lock);
-			BUG_ON(subdir->nr_swapped > offset);
+			if (!punch_hole)
+				BUG_ON(subdir->nr_swapped > offset);
 		}
 		if (offset)
 			offset = 0;
-		else if (subdir) {
+		else if (subdir && !subdir->nr_swapped) {
 			dir[diroff] = NULL;
 			nr_pages_to_free++;
 			list_add(&subdir->lru, &pages_to_free);
@@ -596,7 +606,7 @@ done2:
 		 * Also, though shmem_getpage checks i_size before adding to
 		 * cache, no recheck after: so fix the narrow window there too.
 		 */
-		truncate_inode_pages(inode->i_mapping, inode->i_size);
+		truncate_inode_pages_range(inode->i_mapping, start, end);
 	}
 
 	spin_lock(&info->lock);
@@ -616,6 +626,11 @@ done2:
 	}
 }
 
+static void shmem_truncate(struct inode *inode)
+{
+	shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
+
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file
 static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
+	.truncate_range	= shmem_truncate_range,
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
diff -Naurp -X dontdiff linux-2.6.14/mm/truncate.c linux-2.6.14.madv/mm/truncate.c
--- linux-2.6.14/mm/truncate.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14.madv/mm/truncate.c	2005-11-02 03:03:55.000000000 -0800
@@ -91,12 +91,15 @@ invalidate_complete_page(struct address_
 }
 
 /**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
  * @mapping: mapping to truncate
  * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
  *
- * Truncate the page cache at a set offset, removing the pages that are beyond
- * that offset (and zeroing out partial pages).
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
  *
  * Truncate takes two passes - the first pass is nonblocking.  It will not
  * block on page locks and it will not block on writeback.  The second pass
@@ -110,12 +113,12 @@ invalidate_complete_page(struct address_
  * We pass down the cache-hot hint to the page freeing code.  Even if the
  * mapping is large, it is probably the case that the final pages are the most
  * recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
  */
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+				loff_t lstart, loff_t lend)
 {
 	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	pgoff_t end;
 	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t next;
@@ -124,13 +127,22 @@ void truncate_inode_pages(struct address
 	if (mapping->nrpages == 0)
 		return;
 
+	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+	end = (lend  >> PAGE_CACHE_SHIFT);
+
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+	while (next <= end &&
+	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index = page->index;
 
+			if (page_index > end) {
+				next = page_index;
+				break;
+			}
+
 			if (page_index > next)
 				next = page_index;
 			next++;
@@ -166,9 +178,15 @@ void truncate_inode_pages(struct address
 			next = start;
 			continue;
 		}
+		if (pvec.pages[0]->index > end) {
+			pagevec_release(&pvec);
+			break;
+		}
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
+			if (page->index > end)
+				break;
 			lock_page(page);
 			wait_on_page_writeback(page);
 			if (page->index > next)
@@ -180,7 +198,19 @@ void truncate_inode_pages(struct address
 		pagevec_release(&pvec);
 	}
 }
+EXPORT_SYMBOL(truncate_inode_pages_range);
 
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
 EXPORT_SYMBOL(truncate_inode_pages);
 
 /**

^ permalink raw reply	[flat|nested] 86+ messages in thread

* New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE))
  2005-11-02 16:12                   ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Badari Pulavarty
@ 2005-11-02 19:54                     ` Blaisorblade
  2005-11-02 20:12                       ` Hugh Dickins
  2005-11-02 21:36                       ` Badari Pulavarty
  2005-11-12  0:25                     ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Andrew Morton
  2005-11-12  0:34                     ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Andrew Morton
  2 siblings, 2 replies; 86+ messages in thread
From: Blaisorblade @ 2005-11-02 19:54 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Andrea Arcangeli, lkml, Hugh Dickins, akpm, dvhltc, linux-mm,
	Jeff Dike

On Wednesday 02 November 2005 17:12, Badari Pulavarty wrote:
> Hi Andrew & Andrea,
>
> Here is the updated patch with name change again :(
> Hopefully this would be final. (MADV_REMOVE).
>
> BTW, I am not sure if we need to hold i_sem and i_allocsem
> all the way ? I wanted to be safe - but this may be overkill ?
While looking into this, I probably found another problem, a race with 
install_page(), which doesn't use the seqlock-style check we use for 
everything else (aka do_no_page) but simply assumes a page is valid if its 
index is below the current file size.

This is clearly "truncate" specific, and is already racy. Suppose I truncate a 
file and reduce its size, and then re-extend it, the page which I previously 
fetched from the cache is invalid. The current install_page code generates 
corruption.

In fact the page is fetched from the caller of install_page and passed to it.

This affects anybody using MAP_POPULATE or using remap_file_pages.

> +       /* XXX - Do we need both i_sem and i_allocsem all the way ? */
> +       down(&inode->i_sem);
> +       down_write(&inode->i_alloc_sem);
> +       unmap_mapping_range(mapping, offset, (end - offset), 1);
In my opinion, as already said, unmap_mapping_range can be called without 
these two locks, as it operates only on mappings for the file.

However currently it's called with these locks held in vmtruncate, but I think 
the locks are held in that case only because we need to truncate the file, 
and are hold in excess also across this call.

Instead, we need to protect against concurrent faults on the mapping (not 
against concurrent mmaps)...and that is done through (struct address_space*) 
mapping->truncate_count.

=====
Finally, there is MAP_POPULATE and other pre-faulting, i.e. install_page (no, 
this is not peculiar to VM_NONLINEAR, even if some code is shared), but 
install_page checks explicitly for truncation; the problem is that the check 
is rather bogus, compared to the rest of checks:

        /*
         * This page may have been truncated. Tell the
         * caller about it.
         */
        err = -EINVAL;
        inode = vma->vm_file->f_mapping->host;
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (!page->mapping || page->index >= size)
                goto err_unlock;

I remember there being a BUG_ON and Linus fixing it up.

It should be converted to the normal checks used for the rest 
(->truncate_count based - see do_no_page()).

To do so, the caller (*_populate) needs to call again *_getpage, if 
install_page detects a race, but it must also save and pass the 
truncate_count.

So, we probably want need to cleanup and join {filemap,shmem}_populate 
together, because the only real difference between them is the function 
called to lookup the page from disk ({shmem,filemap}_getpage).

So, we should replace struct vm_operations_struct "populate" method with a 
"getpage" method, by using the shmem_getpage prototype, which is better 
engineered, see my comment:

        page = filemap_getpage(file, pgoff, nonblock);

        /* XXX: This is wrong, a filesystem I/O error may have happened. Fix 
that as
         * done in shmem_populate calling shmem_getpage */
        if (!page && !nonblock)
                return -ENOMEM;

> +       truncate_inode_pages_range(mapping, offset, end);
> +       inode->i_op->truncate_range(inode, offset, end);
> +       up_write(&inode->i_alloc_sem);
> +       up(&inode->i_sem);

-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

	

	
		
___________________________________ 
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB 
http://mail.yahoo.it

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE))
  2005-11-02 19:54                     ` New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)) Blaisorblade
@ 2005-11-02 20:12                       ` Hugh Dickins
  2005-11-02 20:45                         ` Hugh Dickins
  2005-11-02 21:36                       ` Badari Pulavarty
  1 sibling, 1 reply; 86+ messages in thread
From: Hugh Dickins @ 2005-11-02 20:12 UTC (permalink / raw)
  To: Blaisorblade
  Cc: Badari Pulavarty, Andrea Arcangeli, lkml, akpm, dvhltc, linux-mm,
	Jeff Dike

On Wed, 2 Nov 2005, Blaisorblade wrote:
> While looking into this, I probably found another problem, a race with 
> install_page(), which doesn't use the seqlock-style check we use for 
> everything else (aka do_no_page) but simply assumes a page is valid if its 
> index is below the current file size.
> 
> This is clearly "truncate" specific, and is already racy. Suppose I truncate a 
> file and reduce its size, and then re-extend it, the page which I previously 
> fetched from the cache is invalid. The current install_page code generates 
> corruption.

No, it should be fine as is (unless perhaps some barrier is needed).

The check
	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
	if (!page->mapping || page->index >= size)
		goto err_unlock;
handles the case that worries you: page->mapping will be NULL.

do_no_page has to do the more complicated truncate_count business because
it deals with all kinds of ->nopage, some of which leave page->mapping NULL:
so it's unable to distinguish one where the driver left it NULL from one
where truncation has suddenly made it NULL.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE))
  2005-11-02 20:12                       ` Hugh Dickins
@ 2005-11-02 20:45                         ` Hugh Dickins
  0 siblings, 0 replies; 86+ messages in thread
From: Hugh Dickins @ 2005-11-02 20:45 UTC (permalink / raw)
  To: Blaisorblade
  Cc: Badari Pulavarty, Andrea Arcangeli, lkml, akpm, dvhltc, linux-mm,
	Jeff Dike

On Wed, 2 Nov 2005, Hugh Dickins wrote:
> On Wed, 2 Nov 2005, Blaisorblade wrote:
> 
> No, it should be fine as is (unless perhaps some barrier is needed).

We already have the barrier needed: we're holding page_table_lock (pte lock).

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE))
  2005-11-02 19:54                     ` New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)) Blaisorblade
  2005-11-02 20:12                       ` Hugh Dickins
@ 2005-11-02 21:36                       ` Badari Pulavarty
  2005-11-02 21:55                         ` Hugh Dickins
  1 sibling, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-11-02 21:36 UTC (permalink / raw)
  To: Blaisorblade
  Cc: Andrea Arcangeli, lkml, Hugh Dickins, akpm, dvhltc, linux-mm,
	Jeff Dike

On Wed, 2005-11-02 at 20:54 +0100, Blaisorblade wrote:
> On Wednesday 02 November 2005 17:12, Badari Pulavarty wrote:
> > Hi Andrew & Andrea,
> >
> > Here is the updated patch with name change again :(
> > Hopefully this would be final. (MADV_REMOVE).
> >
> > BTW, I am not sure if we need to hold i_sem and i_allocsem
> > all the way ? I wanted to be safe - but this may be overkill ?
> While looking into this, I probably found another problem, a race with 
> install_page(), which doesn't use the seqlock-style check we use for 
> everything else (aka do_no_page) but simply assumes a page is valid if its 
> index is below the current file size.
> 
> This is clearly "truncate" specific, and is already racy. Suppose I truncate a 
> file and reduce its size, and then re-extend it, the page which I previously 
> fetched from the cache is invalid. The current install_page code generates 
> corruption.
> 
> In fact the page is fetched from the caller of install_page and passed to it.
> 
> This affects anybody using MAP_POPULATE or using remap_file_pages.
> 
> > +       /* XXX - Do we need both i_sem and i_allocsem all the way ? */
> > +       down(&inode->i_sem);
> > +       down_write(&inode->i_alloc_sem);
> > +       unmap_mapping_range(mapping, offset, (end - offset), 1);
> In my opinion, as already said, unmap_mapping_range can be called without 
> these two locks, as it operates only on mappings for the file.
> 
> However currently it's called with these locks held in vmtruncate, but I think 
> the locks are held in that case only because we need to truncate the file, 
> and are hold in excess also across this call.

I agree, I can push down the locking only for ->truncate_range - if
no one has objections. (But again, it so special case - no one really
cares about the performance of this interface ?).

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE))
  2005-11-02 21:36                       ` Badari Pulavarty
@ 2005-11-02 21:55                         ` Hugh Dickins
  2005-11-02 22:02                           ` Badari Pulavarty
  0 siblings, 1 reply; 86+ messages in thread
From: Hugh Dickins @ 2005-11-02 21:55 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Blaisorblade, Andrea Arcangeli, lkml, akpm, dvhltc, linux-mm,
	Jeff Dike

On Wed, 2 Nov 2005, Badari Pulavarty wrote:
> On Wed, 2005-11-02 at 20:54 +0100, Blaisorblade wrote:
> > > +       /* XXX - Do we need both i_sem and i_allocsem all the way ? */
> > > +       down(&inode->i_sem);
> > > +       down_write(&inode->i_alloc_sem);
> > > +       unmap_mapping_range(mapping, offset, (end - offset), 1);
> > In my opinion, as already said, unmap_mapping_range can be called without 
> > these two locks, as it operates only on mappings for the file.
> > 
> > However currently it's called with these locks held in vmtruncate, but I think 
> > the locks are held in that case only because we need to truncate the file, 
> > and are hold in excess also across this call.
> 
> I agree, I can push down the locking only for ->truncate_range - if
> no one has objections. (But again, it so special case - no one really
> cares about the performance of this interface ?).

I can't remember why i_alloc_sem got introduced, and don't have time to
work it out: something to do with direct I/O races, perhaps?  Someone
else must advise, perhaps you will be able to drop that one.

But I think you'd be very unwise to drop i_sem too.  i_mmap_lock gets
dropped whenever preemption demands here, i_sem is what's preventing
someone else coming along and doing a concurrent truncate or remove.
You don't want that.

Sorry, I've not yet had time to study your patch: I do intend to,
but cannot promise when.  I fear it won't be as easy as making
these occasional responses.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE))
  2005-11-02 21:55                         ` Hugh Dickins
@ 2005-11-02 22:02                           ` Badari Pulavarty
  0 siblings, 0 replies; 86+ messages in thread
From: Badari Pulavarty @ 2005-11-02 22:02 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Blaisorblade, Andrea Arcangeli, lkml, akpm, dvhltc, linux-mm,
	Jeff Dike

On Wed, 2005-11-02 at 21:55 +0000, Hugh Dickins wrote:
> On Wed, 2 Nov 2005, Badari Pulavarty wrote:
> > On Wed, 2005-11-02 at 20:54 +0100, Blaisorblade wrote:
> > > > +       /* XXX - Do we need both i_sem and i_allocsem all the way ? */
> > > > +       down(&inode->i_sem);
> > > > +       down_write(&inode->i_alloc_sem);
> > > > +       unmap_mapping_range(mapping, offset, (end - offset), 1);
> > > In my opinion, as already said, unmap_mapping_range can be called without 
> > > these two locks, as it operates only on mappings for the file.
> > > 
> > > However currently it's called with these locks held in vmtruncate, but I think 
> > > the locks are held in that case only because we need to truncate the file, 
> > > and are hold in excess also across this call.
> > 
> > I agree, I can push down the locking only for ->truncate_range - if
> > no one has objections. (But again, it so special case - no one really
> > cares about the performance of this interface ?).
> 
> I can't remember why i_alloc_sem got introduced, and don't have time to
> work it out: something to do with direct I/O races, perhaps?  Someone
> else must advise, perhaps you will be able to drop that one.

Yep. i_alloc_sem is supposed to protect DIO races with truncate.

> But I think you'd be very unwise to drop i_sem too.  i_mmap_lock gets
> dropped whenever preemption demands here, i_sem is what's preventing
> someone else coming along and doing a concurrent truncate or remove.
> You don't want that.
> 
> Sorry, I've not yet had time to study your patch: I do intend to,
> but cannot promise when.  I fear it won't be as easy as making
> these occasional responses.

Thanks Hugh. For now, I will leave those locks alone. We can re-visit
later, if we really care about the performance of this interface.
Better be safe than sorry.

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)
  2005-11-02 16:12                   ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Badari Pulavarty
  2005-11-02 19:54                     ` New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)) Blaisorblade
@ 2005-11-12  0:25                     ` Andrew Morton
  2005-11-12  0:34                       ` Badari Pulavarty
  2005-11-12  0:34                     ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Andrew Morton
  2 siblings, 1 reply; 86+ messages in thread
From: Andrew Morton @ 2005-11-12  0:25 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: andrea, linux-kernel, hugh, dvhltc, linux-mm, blaisorblade, jdike

Badari Pulavarty <pbadari@us.ibm.com> wrote:
>
> +/*
>  + * Application wants to free up the pages and associated backing store. 
>  + * This is effectively punching a hole into the middle of a file.
>  + *
>  + * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
>  + * Other filesystems return -ENOSYS.
>  + */
>  +static long madvise_remove(struct vm_area_struct * vma,
>  +			     unsigned long start, unsigned long end)
>  +{
>  +	struct address_space *mapping;
>  +        loff_t offset, endoff;
>  +
>  +	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 
>  +		return -EINVAL;
>  +
>  +	if (!vma->vm_file || !vma->vm_file->f_mapping 
>  +		|| !vma->vm_file->f_mapping->host) {
>  +			return -EINVAL;
>  +	}
>  +
>  +	mapping = vma->vm_file->f_mapping;
>  +	if (mapping == &swapper_space) {
>  +		return -EINVAL;
>  +	}
>  +
>  +	offset = (loff_t)(start - vma->vm_start) 
>  +			+ (vma->vm_pgoff << PAGE_SHIFT);
>  +	endoff = (loff_t)(end - vma->vm_start - 1) 
>  +			+ (vma->vm_pgoff << PAGE_SHIFT);
>  +	return  vmtruncate_range(mapping->host, offset, endoff);
>  +}
>  +

I'm suspecting you tested this on a 64-bit machine, yes?  On 32-bit that
vm_pgoff shift is going to overflow.  

Fixes-thus-far below.   Please rerun all tests on x86?

Why does madvise_remove() have an explicit check for swapper_space?

In your testing, how are you determining that the code is successfully
removing the correct number of pages, from the correct file offset?


diff -puN mm/madvise.c~madvise-remove-remove-pages-from-tmpfs-shm-backing-store-tidy mm/madvise.c
--- devel/mm/madvise.c~madvise-remove-remove-pages-from-tmpfs-shm-backing-store-tidy	2005-11-11 16:12:43.000000000 -0800
+++ devel-akpm/mm/madvise.c	2005-11-11 16:16:50.000000000 -0800
@@ -147,8 +147,8 @@ static long madvise_dontneed(struct vm_a
  * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
  * Other filesystems return -ENOSYS.
  */
-static long madvise_remove(struct vm_area_struct * vma,
-			     unsigned long start, unsigned long end)
+static long madvise_remove(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
 {
 	struct address_space *mapping;
         loff_t offset, endoff;
@@ -162,14 +162,13 @@ static long madvise_remove(struct vm_are
 	}
 
 	mapping = vma->vm_file->f_mapping;
-	if (mapping == &swapper_space) {
+	if (mapping == &swapper_space)
 		return -EINVAL;
-	}
 
 	offset = (loff_t)(start - vma->vm_start)
-			+ (vma->vm_pgoff << PAGE_SHIFT);
+			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 	endoff = (loff_t)(end - vma->vm_start - 1)
-			+ (vma->vm_pgoff << PAGE_SHIFT);
+			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 	return  vmtruncate_range(mapping->host, offset, endoff);
 }
 
diff -puN mm/memory.c~madvise-remove-remove-pages-from-tmpfs-shm-backing-store-tidy mm/memory.c
--- devel/mm/memory.c~madvise-remove-remove-pages-from-tmpfs-shm-backing-store-tidy	2005-11-11 16:16:54.000000000 -0800
+++ devel-akpm/mm/memory.c	2005-11-11 16:17:59.000000000 -0800
@@ -1608,10 +1608,9 @@ out_big:
 out_busy:
 	return -ETXTBSY;
 }
-
 EXPORT_SYMBOL(vmtruncate);
 
-int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end)
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 {
 	struct address_space *mapping = inode->i_mapping;
 
@@ -1634,7 +1633,6 @@ int vmtruncate_range(struct inode * inod
 
 	return 0;
 }
-
 EXPORT_SYMBOL(vmtruncate_range);
 
 /* 
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)
  2005-11-02 16:12                   ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Badari Pulavarty
  2005-11-02 19:54                     ` New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)) Blaisorblade
  2005-11-12  0:25                     ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Andrew Morton
@ 2005-11-12  0:34                     ` Andrew Morton
  2 siblings, 0 replies; 86+ messages in thread
From: Andrew Morton @ 2005-11-12  0:34 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: andrea, linux-kernel, hugh, dvhltc, linux-mm, blaisorblade, jdike

Badari Pulavarty <pbadari@us.ibm.com> wrote:
>
> +int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end)
>  +{
>  +	struct address_space *mapping = inode->i_mapping;
>  +
>  +	/*
>  +	 * If the underlying filesystem is not going to provide 
>  +	 * a way to truncate a range of blocks (punch a hole) - 
>  +	 * we should return failure right now.
>  +	 */
>  +	if (!inode->i_op || !inode->i_op->truncate_range)
>  +		return -ENOSYS;
>  +		
>  +	/* XXX - Do we need both i_sem and i_allocsem all the way ? */
>  +	down(&inode->i_sem);
>  +	down_write(&inode->i_alloc_sem);
>  +	unmap_mapping_range(mapping, offset, (end - offset), 1);
>  +	truncate_inode_pages_range(mapping, offset, end);
>  +	inode->i_op->truncate_range(inode, offset, end);
>  +	up_write(&inode->i_alloc_sem);
>  +	up(&inode->i_sem);
>  +
>  +	return 0;
>  +}

Yes, we need to take i_alloc_sem for writing.  To prevent concurrent
direct-io reads from coming in and instantiated by unwritten blocks.

tmpfs doesn't implements direct-io though.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)
  2005-11-12  0:25                     ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Andrew Morton
@ 2005-11-12  0:34                       ` Badari Pulavarty
  2005-11-12  1:43                         ` Andrew Morton
  0 siblings, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-11-12  0:34 UTC (permalink / raw)
  To: Andrew Morton; +Cc: andrea, lkml, hugh, dvhltc, linux-mm, blaisorblade, jdike

On Fri, 2005-11-11 at 16:25 -0800, Andrew Morton wrote:
> Badari Pulavarty <pbadari@us.ibm.com> wrote:
> >
> > +/*
> >  + * Application wants to free up the pages and associated backing store. 
> >  + * This is effectively punching a hole into the middle of a file.
> >  + *
> >  + * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
> >  + * Other filesystems return -ENOSYS.
> >  + */
> >  +static long madvise_remove(struct vm_area_struct * vma,
> >  +			     unsigned long start, unsigned long end)
> >  +{
> >  +	struct address_space *mapping;
> >  +        loff_t offset, endoff;
> >  +
> >  +	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 
> >  +		return -EINVAL;
> >  +
> >  +	if (!vma->vm_file || !vma->vm_file->f_mapping 
> >  +		|| !vma->vm_file->f_mapping->host) {
> >  +			return -EINVAL;
> >  +	}
> >  +
> >  +	mapping = vma->vm_file->f_mapping;
> >  +	if (mapping == &swapper_space) {
> >  +		return -EINVAL;
> >  +	}
> >  +
> >  +	offset = (loff_t)(start - vma->vm_start) 
> >  +			+ (vma->vm_pgoff << PAGE_SHIFT);
> >  +	endoff = (loff_t)(end - vma->vm_start - 1) 
> >  +			+ (vma->vm_pgoff << PAGE_SHIFT);
> >  +	return  vmtruncate_range(mapping->host, offset, endoff);
> >  +}
> >  +
> 
> I'm suspecting you tested this on a 64-bit machine, yes?  On 32-bit that
> vm_pgoff shift is going to overflow.  

Yes. I have moved to all 64-bit (amd64, em64t, ppc64) machines. My bad.

> 
> Fixes-thus-far below.   Please rerun all tests on x86?
> 

I will verify. Thanks.

> Why does madvise_remove() have an explicit check for swapper_space?

I really don't remember (I yanked code from some other kernel routine
vmtruncate()). If you think its unnecessary, I can take it out.

> In your testing, how are you determining that the code is successfully
> removing the correct number of pages, from the correct file offset?

I verified with test programs, added debug printk + looked through live
"crash" session + verified with UML testcases.

> 
> diff -puN mm/madvise.c~madvise-remove-remove-pages-from-tmpfs-shm-backing-store-tidy mm/madvise.c
> --- devel/mm/madvise.c~madvise-remove-remove-pages-from-tmpfs-shm-backing-store-tidy	2005-11-11 16:12:43.000000000 -0800
> +++ devel-akpm/mm/madvise.c	2005-11-11 16:16:50.000000000 -0800
> @@ -147,8 +147,8 @@ static long madvise_dontneed(struct vm_a
>   * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
>   * Other filesystems return -ENOSYS.
>   */
> -static long madvise_remove(struct vm_area_struct * vma,
> -			     unsigned long start, unsigned long end)
> +static long madvise_remove(struct vm_area_struct *vma,
> +				unsigned long start, unsigned long end)
>  {
>  	struct address_space *mapping;
>          loff_t offset, endoff;
> @@ -162,14 +162,13 @@ static long madvise_remove(struct vm_are
>  	}
>  
>  	mapping = vma->vm_file->f_mapping;
> -	if (mapping == &swapper_space) {
> +	if (mapping == &swapper_space)
>  		return -EINVAL;
> -	}
>  
>  	offset = (loff_t)(start - vma->vm_start)
> -			+ (vma->vm_pgoff << PAGE_SHIFT);
> +			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
>  	endoff = (loff_t)(end - vma->vm_start - 1)
> -			+ (vma->vm_pgoff << PAGE_SHIFT);
> +			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
>  	return  vmtruncate_range(mapping->host, offset, endoff);
>  }
>  
> diff -puN mm/memory.c~madvise-remove-remove-pages-from-tmpfs-shm-backing-store-tidy mm/memory.c
> --- devel/mm/memory.c~madvise-remove-remove-pages-from-tmpfs-shm-backing-store-tidy	2005-11-11 16:16:54.000000000 -0800
> +++ devel-akpm/mm/memory.c	2005-11-11 16:17:59.000000000 -0800
> @@ -1608,10 +1608,9 @@ out_big:
>  out_busy:
>  	return -ETXTBSY;
>  }
> -
>  EXPORT_SYMBOL(vmtruncate);
>  
> -int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end)
> +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
>  {
>  	struct address_space *mapping = inode->i_mapping;
>  
> @@ -1634,7 +1633,6 @@ int vmtruncate_range(struct inode * inod
>  
>  	return 0;
>  }
> -
>  EXPORT_SYMBOL(vmtruncate_range);
>  
>  /* 
> _
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)
  2005-11-12  0:34                       ` Badari Pulavarty
@ 2005-11-12  1:43                         ` Andrew Morton
  2005-11-12  4:41                           ` Badari Pulavarty
  0 siblings, 1 reply; 86+ messages in thread
From: Andrew Morton @ 2005-11-12  1:43 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: andrea, linux-kernel, hugh, dvhltc, linux-mm, blaisorblade, jdike

Badari Pulavarty <pbadari@us.ibm.com> wrote:
>
> > Why does madvise_remove() have an explicit check for swapper_space?
> 
> I really don't remember (I yanked code from some other kernel routine
> vmtruncate()).

I don't see such a thing anywhere.  vmtruncate() has the IS_SWAPFILE()
test, which I guess vmtruncate_range() ought to have too, for
future-safety.

Logically, vmtruncate() should just be a special case of vmtruncate_range().
But it's not - ugly, but hard to do anything about (need to implement
->truncate_range in all filesystems, but "know" which ones only support
->truncate_range() at eof).

> 
> > In your testing, how are you determining that the code is successfully
> > removing the correct number of pages, from the correct file offset?
> 
> I verified with test programs, added debug printk + looked through live
> "crash" session + verified with UML testcases.

OK, well please be sure to test it on 32-bit and 64-bit, operating in three
ranges of the file: <2G, 2G-4G amd >4G.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)
  2005-11-12  1:43                         ` Andrew Morton
@ 2005-11-12  4:41                           ` Badari Pulavarty
  2006-01-16 13:06                             ` differences between MADV_FREE and MADV_DONTNEED Andrea Arcangeli
  0 siblings, 1 reply; 86+ messages in thread
From: Badari Pulavarty @ 2005-11-12  4:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: andrea, linux-kernel, hugh, dvhltc, linux-mm, blaisorblade, jdike

Andrew Morton wrote:
> Badari Pulavarty <pbadari@us.ibm.com> wrote:
> 
>>>Why does madvise_remove() have an explicit check for swapper_space?
>>
>>I really don't remember (I yanked code from some other kernel routine
>>vmtruncate()).
> 
> 
> I don't see such a thing anywhere.  vmtruncate() has the IS_SWAPFILE()
> test, which I guess vmtruncate_range() ought to have too, for
> future-safety.

Yep. That was the check. Since I don't have inode and have mapping
handy anyway, check was made using that. I could change it, if you wish.

> 
> Logically, vmtruncate() should just be a special case of vmtruncate_range().
> But it's not - ugly, but hard to do anything about (need to implement
> ->truncate_range in all filesystems, but "know" which ones only support
> ->truncate_range() at eof).
> 
> 
>>>In your testing, how are you determining that the code is successfully
>>>removing the correct number of pages, from the correct file offset?
>>
>>I verified with test programs, added debug printk + looked through live
>>"crash" session + verified with UML testcases.
> 
> 
> OK, well please be sure to test it on 32-bit and 64-bit, operating in three
> ranges of the file: <2G, 2G-4G amd >4G.
> 
Will do.

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* differences between MADV_FREE and MADV_DONTNEED
  2005-11-12  4:41                           ` Badari Pulavarty
@ 2006-01-16 13:06                             ` Andrea Arcangeli
  2006-01-16 16:02                               ` Suleiman Souhlal
  2006-01-17  1:06                               ` Blaisorblade
  0 siblings, 2 replies; 86+ messages in thread
From: Andrea Arcangeli @ 2006-01-16 13:06 UTC (permalink / raw)
  To: Badari Pulavarty
  Cc: Andrew Morton, linux-kernel, hugh, dvhltc, linux-mm, blaisorblade,
	jdike

Now that MADV_REMOVE is in, should we discuss MADV_FREE?

MADV_FREE in Solaris is destructive and only works on anonymous memory,
while MADV_DONTNEED seems to never be destructive (which I assume it
means it's a noop on anonymous memory).

Our MADV_DONTNEED is destructive on anonymous memory, while it's
non-destructive on file mappings.

Perhaps we could move the destructive anonymous part of MADV_DONTNEED to
MADV_FREE?

Or we could as well go relaxed and define MADV_FREE and MADV_DONTNEED
the same way (that still leaves the question if we risk to break apps
ported from solaris where MADV_DONTNEED is apparently always not
destructive).

I only read the docs, I don't know in practice what MADV_DONTNEED does
on solaris (does it return -EINVAL if run on anonymous memory or not?).

http://docs.sun.com/app/docs/doc/816-5168/6mbb3hrgk?a=view

BTW, I don't know how other specifications define MADV_FREE, but besides
MADV_REMOVE I've also got the request to provide MADV_FREE in linux,
this is why I'm asking. (right now I'm telling them to use #ifdef
__linux__ #define MADV_FREE MADV_DONTNEED but that's quite an hack since
it could break if we make MADV_DONTNEED non-destructive in the future)

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-16 13:06                             ` differences between MADV_FREE and MADV_DONTNEED Andrea Arcangeli
@ 2006-01-16 16:02                               ` Suleiman Souhlal
  2006-01-16 16:28                                 ` Andrea Arcangeli
  2006-01-17  1:06                               ` Blaisorblade
  1 sibling, 1 reply; 86+ messages in thread
From: Suleiman Souhlal @ 2006-01-16 16:02 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Badari Pulavarty, Andrew Morton, linux-kernel, hugh, dvhltc,
	linux-mm, blaisorblade, jdike

Andrea Arcangeli wrote:
> Now that MADV_REMOVE is in, should we discuss MADV_FREE?
> 
> MADV_FREE in Solaris is destructive and only works on anonymous memory,
> while MADV_DONTNEED seems to never be destructive (which I assume it
> means it's a noop on anonymous memory).

FWIW, in FreeBSD, MADV_DONTNEED is not destructive, and just makes pages 
(including anonymous ones) more likely to get swapped out.

> Our MADV_DONTNEED is destructive on anonymous memory, while it's
> non-destructive on file mappings.
> 
> Perhaps we could move the destructive anonymous part of MADV_DONTNEED to
> MADV_FREE?

This would seem like the best way to go, since it would bring Linux's 
behavior more in line with what other systems do.

> Or we could as well go relaxed and define MADV_FREE and MADV_DONTNEED
> the same way (that still leaves the question if we risk to break apps
> ported from solaris where MADV_DONTNEED is apparently always not
> destructive).
> 
> I only read the docs, I don't know in practice what MADV_DONTNEED does
> on solaris (does it return -EINVAL if run on anonymous memory or not?).
> 
> http://docs.sun.com/app/docs/doc/816-5168/6mbb3hrgk?a=view
> 
> BTW, I don't know how other specifications define MADV_FREE, but besides
> MADV_REMOVE I've also got the request to provide MADV_FREE in linux,
> this is why I'm asking. (right now I'm telling them to use #ifdef
> __linux__ #define MADV_FREE MADV_DONTNEED but that's quite an hack since
> it could break if we make MADV_DONTNEED non-destructive in the future)

FreeBSD's MADV_FREE only works on anonymous memory (it's a noop for 
vnode-backed memory), and marks the pages clean before moving them to 
the inactive queue, so that they can be freed or reused quickly, without 
causing a pagefault.

-- Suleiman

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-16 16:02                               ` Suleiman Souhlal
@ 2006-01-16 16:28                                 ` Andrea Arcangeli
  2006-01-16 17:03                                   ` Suleiman Souhlal
  0 siblings, 1 reply; 86+ messages in thread
From: Andrea Arcangeli @ 2006-01-16 16:28 UTC (permalink / raw)
  To: Suleiman Souhlal
  Cc: Badari Pulavarty, Andrew Morton, linux-kernel, hugh, dvhltc,
	linux-mm, blaisorblade, jdike

On Mon, Jan 16, 2006 at 08:02:07AM -0800, Suleiman Souhlal wrote:
> FWIW, in FreeBSD, MADV_DONTNEED is not destructive, and just makes pages 
> (including anonymous ones) more likely to get swapped out.

We can also use it for the same purpose, we could add the pages to
swapcache mark them dirty and zap the ptes _after_ that.

> This would seem like the best way to go, since it would bring Linux's 
> behavior more in line with what other systems do.

Agreed.

> FreeBSD's MADV_FREE only works on anonymous memory (it's a noop for 
> vnode-backed memory), and marks the pages clean before moving them to 
> the inactive queue, so that they can be freed or reused quickly, without 
> causing a pagefault.

Well, perhaps solaris is also a noop and not necessairly a -EINVAL, all
I know from the docs is "This value cannot be used on mappings that have
underlying file objects.", so I expected -EINVAL but it may be a noop.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-16 16:28                                 ` Andrea Arcangeli
@ 2006-01-16 17:03                                   ` Suleiman Souhlal
  2006-01-16 17:24                                     ` Andrea Arcangeli
  0 siblings, 1 reply; 86+ messages in thread
From: Suleiman Souhlal @ 2006-01-16 17:03 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Badari Pulavarty, Andrew Morton, linux-kernel, hugh, dvhltc,
	linux-mm, blaisorblade, jdike

Andrea Arcangeli wrote:

> We can also use it for the same purpose, we could add the pages to
> swapcache mark them dirty and zap the ptes _after_ that.

Wouldn't that cause the pages to get swapped out immediately?
If so, I don't think this would be the best approach: It would be better 
  to just move the pages to the inactive list, if they aren't there 
already, so that they get swapped out only when they really need to be.

-- Suleiman

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-16 17:03                                   ` Suleiman Souhlal
@ 2006-01-16 17:24                                     ` Andrea Arcangeli
  2006-01-16 21:43                                       ` Eric W. Biederman
  0 siblings, 1 reply; 86+ messages in thread
From: Andrea Arcangeli @ 2006-01-16 17:24 UTC (permalink / raw)
  To: Suleiman Souhlal
  Cc: Badari Pulavarty, Andrew Morton, linux-kernel, hugh, dvhltc,
	linux-mm, blaisorblade, jdike

On Mon, Jan 16, 2006 at 09:03:00AM -0800, Suleiman Souhlal wrote:
> Andrea Arcangeli wrote:
> 
> >We can also use it for the same purpose, we could add the pages to
> >swapcache mark them dirty and zap the ptes _after_ that.
> 
> Wouldn't that cause the pages to get swapped out immediately?

Not really, it would be a non blocking operation. But they could be
swapped out shortly later (that's the whole point of DONTNEED, right?),
once there is more memory pressure. Otherwise if they're used again, a
minor fault will happen and it will find the swapcache uptodate in ram.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-16 17:24                                     ` Andrea Arcangeli
@ 2006-01-16 21:43                                       ` Eric W. Biederman
  2006-01-17  0:24                                         ` Suleiman Souhlal
  0 siblings, 1 reply; 86+ messages in thread
From: Eric W. Biederman @ 2006-01-16 21:43 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Suleiman Souhlal, Badari Pulavarty, Andrew Morton, linux-kernel,
	hugh, dvhltc, linux-mm, blaisorblade, jdike

Andrea Arcangeli <andrea@suse.de> writes:

> On Mon, Jan 16, 2006 at 09:03:00AM -0800, Suleiman Souhlal wrote:
>> Andrea Arcangeli wrote:
>> 
>> >We can also use it for the same purpose, we could add the pages to
>> >swapcache mark them dirty and zap the ptes _after_ that.
>> 
>> Wouldn't that cause the pages to get swapped out immediately?
>
> Not really, it would be a non blocking operation. But they could be
> swapped out shortly later (that's the whole point of DONTNEED, right?),
> once there is more memory pressure. Otherwise if they're used again, a
> minor fault will happen and it will find the swapcache uptodate in ram.

As I recall the logic with DONTNEED was to mark the mapping of
the page clean so the page didn't need to be swapped out, it could
just be dropped.

That is why they anonymous and the file backed cases differ.

Part of the point is to avoid the case of swapping the pages out if
the application doesn't care what is on them anymore.

Eric

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-16 21:43                                       ` Eric W. Biederman
@ 2006-01-17  0:24                                         ` Suleiman Souhlal
  2006-01-17  1:04                                           ` Nicholas Miell
  0 siblings, 1 reply; 86+ messages in thread
From: Suleiman Souhlal @ 2006-01-17  0:24 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrea Arcangeli, Badari Pulavarty, Andrew Morton, linux-kernel,
	hugh, dvhltc, linux-mm, blaisorblade, jdike

Eric W. Biederman wrote:
> As I recall the logic with DONTNEED was to mark the mapping of
> the page clean so the page didn't need to be swapped out, it could
> just be dropped.
> 
> That is why they anonymous and the file backed cases differ.
> 
> Part of the point is to avoid the case of swapping the pages out if
> the application doesn't care what is on them anymore.

Well, imho, MADV_DONTNEED should mean "I won't need this anytime soon", 
and MADV_FREE "I will never need this again".

-- Suleiman

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-17  0:24                                         ` Suleiman Souhlal
@ 2006-01-17  1:04                                           ` Nicholas Miell
  2006-01-17 12:43                                             ` Christoph Hellwig
  0 siblings, 1 reply; 86+ messages in thread
From: Nicholas Miell @ 2006-01-17  1:04 UTC (permalink / raw)
  To: Suleiman Souhlal
  Cc: Ulrich Drepper, Eric W. Biederman, Andrea Arcangeli,
	Badari Pulavarty, Andrew Morton, linux-kernel, hugh, dvhltc,
	linux-mm, blaisorblade, jdike

On Mon, 2006-01-16 at 16:24 -0800, Suleiman Souhlal wrote:
> Eric W. Biederman wrote:
> > As I recall the logic with DONTNEED was to mark the mapping of
> > the page clean so the page didn't need to be swapped out, it could
> > just be dropped.
> > 
> > That is why they anonymous and the file backed cases differ.
> > 
> > Part of the point is to avoid the case of swapping the pages out if
> > the application doesn't care what is on them anymore.
> 
> Well, imho, MADV_DONTNEED should mean "I won't need this anytime soon", 
> and MADV_FREE "I will never need this again".
> 

POSIX doesn't have a madvise(), but it does have a posix_madvise(), with
flags defined as follows:

POSIX_MADV_NORMAL
   Specifies that the application has no advice to give on its behavior
with respect to the specified range. It is the default characteristic if
no advice is given for a range of memory.
POSIX_MADV_SEQUENTIAL
   Specifies that the application expects to access the specified range
sequentially from lower addresses to higher addresses.
POSIX_MADV_RANDOM
   Specifies that the application expects to access the specified range
in a random order.
POSIX_MADV_WILLNEED
   Specifies that the application expects to access the specified range
in the near future.
POSIX_MADV_DONTNEED
   Specifies that the application expects that it will not access the
specified range in the near future.

Note that glibc forwards posix_madvise() directly to madvise(2), which
means that right now, POSIX conformant apps which use
posix_madvise(addr, len, POSIX_MADV_DONTNEED) are silently corrupting
data on Linux systems.

-- 
Nicholas Miell <nmiell@comcast.net>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-16 13:06                             ` differences between MADV_FREE and MADV_DONTNEED Andrea Arcangeli
  2006-01-16 16:02                               ` Suleiman Souhlal
@ 2006-01-17  1:06                               ` Blaisorblade
  2006-01-17  1:33                                 ` Andrea Arcangeli
  1 sibling, 1 reply; 86+ messages in thread
From: Blaisorblade @ 2006-01-17  1:06 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Badari Pulavarty, Andrew Morton, linux-kernel, hugh, dvhltc,
	linux-mm, jdike

On Monday 16 January 2006 14:06, Andrea Arcangeli wrote:
> Now that MADV_REMOVE is in, should we discuss MADV_FREE?

> MADV_FREE in Solaris is destructive and only works on anonymous memory,

I.e. it's a restriction of MADV_REMOVE. Is there anything conceivable relying 
on errors or no behaviour on file-backed memory? If relying on errors we 
could need an API, but if relying only on the NO-OP thing the correctness 
semantics are already implemented. I.e. data are retained on both Solaris 
MADV_FREE and Linux MADV_REMOVE for file-backed case, they get a different 
semantics for caching.

> while MADV_DONTNEED seems to never be destructive (which I assume it
> means it's a noop on anonymous memory).

It could be a "swap it out", as mentioned in Linux comments on our madvise 
semantics about "other Unices".

> Our MADV_DONTNEED is destructive on anonymous memory, while it's
> non-destructive on file mappings.

Indeed, not even that. See our madvise_dontneed() comment - dirty data are 
discarded in both cases, and the comment suggests msync(MS_INVALIDATE). It 
also speaks of "other implementation", which could also refer to Solaris.

> Perhaps we could move the destructive anonymous part of MADV_DONTNEED to
> MADV_FREE?

Why changing existing apps behaviour? That's nonsense, unless you have a 
standard. Indeed, however, posix_madvise exists, and it's DONTNEED semantics 
are the Solaris ones. Don't know past behaviour about "breaking existing to 
comply to standards" (new syscall slot?).

> Or we could as well go relaxed and define MADV_FREE and MADV_DONTNEED
> the same way (that still leaves the question if we risk to break apps
> ported from solaris where MADV_DONTNEED is apparently always not
> destructive).

Provide our fine-grained semantics with new, not misunderstandable identifiers 
(MADV_FREE_DISCARD, MADV_FREE_CACHE, for instance).

For current names, libc could provide a "let user choose the meaning of 
things", like it does for signals with _BSD_SOURCE, _POSIX_SOURCE and so on.

> I only read the docs, I don't know in practice what MADV_DONTNEED does
> on solaris (does it return -EINVAL if run on anonymous memory or not?).

> http://docs.sun.com/app/docs/doc/816-5168/6mbb3hrgk?a=view

> BTW, I don't know how other specifications define MADV_FREE, but besides
> MADV_REMOVE I've also got the request to provide MADV_FREE in linux,
> this is why I'm asking. (right now I'm telling them to use #ifdef
> __linux__ #define MADV_FREE MADV_DONTNEED but that's quite an hack since
> it could break if we make MADV_DONTNEED non-destructive in the future)

Making their apps work by causing the same breakage to Linux apps is a better 
idea?

> Thanks.

-- 
Inform me of my mistakes, so I can keep imitating Homer Simpson's "Doh!".
Paolo Giarrusso, aka Blaisorblade (Skype ID "PaoloGiarrusso", ICQ 215621894)
http://www.user-mode-linux.org/~blaisorblade

	

	
		
___________________________________ 
Yahoo! Mail: gratis 1GB per i messaggi e allegati da 10MB 
http://mail.yahoo.it

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-17  1:06                               ` Blaisorblade
@ 2006-01-17  1:33                                 ` Andrea Arcangeli
  0 siblings, 0 replies; 86+ messages in thread
From: Andrea Arcangeli @ 2006-01-17  1:33 UTC (permalink / raw)
  To: Blaisorblade
  Cc: Badari Pulavarty, Andrew Morton, linux-kernel, hugh, dvhltc,
	linux-mm, jdike

On Tue, Jan 17, 2006 at 02:06:09AM +0100, Blaisorblade wrote:
> I.e. it's a restriction of MADV_REMOVE. Is there anything conceivable
> relying on errors or no behaviour on file-backed memory? If relying on
> errors we could need an API, but if relying only on the NO-OP thing the
> correctness semantics are already implemented. I.e. data are retained on both
> Solaris MADV_FREE and Linux MADV_REMOVE for file-backed case, they get a
> different semantics for caching.

Not sure to understand but merging MADV_REMOVE into MADV_FREE apparently
would break freebsd apps that might expect a noop instead. And it could
break Solaris apps if they execpt a -EINVAL (though the latter is more
dubious, but I doubt making differences is worth it and if freebsd makes
it a noop I'd stick with the noop and leave MADV_REMOVE alone).

> are the Solaris ones. Don't know past behaviour about "breaking existing to 
> comply to standards" (new syscall slot?).

The change I suggested would be backwards compatible because it can only
affect performance.

The only thing that can break right now, is running a non-linux (and
apparently posix too) app on a linux system that will corrupt memory
with potential data loss.

> Provide our fine-grained semantics with new, not misunderstandable identifiers 
> (MADV_FREE_DISCARD, MADV_FREE_CACHE, for instance).

Why should we deviate for the sake of porting pain, when we can comply
at no tangible risk for us?

> Making their apps work by causing the same breakage to Linux apps is a better 
> idea?

Again: if an app breaks it means it's working by pure luck because it's
depending on fragile timings in the first place.

Call it a potential lower performance or less efficient memory
utilization, a breakage not.

If we were to make MADV_DONTNEED more aggressive, then we'd be risking a
breakage, but we're going to relax it instead.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-17  1:04                                           ` Nicholas Miell
@ 2006-01-17 12:43                                             ` Christoph Hellwig
  2006-01-17 18:23                                               ` Eric W. Biederman
  2006-01-17 19:06                                               ` Badari Pulavarty
  0 siblings, 2 replies; 86+ messages in thread
From: Christoph Hellwig @ 2006-01-17 12:43 UTC (permalink / raw)
  To: Nicholas Miell
  Cc: Suleiman Souhlal, Ulrich Drepper, Eric W. Biederman,
	Andrea Arcangeli, Badari Pulavarty, Andrew Morton, linux-kernel,
	hugh, dvhltc, linux-mm, blaisorblade, jdike

On Mon, Jan 16, 2006 at 05:04:07PM -0800, Nicholas Miell wrote:
> On Mon, 2006-01-16 at 16:24 -0800, Suleiman Souhlal wrote:
> > Eric W. Biederman wrote:
> > > As I recall the logic with DONTNEED was to mark the mapping of
> > > the page clean so the page didn't need to be swapped out, it could
> > > just be dropped.
> > > 
> > > That is why they anonymous and the file backed cases differ.
> > > 
> > > Part of the point is to avoid the case of swapping the pages out if
> > > the application doesn't care what is on them anymore.
> > 
> > Well, imho, MADV_DONTNEED should mean "I won't need this anytime soon", 
> > and MADV_FREE "I will never need this again".
> > 
> 
> POSIX doesn't have a madvise(), but it does have a posix_madvise(), with
> flags defined as follows:
> 
> POSIX_MADV_NORMAL
>    Specifies that the application has no advice to give on its behavior
> with respect to the specified range. It is the default characteristic if
> no advice is given for a range of memory.
> POSIX_MADV_SEQUENTIAL
>    Specifies that the application expects to access the specified range
> sequentially from lower addresses to higher addresses.
> POSIX_MADV_RANDOM
>    Specifies that the application expects to access the specified range
> in a random order.
> POSIX_MADV_WILLNEED
>    Specifies that the application expects to access the specified range
> in the near future.
> POSIX_MADV_DONTNEED
>    Specifies that the application expects that it will not access the
> specified range in the near future.
> 
> Note that glibc forwards posix_madvise() directly to madvise(2), which
> means that right now, POSIX conformant apps which use
> posix_madvise(addr, len, POSIX_MADV_DONTNEED) are silently corrupting
> data on Linux systems.

Does our MAD_DONTNEED numerical value match glibc's POSIX_MADV_DONTNEED?

In either case I'd say we should backout this patch for now.  We should
implement a real MADV_DONTNEED and rename the current one to MADV_FREE,
but that's 2.6.17 material.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-17 12:43                                             ` Christoph Hellwig
@ 2006-01-17 18:23                                               ` Eric W. Biederman
  2006-01-17 22:55                                                 ` Nicholas Miell
  2007-03-01 18:11                                                 ` Samuel Thibault
  2006-01-17 19:06                                               ` Badari Pulavarty
  1 sibling, 2 replies; 86+ messages in thread
From: Eric W. Biederman @ 2006-01-17 18:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Nicholas Miell, Suleiman Souhlal, Ulrich Drepper,
	Andrea Arcangeli, Badari Pulavarty, Andrew Morton, linux-kernel,
	hugh, dvhltc, linux-mm, blaisorblade, jdike

Christoph Hellwig <hch@infradead.org> writes:

> On Mon, Jan 16, 2006 at 05:04:07PM -0800, Nicholas Miell wrote:
>> On Mon, 2006-01-16 at 16:24 -0800, Suleiman Souhlal wrote:
>> > Eric W. Biederman wrote:
>> > > As I recall the logic with DONTNEED was to mark the mapping of
>> > > the page clean so the page didn't need to be swapped out, it could
>> > > just be dropped.
>> > > 
>> > > That is why they anonymous and the file backed cases differ.
>> > > 
>> > > Part of the point is to avoid the case of swapping the pages out if
>> > > the application doesn't care what is on them anymore.
>> > 
>> > Well, imho, MADV_DONTNEED should mean "I won't need this anytime soon", 
>> > and MADV_FREE "I will never need this again".
>> > 
>> 
>> POSIX doesn't have a madvise(), but it does have a posix_madvise(), with
>> flags defined as follows:
>> 
>> POSIX_MADV_NORMAL
>>    Specifies that the application has no advice to give on its behavior
>> with respect to the specified range. It is the default characteristic if
>> no advice is given for a range of memory.
>> POSIX_MADV_SEQUENTIAL
>>    Specifies that the application expects to access the specified range
>> sequentially from lower addresses to higher addresses.
>> POSIX_MADV_RANDOM
>>    Specifies that the application expects to access the specified range
>> in a random order.
>> POSIX_MADV_WILLNEED
>>    Specifies that the application expects to access the specified range
>> in the near future.
>> POSIX_MADV_DONTNEED
>>    Specifies that the application expects that it will not access the
>> specified range in the near future.
>> 
>> Note that glibc forwards posix_madvise() directly to madvise(2), which
>> means that right now, POSIX conformant apps which use
>> posix_madvise(addr, len, POSIX_MADV_DONTNEED) are silently corrupting
>> data on Linux systems.
>
> Does our MAD_DONTNEED numerical value match glibc's POSIX_MADV_DONTNEED?
>
> In either case I'd say we should backout this patch for now.  We should
> implement a real MADV_DONTNEED and rename the current one to MADV_FREE,
> but that's 2.6.17 material.

We definitely need to check this.  I am fairly certain  I have seen this conversation
before.

Eric


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-17 12:43                                             ` Christoph Hellwig
  2006-01-17 18:23                                               ` Eric W. Biederman
@ 2006-01-17 19:06                                               ` Badari Pulavarty
  1 sibling, 0 replies; 86+ messages in thread
From: Badari Pulavarty @ 2006-01-17 19:06 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Nicholas Miell, Suleiman Souhlal, Ulrich Drepper,
	Eric W. Biederman, Andrea Arcangeli, Andrew Morton, lkml, hugh,
	dvhltc, linux-mm, blaisorblade, jdike

On Tue, 2006-01-17 at 12:43 +0000, Christoph Hellwig wrote:
> On Mon, Jan 16, 2006 at 05:04:07PM -0800, Nicholas Miell wrote:
> > On Mon, 2006-01-16 at 16:24 -0800, Suleiman Souhlal wrote:
> > > Eric W. Biederman wrote:
> > > > As I recall the logic with DONTNEED was to mark the mapping of
> > > > the page clean so the page didn't need to be swapped out, it could
> > > > just be dropped.
> > > > 
> > > > That is why they anonymous and the file backed cases differ.
> > > > 
> > > > Part of the point is to avoid the case of swapping the pages out if
> > > > the application doesn't care what is on them anymore.
> > > 
> > > Well, imho, MADV_DONTNEED should mean "I won't need this anytime soon", 
> > > and MADV_FREE "I will never need this again".
> > > 
> > 
> > POSIX doesn't have a madvise(), but it does have a posix_madvise(), with
> > flags defined as follows:
> > 
> > POSIX_MADV_NORMAL
> >    Specifies that the application has no advice to give on its behavior
> > with respect to the specified range. It is the default characteristic if
> > no advice is given for a range of memory.
> > POSIX_MADV_SEQUENTIAL
> >    Specifies that the application expects to access the specified range
> > sequentially from lower addresses to higher addresses.
> > POSIX_MADV_RANDOM
> >    Specifies that the application expects to access the specified range
> > in a random order.
> > POSIX_MADV_WILLNEED
> >    Specifies that the application expects to access the specified range
> > in the near future.
> > POSIX_MADV_DONTNEED
> >    Specifies that the application expects that it will not access the
> > specified range in the near future.
> > 
> > Note that glibc forwards posix_madvise() directly to madvise(2), which
> > means that right now, POSIX conformant apps which use
> > posix_madvise(addr, len, POSIX_MADV_DONTNEED) are silently corrupting
> > data on Linux systems.
> 
> Does our MAD_DONTNEED numerical value match glibc's POSIX_MADV_DONTNEED?
> 
> In either case I'd say we should backout this patch for now.  We should
> implement a real MADV_DONTNEED and rename the current one to MADV_FREE,
> but that's 2.6.17 material.

Christoph,

What patch are you recommending backing out ? 

Thanks,
Badari

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-17 18:23                                               ` Eric W. Biederman
@ 2006-01-17 22:55                                                 ` Nicholas Miell
  2007-03-01 18:11                                                 ` Samuel Thibault
  1 sibling, 0 replies; 86+ messages in thread
From: Nicholas Miell @ 2006-01-17 22:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Christoph Hellwig, Suleiman Souhlal, Ulrich Drepper,
	Andrea Arcangeli, Badari Pulavarty, Andrew Morton, linux-kernel,
	hugh, dvhltc, linux-mm, blaisorblade, jdike

On Tue, 2006-01-17 at 11:23 -0700, Eric W. Biederman wrote:
> Christoph Hellwig <hch@infradead.org> writes:
> > On Mon, Jan 16, 2006 at 05:04:07PM -0800, Nicholas Miell wrote:
> >> On Mon, 2006-01-16 at 16:24 -0800, Suleiman Souhlal wrote:
> >> > Well, imho, MADV_DONTNEED should mean "I won't need this anytime soon", 
> >> > and MADV_FREE "I will never need this again".
> >> > 
> >> 
> >> POSIX doesn't have a madvise(), but it does have a posix_madvise(), with
> >> flags defined as follows:
> >> 
> >> POSIX_MADV_NORMAL
> >>    Specifies that the application has no advice to give on its behavior
> >> with respect to the specified range. It is the default characteristic if
> >> no advice is given for a range of memory.
> >> POSIX_MADV_SEQUENTIAL
> >>    Specifies that the application expects to access the specified range
> >> sequentially from lower addresses to higher addresses.
> >> POSIX_MADV_RANDOM
> >>    Specifies that the application expects to access the specified range
> >> in a random order.
> >> POSIX_MADV_WILLNEED
> >>    Specifies that the application expects to access the specified range
> >> in the near future.
> >> POSIX_MADV_DONTNEED
> >>    Specifies that the application expects that it will not access the
> >> specified range in the near future.
> >> 
> >> Note that glibc forwards posix_madvise() directly to madvise(2), which
> >> means that right now, POSIX conformant apps which use
> >> posix_madvise(addr, len, POSIX_MADV_DONTNEED) are silently corrupting
> >> data on Linux systems.
> >
> > Does our MAD_DONTNEED numerical value match glibc's POSIX_MADV_DONTNEED?
> >
> > In either case I'd say we should backout this patch for now.  We should
> > implement a real MADV_DONTNEED and rename the current one to MADV_FREE,
> > but that's 2.6.17 material.
> 
> We definitely need to check this.  I am fairly certain  I have seen this conversation
> before.

Yes, POSIX_MADV_* have the same values as MADV_*. And if you're trying
to find the actual implementation of posix_madvise() to verify its
behavior, it is generated by script from a line in
libc/sysdeps/unix/sysv/linux/syscalls.list.

-- 
Nicholas Miell <nmiell@comcast.net>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

* Re: differences between MADV_FREE and MADV_DONTNEED
  2006-01-17 18:23                                               ` Eric W. Biederman
  2006-01-17 22:55                                                 ` Nicholas Miell
@ 2007-03-01 18:11                                                 ` Samuel Thibault
  1 sibling, 0 replies; 86+ messages in thread
From: Samuel Thibault @ 2007-03-01 18:11 UTC (permalink / raw)
  To: linux-kernel, linux-mm

Hi,

Eric wrote:
> > We should implement a real MADV_DONTNEED and rename the current one
> > to MADV_FREE, but that's 2.6.17 material.
> 
> We definitely need to check this.  I am fairly certain I have seen
> this conversation before.

Yes, it was back in 2005:
http://marc.theaimsgroup.com/?l=linux-kernel&m=111996850004771&w=2

Nobody took the time to fix it, I filed bug #6282 on bugzilla.kernel.org
some time ago.

Samuel

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 86+ messages in thread

end of thread, other threads:[~2007-03-01 18:48 UTC | newest]

Thread overview: 86+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-10-26 22:49 [RFC] madvise(MADV_TRUNCATE) Badari Pulavarty
2005-10-27  8:38 ` Andi Kleen
2005-10-27 13:17   ` Andrea Arcangeli
2005-10-27 15:00     ` Badari Pulavarty
2005-10-27 15:11       ` Andrea Arcangeli
2005-10-27 18:20         ` Andrew Morton
2005-10-27 18:35           ` Badari Pulavarty
2005-10-27 18:50             ` Andrew Morton
2005-10-27 19:40               ` Gerrit Huizenga
2005-10-27 19:56                 ` Andi Kleen
2005-10-27 23:21                   ` Darren Hart
2005-10-27 20:05               ` Theodore Ts'o
2005-10-27 20:16                 ` Andrea Arcangeli
2005-10-28  1:42                 ` Badari Pulavarty
2005-10-28 16:33                   ` Theodore Ts'o
2005-10-27 20:22               ` Jeff Dike
2005-10-27 20:04           ` Andrea Arcangeli
2005-10-27 20:50             ` Andrew Morton
2005-10-27 21:37               ` Andrea Arcangeli
2005-10-27 22:23                 ` Andrew Morton
2005-10-27 23:05                   ` Badari Pulavarty
2005-10-27 23:16                     ` Andrew Morton
2005-10-27 23:33                       ` Peter Chubb
2005-10-28  0:22                   ` Andrea Arcangeli
2005-10-28  0:32                     ` Andrew Morton
2005-10-28  1:10                       ` Andrea Arcangeli
2005-10-28  1:27                       ` Badari Pulavarty
2005-10-28  2:00                         ` Andrew Morton
2005-10-27 22:32               ` Badari Pulavarty
2005-10-27 23:28             ` Peter Chubb
2005-10-27 23:49               ` Andrew Morton
2005-10-27 23:56                 ` Nathan Scott
2005-10-28  0:15                   ` Andrea Arcangeli
2005-10-27 23:59                 ` Peter Chubb
2005-10-28  3:46 ` Jeff Dike
2005-10-28 11:03   ` Blaisorblade
2005-10-28 13:29     ` Andrea Arcangeli
2005-10-28 16:56       ` Blaisorblade
2005-10-28 16:16     ` Badari Pulavarty
2005-10-28 18:40       ` Blaisorblade
2005-10-28 18:56         ` Badari Pulavarty
2005-10-29  0:35         ` Badari Pulavarty
2005-10-28 16:19   ` Badari Pulavarty
2005-10-28 17:10     ` Blaisorblade
2005-10-28 18:28       ` Jeff Dike
2005-10-28 18:44         ` Blaisorblade
2005-10-28 18:42     ` Jeff Dike
2005-10-28 18:54       ` Badari Pulavarty
2005-10-29  0:03       ` Badari Pulavarty
2005-10-29  2:51         ` Jeff Dike
2005-10-31 16:34           ` Badari Pulavarty
2005-10-31 19:15           ` Badari Pulavarty
2005-10-31 19:49           ` [RFC][PATCH] madvise(MADV_TRUNCATE) Badari Pulavarty
2005-11-01  0:05             ` Jeff Dike
2005-11-02  1:15               ` [PATCH] 2.6.14 patch for supporting madvise(MADV_FREE) Badari Pulavarty
2005-11-02  1:43                 ` Andrea Arcangeli
2005-11-02 15:49                   ` Badari Pulavarty
2005-11-02 16:12                   ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Badari Pulavarty
2005-11-02 19:54                     ` New bug in patch and existing Linux code - race with install_page() (was: Re: [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE)) Blaisorblade
2005-11-02 20:12                       ` Hugh Dickins
2005-11-02 20:45                         ` Hugh Dickins
2005-11-02 21:36                       ` Badari Pulavarty
2005-11-02 21:55                         ` Hugh Dickins
2005-11-02 22:02                           ` Badari Pulavarty
2005-11-12  0:25                     ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Andrew Morton
2005-11-12  0:34                       ` Badari Pulavarty
2005-11-12  1:43                         ` Andrew Morton
2005-11-12  4:41                           ` Badari Pulavarty
2006-01-16 13:06                             ` differences between MADV_FREE and MADV_DONTNEED Andrea Arcangeli
2006-01-16 16:02                               ` Suleiman Souhlal
2006-01-16 16:28                                 ` Andrea Arcangeli
2006-01-16 17:03                                   ` Suleiman Souhlal
2006-01-16 17:24                                     ` Andrea Arcangeli
2006-01-16 21:43                                       ` Eric W. Biederman
2006-01-17  0:24                                         ` Suleiman Souhlal
2006-01-17  1:04                                           ` Nicholas Miell
2006-01-17 12:43                                             ` Christoph Hellwig
2006-01-17 18:23                                               ` Eric W. Biederman
2006-01-17 22:55                                                 ` Nicholas Miell
2007-03-01 18:11                                                 ` Samuel Thibault
2006-01-17 19:06                                               ` Badari Pulavarty
2006-01-17  1:06                               ` Blaisorblade
2006-01-17  1:33                                 ` Andrea Arcangeli
2005-11-12  0:34                     ` [PATCH] 2.6.14 patch for supporting madvise(MADV_REMOVE) Andrew Morton
2005-10-28 17:55   ` [RFC] madvise(MADV_TRUNCATE) Blaisorblade
2005-10-28 21:23     ` Theodore Ts'o

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).