linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [Patch] tmpfs: add fallocate support
@ 2011-11-15  8:42 Amerigo Wang
  2011-11-15 10:23 ` Cong Wang
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Amerigo Wang @ 2011-11-15  8:42 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, WANG Cong, Hugh Dickins, linux-mm

This patch adds fallocate support to tmpfs. I tested this patch
with the following test case,

	% sudo mount -t tmpfs -o size=100 tmpfs /mnt
	% touch /mnt/foobar
	% echo hi > /mnt/foobar
	% fallocate -o 3 -l 5000 /mnt/foobar          
	fallocate: /mnt/foobar: fallocate failed: No space left on device
	% fallocate -o 3 -l 3000 /mnt/foobar
	% ls -l /mnt/foobar
	-rw-rw-r-- 1 wangcong wangcong 3003 Nov 15 16:10 /mnt/foobar
	% dd if=/dev/zero of=/mnt/foobar seek=3 bs=1 count=3000
	3000+0 records in
	3000+0 records out
	3000 bytes (3.0 kB) copied, 0.0153224 s, 196 kB/s
	% hexdump -C /mnt/foobar                                   
	00000000  68 69 0a 00 00 00 00 00  00 00 00 00 00 00 00 00  |hi..............|
	00000010  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
	*
	00000bb0  00 00 00 00 00 00 00 00  00 00 00                 |...........|
	00000bbb
	% cat /mnt/foobar
	hi

Signed-off-by: WANG Cong <amwang@redhat.com>

---
diff --git a/mm/shmem.c b/mm/shmem.c
index d672250..438b7b8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -30,6 +30,7 @@
 #include <linux/mm.h>
 #include <linux/export.h>
 #include <linux/swap.h>
+#include <linux/falloc.h>
 
 static struct vfsmount *shm_mnt;
 
@@ -1431,6 +1432,102 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 	return error;
 }
 
+static long shmem_fallocate(struct file *file, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	pgoff_t start = DIV_ROUND_UP(offset, PAGE_CACHE_SIZE);
+	pgoff_t end = DIV_ROUND_UP((offset + len), PAGE_CACHE_SIZE);
+	pgoff_t index = start;
+	gfp_t gfp = mapping_gfp_mask(mapping);
+	loff_t i_size = i_size_read(inode);
+	struct page *page = NULL;
+	int ret;
+
+	if ((offset + len) <= i_size)
+		return 0;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+		ret = inode_newsize_ok(inode, (offset + len));
+		if (ret)
+			return ret;
+	}
+
+	if (start == end) {
+		if (!(mode & FALLOC_FL_KEEP_SIZE))
+			i_size_write(inode, offset + len);
+		return 0;
+	}
+
+	if (shmem_acct_block(info->flags))
+		return -ENOSPC;
+
+	if (sbinfo->max_blocks) {
+		unsigned long blocks = (end - index) * BLOCKS_PER_PAGE;
+		if (blocks + percpu_counter_sum(&sbinfo->used_blocks)
+				>= sbinfo->max_blocks) {
+			ret = -ENOSPC;
+			goto unacct;
+		}
+	}
+
+	while (index < end) {
+		if (sbinfo->max_blocks)
+			percpu_counter_add(&sbinfo->used_blocks, BLOCKS_PER_PAGE);
+
+		page = shmem_alloc_page(gfp, info, index);
+		if (!page) {
+			ret = -ENOMEM;
+			goto decused;
+		}
+
+		SetPageSwapBacked(page);
+		__set_page_locked(page);
+		ret = mem_cgroup_cache_charge(page, current->mm,
+						gfp & GFP_RECLAIM_MASK);
+		if (!ret)
+			ret = shmem_add_to_page_cache(page, mapping, index,
+						gfp, NULL);
+		if (ret)
+			goto unlock;
+		lru_cache_add_anon(page);
+
+		spin_lock(&info->lock);
+		info->alloced++;
+		inode->i_blocks += BLOCKS_PER_PAGE;
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+		shmem_recalc_inode(inode);
+		spin_unlock(&info->lock);
+
+		clear_highpage(page);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+		cond_resched();
+		index++;
+		if (!(mode & FALLOC_FL_KEEP_SIZE))
+			i_size_write(inode, index << PAGE_CACHE_SHIFT);
+	}
+
+	goto unacct;
+
+unlock:
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+decused:
+	if (sbinfo->max_blocks)
+		percpu_counter_sub(&sbinfo->used_blocks, BLOCKS_PER_PAGE);
+unacct:
+	shmem_unacct_blocks(info->flags, 1);
+	return ret;
+}
+
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2286,6 +2383,7 @@ static const struct file_operations shmem_file_operations = {
 	.fsync		= noop_fsync,
 	.splice_read	= shmem_file_splice_read,
 	.splice_write	= generic_file_splice_write,
+	.fallocate	= shmem_fallocate,
 #endif
 };
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-15  8:42 [Patch] tmpfs: add fallocate support Amerigo Wang
@ 2011-11-15 10:23 ` Cong Wang
  2011-11-15 17:43   ` Dave Hansen
  2011-11-15 11:23 ` Pekka Enberg
  2011-11-16  1:18 ` KAMEZAWA Hiroyuki
  2 siblings, 1 reply; 13+ messages in thread
From: Cong Wang @ 2011-11-15 10:23 UTC (permalink / raw)
  To: Amerigo Wang
  Cc: linux-kernel, akpm, Hugh Dickins, linux-mm, Lennart Poettering,
	KOSAKI Motohiro

CC: Lennart Poettering <lennart@poettering.net>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>

On 2011/11/15 16:42, Amerigo Wang wrote:
> This patch adds fallocate support to tmpfs. I tested this patch
> with the following test case,
>
> 	% sudo mount -t tmpfs -o size=100 tmpfs /mnt
> 	% touch /mnt/foobar
> 	% echo hi>  /mnt/foobar
> 	% fallocate -o 3 -l 5000 /mnt/foobar
> 	fallocate: /mnt/foobar: fallocate failed: No space left on device
> 	% fallocate -o 3 -l 3000 /mnt/foobar
> 	% ls -l /mnt/foobar
> 	-rw-rw-r-- 1 wangcong wangcong 3003 Nov 15 16:10 /mnt/foobar
> 	% dd if=/dev/zero of=/mnt/foobar seek=3 bs=1 count=3000
> 	3000+0 records in
> 	3000+0 records out
> 	3000 bytes (3.0 kB) copied, 0.0153224 s, 196 kB/s
> 	% hexdump -C /mnt/foobar
> 	00000000  68 69 0a 00 00 00 00 00  00 00 00 00 00 00 00 00  |hi..............|
> 	00000010  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
> 	*
> 	00000bb0  00 00 00 00 00 00 00 00  00 00 00                 |...........|
> 	00000bbb
> 	% cat /mnt/foobar
> 	hi
>
> Signed-off-by: WANG Cong<amwang@redhat.com>
>
> ---
> diff --git a/mm/shmem.c b/mm/shmem.c
> index d672250..438b7b8 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -30,6 +30,7 @@
>   #include<linux/mm.h>
>   #include<linux/export.h>
>   #include<linux/swap.h>
> +#include<linux/falloc.h>
>
>   static struct vfsmount *shm_mnt;
>
> @@ -1431,6 +1432,102 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
>   	return error;
>   }
>
> +static long shmem_fallocate(struct file *file, int mode,
> +			    loff_t offset, loff_t len)
> +{
> +	struct inode *inode = file->f_path.dentry->d_inode;
> +	struct address_space *mapping = inode->i_mapping;
> +	struct shmem_inode_info *info = SHMEM_I(inode);
> +	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
> +	pgoff_t start = DIV_ROUND_UP(offset, PAGE_CACHE_SIZE);
> +	pgoff_t end = DIV_ROUND_UP((offset + len), PAGE_CACHE_SIZE);
> +	pgoff_t index = start;
> +	gfp_t gfp = mapping_gfp_mask(mapping);
> +	loff_t i_size = i_size_read(inode);
> +	struct page *page = NULL;
> +	int ret;
> +
> +	if ((offset + len)<= i_size)
> +		return 0;
> +
> +	if (!(mode&  FALLOC_FL_KEEP_SIZE)) {
> +		ret = inode_newsize_ok(inode, (offset + len));
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (start == end) {
> +		if (!(mode&  FALLOC_FL_KEEP_SIZE))
> +			i_size_write(inode, offset + len);
> +		return 0;
> +	}
> +
> +	if (shmem_acct_block(info->flags))
> +		return -ENOSPC;
> +
> +	if (sbinfo->max_blocks) {
> +		unsigned long blocks = (end - index) * BLOCKS_PER_PAGE;
> +		if (blocks + percpu_counter_sum(&sbinfo->used_blocks)
> +				>= sbinfo->max_blocks) {
> +			ret = -ENOSPC;
> +			goto unacct;
> +		}
> +	}
> +
> +	while (index<  end) {
> +		if (sbinfo->max_blocks)
> +			percpu_counter_add(&sbinfo->used_blocks, BLOCKS_PER_PAGE);
> +
> +		page = shmem_alloc_page(gfp, info, index);
> +		if (!page) {
> +			ret = -ENOMEM;
> +			goto decused;
> +		}
> +
> +		SetPageSwapBacked(page);
> +		__set_page_locked(page);
> +		ret = mem_cgroup_cache_charge(page, current->mm,
> +						gfp&  GFP_RECLAIM_MASK);
> +		if (!ret)
> +			ret = shmem_add_to_page_cache(page, mapping, index,
> +						gfp, NULL);
> +		if (ret)
> +			goto unlock;
> +		lru_cache_add_anon(page);
> +
> +		spin_lock(&info->lock);
> +		info->alloced++;
> +		inode->i_blocks += BLOCKS_PER_PAGE;
> +		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
> +		shmem_recalc_inode(inode);
> +		spin_unlock(&info->lock);
> +
> +		clear_highpage(page);
> +		flush_dcache_page(page);
> +		SetPageUptodate(page);
> +		unlock_page(page);
> +		page_cache_release(page);
> +		cond_resched();
> +		index++;
> +		if (!(mode&  FALLOC_FL_KEEP_SIZE))
> +			i_size_write(inode, index<<  PAGE_CACHE_SHIFT);
> +	}
> +
> +	goto unacct;
> +
> +unlock:
> +	if (page) {
> +		unlock_page(page);
> +		page_cache_release(page);
> +	}
> +decused:
> +	if (sbinfo->max_blocks)
> +		percpu_counter_sub(&sbinfo->used_blocks, BLOCKS_PER_PAGE);
> +unacct:
> +	shmem_unacct_blocks(info->flags, 1);
> +	return ret;
> +}
> +
>   static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
>   {
>   	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
> @@ -2286,6 +2383,7 @@ static const struct file_operations shmem_file_operations = {
>   	.fsync		= noop_fsync,
>   	.splice_read	= shmem_file_splice_read,
>   	.splice_write	= generic_file_splice_write,
> +	.fallocate	= shmem_fallocate,
>   #endif
>   };
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-15  8:42 [Patch] tmpfs: add fallocate support Amerigo Wang
  2011-11-15 10:23 ` Cong Wang
@ 2011-11-15 11:23 ` Pekka Enberg
  2011-11-16  7:09   ` Cong Wang
  2011-11-16  1:18 ` KAMEZAWA Hiroyuki
  2 siblings, 1 reply; 13+ messages in thread
From: Pekka Enberg @ 2011-11-15 11:23 UTC (permalink / raw)
  To: Amerigo Wang; +Cc: linux-kernel, akpm, Hugh Dickins, linux-mm

Hello,

On Tue, Nov 15, 2011 at 10:42 AM, Amerigo Wang <amwang@redhat.com> wrote:
> This patch adds fallocate support to tmpfs. I tested this patch
> with the following test case,
>
>        % sudo mount -t tmpfs -o size=100 tmpfs /mnt
>        % touch /mnt/foobar
>        % echo hi > /mnt/foobar
>        % fallocate -o 3 -l 5000 /mnt/foobar
>        fallocate: /mnt/foobar: fallocate failed: No space left on device
>        % fallocate -o 3 -l 3000 /mnt/foobar
>        % ls -l /mnt/foobar
>        -rw-rw-r-- 1 wangcong wangcong 3003 Nov 15 16:10 /mnt/foobar
>        % dd if=/dev/zero of=/mnt/foobar seek=3 bs=1 count=3000
>        3000+0 records in
>        3000+0 records out
>        3000 bytes (3.0 kB) copied, 0.0153224 s, 196 kB/s
>        % hexdump -C /mnt/foobar
>        00000000  68 69 0a 00 00 00 00 00  00 00 00 00 00 00 00 00  |hi..............|
>        00000010  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
>        *
>        00000bb0  00 00 00 00 00 00 00 00  00 00 00                 |...........|
>        00000bbb
>        % cat /mnt/foobar
>        hi
>
> Signed-off-by: WANG Cong <amwang@redhat.com>

What's the use case for this?

                        Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-15 10:23 ` Cong Wang
@ 2011-11-15 17:43   ` Dave Hansen
  2011-11-16  7:21     ` Cong Wang
  0 siblings, 1 reply; 13+ messages in thread
From: Dave Hansen @ 2011-11-15 17:43 UTC (permalink / raw)
  To: Cong Wang
  Cc: linux-kernel, akpm, Hugh Dickins, linux-mm, Lennart Poettering,
	KOSAKI Motohiro

On Tue, 2011-11-15 at 18:23 +0800, Cong Wang wrote:
> > +static long shmem_fallocate(struct file *file, int mode,
> > +			    loff_t offset, loff_t len)
> > +{
> > +	struct inode *inode = file->f_path.dentry->d_inode;
> > +	struct address_space *mapping = inode->i_mapping;
> > +	struct shmem_inode_info *info = SHMEM_I(inode);
> > +	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
> > +	pgoff_t start = DIV_ROUND_UP(offset, PAGE_CACHE_SIZE);
> > +	pgoff_t end = DIV_ROUND_UP((offset + len), PAGE_CACHE_SIZE);
> > +	pgoff_t index = start;
> > +	gfp_t gfp = mapping_gfp_mask(mapping);
> > +	loff_t i_size = i_size_read(inode);
> > +	struct page *page = NULL;
> > +	int ret;
> > +
> > +	if ((offset + len)<= i_size)
> > +		return 0;

This seems to say that if the fallocate() call ends before the end of
the file we should ignore the call.  In other words, this can only be
used to extend the file, but could not be used to fill in the holes of a
sparse file.  

Is there a reason it was done that way?

> > +	if (!(mode&  FALLOC_FL_KEEP_SIZE)) {
> > +		ret = inode_newsize_ok(inode, (offset + len));
> > +		if (ret)
> > +			return ret;
> > +	}

inode_newsize_ok()'s comments say:

 * inode_newsize_ok must be called with i_mutex held.

But I don't see any trace of it.

> > +	if (start == end) {
> > +		if (!(mode&  FALLOC_FL_KEEP_SIZE))
> > +			i_size_write(inode, offset + len);
> > +		return 0;
> > +	}

There's a whitespace borkage like that 'mode&' all over this patch.
Probably needs a little love.

> +       if (shmem_acct_block(info->flags))
> +               return -ENOSPC;
> +
> +       if (sbinfo->max_blocks) {
> +               unsigned long blocks = (end - index) * BLOCKS_PER_PAGE;
> +               if (blocks + percpu_counter_sum(&sbinfo->used_blocks)
> +                               >= sbinfo->max_blocks) {
> +                       ret = -ENOSPC;
> +                       goto unacct;
> +               }
> +       }
...
> > +	while (index<  end) {
> > +		if (sbinfo->max_blocks)
> > +			percpu_counter_add(&sbinfo->used_blocks, BLOCKS_PER_PAGE);
> > +
> > +		page = shmem_alloc_page(gfp, info, index);
> > +		if (!page) {
> > +			ret = -ENOMEM;
> > +			goto decused;
> > +		}
> > +
> > +		SetPageSwapBacked(page);
> > +		__set_page_locked(page);
> > +		ret = mem_cgroup_cache_charge(page, current->mm,
> > +						gfp&  GFP_RECLAIM_MASK);
> > +		if (!ret)
> > +			ret = shmem_add_to_page_cache(page, mapping, index,
> > +						gfp, NULL);
> > +		if (ret)
> > +			goto unlock;
> > +		lru_cache_add_anon(page);
> > +
> > +		spin_lock(&info->lock);
> > +		info->alloced++;
> > +		inode->i_blocks += BLOCKS_PER_PAGE;
> > +		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
> > +		shmem_recalc_inode(inode);
> > +		spin_unlock(&info->lock);
> > +
> > +		clear_highpage(page);
> > +		flush_dcache_page(page);
> > +		SetPageUptodate(page);
> > +		unlock_page(page);
> > +		page_cache_release(page);
> > +		cond_resched();
> > +		index++;
> > +		if (!(mode&  FALLOC_FL_KEEP_SIZE))
> > +			i_size_write(inode, index<<  PAGE_CACHE_SHIFT);
> > +	

This seems to have borrowed quite generously from shmem_getpage_gfp().
Seems like some code consolidation is in order before this level of
copy-n-paste.

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-15  8:42 [Patch] tmpfs: add fallocate support Amerigo Wang
  2011-11-15 10:23 ` Cong Wang
  2011-11-15 11:23 ` Pekka Enberg
@ 2011-11-16  1:18 ` KAMEZAWA Hiroyuki
  2011-11-16  7:12   ` Cong Wang
  2 siblings, 1 reply; 13+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-11-16  1:18 UTC (permalink / raw)
  To: Amerigo Wang; +Cc: linux-kernel, akpm, Hugh Dickins, linux-mm

On Tue, 15 Nov 2011 16:42:05 +0800
Amerigo Wang <amwang@redhat.com> wrote:

> This patch adds fallocate support to tmpfs. I tested this patch
> with the following test case,
> 
> 	% sudo mount -t tmpfs -o size=100 tmpfs /mnt
> 	% touch /mnt/foobar
> 	% echo hi > /mnt/foobar
> 	% fallocate -o 3 -l 5000 /mnt/foobar          
> 	fallocate: /mnt/foobar: fallocate failed: No space left on device
> 	% fallocate -o 3 -l 3000 /mnt/foobar
> 	% ls -l /mnt/foobar
> 	-rw-rw-r-- 1 wangcong wangcong 3003 Nov 15 16:10 /mnt/foobar
> 	% dd if=/dev/zero of=/mnt/foobar seek=3 bs=1 count=3000
> 	3000+0 records in
> 	3000+0 records out
> 	3000 bytes (3.0 kB) copied, 0.0153224 s, 196 kB/s
> 	% hexdump -C /mnt/foobar                                   
> 	00000000  68 69 0a 00 00 00 00 00  00 00 00 00 00 00 00 00  |hi..............|
> 	00000010  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
> 	*
> 	00000bb0  00 00 00 00 00 00 00 00  00 00 00                 |...........|
> 	00000bbb
> 	% cat /mnt/foobar
> 	hi
> 
> Signed-off-by: WANG Cong <amwang@redhat.com>
> 
> ---
> diff --git a/mm/shmem.c b/mm/shmem.c
> index d672250..438b7b8 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -30,6 +30,7 @@
>  #include <linux/mm.h>
>  #include <linux/export.h>
>  #include <linux/swap.h>
> +#include <linux/falloc.h>
>  
>  static struct vfsmount *shm_mnt;
>  
> @@ -1431,6 +1432,102 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
>  	return error;
>  }
>  
> +static long shmem_fallocate(struct file *file, int mode,
> +			    loff_t offset, loff_t len)
> +{
> +	struct inode *inode = file->f_path.dentry->d_inode;
> +	struct address_space *mapping = inode->i_mapping;
> +	struct shmem_inode_info *info = SHMEM_I(inode);
> +	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
> +	pgoff_t start = DIV_ROUND_UP(offset, PAGE_CACHE_SIZE);
> +	pgoff_t end = DIV_ROUND_UP((offset + len), PAGE_CACHE_SIZE);
> +	pgoff_t index = start;
> +	gfp_t gfp = mapping_gfp_mask(mapping);
> +	loff_t i_size = i_size_read(inode);
> +	struct page *page = NULL;
> +	int ret;
> +
> +	if ((offset + len) <= i_size)
> +		return 0;
> +
> +	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
> +		ret = inode_newsize_ok(inode, (offset + len));
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (start == end) {
> +		if (!(mode & FALLOC_FL_KEEP_SIZE))
> +			i_size_write(inode, offset + len);
> +		return 0;
> +	}
> +
> +	if (shmem_acct_block(info->flags))
> +		return -ENOSPC;
> +
> +	if (sbinfo->max_blocks) {
> +		unsigned long blocks = (end - index) * BLOCKS_PER_PAGE;
> +		if (blocks + percpu_counter_sum(&sbinfo->used_blocks)
> +				>= sbinfo->max_blocks) {
> +			ret = -ENOSPC;
> +			goto unacct;
> +		}
> +	}
> +
> +	while (index < end) {
> +		if (sbinfo->max_blocks)
> +			percpu_counter_add(&sbinfo->used_blocks, BLOCKS_PER_PAGE);
> +
> +		page = shmem_alloc_page(gfp, info, index);
> +		if (!page) {
> +			ret = -ENOMEM;
> +			goto decused;
> +		}
> +
> +		SetPageSwapBacked(page);
> +		__set_page_locked(page);
> +		ret = mem_cgroup_cache_charge(page, current->mm,
> +						gfp & GFP_RECLAIM_MASK);
> +		if (!ret)
> +			ret = shmem_add_to_page_cache(page, mapping, index,
> +						gfp, NULL);
> +		if (ret)
> +			goto unlock;

The charges for memcg seems leaked here.
Please cancel 'charge' in error path. as

		if (ret) {
			mem_cgroup_uncharge_cache_page(page);
			goto unlock;
		}
> +		lru_cache_add_anon(page);
> +
> +		spin_lock(&info->lock);
> +		info->alloced++;
> +		inode->i_blocks += BLOCKS_PER_PAGE;
> +		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
> +		shmem_recalc_inode(inode);
> +		spin_unlock(&info->lock);
> +
> +		clear_highpage(page);
> +		flush_dcache_page(page);
> +		SetPageUptodate(page);
> +		unlock_page(page);
> +		page_cache_release(page);
> +		cond_resched();
> +		index++;
> +		if (!(mode & FALLOC_FL_KEEP_SIZE))
> +			i_size_write(inode, index << PAGE_CACHE_SHIFT);
> +	}
> +

Hmm.. Doesn't this duplicate shmem_getpage_gfp() ? Can't you split/share codes ?

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-15 11:23 ` Pekka Enberg
@ 2011-11-16  7:09   ` Cong Wang
  2011-11-16  7:12     ` Pekka Enberg
  0 siblings, 1 reply; 13+ messages in thread
From: Cong Wang @ 2011-11-16  7:09 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: linux-kernel, akpm, Hugh Dickins, linux-mm, kay.sievers

ao? 2011a1'11ae??15ae?JPY 19:23, Pekka Enberg a??e??:
> Hello,
>
> On Tue, Nov 15, 2011 at 10:42 AM, Amerigo Wang<amwang@redhat.com>  wrote:
>> This patch adds fallocate support to tmpfs. I tested this patch
>> with the following test case,
>>
>>         % sudo mount -t tmpfs -o size=100 tmpfs /mnt
>>         % touch /mnt/foobar
>>         % echo hi>  /mnt/foobar
>>         % fallocate -o 3 -l 5000 /mnt/foobar
>>         fallocate: /mnt/foobar: fallocate failed: No space left on device
>>         % fallocate -o 3 -l 3000 /mnt/foobar
>>         % ls -l /mnt/foobar
>>         -rw-rw-r-- 1 wangcong wangcong 3003 Nov 15 16:10 /mnt/foobar
>>         % dd if=/dev/zero of=/mnt/foobar seek=3 bs=1 count=3000
>>         3000+0 records in
>>         3000+0 records out
>>         3000 bytes (3.0 kB) copied, 0.0153224 s, 196 kB/s
>>         % hexdump -C /mnt/foobar
>>         00000000  68 69 0a 00 00 00 00 00  00 00 00 00 00 00 00 00  |hi..............|
>>         00000010  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
>>         *
>>         00000bb0  00 00 00 00 00 00 00 00  00 00 00                 |...........|
>>         00000bbb
>>         % cat /mnt/foobar
>>         hi
>>
>> Signed-off-by: WANG Cong<amwang@redhat.com>
>
> What's the use case for this?
>

Hi, Pekka,

Systemd needs it, see http://lkml.org/lkml/2011/10/20/275.
I am adding Kay into Cc.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-16  7:09   ` Cong Wang
@ 2011-11-16  7:12     ` Pekka Enberg
  2011-11-16  7:16       ` Cong Wang
  0 siblings, 1 reply; 13+ messages in thread
From: Pekka Enberg @ 2011-11-16  7:12 UTC (permalink / raw)
  To: Cong Wang; +Cc: linux-kernel, akpm, Hugh Dickins, linux-mm, kay.sievers

On Wed, 16 Nov 2011, Cong Wang wrote:
>> What's the use case for this?
>
> Systemd needs it, see http://lkml.org/lkml/2011/10/20/275.
> I am adding Kay into Cc.

The post doesn't mention why it needs it, though.

 			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-16  1:18 ` KAMEZAWA Hiroyuki
@ 2011-11-16  7:12   ` Cong Wang
  0 siblings, 0 replies; 13+ messages in thread
From: Cong Wang @ 2011-11-16  7:12 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-kernel, akpm, Hugh Dickins, linux-mm

ao? 2011a1'11ae??16ae?JPY 09:18, KAMEZAWA Hiroyuki a??e??:
>
> Hmm.. Doesn't this duplicate shmem_getpage_gfp() ? Can't you split/share codes ?
>

Yeah, you are right... I will split the code.

Thanks for review.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-16  7:12     ` Pekka Enberg
@ 2011-11-16  7:16       ` Cong Wang
  2011-11-17  1:31         ` Hugh Dickins
  0 siblings, 1 reply; 13+ messages in thread
From: Cong Wang @ 2011-11-16  7:16 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: linux-kernel, akpm, Hugh Dickins, linux-mm, kay.sievers

ao? 2011a1'11ae??16ae?JPY 15:12, Pekka Enberg a??e??:
> On Wed, 16 Nov 2011, Cong Wang wrote:
>>> What's the use case for this?
>>
>> Systemd needs it, see http://lkml.org/lkml/2011/10/20/275.
>> I am adding Kay into Cc.
>
> The post doesn't mention why it needs it, though.
>

Right, I should mention this in the changelog. :-/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-15 17:43   ` Dave Hansen
@ 2011-11-16  7:21     ` Cong Wang
  2011-11-16 23:21       ` Dave Hansen
  0 siblings, 1 reply; 13+ messages in thread
From: Cong Wang @ 2011-11-16  7:21 UTC (permalink / raw)
  To: Dave Hansen
  Cc: linux-kernel, akpm, Hugh Dickins, linux-mm, Lennart Poettering,
	KOSAKI Motohiro

ao? 2011a1'11ae??16ae?JPY 01:43, Dave Hansen a??e??:
> On Tue, 2011-11-15 at 18:23 +0800, Cong Wang wrote:
>>> +static long shmem_fallocate(struct file *file, int mode,
>>> +			    loff_t offset, loff_t len)
>>> +{
>>> +	struct inode *inode = file->f_path.dentry->d_inode;
>>> +	struct address_space *mapping = inode->i_mapping;
>>> +	struct shmem_inode_info *info = SHMEM_I(inode);
>>> +	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
>>> +	pgoff_t start = DIV_ROUND_UP(offset, PAGE_CACHE_SIZE);
>>> +	pgoff_t end = DIV_ROUND_UP((offset + len), PAGE_CACHE_SIZE);
>>> +	pgoff_t index = start;
>>> +	gfp_t gfp = mapping_gfp_mask(mapping);
>>> +	loff_t i_size = i_size_read(inode);
>>> +	struct page *page = NULL;
>>> +	int ret;
>>> +
>>> +	if ((offset + len)<= i_size)
>>> +		return 0;
>
> This seems to say that if the fallocate() call ends before the end of
> the file we should ignore the call.  In other words, this can only be
> used to extend the file, but could not be used to fill in the holes of a
> sparse file.
>
> Is there a reason it was done that way?

Hmm, I missed the case of sparse file, so you are right, I need to fix it.

>
>>> +	if (!(mode&   FALLOC_FL_KEEP_SIZE)) {
>>> +		ret = inode_newsize_ok(inode, (offset + len));
>>> +		if (ret)
>>> +			return ret;
>>> +	}
>
> inode_newsize_ok()'s comments say:
>
>   * inode_newsize_ok must be called with i_mutex held.
>
> But I don't see any trace of it.
>

Hmm, even for tmpfs? I see none of the tmpfs code takes
i_mutex lock though...

>>> +	if (start == end) {
>>> +		if (!(mode&   FALLOC_FL_KEEP_SIZE))
>>> +			i_size_write(inode, offset + len);
>>> +		return 0;
>>> +	}
>
> There's a whitespace borkage like that 'mode&' all over this patch.
> Probably needs a little love.

Odd, it is fine in my local patch, and it passed checkpatch.pl
check before I sent it.

<...>	
>
> This seems to have borrowed quite generously from shmem_getpage_gfp().
> Seems like some code consolidation is in order before this level of
> copy-n-paste.

Yes, I will separate the similar code.

Thanks for review!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-16  7:21     ` Cong Wang
@ 2011-11-16 23:21       ` Dave Hansen
  0 siblings, 0 replies; 13+ messages in thread
From: Dave Hansen @ 2011-11-16 23:21 UTC (permalink / raw)
  To: Cong Wang
  Cc: linux-kernel, akpm, Hugh Dickins, linux-mm, Lennart Poettering,
	KOSAKI Motohiro

On Wed, 2011-11-16 at 15:21 +0800, Cong Wang wrote:
> ao? 2011a1'11ae??16ae?JPY 01:43, Dave Hansen a??e??:
> > On Tue, 2011-11-15 at 18:23 +0800, Cong Wang wrote:
> >>> +	if (!(mode&   FALLOC_FL_KEEP_SIZE)) {
> >>> +		ret = inode_newsize_ok(inode, (offset + len));
> >>> +		if (ret)
> >>> +			return ret;
> >>> +	}
> >
> > inode_newsize_ok()'s comments say:
> >
> >   * inode_newsize_ok must be called with i_mutex held.
> >
> > But I don't see any trace of it.
> 
> Hmm, even for tmpfs? I see none of the tmpfs code takes
> i_mutex lock though...

Look harder. :)

ramfs/tmpfs for a large part just used the generic VFS functions to do
their work since they're page-cache based.  For instance:

static const struct file_operations shmem_file_operations = {
...
        .aio_write      = generic_file_aio_write,

IOW, you need to check beyond mm/shmem.c.

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-16  7:16       ` Cong Wang
@ 2011-11-17  1:31         ` Hugh Dickins
  2011-11-18 10:27           ` Cong Wang
  0 siblings, 1 reply; 13+ messages in thread
From: Hugh Dickins @ 2011-11-17  1:31 UTC (permalink / raw)
  To: Cong Wang
  Cc: Pekka Enberg, Lennart Poettering, KOSAKI Motohiro,
	KAMEZAWA Hiroyuki, Dave Hansen, linux-kernel, akpm, linux-mm,
	kay.sievers

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2197 bytes --]

On Wed, 16 Nov 2011, Cong Wang wrote:
> 于 2011年11月16日 15:12, Pekka Enberg 写道:
> > On Wed, 16 Nov 2011, Cong Wang wrote:
> > > > What's the use case for this?
> > > 
> > > Systemd needs it, see http://lkml.org/lkml/2011/10/20/275.
> > > I am adding Kay into Cc.
> > 
> > The post doesn't mention why it needs it, though.
> > 
> 
> Right, I should mention this in the changelog. :-/

Yes, but I think Pekka's point is that the page which you link to does not
explain why Plumbers would want tmpfs to support fallocate() properly.

What good is it going to do for them?  Why not just do it in userspace,
either by dd if=/dev/zero of=tmpfsfile, or by mmap() and touch if very
anxious to avoid the triple memset/memcpy (once reading from /dev/zero,
once allocating tmpfs pages, once copying to tmpfs pages)?  Or splice().

I don't want to stand in the way of progress, but there's a lot of
things tmpfs does not support (a persistent filesystem would be top
of the list; but readahead, direct I/O, AIO, ....), and it may be
better to continue not to support them unless there's good reason.
tmpfs does not have a disk layout that we need to optimize.

I did not study your implementation in detail, but agree with Dave
and Kame that (if it needs to be in kernel at all) you should reuse
the existing code rather than repeating extracts: shmem_getpage_gfp()
is the one place which looks after all of shmem page allocation, so
I'd prefer you just make a loop of calls to that (with a new sgp_type
if there's particular reason to do something differently in there).

I've not yet looked up the specification of fallocate(), but it
looked surprising to be allocating pages up to the point where a
page already exists (when shmem_add_to_page_cache will fail) and
then giving up with -EEXIST.

Seeing your Subject, I imagined at first that you would be implementing
FALLOC_FL_PUNCH_HOLE support. That is on my list to do: tmpfs has its
own peculiar madvise(MADV_REMOVE) support (and yes, you may question
whether we were right to add that in) - we should be converting
MADV_REMOVE to use FALLOC_FL_PUNCH_HOLE, and tmpfs to support that.

Thanks,
Hugh

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Patch] tmpfs: add fallocate support
  2011-11-17  1:31         ` Hugh Dickins
@ 2011-11-18 10:27           ` Cong Wang
  0 siblings, 0 replies; 13+ messages in thread
From: Cong Wang @ 2011-11-18 10:27 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Pekka Enberg, Lennart Poettering, KOSAKI Motohiro,
	KAMEZAWA Hiroyuki, Dave Hansen, linux-kernel, akpm, linux-mm,
	kay.sievers

ao? 2011a1'11ae??17ae?JPY 09:31, Hugh Dickins a??e??:
> On Wed, 16 Nov 2011, Cong Wang wrote:
>> ao? 2011a1'11ae??16ae?JPY 15:12, Pekka Enberg a??e??:
>>> On Wed, 16 Nov 2011, Cong Wang wrote:
>>>>> What's the use case for this?
>>>>
>>>> Systemd needs it, see http://lkml.org/lkml/2011/10/20/275.
>>>> I am adding Kay into Cc.
>>>
>>> The post doesn't mention why it needs it, though.
>>>
>>
>> Right, I should mention this in the changelog. :-/
>
> Yes, but I think Pekka's point is that the page which you link to does not
> explain why Plumbers would want tmpfs to support fallocate() properly.
>
> What good is it going to do for them?  Why not just do it in userspace,
> either by dd if=/dev/zero of=tmpfsfile, or by mmap() and touch if very
> anxious to avoid the triple memset/memcpy (once reading from /dev/zero,
> once allocating tmpfs pages, once copying to tmpfs pages)?  Or splice().


It is not hard at all to implement this in kernel space, so this
will make systemd a little happier.

>
> I don't want to stand in the way of progress, but there's a lot of
> things tmpfs does not support (a persistent filesystem would be top
> of the list; but readahead, direct I/O, AIO, ....), and it may be
> better to continue not to support them unless there's good reason.
> tmpfs does not have a disk layout that we need to optimize.


True, and no one requests for these features so far? As systemd
developers need this light feature, fallocate, and it is not hard
to implement it, so why not? ;)

>
> I did not study your implementation in detail, but agree with Dave
> and Kame that (if it needs to be in kernel at all) you should reuse
> the existing code rather than repeating extracts: shmem_getpage_gfp()
> is the one place which looks after all of shmem page allocation, so
> I'd prefer you just make a loop of calls to that (with a new sgp_type
> if there's particular reason to do something differently in there).


Yes, while reworking on this patch, I did exactly what you said.

>
> I've not yet looked up the specification of fallocate(), but it
> looked surprising to be allocating pages up to the point where a
> page already exists (when shmem_add_to_page_cache will fail) and
> then giving up with -EEXIST.

You are right, I need to fix this.

>
> Seeing your Subject, I imagined at first that you would be implementing
> FALLOC_FL_PUNCH_HOLE support. That is on my list to do: tmpfs has its
> own peculiar madvise(MADV_REMOVE) support (and yes, you may question
> whether we were right to add that in) - we should be converting
> MADV_REMOVE to use FALLOC_FL_PUNCH_HOLE, and tmpfs to support that.
>

I will add this in my V2 patch.

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2011-11-18 10:27 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-11-15  8:42 [Patch] tmpfs: add fallocate support Amerigo Wang
2011-11-15 10:23 ` Cong Wang
2011-11-15 17:43   ` Dave Hansen
2011-11-16  7:21     ` Cong Wang
2011-11-16 23:21       ` Dave Hansen
2011-11-15 11:23 ` Pekka Enberg
2011-11-16  7:09   ` Cong Wang
2011-11-16  7:12     ` Pekka Enberg
2011-11-16  7:16       ` Cong Wang
2011-11-17  1:31         ` Hugh Dickins
2011-11-18 10:27           ` Cong Wang
2011-11-16  1:18 ` KAMEZAWA Hiroyuki
2011-11-16  7:12   ` Cong Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).