linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "J. R. Okajima" <hooanon05g@gmail.com>
To: Christoph Hellwig <hch@lst.de>
Cc: dchinner@redhat.com, viro@zeniv.linux.org.uk,
	linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH] vfs: get_next_ino(), never inum=0
Date: Wed, 30 Apr 2014 13:08:54 +0900	[thread overview]
Message-ID: <7467.1398830934@jrobl> (raw)
In-Reply-To: <20140429175342.GA26109@lst.de>


Christoph Hellwig:
> If you care about really unique inode numbers you shouldn't use get_next_ino
> but something like an idr allocator.  The default i_ino assigned in
> new_inode() from which get_next_ino was factored out was mostly intended
> for small synthetic filesystems with few enough inodes that it wouldn't
> wrap around.

Grep-ping get_next_ino, I got 30 calls in mainline.
How many of them are get_next_ino() inappropriate? I don't know. But at
least for tmpfs, it is better to manage the inums by itself since tmpfs
must be one of the biggest consumer of inums and it is NFS-exportable.

Do you think we need a common function in VFS to manage inums per sb, or
it is totally up to filesystem and the common function is unnecessary?

Instead of idr, I was thinking about a simple bitmap in tmpfs such like
this. It introduces a new mount option "ino" which forces tmpfs to
assign the lowest unused number for a new inode within the mounted
tmpfs. Without "ino" or specifying "noino", the behaviour is unchanged
(use vfs:get_next_ino()).
But it may not scale well due to the single spinlock every time.


J. R. Okajima


commit 214d38e8c34fb341fd0f37cc92614b5e93e0803b
Author: J. R. Okajima <hooanon05@yahoo.co.jp>
Date:   Mon Sep 2 10:45:42 2013 +0900

    shmem: management for inum
    
    Signed-off-by: J. R. Okajima <hooanon05g@gmail.com>

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 4d1771c..39762e1 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -29,6 +29,8 @@ struct shmem_sb_info {
 	unsigned long max_inodes;   /* How many inodes are allowed */
 	unsigned long free_inodes;  /* How many are left for allocation */
 	spinlock_t stat_lock;	    /* Serialize shmem_sb_info changes */
+	spinlock_t ino_lock;
+	unsigned long *ino_bitmap;
 	kuid_t uid;		    /* Mount uid for root directory */
 	kgid_t gid;		    /* Mount gid for root directory */
 	umode_t mode;		    /* Mount mode for root directory */
diff --git a/mm/shmem.c b/mm/shmem.c
index 9f70e02..bc2c5e4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -197,14 +197,61 @@ static int shmem_reserve_inode(struct super_block *sb)
 	return 0;
 }
 
-static void shmem_free_inode(struct super_block *sb)
+static void shmem_free_inode(struct inode *inode)
 {
+	struct super_block *sb = inode->i_sb;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
 	if (sbinfo->max_inodes) {
 		spin_lock(&sbinfo->stat_lock);
 		sbinfo->free_inodes++;
 		spin_unlock(&sbinfo->stat_lock);
 	}
+
+	if (!inode->i_nlink) {
+		spin_lock(&sbinfo->ino_lock);
+		if (sbinfo->ino_bitmap)
+			clear_bit(inode->i_ino - 2, sbinfo->ino_bitmap);
+		spin_unlock(&sbinfo->ino_lock);
+	}
+}
+
+/*
+ * This is unsigned int instead of unsigned long.
+ * For details, see fs/inode.c:get_next_ino().
+ */
+unsigned int shmem_next_ino(struct super_block *sb)
+{
+	unsigned long ino;
+	struct shmem_sb_info *sbinfo;
+
+	ino = 0;
+	sbinfo = SHMEM_SB(sb);
+	if (sbinfo->ino_bitmap) {
+		spin_lock(&sbinfo->ino_lock);
+		/*
+		 * someone else may remount,
+		 * and ino_bitmap might be reset.
+		 */
+		if (sbinfo->ino_bitmap
+		    && !bitmap_full(sbinfo->ino_bitmap, sbinfo->max_inodes)) {
+			ino = find_first_zero_bit(sbinfo->ino_bitmap,
+						  sbinfo->max_inodes);
+			set_bit(ino, sbinfo->ino_bitmap);
+			ino += 2; /* ino 0 and 1 are reserved */
+		}
+		spin_unlock(&sbinfo->ino_lock);
+	}
+
+	/*
+	 * someone else did remount,
+	 * or ino_bitmap is unused originally,
+	 * or ino_bimapt is full.
+	 */
+	if (!ino)
+		ino = get_next_ino();
+
+	return ino;
 }
 
 /**
@@ -578,7 +625,7 @@ static void shmem_evict_inode(struct inode *inode)
 
 	simple_xattrs_free(&info->xattrs);
 	WARN_ON(inode->i_blocks);
-	shmem_free_inode(inode->i_sb);
+	shmem_free_inode(inode);
 	clear_inode(inode);
 }
 
@@ -1306,7 +1353,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 	inode = new_inode(sb);
 	if (inode) {
-		inode->i_ino = get_next_ino();
+		inode->i_ino = shmem_next_ino(sb);
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1348,7 +1395,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 			break;
 		}
 	} else
-		shmem_free_inode(sb);
+		shmem_free_inode(inode);
 	return inode;
 }
 
@@ -1945,7 +1992,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 
 	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
-		shmem_free_inode(inode->i_sb);
+		shmem_free_inode(inode);
 
 	dir->i_size -= BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -2315,6 +2362,54 @@ static const struct export_operations shmem_export_ops = {
 	.fh_to_dentry	= shmem_fh_to_dentry,
 };
 
+static void shmem_ino_bitmap(struct shmem_sb_info *sbinfo,
+			     unsigned long prev_max)
+{
+	unsigned long *p;
+	unsigned long n, d;
+	int do_msg;
+
+	n = sbinfo->max_inodes / BITS_PER_BYTE;
+	if (sbinfo->max_inodes % BITS_PER_BYTE)
+		n++;
+
+	do_msg = 0;
+	if (sbinfo->ino_bitmap) {
+		/*
+		 * by shrinking the bitmap, the large inode number in use
+		 * may be left. but it is harmless.
+		 */
+		d = 0;
+		if (sbinfo->max_inodes > prev_max) {
+			d = sbinfo->max_inodes - prev_max;
+			d /= BITS_PER_BYTE;
+		}
+		spin_lock(&sbinfo->ino_lock);
+		p = krealloc(sbinfo->ino_bitmap, n, GFP_NOWAIT);
+		if (p) {
+			memset(p + n - d, 0, d);
+			sbinfo->ino_bitmap = p;
+			spin_unlock(&sbinfo->ino_lock);
+		} else {
+			p = sbinfo->ino_bitmap;
+			sbinfo->ino_bitmap = NULL;
+			spin_unlock(&sbinfo->ino_lock);
+			kfree(p);
+			do_msg = 1;
+		}
+	} else {
+		p = kzalloc(n, GFP_NOFS);
+		spin_lock(&sbinfo->ino_lock);
+		sbinfo->ino_bitmap = p;
+		spin_unlock(&sbinfo->ino_lock);
+		do_msg = !p;
+	}
+
+	if (unlikely(do_msg))
+		pr_err("%s: ino failed (%lu bytes). Ignored.\n",
+		       __func__, n);
+}
+
 static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 			       bool remount)
 {
@@ -2322,7 +2417,10 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 	struct mempolicy *mpol = NULL;
 	uid_t uid;
 	gid_t gid;
+	bool do_ino;
+	unsigned long old_val = sbinfo->max_inodes;
 
+	do_ino = 0;
 	while (options != NULL) {
 		this_char = options;
 		for (;;) {
@@ -2342,6 +2440,14 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 		}
 		if (!*this_char)
 			continue;
+		if (!strcmp(this_char, "ino")) {
+			do_ino = 1;
+			continue;
+		} else if (!strcmp(this_char, "noino")) {
+			do_ino = 0;
+			continue;
+		}
+
 		if ((value = strchr(this_char,'=')) != NULL) {
 			*value++ = 0;
 		} else {
@@ -2370,7 +2476,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 				goto bad_val;
 		} else if (!strcmp(this_char,"nr_inodes")) {
 			sbinfo->max_inodes = memparse(value, &rest);
-			if (*rest)
+			if (*rest || !sbinfo->max_inodes)
 				goto bad_val;
 		} else if (!strcmp(this_char,"mode")) {
 			if (remount)
@@ -2408,6 +2514,16 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 		}
 	}
 	sbinfo->mpol = mpol;
+
+	if (do_ino)
+		shmem_ino_bitmap(sbinfo, old_val);
+	else if (sbinfo->ino_bitmap) {
+		void *p = sbinfo->ino_bitmap;
+		spin_lock(&sbinfo->ino_lock);
+		sbinfo->ino_bitmap = NULL;
+		spin_unlock(&sbinfo->ino_lock);
+		kfree(p);
+	}
 	return 0;
 
 bad_val:
@@ -2472,6 +2588,8 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 			sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
 	if (sbinfo->max_inodes != shmem_default_max_inodes())
 		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
+	if (sbinfo->ino_bitmap)
+		seq_printf(seq, ",ino");
 	if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
 		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
 	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
@@ -2491,6 +2609,7 @@ static void shmem_put_super(struct super_block *sb)
 
 	percpu_counter_destroy(&sbinfo->used_blocks);
 	mpol_put(sbinfo->mpol);
+	kfree(sbinfo->ino_bitmap);
 	kfree(sbinfo);
 	sb->s_fs_info = NULL;
 }
@@ -2510,6 +2629,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 	sbinfo->mode = S_IRWXUGO | S_ISVTX;
 	sbinfo->uid = current_fsuid();
 	sbinfo->gid = current_fsgid();
+	spin_lock_init(&sbinfo->ino_lock);
 	sb->s_fs_info = sbinfo;
 
 #ifdef CONFIG_TMPFS

  reply	other threads:[~2014-04-30  4:09 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-04-29 15:45 [PATCH] vfs: get_next_ino(), never inum=0 hooanon05g
2014-04-29 17:42 ` J. R. Okajima
2014-04-29 17:53   ` Christoph Hellwig
2014-04-30  4:08     ` J. R. Okajima [this message]
2014-04-30 22:56       ` Andreas Dilger
2014-05-10  3:18         ` J. R. Okajima
2014-08-18 18:21 ` [PATCH v2] " Carlos Maiolino
2014-08-19  0:58   ` J. R. Okajima

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7467.1398830934@jrobl \
    --to=hooanon05g@gmail.com \
    --cc=dchinner@redhat.com \
    --cc=hch@lst.de \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).