All of lore.kernel.org
 help / color / mirror / Atom feed
From: "J. R. Okajima" <hooanon05g@gmail.com>
To: Christoph Hellwig <hch@lst.de>
Cc: dchinner@redhat.com, viro@zeniv.linux.org.uk,
	linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH] vfs: get_next_ino(), never inum=0
Date: Wed, 30 Apr 2014 13:08:54 +0900	[thread overview]
Message-ID: <7467.1398830934@jrobl> (raw)
In-Reply-To: <20140429175342.GA26109@lst.de>


Christoph Hellwig:
> If you care about really unique inode numbers you shouldn't use get_next_ino
> but something like an idr allocator.  The default i_ino assigned in
> new_inode() from which get_next_ino was factored out was mostly intended
> for small synthetic filesystems with few enough inodes that it wouldn't
> wrap around.

Grep-ping get_next_ino, I got 30 calls in mainline.
How many of them are get_next_ino() inappropriate? I don't know. But at
least for tmpfs, it is better to manage the inums by itself since tmpfs
must be one of the biggest consumer of inums and it is NFS-exportable.

Do you think we need a common function in VFS to manage inums per sb, or
it is totally up to filesystem and the common function is unnecessary?

Instead of idr, I was thinking about a simple bitmap in tmpfs such like
this. It introduces a new mount option "ino" which forces tmpfs to
assign the lowest unused number for a new inode within the mounted
tmpfs. Without "ino" or specifying "noino", the behaviour is unchanged
(use vfs:get_next_ino()).
But it may not scale well due to the single spinlock every time.


J. R. Okajima


commit 214d38e8c34fb341fd0f37cc92614b5e93e0803b
Author: J. R. Okajima <hooanon05@yahoo.co.jp>
Date:   Mon Sep 2 10:45:42 2013 +0900

    shmem: management for inum
    
    Signed-off-by: J. R. Okajima <hooanon05g@gmail.com>

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 4d1771c..39762e1 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -29,6 +29,8 @@ struct shmem_sb_info {
 	unsigned long max_inodes;   /* How many inodes are allowed */
 	unsigned long free_inodes;  /* How many are left for allocation */
 	spinlock_t stat_lock;	    /* Serialize shmem_sb_info changes */
+	spinlock_t ino_lock;
+	unsigned long *ino_bitmap;
 	kuid_t uid;		    /* Mount uid for root directory */
 	kgid_t gid;		    /* Mount gid for root directory */
 	umode_t mode;		    /* Mount mode for root directory */
diff --git a/mm/shmem.c b/mm/shmem.c
index 9f70e02..bc2c5e4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -197,14 +197,61 @@ static int shmem_reserve_inode(struct super_block *sb)
 	return 0;
 }
 
-static void shmem_free_inode(struct super_block *sb)
+static void shmem_free_inode(struct inode *inode)
 {
+	struct super_block *sb = inode->i_sb;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
 	if (sbinfo->max_inodes) {
 		spin_lock(&sbinfo->stat_lock);
 		sbinfo->free_inodes++;
 		spin_unlock(&sbinfo->stat_lock);
 	}
+
+	if (!inode->i_nlink) {
+		spin_lock(&sbinfo->ino_lock);
+		if (sbinfo->ino_bitmap)
+			clear_bit(inode->i_ino - 2, sbinfo->ino_bitmap);
+		spin_unlock(&sbinfo->ino_lock);
+	}
+}
+
+/*
+ * This is unsigned int instead of unsigned long.
+ * For details, see fs/inode.c:get_next_ino().
+ */
+unsigned int shmem_next_ino(struct super_block *sb)
+{
+	unsigned long ino;
+	struct shmem_sb_info *sbinfo;
+
+	ino = 0;
+	sbinfo = SHMEM_SB(sb);
+	if (sbinfo->ino_bitmap) {
+		spin_lock(&sbinfo->ino_lock);
+		/*
+		 * someone else may remount,
+		 * and ino_bitmap might be reset.
+		 */
+		if (sbinfo->ino_bitmap
+		    && !bitmap_full(sbinfo->ino_bitmap, sbinfo->max_inodes)) {
+			ino = find_first_zero_bit(sbinfo->ino_bitmap,
+						  sbinfo->max_inodes);
+			set_bit(ino, sbinfo->ino_bitmap);
+			ino += 2; /* ino 0 and 1 are reserved */
+		}
+		spin_unlock(&sbinfo->ino_lock);
+	}
+
+	/*
+	 * someone else did remount,
+	 * or ino_bitmap is unused originally,
+	 * or ino_bimapt is full.
+	 */
+	if (!ino)
+		ino = get_next_ino();
+
+	return ino;
 }
 
 /**
@@ -578,7 +625,7 @@ static void shmem_evict_inode(struct inode *inode)
 
 	simple_xattrs_free(&info->xattrs);
 	WARN_ON(inode->i_blocks);
-	shmem_free_inode(inode->i_sb);
+	shmem_free_inode(inode);
 	clear_inode(inode);
 }
 
@@ -1306,7 +1353,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 	inode = new_inode(sb);
 	if (inode) {
-		inode->i_ino = get_next_ino();
+		inode->i_ino = shmem_next_ino(sb);
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1348,7 +1395,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 			break;
 		}
 	} else
-		shmem_free_inode(sb);
+		shmem_free_inode(inode);
 	return inode;
 }
 
@@ -1945,7 +1992,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 
 	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
-		shmem_free_inode(inode->i_sb);
+		shmem_free_inode(inode);
 
 	dir->i_size -= BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -2315,6 +2362,54 @@ static const struct export_operations shmem_export_ops = {
 	.fh_to_dentry	= shmem_fh_to_dentry,
 };
 
+static void shmem_ino_bitmap(struct shmem_sb_info *sbinfo,
+			     unsigned long prev_max)
+{
+	unsigned long *p;
+	unsigned long n, d;
+	int do_msg;
+
+	n = sbinfo->max_inodes / BITS_PER_BYTE;
+	if (sbinfo->max_inodes % BITS_PER_BYTE)
+		n++;
+
+	do_msg = 0;
+	if (sbinfo->ino_bitmap) {
+		/*
+		 * by shrinking the bitmap, the large inode number in use
+		 * may be left. but it is harmless.
+		 */
+		d = 0;
+		if (sbinfo->max_inodes > prev_max) {
+			d = sbinfo->max_inodes - prev_max;
+			d /= BITS_PER_BYTE;
+		}
+		spin_lock(&sbinfo->ino_lock);
+		p = krealloc(sbinfo->ino_bitmap, n, GFP_NOWAIT);
+		if (p) {
+			memset(p + n - d, 0, d);
+			sbinfo->ino_bitmap = p;
+			spin_unlock(&sbinfo->ino_lock);
+		} else {
+			p = sbinfo->ino_bitmap;
+			sbinfo->ino_bitmap = NULL;
+			spin_unlock(&sbinfo->ino_lock);
+			kfree(p);
+			do_msg = 1;
+		}
+	} else {
+		p = kzalloc(n, GFP_NOFS);
+		spin_lock(&sbinfo->ino_lock);
+		sbinfo->ino_bitmap = p;
+		spin_unlock(&sbinfo->ino_lock);
+		do_msg = !p;
+	}
+
+	if (unlikely(do_msg))
+		pr_err("%s: ino failed (%lu bytes). Ignored.\n",
+		       __func__, n);
+}
+
 static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 			       bool remount)
 {
@@ -2322,7 +2417,10 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 	struct mempolicy *mpol = NULL;
 	uid_t uid;
 	gid_t gid;
+	bool do_ino;
+	unsigned long old_val = sbinfo->max_inodes;
 
+	do_ino = 0;
 	while (options != NULL) {
 		this_char = options;
 		for (;;) {
@@ -2342,6 +2440,14 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 		}
 		if (!*this_char)
 			continue;
+		if (!strcmp(this_char, "ino")) {
+			do_ino = 1;
+			continue;
+		} else if (!strcmp(this_char, "noino")) {
+			do_ino = 0;
+			continue;
+		}
+
 		if ((value = strchr(this_char,'=')) != NULL) {
 			*value++ = 0;
 		} else {
@@ -2370,7 +2476,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 				goto bad_val;
 		} else if (!strcmp(this_char,"nr_inodes")) {
 			sbinfo->max_inodes = memparse(value, &rest);
-			if (*rest)
+			if (*rest || !sbinfo->max_inodes)
 				goto bad_val;
 		} else if (!strcmp(this_char,"mode")) {
 			if (remount)
@@ -2408,6 +2514,16 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 		}
 	}
 	sbinfo->mpol = mpol;
+
+	if (do_ino)
+		shmem_ino_bitmap(sbinfo, old_val);
+	else if (sbinfo->ino_bitmap) {
+		void *p = sbinfo->ino_bitmap;
+		spin_lock(&sbinfo->ino_lock);
+		sbinfo->ino_bitmap = NULL;
+		spin_unlock(&sbinfo->ino_lock);
+		kfree(p);
+	}
 	return 0;
 
 bad_val:
@@ -2472,6 +2588,8 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 			sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
 	if (sbinfo->max_inodes != shmem_default_max_inodes())
 		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
+	if (sbinfo->ino_bitmap)
+		seq_printf(seq, ",ino");
 	if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
 		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
 	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
@@ -2491,6 +2609,7 @@ static void shmem_put_super(struct super_block *sb)
 
 	percpu_counter_destroy(&sbinfo->used_blocks);
 	mpol_put(sbinfo->mpol);
+	kfree(sbinfo->ino_bitmap);
 	kfree(sbinfo);
 	sb->s_fs_info = NULL;
 }
@@ -2510,6 +2629,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 	sbinfo->mode = S_IRWXUGO | S_ISVTX;
 	sbinfo->uid = current_fsuid();
 	sbinfo->gid = current_fsgid();
+	spin_lock_init(&sbinfo->ino_lock);
 	sb->s_fs_info = sbinfo;
 
 #ifdef CONFIG_TMPFS

  reply	other threads:[~2014-04-30  4:09 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-04-29 15:45 [PATCH] vfs: get_next_ino(), never inum=0 hooanon05g
2014-04-29 17:42 ` J. R. Okajima
2014-04-29 17:53   ` Christoph Hellwig
2014-04-30  4:08     ` J. R. Okajima [this message]
2014-04-30 22:56       ` Andreas Dilger
2014-05-10  3:18         ` J. R. Okajima
2014-08-18 18:21 ` [PATCH v2] " Carlos Maiolino
2014-08-19  0:58   ` J. R. Okajima

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7467.1398830934@jrobl \
    --to=hooanon05g@gmail.com \
    --cc=dchinner@redhat.com \
    --cc=hch@lst.de \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.