public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Andrew Morton <akpm@zip.com.au>
To: Linus Torvalds <torvalds@transmeta.com>
Cc: lkml <linux-kernel@vger.kernel.org>
Subject: [patch 13/13] lseek speedup
Date: Tue, 16 Jul 2002 22:31:23 -0700	[thread overview]
Message-ID: <3D35012B.EE9B1ABB@zip.com.au> (raw)



This is a fairly dopey patch to fix up the i_sem contention in lseek.
Better ideas are welcome, but I'm offline until Monday so don't think
I'm ignoring them...

We need to decide what we really want to lock in there.  If multiple
threads are updating f_pos and/or i_size at the same time then they are
going to see strange results whatever the kernel does.  I'd maintain
that the only real obligation which the kernel has in this situation is
to not oops, to not munch data and to not return competely outlandish
results.

And the only way we can return outlandish results is on 32-bit SMP if
one CPU reads i_size or f_pos while another CPU is in the middle of
modifying it.

- We only need to take i_sem for reading i_size on 32-bit machines
  with SMP or preempt, so add the entertaining down32() and up32()
  functions for that.

- Only take i_sem in the case where we actually need to read i_size:
  SEEK_END.

  So this patch only speeds up SEEK_SET, SEEK_CUR and SEEK_END on
  64-bit.  SEEK_END on 32-bit will continue to bounce i_sem around.

- Introduce a new spinlock in struct file.  Its scope is currently
  for providing atomicity to f_pos updates.

  This lock could also be used for guarding consistency of the
  readahead state.  But after a close walkthrough the readahead code I
  don't think readahead needs any locking (apart from the 32-bit reader
  of i_size!).  Any errors in there will be +/- 1 differences in the
  numbers, and readahead just fixes things like that up anyway.

- Fiddled with the layout of struct file in an attempt to make it
  more cache-friendly.



 fs/file_table.c    |    1 
 fs/read_write.c    |   86 ++++++++++++++++++++++++++++-------------------------
 include/linux/fs.h |   15 ++++++---
 3 files changed, 57 insertions(+), 45 deletions(-)

--- 2.5.26/fs/read_write.c~lseek-speedup	Tue Jul 16 21:47:22 2002
+++ 2.5.26-akpm/fs/read_write.c	Tue Jul 16 21:47:22 2002
@@ -20,53 +20,59 @@ struct file_operations generic_ro_fops =
 	mmap:		generic_file_mmap,
 };
 
+/*
+ * Assume reads and writes of long longs are atomic on 64-bit machines
+ */
+static inline void down32(struct semaphore *sem)
+{
+#if (BITS_PER_LONG != 64) && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT))
+	down(sem);
+#endif
+}
+
+static inline void up32(struct semaphore *sem)
+{
+#if (BITS_PER_LONG != 64) && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT))
+	up(sem);
+#endif
+}
+
 loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 {
-	long long retval;
+	loff_t retval;
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
 
-	down(&inode->i_sem);
-	switch (origin) {
-		case 2:
-			offset += inode->i_size;
-			break;
-		case 1:
-			offset += file->f_pos;
+	if (origin == 2) {
+		down32(&inode->i_sem);
+		offset += inode->i_size;
+		up32(&inode->i_sem);
+		spin_lock(&file->f_lock);
+	} else if (origin == 1) {
+		spin_lock(&file->f_lock);
+		offset += file->f_pos;
+	} else {
+		spin_lock(&file->f_lock);
 	}
 	retval = -EINVAL;
-	if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = ++event;
 		}
 		retval = offset;
 	}
-	up(&inode->i_sem);
+	spin_unlock(&file->f_lock);
 	return retval;
 }
 
 loff_t remote_llseek(struct file *file, loff_t offset, int origin)
 {
-	long long retval;
+	loff_t ret;
 
 	lock_kernel();
-	switch (origin) {
-		case 2:
-			offset += file->f_dentry->d_inode->i_size;
-			break;
-		case 1:
-			offset += file->f_pos;
-	}
-	retval = -EINVAL;
-	if (offset>=0 && offset<=file->f_dentry->d_inode->i_sb->s_maxbytes) {
-		if (offset != file->f_pos) {
-			file->f_pos = offset;
-			file->f_version = ++event;
-		}
-		retval = offset;
-	}
+	ret = generic_file_llseek(file, offset, origin);
 	unlock_kernel();
-	return retval;
+	return ret;
 }
 
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
@@ -76,15 +82,18 @@ loff_t no_llseek(struct file *file, loff
 
 loff_t default_llseek(struct file *file, loff_t offset, int origin)
 {
+	struct inode *inode = file->f_dentry->d_inode;
 	long long retval;
 
-	lock_kernel();
-	switch (origin) {
-		case 2:
-			offset += file->f_dentry->d_inode->i_size;
-			break;
-		case 1:
-			offset += file->f_pos;
+	if (origin == 2) {
+		down32(&inode->i_sem);
+		offset += inode->i_size;
+		up32(&inode->i_sem);
+	} else if (origin == 1) {
+		spin_lock(&file->f_lock);
+		offset += file->f_pos;
+	} else {
+		spin_lock(&file->f_lock);
 	}
 	retval = -EINVAL;
 	if (offset >= 0) {
@@ -94,18 +103,15 @@ loff_t default_llseek(struct file *file,
 		}
 		retval = offset;
 	}
-	unlock_kernel();
+	spin_unlock(&file->f_lock);
 	return retval;
 }
 
 static inline loff_t llseek(struct file *file, loff_t offset, int origin)
 {
-	loff_t (*fn)(struct file *, loff_t, int);
-
-	fn = default_llseek;
 	if (file->f_op && file->f_op->llseek)
-		fn = file->f_op->llseek;
-	return fn(file, offset, origin);
+		return (*file->f_op->llseek)(file, offset, origin);
+	return default_llseek(file, offset, origin);
 }
 
 asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
--- 2.5.26/include/linux/fs.h~lseek-speedup	Tue Jul 16 21:47:22 2002
+++ 2.5.26-akpm/include/linux/fs.h	Tue Jul 16 21:47:22 2002
@@ -475,20 +475,25 @@ struct file_ra_state {
 };
 
 struct file {
-	struct list_head	f_list;
 	struct dentry		*f_dentry;
+	struct list_head	f_list;
 	struct vfsmount         *f_vfsmnt;
-	struct file_operations	*f_op;
-	atomic_t		f_count;
+
 	unsigned int 		f_flags;
 	mode_t			f_mode;
+	struct file_operations	*f_op;
+	unsigned long		f_version;
+
+	atomic_t		f_count;
+	spinlock_t		f_lock;	/* for f_pos atomicity */
 	loff_t			f_pos;
+
 	struct fown_struct	f_owner;
-	unsigned int		f_uid, f_gid;
+	unsigned int		f_uid;
+	unsigned int		f_gid;
 	int			f_error;
 	struct file_ra_state	f_ra;
 
-	unsigned long		f_version;
 
 	/* needed for tty driver, and maybe others */
 	void			*private_data;
--- 2.5.26/fs/file_table.c~lseek-speedup	Tue Jul 16 21:47:22 2002
+++ 2.5.26-akpm/fs/file_table.c	Tue Jul 16 21:47:22 2002
@@ -44,6 +44,7 @@ struct file * get_empty_filp(void)
 	new_one:
 		memset(f, 0, sizeof(*f));
 		atomic_set(&f->f_count,1);
+		spin_lock_init(&f->f_lock);
 		f->f_version = ++event;
 		f->f_uid = current->fsuid;
 		f->f_gid = current->fsgid;

.

             reply	other threads:[~2002-07-17  5:23 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-07-17  5:31 Andrew Morton [this message]
2002-07-17  9:45 ` [patch 13/13] lseek speedup Anton Altaparmakov
2002-07-17 16:18   ` Andrew Morton
2002-07-17 16:20     ` Anton Altaparmakov
2002-07-17 16:49       ` Andrew Morton
2002-07-17 18:07         ` Anton Altaparmakov
2002-07-22  7:16           ` Andrew Morton
2002-07-22  7:43             ` Anton Altaparmakov
2002-07-17 18:27         ` Benjamin LaHaise
2002-07-17 10:54 ` Anton Altaparmakov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3D35012B.EE9B1ABB@zip.com.au \
    --to=akpm@zip.com.au \
    --cc=linux-kernel@vger.kernel.org \
    --cc=torvalds@transmeta.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox