From mboxrd@z Thu Jan  1 00:00:00 1970
From: Matthew Wilcox <willy@debian.org>
Subject: CFT: new posix file locking code
Date: Tue, 18 Jun 2002 22:38:04 +0100
Sender: linux-fsdevel-owner@vger.kernel.org
Message-ID: <20020618223804.I9435@parcelfarce.linux.theplanet.co.uk>
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Return-path: <linux-fsdevel-owner@vger.kernel.org>
Received: from willy by www.linux.org.uk with local (Exim 3.33 #5)
	id 17KQfs-0003Et-00
	for linux-fsdevel@vger.kernel.org; Tue, 18 Jun 2002 22:38:04 +0100
To: linux-fsdevel@vger.kernel.org
Content-Disposition: inline
List-Id: linux-fsdevel.vger.kernel.org


This is a fairly extensive rewrite of the POSIX file locking code, so
I'd appreciate some eyes (and, better, testers!).  If you were paying
attention earlier, this basically corresponds to flock-A3, -B0 and half
of -B1 (which are now in a flock-2.5.15 directory), plus some bugfixes.

The posix_lock_file function was 176 lines.  I split it into 75 lines
in __posix_lock_file and 56 in posix_juggle_locks.  I realise that
functions are ideally a mere 20-50 lines long, but this significantly
improves the nosebleed quotient.

We have a new flag -- FL_SLEEP.  This indicates that we're interested
in a sleeping lock rather than returning -EAGAIN to userspace.  NFS is
never interested in sleeping so never specifies this flag.  Callers which
do use the FL_SLEEP flag need to use a wait_event_interruptible loop.

diff -urNX dontdiff linux-2.5.22/fs/lockd/svclock.c linux-2.5.22-flock/fs/lockd/svclock.c
--- linux-2.5.22/fs/lockd/svclock.c	Sun Jun  2 18:44:45 2002
+++ linux-2.5.22-flock/fs/lockd/svclock.c	Fri Jun 14 06:19:17 2002
@@ -242,7 +242,7 @@
 	if (unlock && block->b_granted) {
 		dprintk("lockd: deleting granted lock\n");
 		fl->fl_type = F_UNLCK;
-		posix_lock_file(&block->b_file->f_file, fl, 0);
+		posix_lock_file(&block->b_file->f_file, fl);
 		block->b_granted = 0;
 	} else {
 		dprintk("lockd: unblocking blocked lock\n");
@@ -324,7 +324,7 @@
 
 again:
 	if (!(conflock = posix_test_lock(&file->f_file, &lock->fl))) {
-		error = posix_lock_file(&file->f_file, &lock->fl, 0);
+		error = posix_lock_file(&file->f_file, &lock->fl);
 
 		if (block)
 			nlmsvc_delete_block(block, 0);
@@ -428,7 +428,7 @@
 	nlmsvc_cancel_blocked(file, lock);
 
 	lock->fl.fl_type = F_UNLCK;
-	error = posix_lock_file(&file->f_file, &lock->fl, 0);
+	error = posix_lock_file(&file->f_file, &lock->fl);
 
 	return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
 }
@@ -533,7 +533,7 @@
 	 * following yields an error, this is most probably due to low
 	 * memory. Retry the lock in a few seconds.
 	 */
-	if ((error = posix_lock_file(&file->f_file, &lock->fl, 0)) < 0) {
+	if ((error = posix_lock_file(&file->f_file, &lock->fl)) < 0) {
 		printk(KERN_WARNING "lockd: unexpected error %d in %s!\n",
 				-error, __FUNCTION__);
 		nlmsvc_insert_block(block, 10 * HZ);
diff -urNX dontdiff linux-2.5.22/fs/lockd/svcsubs.c linux-2.5.22-flock/fs/lockd/svcsubs.c
--- linux-2.5.22/fs/lockd/svcsubs.c	Sun Jun  2 18:44:41 2002
+++ linux-2.5.22-flock/fs/lockd/svcsubs.c	Fri Jun 14 06:19:17 2002
@@ -176,7 +176,7 @@
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
 			lock.fl_end   = OFFSET_MAX;
-			if (posix_lock_file(&file->f_file, &lock, 0) < 0) {
+			if (posix_lock_file(&file->f_file, &lock) < 0) {
 				printk("lockd: unlock failure in %s:%d\n",
 						__FILE__, __LINE__);
 				return 1;
diff -urNX dontdiff linux-2.5.22/fs/locks.c linux-2.5.22-flock/fs/locks.c
--- linux-2.5.22/fs/locks.c	Mon Jun 17 04:49:08 2002
+++ linux-2.5.22-flock/fs/locks.c	Tue Jun 18 12:55:44 2002
@@ -184,7 +184,6 @@
 	fl->fl_file = NULL;
 	fl->fl_flags = 0;
 	fl->fl_type = 0;
-	fl->fl_start = fl->fl_end = 0;
 	fl->fl_notify = NULL;
 	fl->fl_insert = NULL;
 	fl->fl_remove = NULL;
@@ -254,8 +253,12 @@
 	fl->fl_pid = current->pid;
 	fl->fl_flags = FL_FLOCK;
 	fl->fl_type = type;
+	fl->fl_start = 0;
 	fl->fl_end = OFFSET_MAX;
-	
+
+	if ((cmd & LOCK_NB) == 0)
+		fl->fl_flags |= FL_SLEEP;
+
 	*lock = fl;
 	return 0;
 }
@@ -465,6 +468,8 @@
  */
 static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
 {
+	if (fl->fl_type == F_UNLCK)
+		return;
 	list_add(&fl->fl_link, &file_lock_list);
 
 	/* insert into file's list */
@@ -767,8 +772,7 @@
  * at the head of the list, but that's secret knowledge known only to
  * flock_lock_file and posix_lock_file.
  */
-static int flock_lock_file(struct file *filp, struct file_lock *new_fl,
-			   unsigned int wait)
+static int flock_lock_file(struct file *filp, struct file_lock *new_fl)
 {
 	struct file_lock **before;
 	struct inode * inode = filp->f_dentry->d_inode;
@@ -799,7 +803,6 @@
 		return 0;
 
 	lock_kernel();
-repeat:
 	for_each_lock(inode, before) {
 		struct file_lock *fl = *before;
 		if (IS_POSIX(fl))
@@ -809,12 +812,10 @@
 		if (!flock_locks_conflict(new_fl, fl))
 			continue;
 		error = -EAGAIN;
-		if (!wait)
-			goto out;
-		error = locks_block_on(fl, new_fl);
-		if (error != 0)
-			goto out;
-		goto repeat;
+		if (new_fl->fl_flags & FL_SLEEP) {
+			locks_insert_block(fl, new_fl);
+		}
+		goto out;
 	}
 	locks_insert_lock(&inode->i_flock, new_fl);
 	error = 0;
@@ -824,199 +825,174 @@
 	return error;
 }
 
+/*
+ * posix_juggle_locks - Attempt to merge POSIX locks
+ * @caller: new lock
+ * @fl: existing lock
+ * @returns 0 to continue looking, 1 to stop looking and -1 to indicate
+ * that the current lock should be deleted.
+ *
+ * Attempt to merge these two locks.  Due to the heinous POSIX locking
+ * semantics, we may end up having to split an existing lock into three
+ * pieces, so we need an extra lock.
+ */
+static int posix_juggle_locks(struct file_lock *caller,
+		struct file_lock *fl, struct file_lock *extra)
+{
+	if (caller->fl_type == fl->fl_type) {
+		if (fl->fl_end < caller->fl_start - 1)
+			return 0;
+		if (fl->fl_start > caller->fl_end + 1)
+			return 1;
+
+		/* The new and old lock are of the same type and should be
+		 * merged.  Extend the new lock to be the union of the
+		 * two locks and delete the existing lock.
+		 */
+		if (fl->fl_start < caller->fl_start) {
+			caller->fl_start = fl->fl_start;
+		}
+		if (fl->fl_end > caller->fl_end) {
+			caller->fl_end = fl->fl_end;
+		}
+		return -1;
+	} else {
+		if (fl->fl_end < caller->fl_start)
+			return 0;
+		if (fl->fl_start > caller->fl_end)
+			return 1;
+
+		printk(KERN_DEBUG "locks: caller %d %Lx-%Lx, existing %d %Lx-%Lx\n",
+				caller->fl_type, caller->fl_start, caller->fl_end,
+				fl->fl_type, fl->fl_start, fl->fl_end);
+		/* We have an overlap of some type. */
+		if (caller->fl_start <= fl->fl_start) {
+			if (fl->fl_end <= caller->fl_end) {
+				/* We completely replace this lock.  Just delete it. */
+				return -1;
+			}
+			fl->fl_start = caller->fl_end + 1;
+			/* Maybe the changed size & type allows others to progress */
+			locks_wake_up_blocks(fl);
+			return 1;
+		}
+
+		if (fl->fl_end <= caller->fl_end) {
+			fl->fl_end = caller->fl_start - 1;
+			/* Maybe the changed size & type allows others to progress */
+			locks_wake_up_blocks(fl);
+			return 1;
+		}
+		
+		/* The new lock splits the old lock.  POSIX, we hatesss it */
+		locks_copy_lock(extra, fl);
+		fl->fl_end = caller->fl_start - 1;
+		extra->fl_start = caller->fl_end + 1;
+		locks_insert_lock(&fl->fl_next, extra);
+		locks_wake_up_blocks(fl);
+		return 1;
+	}
+}
+
 /**
- *	posix_lock_file:
- *	@filp: The file to apply the lock to
- *	@caller: The lock to be applied
- *	@wait: 1 to retry automatically, 0 to return -EAGAIN
+ * __posix_lock_file:
+ * @filp: The file to apply the lock to
+ * @caller: The lock to be applied
  *
  * Add a POSIX style lock to a file.
- * We merge adjacent locks whenever possible. POSIX locks are sorted by owner
- * task, then by starting address
- *
- * Kai Petzke writes:
- * To make freeing a lock much faster, we keep a pointer to the lock before the
- * actual one. But the real gain of the new coding was, that lock_it() and
- * unlock_it() became one function.
- *
- * To all purists: Yes, I use a few goto's. Just pass on to the next function.
+ * Locks are merged and split as appropriate.
+ * POSIX locks are sorted by owner task, then by starting address
  */
 
-int posix_lock_file(struct file *filp, struct file_lock *caller,
-			   unsigned int wait)
+int __posix_lock_file(struct file *filp, struct file_lock *caller)
 {
-	struct file_lock *fl;
-	struct file_lock *new_fl, *new_fl2;
-	struct file_lock *left = NULL;
-	struct file_lock *right = NULL;
-	struct file_lock **before;
-	struct inode * inode = filp->f_dentry->d_inode;
-	int error, added = 0;
+	struct file_lock *extra, **before;
+	struct inode *inode = filp->f_dentry->d_inode;
+	int error, found = 0;
 
 	/*
-	 * We may need two file_lock structures for this operation,
-	 * so we get them in advance to avoid races.
+	 * We may need an extra file_lock structure for this operation,
+	 * so get it in advance to avoid atomic allocation.
 	 */
-	new_fl = locks_alloc_lock(0);
-	new_fl2 = locks_alloc_lock(0);
 	error = -ENOLCK; /* "no luck" */
-	if (!(new_fl && new_fl2))
+	extra = locks_alloc_lock(0);
+	if (!extra)
 		goto out_nolock;
 
 	lock_kernel();
 	if (caller->fl_type != F_UNLCK) {
-  repeat:
-		for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+		for_each_lock(inode, before) {
+			struct file_lock *fl = *before;
 			if (!IS_POSIX(fl))
 				continue;
 			if (!posix_locks_conflict(caller, fl))
 				continue;
 			error = -EAGAIN;
-			if (!wait)
+			if (!(caller->fl_flags & FL_SLEEP))
 				goto out;
 			error = -EDEADLK;
 			if (posix_locks_deadlock(caller, fl))
 				goto out;
-
-			error = locks_block_on(fl, caller);
-			if (error != 0)
-				goto out;
-			goto repeat;
+			error = -EAGAIN;
+			locks_insert_block(fl, caller);
+			goto out;
   		}
   	}
 
-	/*
-	 * We've allocated the new locks in advance, so there are no
-	 * errors possible (and no blocking operations) from here on.
-	 * 
-	 * Find the first old lock with the same owner as the new lock.
-	 */
-	
-	before = &inode->i_flock;
-
-	/* First skip locks owned by other processes.
-	 */
-	while ((fl = *before) && (!IS_POSIX(fl) ||
-				  !locks_same_owner(caller, fl))) {
-		before = &fl->fl_next;
-	}
-
-	/* Process locks with this owner.
-	 */
-	while ((fl = *before) && locks_same_owner(caller, fl)) {
-		/* Detect adjacent or overlapping regions (if same lock type)
-		 */
-		if (caller->fl_type == fl->fl_type) {
-			if (fl->fl_end < caller->fl_start - 1)
-				goto next_lock;
-			/* If the next lock in the list has entirely bigger
-			 * addresses than the new one, insert the lock here.
-			 */
-			if (fl->fl_start > caller->fl_end + 1)
+	for_each_lock(inode, before) {
+		int result;
+		struct file_lock *fl;
+ deleted:
+		fl = *before;
+		if (!IS_POSIX(fl))
+			continue;
+		if (!locks_same_owner(caller, fl)) {
+			if (found)
 				break;
-
-			/* If we come here, the new and old lock are of the
-			 * same type and adjacent or overlapping. Make one
-			 * lock yielding from the lower start address of both
-			 * locks to the higher end address.
-			 */
-			if (fl->fl_start > caller->fl_start)
-				fl->fl_start = caller->fl_start;
 			else
-				caller->fl_start = fl->fl_start;
-			if (fl->fl_end < caller->fl_end)
-				fl->fl_end = caller->fl_end;
-			else
-				caller->fl_end = fl->fl_end;
-			if (added) {
-				locks_delete_lock(before);
 				continue;
-			}
-			caller = fl;
-			added = 1;
 		}
-		else {
-			/* Processing for different lock types is a bit
-			 * more complex.
-			 */
-			if (fl->fl_end < caller->fl_start)
-				goto next_lock;
-			if (fl->fl_start > caller->fl_end)
-				break;
-			if (caller->fl_type == F_UNLCK)
-				added = 1;
-			if (fl->fl_start < caller->fl_start)
-				left = fl;
-			/* If the next lock in the list has a higher end
-			 * address than the new one, insert the new one here.
-			 */
-			if (fl->fl_end > caller->fl_end) {
-				right = fl;
-				break;
-			}
-			if (fl->fl_start >= caller->fl_start) {
-				/* The new lock completely replaces an old
-				 * one (This may happen several times).
-				 */
-				if (added) {
-					locks_delete_lock(before);
-					continue;
-				}
-				/* Replace the old lock with the new one.
-				 * Wake up anybody waiting for the old one,
-				 * as the change in lock type might satisfy
-				 * their needs.
-				 */
-				locks_wake_up_blocks(fl);
-				fl->fl_start = caller->fl_start;
-				fl->fl_end = caller->fl_end;
-				fl->fl_type = caller->fl_type;
-				fl->fl_u = caller->fl_u;
-				caller = fl;
-				added = 1;
-			}
+		found = 1;
+		result = posix_juggle_locks(caller, fl, extra);
+		if (result == 0)
+			continue;
+		if (result == -1) {
+			locks_delete_lock(before);
+			if (*before)
+				goto deleted;
 		}
-		/* Go on to next lock.
-		 */
-	next_lock:
-		before = &fl->fl_next;
+		break;
 	}
 
+	/* If we had to use the extra lock, we shouldn't free it */
+	if (!list_empty(&extra->fl_link))
+		extra = NULL;
+	locks_insert_lock(before, caller);
 	error = 0;
-	if (!added) {
-		if (caller->fl_type == F_UNLCK)
-			goto out;
-		locks_copy_lock(new_fl, caller);
-		locks_insert_lock(before, new_fl);
-		new_fl = NULL;
-	}
-	if (right) {
-		if (left == right) {
-			/* The new lock breaks the old one in two pieces,
-			 * so we have to use the second new lock.
-			 */
-			left = new_fl2;
-			new_fl2 = NULL;
-			locks_copy_lock(left, right);
-			locks_insert_lock(before, left);
-		}
-		right->fl_start = caller->fl_end + 1;
-		locks_wake_up_blocks(right);
-	}
-	if (left) {
-		left->fl_end = caller->fl_start - 1;
-		locks_wake_up_blocks(left);
-	}
+
 out:
 	unlock_kernel();
 out_nolock:
 	/*
 	 * Free any unused locks.
 	 */
-	if (new_fl)
-		locks_free_lock(new_fl);
-	if (new_fl2)
-		locks_free_lock(new_fl2);
+	if (extra)
+		locks_free_lock(extra);
+	return error;
+}
+
+/* Called from nfsd/lockd.  They embed the struct file_lock in their
+ * own structs, so if we use it directly we'll try and free it incorrectly
+ */
+int posix_lock_file(struct file *filp, struct file_lock *caller)
+{
+	int error;
+	struct file_lock *copy = locks_alloc_lock(0);
+	locks_copy_lock(copy, caller);
+	error = __posix_lock_file(filp, copy);
+	if (error) {
+		locks_free_lock(copy);
+	}
 	return error;
 }
 
@@ -1303,11 +1279,23 @@
 	if (error < 0)
 		goto out_putf;
 
-	error = flock_lock_file(filp, lock,
-				(cmd & (LOCK_UN | LOCK_NB)) ? 0 : 1);
+	for (;;) {
+		error = flock_lock_file(filp, lock);
+		if ((error != -EAGAIN) || (cmd & LOCK_NB))
+			break;
+		error = wait_event_interruptible(lock->fl_wait, !lock->fl_next);
+		if (!error)
+			continue;
 
-	if (error)
+		lock_kernel();
+		locks_delete_block(lock);
+		unlock_kernel();
+		break;
+	}
+
+	if (error) {
 		locks_free_lock(lock);
+	}
 
 out_putf:
 	fput(filp);
@@ -1415,6 +1403,9 @@
 	error = flock_to_posix_lock(filp, file_lock, &flock);
 	if (error)
 		goto out;
+	if (cmd == F_SETLKW) {
+		file_lock->fl_flags |= FL_SLEEP;
+	}
 	
 	error = -EBADF;
 	switch (flock.l_type) {
@@ -1438,10 +1429,26 @@
 		if (error < 0)
 			goto out;
 	}
-	error = posix_lock_file(filp, file_lock, cmd == F_SETLKW);
+
+	for (;;) {
+		error = __posix_lock_file(filp, file_lock);
+		if ((error != -EAGAIN) || (cmd == F_SETLK))
+			break;
+		error = wait_event_interruptible(file_lock->fl_wait,
+				!file_lock->fl_next);
+		if (!error)
+			continue;
+
+		lock_kernel();
+		locks_delete_block(file_lock);
+		unlock_kernel();
+		break;
+	}
 
 out:
-	locks_free_lock(file_lock);
+	if (error) {
+		locks_free_lock(file_lock);
+	}
 	return error;
 }
 
@@ -1534,6 +1541,9 @@
 	error = flock64_to_posix_lock(filp, file_lock, &flock);
 	if (error)
 		goto out;
+	if (cmd == F_SETLKW64) {
+		file_lock->fl_flags |= FL_SLEEP;
+	}
 	
 	error = -EBADF;
 	switch (flock.l_type) {
@@ -1557,10 +1567,27 @@
 		if (error < 0)
 			goto out;
 	}
-	error = posix_lock_file(filp, file_lock, cmd == F_SETLKW64);
+
+	for (;;) {
+		error = __posix_lock_file(filp, file_lock);
+		if ((error != -EAGAIN) || (cmd == F_SETLK))
+			break;
+		error = wait_event_interruptible(file_lock->fl_wait,
+				!file_lock->fl_next);
+		if (!error)
+			continue;
+
+		lock_kernel();
+		locks_delete_block(file_lock);
+		unlock_kernel();
+		break;
+	}
+
 
 out:
-	locks_free_lock(file_lock);
+	if (error) {
+		locks_free_lock(file_lock);
+	}
 	return error;
 }
 #endif /* BITS_PER_LONG == 32 */
diff -urNX dontdiff linux-2.5.22/include/linux/fs.h linux-2.5.22-flock/include/linux/fs.h
--- linux-2.5.22/include/linux/fs.h	Mon Jun 17 04:49:09 2002
+++ linux-2.5.22-flock/include/linux/fs.h	Mon Jun 17 04:55:10 2002
@@ -519,10 +519,10 @@
 
 #define FL_POSIX	1
 #define FL_FLOCK	2
-#define FL_BROKEN	4	/* broken flock() emulation */
-#define FL_ACCESS	8	/* for processes suspended by mandatory locking */
+#define FL_ACCESS	8	/* not trying to lock, just looking */
 #define FL_LOCKD	16	/* lock held by rpc.lockd */
 #define FL_LEASE	32	/* lease held on this file */
+#define FL_SLEEP	128	/* A blocking lock */
 
 /*
  * The POSIX file lock owner is determined by
@@ -583,7 +583,7 @@
 extern void locks_remove_posix(struct file *, fl_owner_t);
 extern void locks_remove_flock(struct file *);
 extern struct file_lock *posix_test_lock(struct file *, struct file_lock *);
-extern int posix_lock_file(struct file *, struct file_lock *, unsigned int);
+extern int posix_lock_file(struct file *, struct file_lock *);
 extern void posix_block_lock(struct file_lock *, struct file_lock *);
 extern void posix_unblock_lock(struct file_lock *);
 extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);

-- 
Revolutions do not require corporate support.