public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [patch 7/8] fdmap v2 - implement sys_socket2
@ 2007-06-06 22:30 Davide Libenzi
  2007-06-06 22:44 ` David Miller
                   ` (2 more replies)
  0 siblings, 3 replies; 129+ messages in thread
From: Davide Libenzi @ 2007-06-06 22:30 UTC (permalink / raw)
  To: Linux Kernel Mailing List
  Cc: Linus Torvalds, Andrew Morton, Ulrich Drepper, Ingo Molnar,
	Eric Dumazet

This patch implement a new syscall sys_socket2(), that accepts an
extra "flags" parameter:

int socket2(int domain, int type, int protocol, int flags);

The flags parameter is used to pass extra flags to the kernel, and is
at the moment used to select the file descriptor allocations inside
the non-sequential area (O_NONSEQFD). The remaining parameters are
exactly the same as the ones of sys_socket().
The sys_accept() system call has been modified to return a file
descriptor inside the non-sequential area, if the listening fd is.
The sys_socketcall() system call has been also changed to support
a new SYS_SOCKET2 indentifier.



Signed-off-by: Davide Libenzi <davidel@xmailserver.org>


- Davide



Index: linux-2.6.mod/fs/9p/trans_fd.c
===================================================================
--- linux-2.6.mod.orig/fs/9p/trans_fd.c	2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/fs/9p/trans_fd.c	2007-06-06 12:48:41.000000000 -0700
@@ -181,7 +181,7 @@
 	int fd, ret;
 
 	csocket->sk->sk_allocation = GFP_NOIO;
-	if ((fd = sock_map_fd(csocket)) < 0) {
+	if ((fd = sock_map_fd(csocket, 0)) < 0) {
 		eprintk(KERN_ERR, "v9fs_socket_open: failed to map fd\n");
 		ret = fd;
 	      release_csocket:
Index: linux-2.6.mod/include/linux/net.h
===================================================================
--- linux-2.6.mod.orig/include/linux/net.h	2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/include/linux/net.h	2007-06-06 13:09:20.000000000 -0700
@@ -43,6 +43,7 @@
 #define SYS_GETSOCKOPT	15		/* sys_getsockopt(2)		*/
 #define SYS_SENDMSG	16		/* sys_sendmsg(2)		*/
 #define SYS_RECVMSG	17		/* sys_recvmsg(2)		*/
+#define SYS_SOCKET2	18		/* sys_socket2(2)		*/
 
 typedef enum {
 	SS_FREE = 0,			/* not allocated		*/
@@ -190,7 +191,7 @@
 				  size_t len);
 extern int	     sock_recvmsg(struct socket *sock, struct msghdr *msg,
 				  size_t size, int flags);
-extern int 	     sock_map_fd(struct socket *sock);
+extern int 	     sock_map_fd(struct socket *sock, int flags);
 extern struct socket *sockfd_lookup(int fd, int *err);
 #define		     sockfd_put(sock) fput(sock->file)
 extern int	     net_ratelimit(void);
Index: linux-2.6.mod/net/sctp/socket.c
===================================================================
--- linux-2.6.mod.orig/net/sctp/socket.c	2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/net/sctp/socket.c	2007-06-06 12:48:41.000000000 -0700
@@ -3605,7 +3605,7 @@
 		goto out;
 
 	/* Map the socket to an unused fd that can be returned to the user.  */
-	retval = sock_map_fd(newsock);
+	retval = sock_map_fd(newsock, 0);
 	if (retval < 0) {
 		sock_release(newsock);
 		goto out;
Index: linux-2.6.mod/net/socket.c
===================================================================
--- linux-2.6.mod.orig/net/socket.c	2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/net/socket.c	2007-06-06 13:14:08.000000000 -0700
@@ -344,11 +344,11 @@
  *	but we take care of internal coherence yet.
  */
 
-static int sock_alloc_fd(struct file **filep)
+static int sock_alloc_fd(struct file **filep, int flags)
 {
 	int fd;
 
-	fd = get_unused_fd();
+	fd = allocate_fd(flags);
 	if (likely(fd >= 0)) {
 		struct file *file = get_empty_filp();
 
@@ -391,10 +391,10 @@
 	return 0;
 }
 
-int sock_map_fd(struct socket *sock)
+int sock_map_fd(struct socket *sock, int flags)
 {
 	struct file *newfile;
-	int fd = sock_alloc_fd(&newfile);
+	int fd = sock_alloc_fd(&newfile, flags);
 
 	if (likely(fd >= 0)) {
 		int err = sock_attach_fd(sock, newfile);
@@ -1198,7 +1198,7 @@
 	return __sock_create(family, type, protocol, res, 1);
 }
 
-asmlinkage long sys_socket(int family, int type, int protocol)
+asmlinkage long sys_socket2(int family, int type, int protocol, int flags)
 {
 	int retval;
 	struct socket *sock;
@@ -1207,7 +1207,7 @@
 	if (retval < 0)
 		goto out;
 
-	retval = sock_map_fd(sock);
+	retval = sock_map_fd(sock, flags);
 	if (retval < 0)
 		goto out_release;
 
@@ -1220,6 +1220,11 @@
 	return retval;
 }
 
+asmlinkage long sys_socket(int family, int type, int protocol)
+{
+	return sys_socket2(family, type, protocol, 0);
+}
+
 /*
  *	Create a pair of connected sockets.
  */
@@ -1248,11 +1253,11 @@
 	if (err < 0)
 		goto out_release_both;
 
-	fd1 = sock_alloc_fd(&newfile1);
+	fd1 = sock_alloc_fd(&newfile1, 0);
 	if (unlikely(fd1 < 0))
 		goto out_release_both;
 
-	fd2 = sock_alloc_fd(&newfile2);
+	fd2 = sock_alloc_fd(&newfile2, 0);
 	if (unlikely(fd2 < 0)) {
 		put_filp(newfile1);
 		put_unused_fd(fd1);
@@ -1407,7 +1412,8 @@
 	 */
 	__module_get(newsock->ops->owner);
 
-	newfd = sock_alloc_fd(&newfile);
+	newfd = sock_alloc_fd(&newfile,
+	      fd > current->signal->rlim[RLIMIT_NOFILE].rlim_cur ? O_NONSEQFD: 0);
 	if (unlikely(newfd < 0)) {
 		err = newfd;
 		sock_release(newsock);
@@ -1983,10 +1989,11 @@
 
 /* Argument list sizes for sys_socketcall */
 #define AL(x) ((x) * sizeof(unsigned long))
-static const unsigned char nargs[18]={
+static const unsigned char nargs[]={
 	AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 	AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
-	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)
+	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
+	AL(4)
 };
 
 #undef AL
@@ -2005,7 +2012,7 @@
 	unsigned long a0, a1;
 	int err;
 
-	if (call < 1 || call > SYS_RECVMSG)
+	if (call < 1 || call >= ARRAY_SIZE(nargs))
 		return -EINVAL;
 
 	/* copy_from_user should be SMP safe. */
@@ -2082,6 +2089,9 @@
 	case SYS_RECVMSG:
 		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
 		break;
+	case SYS_SOCKET2:
+		err = sys_socket2(a0, a1, a[2], a[3]);
+		break;
 	default:
 		err = -EINVAL;
 		break;


^ permalink raw reply	[flat|nested] 129+ messages in thread
* [patch 1/8] fdmap v2 - fdmap core
@ 2007-06-06 22:30 Davide Libenzi
  2007-06-07  6:54 ` Eric Dumazet
  0 siblings, 1 reply; 129+ messages in thread
From: Davide Libenzi @ 2007-06-06 22:30 UTC (permalink / raw)
  To: Linux Kernel Mailing List
  Cc: Linus Torvalds, Andrew Morton, Ulrich Drepper, Ingo Molnar,
	Eric Dumazet

Core code for the fdmap implementation. Random allocation, exact allocation,
de-allocation and lookup are all O(1) operations. It also support the "legacy"
sequential (compact) file descriptor allocation, that is O(N) like the old
fdtable implementation.
Like the old "struct fdtable", fdmap is RCU friendly too.



Signed-off-by: Davide Libenzi <davidel@xmailserver.org>


- Davide


Index: linux-2.6.mod/fs/fdmap.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.mod/fs/fdmap.c	2007-06-06 12:47:31.000000000 -0700
@@ -0,0 +1,549 @@
+/*
+ *  fs/fdmap.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/fdmap.h>
+
+#define FDMAP_BMPSIZE(s)	(FDMAP_BMP_LONGS(s) * sizeof(long))
+
+#define FDMAP_KMALLOC_LIMIT	PAGE_SIZE
+
+struct fdmap_defer {
+	spinlock_t lock;
+	struct work_struct wq;
+	struct fd_map *next;
+};
+
+static DEFINE_PER_CPU(struct fdmap_defer, fdmap_defer_list);
+
+static inline void fdmap_insert(struct list_head *new,
+				struct list_head *prev, struct list_head *next)
+{
+	/*
+	 * The insert function is used to re-insert the slot inside
+	 * the list of free slots, so basically during fd release time.
+	 * The ->next field is used by fdmap_busy_slot() to test if a
+	 * slot is allocated or not. We need to make sure the ->next
+	 * fields are properly set, before the updates to the ->prev
+	 * fields are visible. The list is not *walked* in RCU fashion
+	 * (simply looked up by entry), so we are fine with the code below.
+	 */
+	new->next = next;
+	smp_wmb();
+	new->prev = prev;
+	next->prev = new;
+	prev->next = new;
+}
+
+static void fdmap_remove(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+}
+
+static inline void fdmap_add_slot(struct list_head *head, struct list_head *new)
+{
+	fdmap_insert(new, head->prev, head);
+}
+
+static void *fdmap_alloc_mem(unsigned long size)
+{
+	if (size <= FDMAP_KMALLOC_LIMIT)
+		return kmalloc(size, GFP_KERNEL);
+	else
+		return vmalloc(size);
+}
+
+static void fdmap_free_mem(void *data, unsigned long size)
+{
+	if (size <= FDMAP_KMALLOC_LIMIT)
+		kfree(data);
+	else
+		vfree(data);
+}
+
+/**
+ * fdmap_install - Installs a file pointer onto a previously allocated
+ *                 file descriptor
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ * @fd:     [in] Previously allocated file descriptor
+ * @file:   [in] File pointer
+ *
+ */
+void fdmap_install(struct fd_map *fmap, unsigned int fd, struct file *file)
+{
+	smp_wmb();
+	fmap->slots[fd - fmap->base].prev = (struct list_head *) file;
+}
+
+static int fdmap_alloc_tail(struct fd_map *fmap, int fd, unsigned long flags)
+{
+	struct list_head *ptr = fmap->slots + fd;
+
+	fdmap_remove(ptr);
+	__set_bit(fd, fmap->map);
+	/*
+	 * We need to make sure that at the time ->next is marked as allocated,
+	 * ->prev is properly initialize to NULL. This way the RCU-aware
+	 * fdmap_file_get() can return the "correct" invalid NULL value, instead
+	 * of garbage.
+	 */
+	ptr->prev = NULL;
+	smp_wmb();
+	FDMAP_SETFLAGS(ptr, FDMAP_F_BUSYSLOT | flags);
+
+	return fmap->base + fd;
+}
+
+/**
+ * fdmap_newfd - Allocates a new random file descriptor
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ * @fd:     [in] File descriptor to allocate, or -1 to allocate a random one
+ * @flags:  [in] Flags to be associated with the file descriptor
+ *
+ * Return the newly allocated file descriptor, or a negative value in case
+ * of error.
+ */
+int fdmap_newfd(struct fd_map *fmap, int fd, unsigned long flags)
+{
+	if (likely(fd < 0)) {
+		if (unlikely(list_empty(&fmap->slist)))
+			return -ENOSPC;
+		fd = (int) (fmap->slist.next - fmap->slots);
+	} else {
+		fd = fd - fmap->base;
+		if (unlikely(fdmap_busy_slot(&fmap->slots[fd])))
+			return -EBUSY;
+	}
+
+	return fdmap_alloc_tail(fmap, fd, flags);
+}
+
+/**
+ * fdmap_newfd_seq - Allocates a new sequential file descriptor
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ * @start:  [in] Start position from where to allocated the new file
+ *               descriptor. It must be inside the area of allocation of @fmap,
+ *               or it can be zero (in which case the lower file descriptor
+ *               will be returned)
+ * @limit:  [in] Maximum (not included) value for the returned fd
+ * @flags:  [in] Flags to be associated with the file descriptor
+ *
+ * Return the newly allocated file descriptor, or a negative value in case
+ * of error.
+ */
+int fdmap_newfd_seq(struct fd_map *fmap, unsigned int start,
+		    unsigned int limit, unsigned long flags)
+{
+	int fd;
+
+	if (unlikely(start))
+		start = start - fmap->base;
+	if (likely(start < fmap->fdnext))
+		start = fmap->fdnext;
+	fd = find_next_zero_bit(fmap->map, fmap->size, start);
+	if (unlikely(fd >= limit))
+		return -EMFILE;
+	if (unlikely(fd >= fmap->size))
+		return -ENOSPC;
+	fmap->fdnext = fd + 1;
+
+	return fdmap_alloc_tail(fmap, fd, flags);
+}
+
+/**
+ * fdmap_putfd - Releases a previously allocated file descriptor
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ * @fd:     [in] Previously allocated file descriptor
+ *
+ */
+void fdmap_putfd(struct fd_map *fmap, unsigned int fd)
+{
+	fd = fd - fmap->base;
+
+	/*
+	 * The smp_wmb() inside fdmap_insert() takes care of making
+	 * the transaction RCU friendly.
+	 */
+	fdmap_add_slot(&fmap->slist, &fmap->slots[fd]);
+	__clear_bit(fd, fmap->map);
+	if (fd < fmap->fdnext)
+		fmap->fdnext = fd;
+}
+
+/**
+ * fdmap_get_fdflags - Retrieves an allocated file descriptor flags
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ * @fd:     [in] Previously allocated file descriptor
+ *
+ * Returns the file descriptor flags, if the descriptor is allocated,
+ * or zero if not.
+ */
+unsigned long fdmap_get_fdflags(struct fd_map *fmap, unsigned int fd)
+{
+	struct list_head *ptr;
+
+	ptr = fmap->slots + fd - fmap->base;
+	if (unlikely(!fdmap_busy_slot(ptr)))
+		return 0;
+
+	return FDMAP_GETFLAGS(ptr) & ~FDMAP_F_BUSYSLOT;
+}
+
+/**
+ * fdmap_set_fdflags - Changes an allocated file descriptor flags. It allows
+ *                     to specify a set of flags to be cleared, together with
+ *                     a set of flags to be set
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ * @fd:     [in] Previously allocated file descriptor
+ * @fclear: [in] Set of flags to be cleared
+ * @fadd:   [in] Set of flags to be set
+ *
+ * Returns the file descriptor flags, if the descriptor is allocated,
+ * or zero if not.
+ */
+int fdmap_set_fdflags(struct fd_map *fmap, unsigned int fd, unsigned long fclear,
+		      unsigned long fadd)
+{
+	struct list_head *ptr;
+
+	ptr = fmap->slots + fd - fmap->base;
+	if (unlikely(!fdmap_busy_slot(ptr)))
+		return -EBADF;
+	fclear &= ~FDMAP_F_BUSYSLOT;
+	/*
+	 * There's no race here WRT the FDMAP_F_BUSYSLOT flag. The flag
+	 * is there before (otherwise the fdmap_busy_slot() check would
+	 * fail, and is never cleared. So an external viewer either sees
+	 * the old value, or the new one, and both have FDMAP_F_BUSYSLOT set.
+	 */
+	ptr->next = (void *) ((FDMAP_GETFLAGS(ptr) & ~fclear) | fadd);
+
+	return 0;
+}
+
+/**
+ * fdmap_for_each_file - Enumerates all the file pointers inside the allocated
+ *                       file descriptors. Only if the file pointer is not NULL
+ *                       (although the file descriptor may be allocated), the
+ *                       callback function is invoked
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ * @reset:  [in] Should the file pointer be atomically reset to NULL?
+ * @proc:   [in] Callback to be invoked for every file pointer
+ * @priv:   [in] Private data for the callback (passed in the first parameter)
+ *
+ */
+void fdmap_for_each_file(struct fd_map *fmap, int reset,
+			 int (*proc)(void *, struct file *, int), void *priv)
+{
+	unsigned int i;
+	struct list_head *ptr;
+	struct file *file;
+
+	for (i = 0, ptr = fmap->slots; i < fmap->size; i++, ptr++) {
+		if (fdmap_busy_slot(ptr)) {
+			if (reset)
+				file = (struct file *) xchg(&ptr->prev, NULL);
+			else
+				file = (struct file *) ptr->prev;
+			if (file && (*proc)(priv, file, i + fmap->base))
+				break;
+		}
+	}
+}
+
+/**
+ * fdmap_next_flag_set - Retrieves the next, not empty set, of allocated file
+ *                       desciptors having the bit @bit set in their flags
+ *
+ * @fmap:   [in]     Pointer to the file descriptor map
+ * @bit:    [in]     Bit number to test for
+ * @clear:  [in]     Should the flag bit be cleared?
+ * @start:  [in/out] Next position to scan from. Must be set to set to start
+ *                   a new scan, and it will be updated at every call to this
+ *                   function
+ * @base:   [out]    File descriptor base value for the returned set
+ * @fset:   [out]    Bit set of file desciptors having the bit @bit set in
+ *                   their flags. Bit #0 of @fset refers to the file desciptor
+ *                   @base, bit #1 to @base+1, etc...
+ *
+ * Returns a non zero value if the next set is available, or zero if no more
+ * file desciptors with the bit @bit set are available.
+ */
+int fdmap_next_flag_set(struct fd_map *fmap, int bit, int clear,
+			unsigned int *start, unsigned int *base,
+			unsigned long *fset)
+{
+	unsigned int i, j;
+	unsigned long f, mask, v;
+	struct list_head *ptr;
+
+	mask = 1UL << bit;
+	i = *start;
+	ptr = fmap->slots + i;
+	f = 0;
+	do {
+		if (i >= fmap->size)
+			return 0;
+		*base = i + fmap->base;
+		for (j = 0; i < fmap->size && j < BITS_PER_LONG;
+		     i++, j++, ptr++) {
+			if (!fdmap_busy_slot(ptr))
+				continue;
+			v = FDMAP_GETFLAGS(ptr);
+			if (v & mask) {
+				f |= 1UL << j;
+				if (clear)
+					FDMAP_SETFLAGS(ptr, v & ~mask);
+			}
+		}
+	} while (!f);
+	*start = i;
+	*fset = f;
+
+	return 1;
+}
+
+/**
+ * fdmap_top_open_fd - Finds the top file descriptor allocated
+ *
+ * @fmap:   [in]     Pointer to the file descriptor map
+ *
+ * Returns the top allocated file descriptor, or (base - 1) if no open
+ * file descriptors are found.
+ */
+int fdmap_top_open_fd(const struct fd_map *fmap)
+{
+	int i, j;
+	unsigned long tset, mask;
+	const unsigned long *map;
+
+	i = (int) (FDMAP_BMPSIZE(fmap->size) / sizeof(long)) - 1;
+	for (map = fmap->map + i; i >= 0 && !*map; i--, map--);
+	if (i >= 0) {
+		/*
+		 * Unfortunately, __fls is not everywhere.
+		 */
+		tset = *map;
+		/* Set "mask" to top BITS_PER_BYTE set */
+		mask = ~((1UL << (BITS_PER_LONG - BITS_PER_BYTE)) - 1);
+		for (j = BITS_PER_LONG; j && !(tset & mask);
+		     j -= BITS_PER_BYTE, mask >>= BITS_PER_BYTE);
+		j--;
+		mask = 1UL << j;
+		for (; j && !(tset & mask); j--, mask >>= 1);
+		i = i * BITS_PER_LONG + j;
+	}
+	return (int) fmap->base + i;
+}
+
+/**
+ * fdmap_copy - Copies the content of a file descriptor map to another
+ *
+ * @dfmap:  [in/out] Pointer to the destination file descriptor map
+ * @sfmap:  [in]     Pointer to the source file descriptor map
+ * @count:  [out]    Pointer to the number of allocated file descriptor
+ *                   transfered
+ * @cpflags [in]     Flags to control how files are copied
+ *
+ * Copies the content of one file descriptor map to another. The size
+ * of the destination map must be greater than the maximum allocated
+ * file descriptor in the source map.
+ */
+void fdmap_copy(struct fd_map *dfmap, const struct fd_map *sfmap,
+		unsigned int *count, unsigned long cpflags)
+{
+	unsigned int i, bcount, size;
+	struct list_head *dptr;
+	const struct list_head *sptr;
+
+	INIT_LIST_HEAD(&dfmap->slist);
+	memset(dfmap->map, 0, FDMAP_BMPSIZE(dfmap->size));
+	dptr = dfmap->slots;
+	sptr = sfmap->slots;
+	size = min(sfmap->size, dfmap->size);
+	for (i = 0, bcount = 0; i < size; i++, dptr++, sptr++) {
+		if (fdmap_busy_slot(sptr) && sptr->prev) {
+			if (cpflags & FDMAP_CPF_FORKMODE) {
+				if (FDMAP_GETFLAGS(sptr) & FDMAP_F_CLOFORK)
+					goto add_free_list;
+				get_file((struct file *) sptr->prev);
+			}
+			*dptr = *sptr;
+			__set_bit(i, dfmap->map);
+			bcount++;
+			continue;
+		}
+add_free_list:
+		fdmap_add_slot(&dfmap->slist, dptr);
+	}
+	/*
+	 * Source map can be greter in size than destination map,
+	 * but no open file descriptors must be present in the higher
+	 * part of the source map.
+	 */
+	for (; i < sfmap->size; i++, sptr++)
+		BUG_ON(fdmap_busy_slot(sptr) && sptr->prev);
+	for (i = size; i < dfmap->size; i++, dptr++)
+		fdmap_add_slot(&dfmap->slist, dptr);
+	if (count)
+		*count = bcount;
+}
+
+/**
+ * fdmap_init_map - Initialize a pre-allocated map
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ * @base:   [in] Starting value for the file descriptors allocated inside this map
+ * @size:   [in] Size of the map
+ * @init:   [in] Boolean indicating if the free-map initialization should be done
+ *
+ */
+void fdmap_init_map(struct fd_map *fmap, unsigned int base, unsigned int size,
+		    int init)
+{
+	int i;
+
+	fmap->next = NULL;
+	fmap->base = base;
+	fmap->size = size;
+	fmap->fdnext = 0;
+	fmap->freecb = NULL;
+	fmap->freecb_priv = NULL;
+	INIT_RCU_HEAD(&fmap->rcu);
+	INIT_LIST_HEAD(&fmap->slist);
+	if (init) {
+		for (i = 0; i < size; i++)
+			fdmap_add_slot(&fmap->slist, &fmap->slots[i]);
+		memset(fmap->map, 0, FDMAP_BMPSIZE(size));
+	}
+}
+
+/**
+ * fdmap_alloc - Allocates a new file descriptor map
+ *
+ * @base:  [in] Starting value for the file descriptors allocated inside this map
+ * @size:  [in] Size of the map
+ * @init:  [in] Boolean indicating if the free-map initialization should be done.
+ *              When allocating a new must just to be copied over by fdmap_copy(),
+ *              it saves time to avoid to go through the whole map memory to
+ *              initialize it, when it will be overwritten soon after
+ *
+ */
+struct fd_map *fdmap_alloc(unsigned int base, unsigned int size, int init)
+{
+	struct fd_map *fmap;
+
+	if ((long) base + (long) size >= INT_MAX ||
+	    UINT_MAX / sizeof(struct list_head) < size)
+		return NULL;
+	fmap = kzalloc(sizeof(struct fd_map), GFP_KERNEL);
+	if (!fmap)
+		return NULL;
+	fmap->slots = fdmap_alloc_mem(size * sizeof(struct list_head));
+	if (!fmap->slots)
+		goto out_free;
+	fmap->map = fdmap_alloc_mem(FDMAP_BMPSIZE(size));
+	if (!fmap->map)
+		goto out_free;
+	fdmap_init_map(fmap, base, size, init);
+
+	return fmap;
+
+out_free:
+	fdmap_free_mem(fmap->slots,
+		       size * sizeof(struct list_head));
+	kfree(fmap);
+	return NULL;
+}
+
+static void fdmap_free_rcu(struct rcu_head *rcu)
+{
+	struct fd_map *fmap = container_of(rcu, struct fd_map, rcu);
+	struct fdmap_defer *fddef;
+
+	BUG_ON(!fmap);
+
+	fddef = &get_cpu_var(fdmap_defer_list);
+	spin_lock(&fddef->lock);
+	fmap->next = fddef->next;
+	fddef->next = fmap;
+	schedule_work(&fddef->wq);
+	spin_unlock(&fddef->lock);
+	put_cpu_var(fdmap_defer_list);
+}
+
+/**
+ * fdmap_free - Frees a file descriptor map. File descriptor map deallocation
+ *              is done in an RCU way, since file descriptor maps must be RCU
+ *              friendly
+ *
+ * @fmap:   [in] Pointer to the file descriptor map to be freed
+ *
+ */
+void fdmap_free(struct fd_map *fmap)
+{
+	call_rcu(&fmap->rcu, fdmap_free_rcu);
+}
+
+static void free_fdmap_work(struct work_struct *work)
+{
+	struct fdmap_defer *fddef = container_of(work, struct fdmap_defer, wq);
+	struct fd_map *fmap, *next;
+
+	spin_lock_bh(&fddef->lock);
+	fmap = fddef->next;
+	fddef->next = NULL;
+	spin_unlock_bh(&fddef->lock);
+	while (fmap) {
+		next = fmap->next;
+		/*
+		 * The struct fd_map may be embedded inside other strctures,
+		 * and we give the ability to set custom RCU free functions.
+		 */
+		if (fmap->freecb)
+			(*fmap->freecb)(fmap->freecb_priv, fmap);
+		else {
+			fdmap_free_mem(fmap->map, FDMAP_BMPSIZE(fmap->size));
+			fdmap_free_mem(fmap->slots,
+				       fmap->size * sizeof(struct list_head));
+			kfree(fmap);
+		}
+		fmap = next;
+	}
+}
+
+/**
+ * fdmap_module_init - Early initialization function for the file descriptors
+ *                     allocator module
+ *
+ */
+int fdmap_module_init(void)
+{
+	int i;
+	struct fdmap_defer *fddef;
+
+	for_each_possible_cpu(i) {
+		fddef = &per_cpu(fdmap_defer_list, i);
+		spin_lock_init(&fddef->lock);
+		INIT_WORK(&fddef->wq, free_fdmap_work);
+		fddef->next = NULL;
+	}
+	return 0;
+}
+
Index: linux-2.6.mod/include/linux/fdmap.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.mod/include/linux/fdmap.h	2007-06-06 12:47:54.000000000 -0700
@@ -0,0 +1,185 @@
+/*
+ *  include/linux/fdmap.h
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#ifndef _LINUX_FDMAP_H
+#define _LINUX_FDMAP_H
+
+#include <linux/rcupdate.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+
+/*
+ * We use bit zero for the special FDMAP_F_BUSYSLOT flag. This
+ * will be an indicator that the slot is busy, and we take advantage
+ * that pointers to "slots" will always have alignment greater than one.
+ */
+#define FDMAP_BIT_BUSYSLOT	0
+#define FDMAP_F_BUSYSLOT	(1UL << FDMAP_BIT_BUSYSLOT)
+
+#define FDMAP_BIT_CLOEXEC	1
+#define FDMAP_F_CLOEXEC		(1UL << FDMAP_BIT_CLOEXEC)
+
+#define FDMAP_BIT_CLOFORK	2
+#define FDMAP_F_CLOFORK		(1UL << FDMAP_BIT_CLOFORK)
+
+#define FDMAP_CPF_FORKMODE	(1UL << 0)
+
+#define FDMAP_GETFLAGS(p)	((unsigned long) (p)->next)
+#define FDMAP_SETFLAGS(p, f)	(p)->next = (void *) (f)
+#define FDMAP_BMP_LONGS(s)	DIV_ROUND_UP((s) + 1, BITS_PER_LONG)
+
+struct fd_map {
+	struct fd_map *next;
+	struct rcu_head rcu;
+	unsigned int base;
+	unsigned int size;
+	struct list_head slist;
+	struct list_head *slots;
+	unsigned int fdnext;
+	unsigned long *map;
+	void (*freecb)(void *, struct fd_map *);
+	void *freecb_priv;
+};
+
+/**
+ * fdmap_busy_slot - Returns the BUSY status of an allocation slot
+ *
+ * @ptr:   [in] Pointer to the allocation slot
+ *
+ * Returns a non-zero value if the slot pointed by @ptr is allocated, zero
+ * otherwise
+ */
+static inline int fdmap_busy_slot(const struct list_head *ptr)
+{
+	smp_rmb();
+	return !!(FDMAP_GETFLAGS(ptr) & FDMAP_F_BUSYSLOT);
+}
+
+/**
+ * fdmap_file_get - Gets the file pointer associated with a file descriptor
+ *
+ * @fmap: [in] Pointer to the file descriptor map
+ * @fd:   [in] File descriptor
+ *
+ * Returns the file pointer associated with the file @fd, or NULL
+ * if no file pointer is still associated with @fd.
+ */
+static inline struct file *fdmap_file_get(struct fd_map *fmap, unsigned int fd)
+{
+	struct list_head *ptr;
+
+	ptr = fmap->slots + fd - fmap->base;
+	if (unlikely(!fdmap_busy_slot(ptr)))
+		return NULL;
+	return (struct file *) ptr->prev;
+}
+
+/**
+ * fdmap_fdof - Tells if a file descriptor value falls inside the range
+ *              allowed by @fmap
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ * @fd:     [in] Previously allocated file descriptor
+ *
+ * Return non-zero if the file descriptor value falls inside the range
+ * allowed byt @fmap, or zero otherwise
+ */
+static inline int fdmap_fdof(struct fd_map *fmap, unsigned int fd)
+{
+	return fd >= fmap->base && fd < fmap->base + fmap->size;
+}
+
+/**
+ * fdmap_basefd - Returns the first file descriptor value that can be
+ *                allocated inside this map
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ *
+ */
+static inline unsigned int fdmap_basefd(const struct fd_map *fmap)
+{
+	return fmap->base;
+}
+
+/**
+ * fdmap_size - Returns the size of this alocation map
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ *
+ */
+static inline unsigned int fdmap_size(const struct fd_map *fmap)
+{
+	return fmap->size;
+}
+
+/**
+ * fdmap_topfd - Returns the file descriptor value after the last
+ *               that can be allocated inside this map
+ *
+ * @fmap:   [in] Pointer to the file descriptor map
+ *
+ */
+static inline unsigned int fdmap_topfd(const struct fd_map *fmap)
+{
+	return fmap->base + fmap->size;
+}
+
+/**
+ * fdmap_set_rcufree - Sets the RCU-free parameters to allow custom free code
+ *
+ * @fmap:       [in] Pointer to the file descriptor map
+ * @freecb:     [in] Pointer to the free callback
+ * @freecb_priv [in] Pointer to the free callback data
+ *
+ */
+static inline void fdmap_set_freecb(struct fd_map *fmap,
+				    void (*freecb)(void *, struct fd_map *),
+				    void *freecb_priv)
+{
+	fmap->freecb = freecb;
+	fmap->freecb_priv = freecb_priv;
+}
+
+/**
+ * fdmap_get_allocmap - Returns the allocation map for this file descriptor map
+ *
+ * @fmap:       [in] Pointer to the file descriptor map
+ *
+ * The first bit of the map is '1', if the file descriptor at fdmap_basefd() is
+ * allocated. The second bit of the map is '1', if the file descriptor at
+ * fdmap_basefd()+1 is allocated. And so on.
+ */
+static inline const unsigned long *fdmap_get_allocmap(const struct fd_map *fmap)
+{
+	return fmap->map;
+}
+
+struct file *fdmap_file_get(struct fd_map *fmap, unsigned int fd);
+void fdmap_install(struct fd_map *fmap, unsigned int fd, struct file *file);
+int fdmap_newfd(struct fd_map *fmap, int fd, unsigned long flags);
+int fdmap_newfd_seq(struct fd_map *fmap, unsigned int start,
+		    unsigned int limit, unsigned long flags);
+void fdmap_putfd(struct fd_map *fmap, unsigned int fd);
+unsigned long fdmap_get_fdflags(struct fd_map *fmap, unsigned int fd);
+int fdmap_set_fdflags(struct fd_map *fmap, unsigned int fd, unsigned long fclear,
+		      unsigned long fadd);
+void fdmap_for_each_file(struct fd_map *fmap, int reset,
+			 int (*proc)(void *, struct file *, int), void *priv);
+int fdmap_next_flag_set(struct fd_map *fmap, int bit, int clear,
+			unsigned int *start, unsigned int *base,
+			unsigned long *fset);
+int fdmap_top_open_fd(const struct fd_map *fmap);
+void fdmap_copy(struct fd_map *dfmap, const struct fd_map *sfmap,
+		unsigned int *count, unsigned long cpflags);
+void fdmap_init_map(struct fd_map *fmap, unsigned int base, unsigned int size,
+		    int init);
+struct fd_map *fdmap_alloc(unsigned int base, unsigned int size, int init);
+void fdmap_free(struct fd_map *fmap);
+int fdmap_module_init(void);
+
+#endif /* _LINUX_FDMAP_H */
+
Index: linux-2.6.mod/fs/Makefile
===================================================================
--- linux-2.6.mod.orig/fs/Makefile	2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/fs/Makefile	2007-06-06 12:38:29.000000000 -0700
@@ -5,7 +5,7 @@
 # Rewritten to use lists instead of if-statements.
 # 
 
-obj-y :=	open.o read_write.o file_table.o super.o \
+obj-y :=	open.o read_write.o file_table.o super.o fdmap.o \
 		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \


^ permalink raw reply	[flat|nested] 129+ messages in thread

end of thread, other threads:[~2007-06-11  8:25 UTC | newest]

Thread overview: 129+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-06-06 22:30 [patch 7/8] fdmap v2 - implement sys_socket2 Davide Libenzi
2007-06-06 22:44 ` David Miller
2007-06-06 22:52   ` Davide Libenzi
2007-06-06 22:57     ` David Miller
2007-06-06 22:57   ` Ulrich Drepper
2007-06-06 23:02     ` David Miller
2007-06-06 22:59 ` Alan Cox
2007-06-06 22:58   ` Ulrich Drepper
2007-06-06 23:04   ` Davide Libenzi
2007-06-06 23:08     ` David Miller
2007-06-06 23:19     ` Alan Cox
2007-06-06 23:22       ` Ulrich Drepper
2007-06-07 10:04         ` Alan Cox
2007-06-07 11:59           ` Kyle Moffett
2007-06-07 13:12             ` Eric Dumazet
2007-06-07 15:51               ` Davide Libenzi
2007-06-07 19:49               ` Davide Libenzi
2007-06-07 20:02                 ` Ulrich Drepper
2007-06-07 20:05                 ` Eric Dumazet
2007-06-07 20:18                   ` Ulrich Drepper
2007-06-07 21:44                     ` Davide Libenzi
2007-06-07 22:03                       ` Ulrich Drepper
2007-06-07 22:40                         ` Davide Libenzi
2007-06-08 12:07                           ` Theodore Tso
2007-06-08 13:01                             ` Alan Cox
2007-06-08 18:11                               ` Davide Libenzi
2007-06-08 18:26                                 ` Alan Cox
2007-06-08 18:43                                   ` Ulrich Drepper
2007-06-08 18:46                                     ` Al Viro
2007-06-08 18:56                                       ` Ulrich Drepper
2007-06-08 19:07                                         ` Linus Torvalds
2007-06-08 19:21                                           ` Davide Libenzi
2007-06-09  0:03                                             ` Linus Torvalds
2007-06-09  0:13                                               ` Davide Libenzi
2007-06-09  0:36                                               ` Al Viro
2007-06-09  1:19                                                 ` Ulrich Drepper
2007-06-09  1:41                                                   ` Al Viro
2007-06-09  2:10                                                     ` Ulrich Drepper
2007-06-09 15:15                                                       ` Al Viro
2007-06-09 16:26                                                         ` Ulrich Drepper
2007-06-09 16:54                                                           ` Al Viro
2007-06-09 17:04                                                             ` Davide Libenzi
2007-06-09 17:08                                                               ` Davide Libenzi
2007-06-09 17:08                                                             ` Ulrich Drepper
2007-06-09 17:24                                                               ` Al Viro
2007-06-09 19:27                                                                 ` Kyle Moffett
2007-06-09 20:06                                                                   ` Al Viro
2007-06-09 20:21                                                                     ` Linus Torvalds
2007-06-09 20:31                                                                       ` Davide Libenzi
2007-06-09 21:41                                                                         ` Matt Mackall
2007-06-09 22:12                                                                           ` Davide Libenzi
2007-06-09 20:49                                                                       ` Al Viro
2007-06-09 21:55                                                                         ` Matt Mackall
2007-06-09 23:33                                                                         ` Linus Torvalds
2007-06-10  3:35                                                                           ` Davide Libenzi
2007-06-10  3:49                                                                             ` Davide Libenzi
2007-06-10  3:19                                                                       ` Al Viro
2007-06-10  3:48                                                                         ` Linus Torvalds
2007-06-10  4:00                                                                           ` Al Viro
2007-06-10  4:03                                                                             ` Linus Torvalds
2007-06-10  4:06                                                                               ` Al Viro
2007-06-10  4:45                                                                           ` dean gaudet
2007-06-10  5:06                                                                             ` Linus Torvalds
2007-06-10  5:46                                                                               ` Al Viro
2007-06-10 17:23                                                                                 ` Linus Torvalds
2007-06-10  6:35                                                                           ` Kari Hurtta
2007-06-10 15:21                                                                             ` Alan Cox
2007-06-10  9:14                                                                       ` Eric Dumazet
2007-06-10 15:16                                                                         ` Alan Cox
2007-06-10 18:19                                                                         ` Linus Torvalds
2007-06-10  2:40                                                                   ` Al Viro
2007-06-08 19:34                                         ` Alan Cox
2007-06-08 19:30                                     ` Alan Cox
2007-06-08 19:37                                       ` Davide Libenzi
2007-06-08 19:48                                         ` Alan Cox
2007-06-08 19:51                                           ` Davide Libenzi
2007-06-08 21:24                                             ` Alan Cox
2007-06-08 21:59                                               ` Davide Libenzi
2007-06-08 22:28                                                 ` Alan Cox
2007-06-08 22:38                                                   ` Davide Libenzi
2007-06-11  8:24                                       ` Xavier Bestel
2007-06-08 19:22                                   ` Davide Libenzi
2007-06-09  5:41                                 ` Paul Mackerras
2007-06-09 14:38                                   ` Kyle Moffett
2007-06-10  6:48                                     ` Paul Mackerras
2007-06-10 15:56                                       ` Davide Libenzi
2007-06-10 19:16                                       ` Davide Libenzi
2007-06-09 17:00                                   ` Davide Libenzi
2007-06-10  6:26                                     ` Paul Mackerras
2007-06-10  7:10                                       ` William Lee Irwin III
2007-06-10 15:52                                       ` Davide Libenzi
2007-06-08 18:07                             ` Davide Libenzi
2007-06-08 18:35                             ` Linus Torvalds
2007-06-07 21:57                   ` Davide Libenzi
2007-06-08  4:38                     ` Eric Dumazet
2007-06-08  5:20                       ` Davide Libenzi
2007-06-07 14:25           ` Ulrich Drepper
2007-06-07 17:56             ` Eric Dumazet
2007-06-07 18:03               ` Davide Libenzi
2007-06-07 18:57                 ` Eric Dumazet
2007-06-07 18:26               ` Ulrich Drepper
2007-06-07 18:39                 ` Davide Libenzi
2007-06-07 18:56                   ` Ulrich Drepper
2007-06-07 19:12                     ` Davide Libenzi
2007-06-07 20:03                   ` Andrew Morton
2007-06-08  2:55                     ` Ulrich Drepper
2007-06-08  5:16                       ` Davide Libenzi
2007-06-06 23:29       ` Davide Libenzi
2007-06-07 10:06         ` Alan Cox
2007-06-07 10:45           ` Eric Dumazet
2007-06-07 11:27             ` Alan Cox
2007-06-07 15:41           ` Davide Libenzi
2007-06-07 20:10   ` Linus Torvalds
2007-06-07 20:47     ` Eric Dumazet
2007-06-07 21:08       ` Linus Torvalds
2007-06-07 21:41         ` Davide Libenzi
2007-06-07 20:59     ` Guillaume Chazarain
2007-06-07 21:06       ` Guillaume Chazarain
2007-06-07 21:31     ` Ulrich Drepper
2007-06-07 22:22     ` Davide Libenzi
2007-06-07 23:42       ` Linus Torvalds
2007-06-08  0:04         ` Davide Libenzi
2007-06-08  0:59     ` Matt Mackall
2007-06-08  2:25       ` Linus Torvalds
2007-06-08 15:56     ` Jeff Dike
2007-06-07  0:29 ` Arnd Bergmann
2007-06-07  0:33   ` Davide Libenzi
  -- strict thread matches above, loose matches on Subject: below --
2007-06-06 22:30 [patch 1/8] fdmap v2 - fdmap core Davide Libenzi
2007-06-07  6:54 ` Eric Dumazet
2007-06-07  7:10   ` Davide Libenzi
2007-06-07 10:39     ` [patch 7/8] fdmap v2 - implement sys_socket2 Eric Dumazet
2007-06-07 15:42       ` Davide Libenzi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox