This patch makes the kernel use the fdmap allocator for sequential (legacy)
and non-sequential file descriptor allocations. It consolidates all scattered
fdtable internal members accesses into fdmap.{c,h}.
Semantics of sequential file descriptor allocations remains unchanged. They
are still performed in the same way the current fdtable bitmap does.
As far as non-sequential file descriptor allocations goes, we do not allow
F_DUPFD to allocate a file descriptor inside the non-sequential area.
The sys_dup2() system call has been changed to allow over-writing an already
allocated file descriptor inside the non-sequential area. It cannot be used
(sys_dup2()) to allocate a new file descriptor in the non-sequential area.
The base for the non-sequential area is randomly chosen. On top of that,
being FIFO, it's not even easy to predict after the application run for a while.
A simple test program for sys_nonseqfd(), sys_socket2() and O_NONSEQFD is
availble here (verified and tested on a P4 HT and a dual Opteron):
http://www.xmailserver.org/extfd-test.c
Signed-off-by: Davide Libenzi <[email protected]>
- Davide
Index: linux-2.6.mod/include/linux/file.h
===================================================================
--- linux-2.6.mod.orig/include/linux/file.h 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/include/linux/file.h 2007-06-06 12:48:28.000000000 -0700
@@ -11,52 +11,53 @@
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/types.h>
+#include <linux/fdmap.h>
/*
- * The default fd array needs to be at least BITS_PER_LONG,
- * as this is the granularity returned by copy_fdset().
+ * Initial size for the non sequential file descriptor arena
*/
-#define NR_OPEN_DEFAULT BITS_PER_LONG
+#define FDMAP_NONSEQ_SIZE 64U
/*
- * The embedded_fd_set is a small fd_set,
- * suitable for most tasks (which open <= BITS_PER_LONG files)
+ * Base for non sequential file descriptors
*/
-struct embedded_fd_set {
- unsigned long fds_bits[1];
-};
+#define FDMAP_NONSEQ_BASE (1U << 28)
-struct fdtable {
- unsigned int max_fds;
- struct file ** fd; /* current fd array */
- fd_set *close_on_exec;
- fd_set *open_fds;
- struct rcu_head rcu;
- struct fdtable *next;
-};
+#define FDMAP_RANDOM_BITS 20
+
+/*
+ * The default fd array needs to be at least BITS_PER_LONG,
+ * as this is the granularity returned by copy_fdset().
+ */
+#define NR_OPEN_DEFAULT BITS_PER_LONG
/*
* Open file table structure
*/
struct files_struct {
- /*
- * read mostly part
- */
+ /*
+ * read mostly part
+ */
atomic_t count;
- struct fdtable *fdt;
- struct fdtable fdtab;
- /*
- * written part on a separate cache line in SMP
- */
+ /*
+ * Used for legacy (sequential) file descriptor allocations.
+ */
+ struct fd_map *lfmap;
+ /*
+ * Used for non-sequential file descriptor allocations.
+ */
+ struct fd_map *fmap;
+
+ struct fd_map lfmap_embed;
+ /*
+ * written part on a separate cache line in SMP
+ */
spinlock_t file_lock ____cacheline_aligned_in_smp;
- int next_fd;
- struct embedded_fd_set close_on_exec_init;
- struct embedded_fd_set open_fds_init;
- struct file * fd_array[NR_OPEN_DEFAULT];
+ int fd_count;
+ struct list_head fdmap_slots[NR_OPEN_DEFAULT];
+ unsigned long fdmap_map[FDMAP_BMP_LONGS(NR_OPEN_DEFAULT)];
};
-#define files_fdtable(files) (rcu_dereference((files)->fdt))
-
extern struct kmem_cache *filp_cachep;
extern void FASTCALL(__fput(struct file *));
@@ -73,25 +74,63 @@
extern void FASTCALL(set_close_on_exec(unsigned int fd, int flag));
extern void put_filp(struct file *);
extern int get_unused_fd(void);
+extern void __put_unused_fd(unsigned int fd);
extern void FASTCALL(put_unused_fd(unsigned int fd));
struct kmem_cache;
-extern int expand_files(struct files_struct *, int nr);
-extern void free_fdtable_rcu(struct rcu_head *rcu);
-extern void __init files_defer_init(void);
-
-static inline void free_fdtable(struct fdtable *fdt)
+extern void init_files_struct(struct files_struct *newf);
+extern struct fd_map *files_fdmap_alloc(struct files_struct *files,
+ struct fd_map **pfmap,
+ unsigned int size);
+extern int __alloc_nonseq_fd(struct files_struct *files, unsigned long flags);
+extern int alloc_nonseq_fd(struct files_struct *files, unsigned long flags);
+extern unsigned int gen_nonseqfd_base(void);
+
+/**
+ * nonseq_files_fdmap - Must be called with @files->file_lock held. It forces
+ * the creation the the non-sequential file descriptor map,
+ * if not present.
+ *
+ * @files: [in] Pointer to the struct files_struct from where the map has to
+ * be retrieved
+ *
+ */
+static inline struct fd_map *nonseq_files_fdmap(struct files_struct *files)
{
- call_rcu(&fdt->rcu, free_fdtable_rcu);
+ struct fd_map *fmap = files->fmap;
+ if (unlikely(!fmap))
+ fmap = files_fdmap_alloc(files, &files->fmap, 0);
+ return fmap;
}
-static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
+/**
+ * files_free_fdmap - Frees the non-sequential file descriptor map, if it is
+ * not the files-embedded one (files->lfmap_embed)
+ *
+ * @files: [in] Pointer to the struct files_struct
+ * @fmap: [in] File descriptor map to be freed
+ *
+ */
+static inline void files_free_fdmap(struct files_struct *files,
+ struct fd_map *fmap)
{
- struct file * file = NULL;
- struct fdtable *fdt = files_fdtable(files);
+ if (fmap != &files->lfmap_embed)
+ fdmap_free(fmap);
+}
- if (fd < fdt->max_fds)
- file = rcu_dereference(fdt->fd[fd]);
+static inline struct file *fcheck_files(struct files_struct *files,
+ unsigned int fd)
+{
+ struct file *file = NULL;
+ struct fd_map *fmap;
+ fmap = rcu_dereference(files->lfmap);
+ if (fdmap_fdof(fmap, fd))
+ file = fdmap_file_get(fmap, fd);
+ else {
+ fmap = rcu_dereference(files->fmap);
+ if (fmap && fdmap_fdof(fmap, fd))
+ file = fdmap_file_get(fmap, fd);
+ }
return file;
}
@@ -105,6 +144,7 @@
struct task_struct;
struct files_struct *get_files_struct(struct task_struct *);
+void free_files_struct(struct files_struct *files);
void FASTCALL(put_files_struct(struct files_struct *fs));
void reset_files_struct(struct task_struct *, struct files_struct *);
Index: linux-2.6.mod/fs/fcntl.c
===================================================================
--- linux-2.6.mod.orig/fs/fcntl.c 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/fs/fcntl.c 2007-06-06 12:48:03.000000000 -0700
@@ -26,24 +26,28 @@
void fastcall set_close_on_exec(unsigned int fd, int flag)
{
struct files_struct *files = current->files;
- struct fdtable *fdt;
+ struct fd_map *fmap;
+
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (flag)
- FD_SET(fd, fdt->close_on_exec);
- else
- FD_CLR(fd, fdt->close_on_exec);
+ fmap = files->lfmap;
+ if (!fdmap_fdof(fmap, fd))
+ fmap = files->fmap;
+ fdmap_set_fdflags(fmap, fd, flag ? 0: FDMAP_F_CLOEXEC,
+ flag ? FDMAP_F_CLOEXEC: 0);
spin_unlock(&files->file_lock);
}
static int get_close_on_exec(unsigned int fd)
{
struct files_struct *files = current->files;
- struct fdtable *fdt;
+ struct fd_map *fmap;
int res;
+
rcu_read_lock();
- fdt = files_fdtable(files);
- res = FD_ISSET(fd, fdt->close_on_exec);
+ fmap = rcu_dereference(files->lfmap);
+ if (!fdmap_fdof(fmap, fd))
+ fmap = rcu_dereference(files->fmap);
+ res = (fdmap_get_fdflags(fmap, fd) & FDMAP_F_CLOEXEC) != 0;
rcu_read_unlock();
return res;
}
@@ -54,81 +58,41 @@
* file_lock held for write.
*/
-static int locate_fd(struct files_struct *files,
- struct file *file, unsigned int orig_start)
+static int locate_fd(struct files_struct *files, unsigned int start)
{
- unsigned int newfd;
- unsigned int start;
- int error;
- struct fdtable *fdt;
-
- error = -EINVAL;
- if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
- goto out;
-
-repeat:
- fdt = files_fdtable(files);
- /*
- * Someone might have closed fd's in the range
- * orig_start..fdt->next_fd
- */
- start = orig_start;
- if (start < files->next_fd)
- start = files->next_fd;
-
- newfd = start;
- if (start < fdt->max_fds)
- newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
- fdt->max_fds, start);
-
- error = -EMFILE;
- if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
- goto out;
-
- error = expand_files(files, newfd);
- if (error < 0)
- goto out;
-
- /*
- * If we needed to expand the fs array we
- * might have blocked - try again.
- */
- if (error)
- goto repeat;
+ int fd;
+ unsigned int size;
- /*
- * We reacquired files_lock, so we are safe as long as
- * we reacquire the fdtable pointer and use it while holding
- * the lock, no one can free it during that time.
- */
- if (start <= files->next_fd)
- files->next_fd = newfd + 1;
+ if (start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ return -EINVAL;
+ if (files->fd_count >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ return -EMFILE;
+ if (unlikely(start >= fdmap_topfd(files->lfmap))) {
+ size = start - fdmap_basefd(files->lfmap);
+ size = 2 * min(size, (unsigned int) NR_OPEN / 2);
+ if (!files_fdmap_alloc(files, &files->lfmap, size))
+ return -ENOMEM;
+ }
+ fd = fdmap_newfd_seq(files->lfmap, start,
+ current->signal->rlim[RLIMIT_NOFILE].rlim_cur, 0);
+ if (likely(fd >= 0))
+ files->fd_count++;
- error = newfd;
-
-out:
- return error;
+ return fd;
}
static int dupfd(struct file *file, unsigned int start)
{
- struct files_struct * files = current->files;
- struct fdtable *fdt;
+ struct files_struct *files = current->files;
int fd;
spin_lock(&files->file_lock);
- fd = locate_fd(files, file, start);
- if (fd >= 0) {
- /* locate_fd() may have expanded fdtable, load the ptr */
- fdt = files_fdtable(files);
- FD_SET(fd, fdt->open_fds);
- FD_CLR(fd, fdt->close_on_exec);
- spin_unlock(&files->file_lock);
+ fd = locate_fd(files, start);
+ spin_unlock(&files->file_lock);
+ if (fd >= 0)
fd_install(fd, file);
- } else {
- spin_unlock(&files->file_lock);
+ else
fput(file);
- }
return fd;
}
@@ -136,9 +100,9 @@
asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
{
int err = -EBADF;
- struct file * file, *tofree;
- struct files_struct * files = current->files;
- struct fdtable *fdt;
+ unsigned int size;
+ struct file *file, *tofree = NULL;
+ struct files_struct *files = current->files;
spin_lock(&files->file_lock);
if (!(file = fcheck(oldfd)))
@@ -147,31 +111,40 @@
if (newfd == oldfd)
goto out_unlock;
err = -EBADF;
- if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ if (newfd < current->signal->rlim[RLIMIT_NOFILE].rlim_cur) {
+ /*
+ * We need to get the file* here, since files_fdmap_alloc()
+ * may temporarly release the lock.
+ */
+ get_file(file);
+ if (unlikely(newfd >= fdmap_topfd(files->lfmap))) {
+ size = 2 * min(newfd - fdmap_basefd(files->lfmap),
+ (unsigned int) NR_OPEN / 2);
+ err = -ENOMEM;
+ if (!files_fdmap_alloc(files, &files->lfmap, size))
+ goto out_fput;
+ }
+ tofree = fdmap_file_get(files->lfmap, newfd);
+ err = fdmap_newfd(files->lfmap, newfd, 0);
+ if (err != (int) newfd) {
+ /*
+ * There's a window inside which a file descriptor can
+ * result as allocated, but the stored file* is NULL.
+ * We return -EBUSY in such case.
+ */
+ if (err != -EBUSY || !tofree)
+ goto out_fput;
+ } else
+ files->fd_count++;
+ fdmap_install(files->lfmap, newfd, file);
+ } else if (files->fmap && fdmap_fdof(files->fmap, newfd)) {
+ tofree = fdmap_file_get(files->fmap, newfd);
+ if (!tofree)
+ goto out_unlock;
+ get_file(file);
+ fdmap_install(files->fmap, newfd, file);
+ } else
goto out_unlock;
- get_file(file); /* We are now finished with oldfd */
-
- err = expand_files(files, newfd);
- if (err < 0)
- goto out_fput;
-
- /* To avoid races with open() and dup(), we will mark the fd as
- * in-use in the open-file bitmap throughout the entire dup2()
- * process. This is quite safe: do_close() uses the fd array
- * entry, not the bitmap, to decide what work needs to be
- * done. --sct */
- /* Doesn't work. open() might be there first. --AV */
-
- /* Yes. It's a race. In user space. Nothing sane to do */
- err = -EBUSY;
- fdt = files_fdtable(files);
- tofree = fdt->fd[newfd];
- if (!tofree && FD_ISSET(newfd, fdt->open_fds))
- goto out_fput;
-
- rcu_assign_pointer(fdt->fd[newfd], file);
- FD_SET(newfd, fdt->open_fds);
- FD_CLR(newfd, fdt->close_on_exec);
spin_unlock(&files->file_lock);
if (tofree)
Index: linux-2.6.mod/fs/exec.c
===================================================================
--- linux-2.6.mod.orig/fs/exec.c 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/fs/exec.c 2007-06-06 12:48:03.000000000 -0700
@@ -780,36 +780,37 @@
* so that a new one can be started
*/
-static void flush_old_files(struct files_struct * files)
+static void flush_fdmap_files(struct files_struct *files,
+ struct fd_map **pfmap)
{
- long j = -1;
- struct fdtable *fdt;
+ unsigned int start, base;
+ unsigned long fset;
+ struct fd_map *fmap;
spin_lock(&files->file_lock);
- for (;;) {
- unsigned long set, i;
-
- j++;
- i = j * __NFDBITS;
- fdt = files_fdtable(files);
- if (i >= fdt->max_fds)
+ for (start = 0;;) {
+ fmap = *pfmap;
+ if (!fmap)
+ break;
+ if (!fdmap_next_flag_set(fmap, FDMAP_BIT_CLOEXEC, 1,
+ &start, &base, &fset))
break;
- set = fdt->close_on_exec->fds_bits[j];
- if (!set)
- continue;
- fdt->close_on_exec->fds_bits[j] = 0;
spin_unlock(&files->file_lock);
- for ( ; set ; i++,set >>= 1) {
- if (set & 1) {
- sys_close(i);
- }
- }
+ for (; fset; base++, fset >>= 1)
+ if (fset & 1)
+ sys_close(base);
spin_lock(&files->file_lock);
-
}
spin_unlock(&files->file_lock);
}
+static void flush_old_files(struct files_struct * files)
+{
+ flush_fdmap_files(files, &files->lfmap);
+ if (files->fmap)
+ flush_fdmap_files(files, &files->fmap);
+}
+
void get_task_comm(char *buf, struct task_struct *tsk)
{
/* buf must be at least sizeof(tsk->comm) in size */
Index: linux-2.6.mod/kernel/exit.c
===================================================================
--- linux-2.6.mod.orig/kernel/exit.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/kernel/exit.c 2007-06-06 12:48:03.000000000 -0700
@@ -417,37 +417,18 @@
EXPORT_SYMBOL(daemonize);
-static void close_files(struct files_struct * files)
+static int files_fdmap_close(void *priv, struct file *file, int fd)
{
- int i, j;
- struct fdtable *fdt;
-
- j = 0;
+ filp_close(file, (struct files_struct *) priv);
+ cond_resched();
+ return 0;
+}
- /*
- * It is safe to dereference the fd table without RCU or
- * ->file_lock because this is the last reference to the
- * files structure.
- */
- fdt = files_fdtable(files);
- for (;;) {
- unsigned long set;
- i = j * __NFDBITS;
- if (i >= fdt->max_fds)
- break;
- set = fdt->open_fds->fds_bits[j++];
- while (set) {
- if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
- if (file) {
- filp_close(file, files);
- cond_resched();
- }
- }
- i++;
- set >>= 1;
- }
- }
+static void close_files(struct files_struct * files)
+{
+ fdmap_for_each_file(files->lfmap, 1, files_fdmap_close, files);
+ if (files->fmap)
+ fdmap_for_each_file(files->fmap, 1, files_fdmap_close, files);
}
struct files_struct *get_files_struct(struct task_struct *task)
@@ -463,23 +444,28 @@
return files;
}
-void fastcall put_files_struct(struct files_struct *files)
+void free_files_struct(struct files_struct *files)
{
- struct fdtable *fdt;
+ struct fd_map *fmap;
- if (atomic_dec_and_test(&files->count)) {
- close_files(files);
- /*
- * Free the fd and fdset arrays if we expanded them.
- * If the fdtable was embedded, pass files for freeing
- * at the end of the RCU grace period. Otherwise,
- * you can free files immediately.
- */
- fdt = files_fdtable(files);
- if (fdt != &files->fdtab)
- kmem_cache_free(files_cachep, files);
- free_fdtable(fdt);
- }
+ close_files(files);
+ if (files->fmap)
+ fdmap_free(files->fmap);
+ fmap = files->lfmap;
+ /*
+ * If this is not the embedded fdmap, we can free it
+ * immediately. Otherwise it will be freed by the fdmap
+ * RCU cleanup code.
+ */
+ if (fmap != &files->lfmap_embed)
+ kmem_cache_free(files_cachep, files);
+ fdmap_free(fmap);
+}
+
+void fastcall put_files_struct(struct files_struct *files)
+{
+ if (atomic_dec_and_test(&files->count))
+ free_files_struct(files);
}
EXPORT_SYMBOL(put_files_struct);
Index: linux-2.6.mod/fs/open.c
===================================================================
--- linux-2.6.mod.orig/fs/open.c 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/fs/open.c 2007-06-06 12:48:03.000000000 -0700
@@ -26,6 +26,7 @@
#include <linux/syscalls.h>
#include <linux/rcupdate.h>
#include <linux/audit.h>
+#include <linux/random.h>
int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
@@ -857,51 +858,26 @@
*/
int get_unused_fd(void)
{
- struct files_struct * files = current->files;
+ struct files_struct *files = current->files;
int fd, error;
- struct fdtable *fdt;
- error = -EMFILE;
spin_lock(&files->file_lock);
-
repeat:
- fdt = files_fdtable(files);
- fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
- files->next_fd);
-
- /*
- * N.B. For clone tasks sharing a files structure, this test
- * will limit the total number of files that can be opened.
- */
- if (fd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
- goto out;
-
- /* Do we need to expand the fd array or fd set? */
- error = expand_files(files, fd);
- if (error < 0)
- goto out;
-
- if (error) {
- /*
- * If we needed to expand the fs array we
- * might have blocked - try again.
- */
- error = -EMFILE;
+ error = -EMFILE;
+ if (unlikely(files->fd_count >=
+ current->signal->rlim[RLIMIT_NOFILE].rlim_cur))
+ goto out;
+ fd = fdmap_newfd_seq(files->lfmap, 0,
+ current->signal->rlim[RLIMIT_NOFILE].rlim_cur, 0);
+ if (unlikely(fd == -ENOSPC)) {
+ error = -ENOMEM;
+ if (!files_fdmap_alloc(files, &files->lfmap, 0))
+ goto out;
goto repeat;
}
-
- FD_SET(fd, fdt->open_fds);
- FD_CLR(fd, fdt->close_on_exec);
- files->next_fd = fd + 1;
-#if 1
- /* Sanity check */
- if (fdt->fd[fd] != NULL) {
- printk(KERN_WARNING "get_unused_fd: slot %d not NULL!\n", fd);
- fdt->fd[fd] = NULL;
- }
-#endif
+ if (likely(fd >= 0))
+ files->fd_count++;
error = fd;
-
out:
spin_unlock(&files->file_lock);
return error;
@@ -909,24 +885,99 @@
EXPORT_SYMBOL(get_unused_fd);
-static void __put_unused_fd(struct files_struct *files, unsigned int fd)
+void __put_unused_fd(unsigned int fd)
{
- struct fdtable *fdt = files_fdtable(files);
- __FD_CLR(fd, fdt->open_fds);
- if (fd < files->next_fd)
- files->next_fd = fd;
+ struct files_struct *files = current->files;
+ if (fdmap_fdof(files->lfmap, fd))
+ fdmap_putfd(files->lfmap, fd);
+ else if (files->fmap)
+ fdmap_putfd(files->fmap, fd);
+ files->fd_count--;
}
void fastcall put_unused_fd(unsigned int fd)
{
struct files_struct *files = current->files;
spin_lock(&files->file_lock);
- __put_unused_fd(files, fd);
+ __put_unused_fd(fd);
spin_unlock(&files->file_lock);
}
EXPORT_SYMBOL(put_unused_fd);
+/**
+ * __alloc_nonseq_fd - Allocates a file descriptor inside the non-sequential
+ * file descriptor map (locked)
+ *
+ * @files: [in] Pointer the files_struct that hosts the non-sequential file
+ * descriptor map
+ * @flags: [in] Flags to be associated with the file descriptor
+ *
+ * Returns the allocated file descriptor, or a negative value in case of error.
+ * This function must be called while holding @files->file_lock. In case the file
+ * descriptor map should be resized, the held lock will be temporarly released
+ * (and re-acquired).
+ */
+int __alloc_nonseq_fd(struct files_struct *files, unsigned long flags)
+{
+ int fd;
+ unsigned long mflags = 0;
+ struct fd_map *fmap;
+
+ /*
+ * Map special open flags parameters to fdmap flags. TODO!!
+ */
+
+repeat:
+ if (unlikely(files->fd_count >=
+ current->signal->rlim[RLIMIT_NOFILE].rlim_cur))
+ return -EMFILE;
+ fmap = nonseq_files_fdmap(files);
+ if (!fmap)
+ return -ENOMEM;
+ fd = fdmap_newfd(fmap, -1, mflags);
+ if (unlikely(fd == -ENOSPC)) {
+ if (!files_fdmap_alloc(files, &files->fmap, 0))
+ return -ENOMEM;
+ goto repeat;
+ }
+ if (likely(fd >= 0))
+ files->fd_count++;
+ return fd;
+}
+
+/**
+ * alloc_nonseq_fd - Allocates a file descriptor inside the non-sequential
+ * file descriptor map (unlocked)
+ *
+ * This function is the unlocked counterpart of the __alloc_nonseq_fd()
+ * function.
+ */
+int alloc_nonseq_fd(struct files_struct *files, unsigned long flags)
+{
+ int fd;
+
+ spin_lock(&files->file_lock);
+ fd = __alloc_nonseq_fd(files, flags);
+ spin_unlock(&files->file_lock);
+ return fd;
+}
+
+/**
+ * gen_nonseqfd_base - Allocates a random base for non-sequential file
+ * descriptors
+ *
+ */
+unsigned int gen_nonseqfd_base(void)
+{
+ unsigned int rndb = get_random_int();
+
+ /*
+ * I'm getting the lower bits here. Should we use upper ones?
+ */
+ return FDMAP_NONSEQ_BASE + (rndb & ((1U << FDMAP_RANDOM_BITS) - 1));
+}
+
/*
* Install a file pointer in the fd array.
*
@@ -943,11 +994,12 @@
void fastcall fd_install(unsigned int fd, struct file * file)
{
struct files_struct *files = current->files;
- struct fdtable *fdt;
+
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- BUG_ON(fdt->fd[fd] != NULL);
- rcu_assign_pointer(fdt->fd[fd], file);
+ if (fdmap_fdof(files->lfmap, fd))
+ fdmap_install(files->lfmap, fd, file);
+ else if (files->fmap)
+ fdmap_install(files->fmap, fd, file);
spin_unlock(&files->file_lock);
}
@@ -1047,21 +1099,18 @@
*/
asmlinkage long sys_close(unsigned int fd)
{
- struct file * filp;
+ struct file *filp = NULL;
struct files_struct *files = current->files;
- struct fdtable *fdt;
int retval;
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (fd >= fdt->max_fds)
- goto out_unlock;
- filp = fdt->fd[fd];
+ if (fdmap_fdof(files->lfmap, fd))
+ filp = fdmap_file_get(files->lfmap, fd);
+ else if (files->fmap && fdmap_fdof(files->fmap, fd))
+ filp = fdmap_file_get(files->fmap, fd);
if (!filp)
goto out_unlock;
- rcu_assign_pointer(fdt->fd[fd], NULL);
- FD_CLR(fd, fdt->close_on_exec);
- __put_unused_fd(files, fd);
+ __put_unused_fd(fd);
spin_unlock(&files->file_lock);
retval = filp_close(filp, files);
Index: linux-2.6.mod/kernel/fork.c
===================================================================
--- linux-2.6.mod.orig/kernel/fork.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/kernel/fork.c 2007-06-06 12:48:03.000000000 -0700
@@ -614,43 +614,53 @@
return 0;
}
-static int count_open_files(struct fdtable *fdt)
-{
- int size = fdt->max_fds;
- int i;
-
- /* Find the last open fd */
- for (i = size/(8*sizeof(long)); i > 0; ) {
- if (fdt->open_fds->fds_bits[--i])
- break;
- }
- i = (i+1) * 8 * sizeof(long);
- return i;
-}
-
static struct files_struct *alloc_files(void)
{
struct files_struct *newf;
- struct fdtable *fdt;
newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
- if (!newf)
- goto out;
+ if (newf)
+ init_files_struct(newf);
- atomic_set(&newf->count, 1);
+ return newf;
+}
- spin_lock_init(&newf->file_lock);
- newf->next_fd = 0;
- fdt = &newf->fdtab;
- fdt->max_fds = NR_OPEN_DEFAULT;
- fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
- fdt->open_fds = (fd_set *)&newf->open_fds_init;
- fdt->fd = &newf->fd_array[0];
- INIT_RCU_HEAD(&fdt->rcu);
- fdt->next = NULL;
- rcu_assign_pointer(newf->fdt, fdt);
+static int dup_fdmap(spinlock_t *lock, struct fd_map **psfmap,
+ unsigned int topfd, int ranbase, struct fd_map **pdfmap,
+ unsigned int *fcount)
+ __releases(*lock)
+ __acquires(*lock)
+{
+ unsigned int size, base;
+ struct fd_map *ofmap, *fmap = NULL;
+
+repeat:
+ *fcount = 0;
+ ofmap = *psfmap;
+ if (!ofmap)
+ goto out;
+ if (!topfd)
+ topfd = fdmap_top_open_fd(ofmap) + 1;
+ size = topfd - fdmap_basefd(ofmap);
+ if (ranbase)
+ base = gen_nonseqfd_base();
+ else
+ base = fdmap_basefd(ofmap);
+ spin_unlock(lock);
+ fmap = fdmap_alloc(base, size, 0);
+ spin_lock(lock);
+ if (!fmap)
+ goto out;
+ if (unlikely(*psfmap != ofmap)) {
+ fdmap_free(fmap);
+ topfd = 0;
+ fmap = NULL;
+ goto repeat;
+ }
+ fdmap_copy(fmap, ofmap, fcount, FDMAP_CPF_FORKMODE);
+ rcu_assign_pointer(*pdfmap, fmap);
out:
- return newf;
+ return fmap || !ofmap ? 0: -ENOMEM;
}
/*
@@ -660,86 +670,39 @@
*/
static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
+ unsigned int topfd, fcount;
struct files_struct *newf;
- struct file **old_fds, **new_fds;
- int open_files, size, i;
- struct fdtable *old_fdt, *new_fdt;
*errorp = -ENOMEM;
newf = alloc_files();
if (!newf)
goto out;
-
spin_lock(&oldf->file_lock);
- old_fdt = files_fdtable(oldf);
- new_fdt = files_fdtable(newf);
- open_files = count_open_files(old_fdt);
-
- /*
- * Check whether we need to allocate a larger fd array and fd set.
- * Note: we're not a clone task, so the open count won't change.
- */
- if (open_files > new_fdt->max_fds) {
- new_fdt->max_fds = 0;
- spin_unlock(&oldf->file_lock);
- spin_lock(&newf->file_lock);
- *errorp = expand_files(newf, open_files-1);
- spin_unlock(&newf->file_lock);
- if (*errorp < 0)
+ topfd = fdmap_top_open_fd(oldf->lfmap) + 1;
+ if (topfd <= fdmap_topfd(newf->lfmap))
+ fdmap_copy(newf->lfmap, oldf->lfmap, &fcount,
+ FDMAP_CPF_FORKMODE);
+ else {
+ *errorp = dup_fdmap(&oldf->file_lock, &oldf->lfmap, topfd, 0,
+ &newf->lfmap, &fcount);
+ if (*errorp)
goto out_release;
- new_fdt = files_fdtable(newf);
- /*
- * Reacquire the oldf lock and a pointer to its fd table
- * who knows it may have a new bigger fd table. We need
- * the latest pointer.
- */
- spin_lock(&oldf->file_lock);
- old_fdt = files_fdtable(oldf);
}
-
- old_fds = old_fdt->fd;
- new_fds = new_fdt->fd;
-
- memcpy(new_fdt->open_fds->fds_bits,
- old_fdt->open_fds->fds_bits, open_files/8);
- memcpy(new_fdt->close_on_exec->fds_bits,
- old_fdt->close_on_exec->fds_bits, open_files/8);
-
- for (i = open_files; i != 0; i--) {
- struct file *f = *old_fds++;
- if (f) {
- get_file(f);
- } else {
- /*
- * The fd may be claimed in the fd bitmap but not yet
- * instantiated in the files array if a sibling thread
- * is partway through open(). So make sure that this
- * fd is available to the new process.
- */
- FD_CLR(open_files - i, new_fdt->open_fds);
- }
- rcu_assign_pointer(*new_fds++, f);
+ newf->fd_count = fcount;
+ if (oldf->fmap) {
+ *errorp = dup_fdmap(&oldf->file_lock, &oldf->fmap, 0, 1,
+ &newf->fmap, &fcount);
+ if (*errorp)
+ goto out_release;
+ newf->fd_count += fcount;
}
spin_unlock(&oldf->file_lock);
- /* compute the remainder to be cleared */
- size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
-
- /* This is long word aligned thus could use a optimized version */
- memset(new_fds, 0, size);
-
- if (new_fdt->max_fds > open_files) {
- int left = (new_fdt->max_fds-open_files)/8;
- int start = open_files / (8 * sizeof(unsigned long));
-
- memset(&new_fdt->open_fds->fds_bits[start], 0, left);
- memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
- }
-
return newf;
out_release:
- kmem_cache_free(files_cachep, newf);
+ spin_unlock(&oldf->file_lock);
+ free_files_struct(newf);
out:
return NULL;
}
Index: linux-2.6.mod/include/linux/init_task.h
===================================================================
--- linux-2.6.mod.orig/include/linux/init_task.h 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/include/linux/init_task.h 2007-06-06 12:48:03.000000000 -0700
@@ -9,27 +9,10 @@
#include <linux/ipc.h>
#include <linux/pid_namespace.h>
-#define INIT_FDTABLE \
-{ \
- .max_fds = NR_OPEN_DEFAULT, \
- .fd = &init_files.fd_array[0], \
- .close_on_exec = (fd_set *)&init_files.close_on_exec_init, \
- .open_fds = (fd_set *)&init_files.open_fds_init, \
- .rcu = RCU_HEAD_INIT, \
- .next = NULL, \
-}
-
-#define INIT_FILES \
-{ \
- .count = ATOMIC_INIT(1), \
- .fdt = &init_files.fdtab, \
- .fdtab = INIT_FDTABLE, \
- .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), \
- .next_fd = 0, \
- .close_on_exec_init = { { 0, } }, \
- .open_fds_init = { { 0, } }, \
- .fd_array = { NULL, } \
-}
+/*
+ * We do the real "init_files" initialization inside fs/file_table.c:files_init()
+ */
+#define INIT_FILES { }
#define INIT_KIOCTX(name, which_mm) \
{ \
Index: linux-2.6.mod/kernel/kmod.c
===================================================================
--- linux-2.6.mod.orig/kernel/kmod.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/kernel/kmod.c 2007-06-06 12:48:03.000000000 -0700
@@ -146,17 +146,18 @@
/* Install input pipe when needed */
if (sub_info->stdin) {
+ int fd;
struct files_struct *f = current->files;
- struct fdtable *fdt;
/* no races because files should be private here */
sys_close(0);
- fd_install(0, sub_info->stdin);
spin_lock(&f->file_lock);
- fdt = files_fdtable(f);
- FD_SET(0, fdt->open_fds);
- FD_CLR(0, fdt->close_on_exec);
+ fd = fdmap_newfd(f->lfmap, 0, 0);
+ BUG_ON(fd < 0);
+ f->fd_count++;
spin_unlock(&f->file_lock);
+ fd_install(0, sub_info->stdin);
+
/* and disallow core files too */
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
}
Index: linux-2.6.mod/fs/proc/base.c
===================================================================
--- linux-2.6.mod.orig/fs/proc/base.c 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/fs/proc/base.c 2007-06-06 12:48:03.000000000 -0700
@@ -1384,10 +1384,10 @@
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct task_struct *p = get_proc_task(inode);
- unsigned int fd, tid, ino;
+ unsigned int fd, tid, ino, topfd;
int retval;
struct files_struct * files;
- struct fdtable *fdt;
+ struct fd_map *lfmap, *fmap;
retval = -ENOENT;
if (!p)
@@ -1411,10 +1411,15 @@
if (!files)
goto out;
rcu_read_lock();
- fdt = files_fdtable(files);
- for (fd = filp->f_pos-2;
- fd < fdt->max_fds;
- fd++, filp->f_pos++) {
+ lfmap = rcu_dereference(files->lfmap);
+ fmap = rcu_dereference(files->fmap);
+ fd = filp->f_pos - 2;
+ if (fd < fdmap_topfd(lfmap) || !fmap)
+ topfd = fdmap_topfd(lfmap);
+ else
+ topfd = fdmap_topfd(fmap);
+rescan:
+ for (; fd < topfd; fd++, filp->f_pos++) {
char name[PROC_NUMBUF];
int len;
@@ -1425,13 +1430,19 @@
len = snprintf(name, sizeof(name), "%d", fd);
if (proc_fill_cache(filp, dirent, filldir,
name, len, instantiate,
- p, &fd) < 0) {
- rcu_read_lock();
- break;
- }
+ p, &fd) < 0)
+ goto out_put_files;
rcu_read_lock();
}
+ fmap = rcu_dereference(files->fmap);
+ if (fmap && fd < fdmap_basefd(fmap)) {
+ fd = fdmap_basefd(fmap);
+ filp->f_pos = fd + 2;
+ topfd = fdmap_topfd(fmap);
+ goto rescan;
+ }
rcu_read_unlock();
+out_put_files:
put_files_struct(files);
}
out:
Index: linux-2.6.mod/fs/file.c
===================================================================
--- linux-2.6.mod.orig/fs/file.c 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/fs/file.c 2007-06-06 12:48:03.000000000 -0700
@@ -18,239 +18,67 @@
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
-struct fdtable_defer {
- spinlock_t lock;
- struct work_struct wq;
- struct fdtable *next;
-};
-
-/*
- * We use this list to defer free fdtables that have vmalloced
- * sets/arrays. By keeping a per-cpu list, we avoid having to embed
- * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
- * this per-task structure.
- */
-static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-
-static inline void * alloc_fdmem(unsigned int size)
-{
- if (size <= PAGE_SIZE)
- return kmalloc(size, GFP_KERNEL);
- else
- return vmalloc(size);
-}
-
-static inline void free_fdarr(struct fdtable *fdt)
-{
- if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *)))
- kfree(fdt->fd);
- else
- vfree(fdt->fd);
-}
-
-static inline void free_fdset(struct fdtable *fdt)
-{
- if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2))
- kfree(fdt->open_fds);
- else
- vfree(fdt->open_fds);
-}
-
-static void free_fdtable_work(struct work_struct *work)
-{
- struct fdtable_defer *f =
- container_of(work, struct fdtable_defer, wq);
- struct fdtable *fdt;
-
- spin_lock_bh(&f->lock);
- fdt = f->next;
- f->next = NULL;
- spin_unlock_bh(&f->lock);
- while(fdt) {
- struct fdtable *next = fdt->next;
- vfree(fdt->fd);
- free_fdset(fdt);
- kfree(fdt);
- fdt = next;
- }
-}
-
-void free_fdtable_rcu(struct rcu_head *rcu)
+static void files_embedd_fdmap_free(void *priv, struct fd_map *fmap)
{
- struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
- struct fdtable_defer *fddef;
-
- BUG_ON(!fdt);
-
- if (fdt->max_fds <= NR_OPEN_DEFAULT) {
- /*
- * This fdtable is embedded in the files structure and that
- * structure itself is getting destroyed.
- */
- kmem_cache_free(files_cachep,
- container_of(fdt, struct files_struct, fdtab));
- return;
- }
- if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {
- kfree(fdt->fd);
- kfree(fdt->open_fds);
- kfree(fdt);
- } else {
- fddef = &get_cpu_var(fdtable_defer_list);
- spin_lock(&fddef->lock);
- fdt->next = fddef->next;
- fddef->next = fdt;
- /* vmallocs are handled from the workqueue context */
- schedule_work(&fddef->wq);
- spin_unlock(&fddef->lock);
- put_cpu_var(fdtable_defer_list);
- }
+ kmem_cache_free(files_cachep, priv);
}
-/*
- * Expand the fdset in the files_struct. Called with the files spinlock
- * held for write.
- */
-static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
+void init_files_struct(struct files_struct *newf)
{
- unsigned int cpy, set;
-
- BUG_ON(nfdt->max_fds < ofdt->max_fds);
- if (ofdt->max_fds == 0)
- return;
-
- cpy = ofdt->max_fds * sizeof(struct file *);
- set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
- memcpy(nfdt->fd, ofdt->fd, cpy);
- memset((char *)(nfdt->fd) + cpy, 0, set);
-
- cpy = ofdt->max_fds / BITS_PER_BYTE;
- set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
- memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
- memset((char *)(nfdt->open_fds) + cpy, 0, set);
- memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
- memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
-}
+ struct fd_map *fmap;
-static struct fdtable * alloc_fdtable(unsigned int nr)
-{
- struct fdtable *fdt;
- char *data;
+ atomic_set(&newf->count, 1);
- /*
- * Figure out how many fds we actually want to support in this fdtable.
- * Allocation steps are keyed to the size of the fdarray, since it
- * grows far faster than any of the other dynamic data. We try to fit
- * the fdarray into comfortable page-tuned chunks: starting at 1024B
- * and growing in powers of two from there on.
- */
- nr /= (1024 / sizeof(struct file *));
- nr = roundup_pow_of_two(nr + 1);
- nr *= (1024 / sizeof(struct file *));
- if (nr > NR_OPEN)
- nr = NR_OPEN;
-
- fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
- if (!fdt)
- goto out;
- fdt->max_fds = nr;
- data = alloc_fdmem(nr * sizeof(struct file *));
- if (!data)
- goto out_fdt;
- fdt->fd = (struct file **)data;
- data = alloc_fdmem(max_t(unsigned int,
- 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
- if (!data)
- goto out_arr;
- fdt->open_fds = (fd_set *)data;
- data += nr / BITS_PER_BYTE;
- fdt->close_on_exec = (fd_set *)data;
- INIT_RCU_HEAD(&fdt->rcu);
- fdt->next = NULL;
-
- return fdt;
-
-out_arr:
- free_fdarr(fdt);
-out_fdt:
- kfree(fdt);
-out:
- return NULL;
+ spin_lock_init(&newf->file_lock);
+ newf->fd_count = 0;
+ newf->fmap = NULL;
+ fmap = &newf->lfmap_embed;
+ fmap->slots = newf->fdmap_slots;
+ fmap->map = newf->fdmap_map;
+ fdmap_init_map(fmap, 0, NR_OPEN_DEFAULT, 1);
+ fdmap_set_freecb(fmap, files_embedd_fdmap_free, newf);
+ rcu_assign_pointer(newf->lfmap, fmap);
}
-/*
- * Expand the file descriptor table.
- * This function will allocate a new fdtable and both fd array and fdset, of
- * the given size.
- * Return <0 error code on error; 1 on successful completion.
- * The files->file_lock should be held on entry, and will be held on exit.
- */
-static int expand_fdtable(struct files_struct *files, int nr)
+struct fd_map *files_fdmap_alloc(struct files_struct *files,
+ struct fd_map **pfmap, unsigned int size)
__releases(files->file_lock)
__acquires(files->file_lock)
{
- struct fdtable *new_fdt, *cur_fdt;
+ unsigned int base, msize, nsize;
+ struct fd_map *fmap, *ofmap, *nfmap;
+
+repeat:
+ assert_spin_locked(&files->file_lock);
+ msize = max(size, FDMAP_NONSEQ_SIZE);
+ ofmap = *pfmap;
+ if (ofmap) {
+ nsize = 2 * min(ofmap->size, (unsigned int) NR_OPEN / 2);
+ msize = max(msize, nsize);
+ base = fdmap_basefd(ofmap);
+ } else
+ base = gen_nonseqfd_base();
+ msize = min(msize, (unsigned int) NR_OPEN);
spin_unlock(&files->file_lock);
- new_fdt = alloc_fdtable(nr);
+ fmap = fdmap_alloc(base, msize, !ofmap);
spin_lock(&files->file_lock);
- if (!new_fdt)
- return -ENOMEM;
- /*
- * Check again since another task may have expanded the fd table while
- * we dropped the lock
- */
- cur_fdt = files_fdtable(files);
- if (nr >= cur_fdt->max_fds) {
- /* Continue as planned */
- copy_fdtable(new_fdt, cur_fdt);
- rcu_assign_pointer(files->fdt, new_fdt);
- if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
- free_fdtable(cur_fdt);
- } else {
- /* Somebody else expanded, so undo our attempt */
- free_fdarr(new_fdt);
- free_fdset(new_fdt);
- kfree(new_fdt);
+ if (fmap) {
+ nfmap = *pfmap;
+ if (nfmap) {
+ if (ofmap == nfmap) {
+ fdmap_copy(fmap, nfmap, NULL, 0);
+ rcu_assign_pointer(*pfmap, fmap);
+ files_free_fdmap(files, nfmap);
+ } else {
+ fdmap_free(fmap);
+ if (fdmap_size(nfmap) < msize)
+ goto repeat;
+ fmap = nfmap;
+ }
+ } else
+ rcu_assign_pointer(*pfmap, fmap);
}
- return 1;
-}
-
-/*
- * Expand files.
- * This function will expand the file structures, if the requested size exceeds
- * the current capacity and there is room for expansion.
- * Return <0 error code on error; 0 when nothing done; 1 when files were
- * expanded and execution may have blocked.
- * The files->file_lock should be held on entry, and will be held on exit.
- */
-int expand_files(struct files_struct *files, int nr)
-{
- struct fdtable *fdt;
-
- fdt = files_fdtable(files);
- /* Do we need to expand? */
- if (nr < fdt->max_fds)
- return 0;
- /* Can we expand? */
- if (nr >= NR_OPEN)
- return -EMFILE;
-
- /* All good, so we try */
- return expand_fdtable(files, nr);
-}
-
-static void __devinit fdtable_defer_list_init(int cpu)
-{
- struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
- spin_lock_init(&fddef->lock);
- INIT_WORK(&fddef->wq, free_fdtable_work);
- fddef->next = NULL;
+ return fmap;
}
-void __init files_defer_init(void)
-{
- int i;
- for_each_possible_cpu(i)
- fdtable_defer_list_init(i);
-}
Index: linux-2.6.mod/fs/file_table.c
===================================================================
--- linux-2.6.mod.orig/fs/file_table.c 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/fs/file_table.c 2007-06-06 12:48:03.000000000 -0700
@@ -298,6 +298,7 @@
files_stat.max_files = n;
if (files_stat.max_files < NR_FILE)
files_stat.max_files = NR_FILE;
- files_defer_init();
+ fdmap_module_init();
+ init_files_struct(init_task.files);
percpu_counter_init(&nr_files, 0);
}
Index: linux-2.6.mod/fs/proc/array.c
===================================================================
--- linux-2.6.mod.orig/fs/proc/array.c 2007-06-06 12:38:27.000000000 -0700
+++ linux-2.6.mod/fs/proc/array.c 2007-06-06 12:48:03.000000000 -0700
@@ -160,7 +160,7 @@
{
struct group_info *group_info;
int g;
- struct fdtable *fdt = NULL;
+ struct fd_map *fmap = NULL;
rcu_read_lock();
buffer += sprintf(buffer,
@@ -182,11 +182,11 @@
task_lock(p);
if (p->files)
- fdt = files_fdtable(p->files);
+ fmap = rcu_dereference(p->files->lfmap);
buffer += sprintf(buffer,
"FDSize:\t%d\n"
"Groups:\t",
- fdt ? fdt->max_fds : 0);
+ fmap ? fdmap_topfd(fmap): 0);
rcu_read_unlock();
group_info = p->group_info;
Index: linux-2.6.mod/security/selinux/hooks.c
===================================================================
--- linux-2.6.mod.orig/security/selinux/hooks.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/security/selinux/hooks.c 2007-06-06 12:48:03.000000000 -0700
@@ -1729,14 +1729,55 @@
extern struct vfsmount *selinuxfs_mount;
extern struct dentry *selinux_null;
+static void flush_unauthorized_fdmap(spinlock_t *lock, struct fd_map **pfmap,
+ struct file *devnull)
+{
+ int perm;
+ unsigned int start, base;
+ unsigned long fset;
+ struct file *file, *nfile;
+
+ spin_lock(lock);
+ for (start = 0;;) {
+ if (!*pfmap)
+ break;
+ if (!fdmap_next_flag_set(*pfmap, FDMAP_BIT_BUSYSLOT, 0,
+ &start, &base, &fset))
+ break;
+ for (; fset; base++, fset >>= 1) {
+ if (!(fset & 1))
+ continue;
+ file = fcheck(base);
+ if (!file)
+ continue;
+ spin_unlock(lock);
+ perm = file_has_perm(current,
+ file,
+ file_to_av(file));
+ spin_lock(lock);
+ nfile = fcheck(base);
+ if (!devnull || !perm || file != nfile) {
+ __put_unused_fd(base);
+ spin_unlock(lock);
+ } else {
+ spin_unlock(lock);
+ get_file(devnull);
+ fd_install(base, devnull);
+ }
+ fput(nfile);
+ fput(file);
+ spin_lock(lock);
+ }
+ }
+ spin_unlock(lock);
+}
+
/* Derived from fs/exec.c:flush_old_files. */
static inline void flush_unauthorized_files(struct files_struct * files)
{
struct avc_audit_data ad;
- struct file *file, *devnull = NULL;
+ struct file *file, *devnull;
struct tty_struct *tty;
- struct fdtable *fdt;
- long j = -1;
int drop_tty = 0;
mutex_lock(&tty_mutex);
@@ -1767,56 +1808,17 @@
AVC_AUDIT_DATA_INIT(&ad,FS);
- spin_lock(&files->file_lock);
- for (;;) {
- unsigned long set, i;
- int fd;
-
- j++;
- i = j * __NFDBITS;
- fdt = files_fdtable(files);
- if (i >= fdt->max_fds)
- break;
- set = fdt->open_fds->fds_bits[j];
- if (!set)
- continue;
- spin_unlock(&files->file_lock);
- for ( ; set ; i++,set >>= 1) {
- if (set & 1) {
- file = fget(i);
- if (!file)
- continue;
- if (file_has_perm(current,
- file,
- file_to_av(file))) {
- sys_close(i);
- fd = get_unused_fd();
- if (fd != i) {
- if (fd >= 0)
- put_unused_fd(fd);
- fput(file);
- continue;
- }
- if (devnull) {
- get_file(devnull);
- } else {
- devnull = dentry_open(dget(selinux_null), mntget(selinuxfs_mount), O_RDWR);
- if (IS_ERR(devnull)) {
- devnull = NULL;
- put_unused_fd(fd);
- fput(file);
- continue;
- }
- }
- fd_install(fd, devnull);
- }
- fput(file);
- }
- }
- spin_lock(&files->file_lock);
-
- }
- spin_unlock(&files->file_lock);
+ devnull = dentry_open(dget(selinux_null),
+ mntget(selinuxfs_mount),
+ O_RDWR);
+ if (IS_ERR(devnull))
+ devnull = NULL;
+ flush_unauthorized_fdmap(&files->file_lock, &files->lfmap,
+ devnull);
+ flush_unauthorized_fdmap(&files->file_lock, &files->fmap,
+ devnull);
+ if (devnull)
+ fput(devnull);
}
static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
Index: linux-2.6.mod/fs/select.c
===================================================================
--- linux-2.6.mod.orig/fs/select.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/fs/select.c 2007-06-06 12:48:03.000000000 -0700
@@ -139,16 +139,15 @@
static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
- unsigned long *open_fds;
+ const unsigned long *open_fds;
unsigned long set;
int max;
- struct fdtable *fdt;
/* handle last in-complete long-word first */
set = ~(~0UL << (n & (__NFDBITS-1)));
n /= __NFDBITS;
- fdt = files_fdtable(current->files);
- open_fds = fdt->open_fds->fds_bits+n;
+ open_fds = fdmap_get_allocmap(current->files->lfmap);
+ open_fds += n;
max = 0;
if (set) {
set &= BITS(fds, n);
@@ -312,7 +311,6 @@
void *bits;
int ret, max_fds;
unsigned int size;
- struct fdtable *fdt;
/* Allocate small arguments on the stack to save memory and be faster */
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
@@ -322,8 +320,7 @@
/* max_fds can increase, so grab it once to avoid race */
rcu_read_lock();
- fdt = files_fdtable(current->files);
- max_fds = fdt->max_fds;
+ max_fds = fdmap_topfd(current->files->lfmap);
rcu_read_unlock();
if (n > max_fds)
n = max_fds;
Index: linux-2.6.mod/fs/compat.c
===================================================================
--- linux-2.6.mod.orig/fs/compat.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/fs/compat.c 2007-06-06 12:48:03.000000000 -0700
@@ -1546,7 +1546,6 @@
fd_set_bits fds;
void *bits;
int size, max_fds, ret = -EINVAL;
- struct fdtable *fdt;
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
if (n < 0)
@@ -1554,8 +1553,7 @@
/* max_fds can increase, so grab it once to avoid race */
rcu_read_lock();
- fdt = files_fdtable(current->files);
- max_fds = fdt->max_fds;
+ max_fds = fdmap_topfd(current->files->lfmap);
rcu_read_unlock();
if (n > max_fds)
n = max_fds;
Index: linux-2.6.mod/drivers/char/tty_io.c
===================================================================
--- linux-2.6.mod.orig/drivers/char/tty_io.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/drivers/char/tty_io.c 2007-06-06 12:48:03.000000000 -0700
@@ -3428,6 +3428,26 @@
}
#endif
+struct tty_fdmap_SAK {
+ struct tty_struct *tty;
+ struct task_struct *p
+};
+
+static int tty_fdmap_SAK_helper(void *priv, struct file *filp, int fd)
+{
+ struct tty_fdmap_SAK *shlp = priv;
+
+ if (filp->f_op->read == tty_read &&
+ filp->private_data == shlp->tty) {
+ printk(KERN_NOTICE "SAK: killed process %d"
+ " (%s): fd#%d opened to the tty\n",
+ shlp->p->pid, shlp->p->comm, fd);
+ force_sig(SIGKILL, shlp->p);
+ return 1;
+ }
+ return 0;
+}
+
/*
* This implements the "Secure Attention Key" --- the idea is to
* prevent trojan horses by killing all processes associated with this
@@ -3454,9 +3474,6 @@
#else
struct task_struct *g, *p;
struct pid *session;
- int i;
- struct file *filp;
- struct fdtable *fdt;
if (!tty)
return;
@@ -3488,25 +3505,18 @@
}
task_lock(p);
if (p->files) {
+ struct tty_fdmap_SAK shlp;
+
+ shlp.tty = tty;
+ shlp.p = p;
+
/*
* We don't take a ref to the file, so we must
* hold ->file_lock instead.
*/
spin_lock(&p->files->file_lock);
- fdt = files_fdtable(p->files);
- for (i=0; i < fdt->max_fds; i++) {
- filp = fcheck_files(p->files, i);
- if (!filp)
- continue;
- if (filp->f_op->read == tty_read &&
- filp->private_data == tty) {
- printk(KERN_NOTICE "SAK: killed process %d"
- " (%s): fd#%d opened to the tty\n",
- p->pid, p->comm, i);
- force_sig(SIGKILL, p);
- break;
- }
- }
+ fdmap_for_each_file(p->files->lfmap, 0,
+ tty_fdmap_SAK_helper, &shlp);
spin_unlock(&p->files->file_lock);
}
task_unlock(p);
Index: linux-2.6.mod/arch/alpha/kernel/osf_sys.c
===================================================================
--- linux-2.6.mod.orig/arch/alpha/kernel/osf_sys.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/arch/alpha/kernel/osf_sys.c 2007-06-06 12:48:03.000000000 -0700
@@ -986,7 +986,6 @@
size_t size;
long timeout;
int ret = -EINVAL;
- struct fdtable *fdt;
int max_fds;
timeout = MAX_SCHEDULE_TIMEOUT;
@@ -1010,8 +1009,7 @@
}
rcu_read_lock();
- fdt = files_fdtable(current->files);
- max_fds = fdt->max_fds;
+ max_fds = fdmap_topfd(current->files->lfmap);
rcu_read_unlock();
if (n < 0 || n > max_fds)
goto out_nofds;
Index: linux-2.6.mod/arch/ia64/kernel/perfmon.c
===================================================================
--- linux-2.6.mod.orig/arch/ia64/kernel/perfmon.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/arch/ia64/kernel/perfmon.c 2007-06-06 12:48:03.000000000 -0700
@@ -2259,20 +2259,9 @@
static void
pfm_free_fd(int fd, struct file *file)
{
- struct files_struct *files = current->files;
- struct fdtable *fdt;
-
- /*
- * there ie no fd_uninstall(), so we do it here
- */
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- rcu_assign_pointer(fdt->fd[fd], NULL);
- spin_unlock(&files->file_lock);
-
+ put_unused_fd(current->files, fd);
if (file)
put_filp(file);
- put_unused_fd(fd);
}
static int
Index: linux-2.6.mod/arch/mips/kernel/kspd.c
===================================================================
--- linux-2.6.mod.orig/arch/mips/kernel/kspd.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/arch/mips/kernel/kspd.c 2007-06-06 12:48:03.000000000 -0700
@@ -294,35 +294,19 @@
printk("KSPD: sp_work_handle_request failed to send to SP\n");
}
+static int sp_files_fdmap_close(void *priv, struct file *file, int fd)
+{
+ filp_close(file, (struct files_struct *) priv);
+ return 0;
+}
+
static void sp_cleanup(void)
{
struct files_struct *files = current->files;
- int i, j;
- struct fdtable *fdt;
- j = 0;
-
- /*
- * It is safe to dereference the fd table without RCU or
- * ->file_lock
- */
- fdt = files_fdtable(files);
- for (;;) {
- unsigned long set;
- i = j * __NFDBITS;
- if (i >= fdt->max_fds)
- break;
- set = fdt->open_fds->fds_bits[j++];
- while (set) {
- if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
- if (file)
- filp_close(file, files);
- }
- i++;
- set >>= 1;
- }
- }
+ fdmap_for_each_file(files->lfmap, 1, sp_files_fdmap_close, files);
+ if (files->fmap)
+ fdmap_for_each_file(files->fmap, 1, sp_files_fdmap_close, files);
}
static int channel_open = 0;
Index: linux-2.6.mod/arch/powerpc/platforms/cell/spufs/coredump.c
===================================================================
--- linux-2.6.mod.orig/arch/powerpc/platforms/cell/spufs/coredump.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/arch/powerpc/platforms/cell/spufs/coredump.c 2007-06-06 12:48:03.000000000 -0700
@@ -133,6 +133,20 @@
return size;
}
+static int spufs_fdmap_add_one_context(void *priv, struct file *file, int fd)
+{
+ int *size = priv;
+
+ if (file->f_op == &spufs_context_fops) {
+ int rval = spufs_add_one_context(file, fd);
+ if (rval < 0)
+ return 1;
+ *size += rval;
+ }
+
+ return 0;
+}
+
/*
* The additional architecture-specific notes for Cell are various
* context files in the spu context.
@@ -144,21 +158,13 @@
*/
static int spufs_arch_notes_size(void)
{
- struct fdtable *fdt = files_fdtable(current->files);
- int size = 0, fd;
+ struct files_struct *files = current->files;
+ int size = 0;
- for (fd = 0; fd < fdt->max_fds; fd++) {
- if (FD_ISSET(fd, fdt->open_fds)) {
- struct file *file = fcheck(fd);
-
- if (file && file->f_op == &spufs_context_fops) {
- int rval = spufs_add_one_context(file, fd);
- if (rval < 0)
- break;
- size += rval;
- }
- }
- }
+ fdmap_for_each_file(files->lfmap, 0, spufs_fdmap_add_one_context, &size);
+ if (files->fmap)
+ fdmap_for_each_file(files->fmap, 0, spufs_fdmap_add_one_context,
+ &size);
return size;
}
Index: linux-2.6.mod/arch/sparc64/solaris/ioctl.c
===================================================================
--- linux-2.6.mod.orig/arch/sparc64/solaris/ioctl.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/arch/sparc64/solaris/ioctl.c 2007-06-06 12:48:03.000000000 -0700
@@ -294,13 +294,13 @@
static inline int solaris_sockmod(unsigned int fd, unsigned int cmd, u32 arg)
{
struct inode *ino;
- struct fdtable *fdt;
+ struct file *file;
/* I wonder which of these tests are superfluous... --patrik */
rcu_read_lock();
- fdt = files_fdtable(current->files);
- if (! fdt->fd[fd] ||
- ! fdt->fd[fd]->f_path.dentry ||
- ! (ino = fdt->fd[fd]->f_path.dentry->d_inode) ||
+ file = fcheck_files(current->files, fd);
+ if (! file ||
+ ! file->f_path.dentry ||
+ ! (ino = file->f_path.dentry->d_inode) ||
! S_ISSOCK(ino->i_mode)) {
rcu_read_unlock();
return TBADF;
Index: linux-2.6.mod/arch/sparc64/solaris/timod.c
===================================================================
--- linux-2.6.mod.orig/arch/sparc64/solaris/timod.c 2007-06-06 12:38:28.000000000 -0700
+++ linux-2.6.mod/arch/sparc64/solaris/timod.c 2007-06-06 12:48:03.000000000 -0700
@@ -143,11 +143,11 @@
static void timod_wake_socket(unsigned int fd)
{
struct socket *sock;
- struct fdtable *fdt;
+ struct file *file;
SOLD("wakeing socket");
- fdt = files_fdtable(current->files);
- sock = SOCKET_I(fdt->fd[fd]->f_path.dentry->d_inode);
+ file = fcheck_files(current->files, fd);
+ sock = SOCKET_I(file->f_path.dentry->d_inode);
wake_up_interruptible(&sock->wait);
read_lock(&sock->sk->sk_callback_lock);
if (sock->fasync_list && !test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
@@ -159,11 +159,11 @@
static void timod_queue(unsigned int fd, struct T_primsg *it)
{
struct sol_socket_struct *sock;
- struct fdtable *fdt;
+ struct file *file;
SOLD("queuing primsg");
- fdt = files_fdtable(current->files);
- sock = (struct sol_socket_struct *)fdt->fd[fd]->private_data;
+ file = fcheck_files(current->files, fd);
+ sock = (struct sol_socket_struct *)file->private_data;
it->next = sock->pfirst;
sock->pfirst = it;
if (!sock->plast)
@@ -175,11 +175,11 @@
static void timod_queue_end(unsigned int fd, struct T_primsg *it)
{
struct sol_socket_struct *sock;
- struct fdtable *fdt;
+ struct file *file;
SOLD("queuing primsg at end");
- fdt = files_fdtable(current->files);
- sock = (struct sol_socket_struct *)fdt->fd[fd]->private_data;
+ file = fcheck_files(current->files, fd);
+ sock = (struct sol_socket_struct *)file->private_data;
it->next = NULL;
if (sock->plast)
sock->plast->next = it;
@@ -350,7 +350,6 @@
char *buf;
struct file *filp;
struct inode *ino;
- struct fdtable *fdt;
struct sol_socket_struct *sock;
mm_segment_t old_fs = get_fs();
long args[6];
@@ -359,8 +358,7 @@
int (*sys_sendto)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int) =
(int (*)(int, void __user *, size_t, unsigned, struct sockaddr __user *, int))SYS(sendto);
- fdt = files_fdtable(current->files);
- filp = fdt->fd[fd];
+ filp = fcheck_files(current->files, fd);
ino = filp->f_path.dentry->d_inode;
sock = (struct sol_socket_struct *)filp->private_data;
SOLD("entry");
@@ -629,7 +627,6 @@
int oldflags;
struct file *filp;
struct inode *ino;
- struct fdtable *fdt;
struct sol_socket_struct *sock;
struct T_unitdata_ind udi;
mm_segment_t old_fs = get_fs();
@@ -642,8 +639,7 @@
SOLD("entry");
SOLDD(("%u %p %d %p %p %d %p %d\n", fd, ctl_buf, ctl_maxlen, ctl_len, data_buf, data_maxlen, data_len, *flags_p));
- fdt = files_fdtable(current->files);
- filp = fdt->fd[fd];
+ filp = fcheck_files(current->files, fd);
ino = filp->f_path.dentry->d_inode;
sock = (struct sol_socket_struct *)filp->private_data;
SOLDD(("%p %p\n", sock->pfirst, sock->pfirst ? sock->pfirst->next : NULL));
@@ -855,14 +851,12 @@
int __user *flgptr;
int flags;
int error = -EBADF;
- struct fdtable *fdt;
SOLD("entry");
lock_kernel();
if(fd >= NR_OPEN) goto out;
- fdt = files_fdtable(current->files);
- filp = fdt->fd[fd];
+ filp = fcheck_files(current->files, fd);
if(!filp) goto out;
ino = filp->f_path.dentry->d_inode;
@@ -923,14 +917,12 @@
struct strbuf ctl, dat;
int flags = (int) arg3;
int error = -EBADF;
- struct fdtable *fdt;
SOLD("entry");
lock_kernel();
if(fd >= NR_OPEN) goto out;
- fdt = files_fdtable(current->files);
- filp = fdt->fd[fd];
+ filp = fcheck_files(current->files, fd);
if(!filp) goto out;
ino = filp->f_path.dentry->d_inode;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]