[PATCH 4/7] fs: break the file_list_lock for sb->s_files

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Break the protection of sb->s_files out from under the global file_list_lock.

sb->s_files is converted to a lock_list. furthermore to prevent the 
lock_list_head of getting too contended with concurrent add operations
the add is buffered in per cpu filevecs.

This would ordinarily require a flush before a delete operation - to ensure
the to be deleted entry is indeed added to the list. This is avoided by 
storing a pointer to the filevec location in the not yet used list_head.
This pointer can then be used to clear the filevec entry before its actually
added.

The file_flag mess is a bit unfortunate - this can be removed by also
converting tty->tty_files to a lock_list (TODO).

Signed-off-by: Peter Zijlstra <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
 fs/dquot.c                   |   10 +-
 fs/file_table.c              |  170 +++++++++++++++++++++++++++++++++++++++----
 fs/open.c                    |    2 
 fs/proc/generic.c            |    8 --
 fs/super.c                   |    7 -
 include/linux/fs.h           |   23 +++++
 mm/readahead.c               |    2 
 security/selinux/selinuxfs.c |    9 +-
 8 files changed, 194 insertions(+), 37 deletions(-)

Index: linux-2.6/fs/dquot.c
===================================================================
--- linux-2.6.orig/fs/dquot.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/dquot.c	2007-01-27 21:07:44.000000000 +0100
@@ -688,23 +688,21 @@ static int dqinit_needed(struct inode *i
 /* This routine is guarded by dqonoff_mutex mutex */
 static void add_dquot_ref(struct super_block *sb, int type)
 {
-	struct list_head *p;
+	struct file *filp;
 
 restart:
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file *filp = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
 		struct inode *inode = filp->f_path.dentry->d_inode;
 		if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) {
 			struct dentry *dentry = dget(filp->f_path.dentry);
-			file_list_unlock();
+			lock_list_for_each_entry_stop(filp, f_u.fu_llist);
 			sb->dq_op->initialize(inode, type);
 			dput(dentry);
 			/* As we may have blocked we had better restart... */
 			goto restart;
 		}
 	}
-	file_list_unlock();
 }
 
 /* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */
Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/file_table.c	2007-01-27 21:07:44.000000000 +0100
@@ -113,7 +113,7 @@ struct file *get_empty_filp(void)
 		goto fail_sec;
 
 	tsk = current;
-	INIT_LIST_HEAD(&f->f_u.fu_list);
+	INIT_LOCK_LIST_HEAD(&f->f_u.fu_llist);
 	atomic_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	f->f_uid = tsk->fsuid;
@@ -245,32 +245,175 @@ void put_filp(struct file *file)
 	}
 }
 
-void file_move(struct file *file, struct list_head *list)
+enum {
+	FILEVEC_SIZE = 15
+};
+
+struct filevec {
+	unsigned long nr;
+	struct file *files[FILEVEC_SIZE];
+};
+
+static DEFINE_PER_CPU(struct filevec, sb_fvec);
+
+static inline unsigned int filevec_size(struct filevec *fvec)
 {
-	if (!list)
-		return;
-	file_list_lock();
-	list_move(&file->f_u.fu_list, list);
-	file_list_unlock();
+	return FILEVEC_SIZE - fvec->nr;
+}
+
+static inline unsigned int filevec_count(struct filevec *fvec)
+{
+	return fvec->nr;
+}
+
+static inline void filevec_reinit(struct filevec *fvec)
+{
+	fvec->nr = 0;
+}
+
+static inline unsigned int filevec_add(struct filevec *fvec, struct file *filp)
+{
+	rcu_assign_pointer(fvec->files[fvec->nr], filp);
+
+	/*
+	 * Here we do icky stuff in order to avoid flushing the per cpu filevec
+	 * on list removal.
+	 *
+	 * We store the location on the per cpu filevec in the as of yet unused
+	 * fu_llist.next field and toggle bit 0 to indicate we done so. This
+	 * allows the removal code to set the filevec entry to NULL, thereby
+	 * avoiding the list add.
+	 *
+	 * Abuse the fu_llist.lock for protection.
+	 */
+	spin_lock(&filp->f_u.fu_llist.lock);
+	filp->f_u.fu_llist.next = (void *)&fvec->files[fvec->nr];
+	__set_bit(0, (void *)&filp->f_u.fu_llist.next);
+	spin_unlock(&filp->f_u.fu_llist.lock);
+
+	fvec->nr++;
+	return filevec_size(fvec);
+}
+
+static void __filevec_add(struct filevec *fvec)
+{
+	int i;
+
+	for (i = 0; i < filevec_count(fvec); i++) {
+		struct file *filp;
+
+		/*
+		 * see the comment in filevec_add();
+		 * need RCU because a concurrent remove might have deleted
+		 * the entry from under us.
+		 */
+		rcu_read_lock();
+		filp = rcu_dereference(fvec->files[i]);
+		/*
+		 * the simple case, its gone - NEXT!
+		 */
+		if (!filp) {
+			rcu_read_unlock();
+			continue;
+		}
+
+		spin_lock(&filp->f_u.fu_llist.lock);
+		/*
+		 * If the entry really is still there, add it!
+		 */
+		if (rcu_dereference(fvec->files[i])) {
+			struct super_block *sb =
+				filp->f_mapping->host->i_sb;
+
+			__lock_list_add(&filp->f_u.fu_llist, &sb->s_files);
+		}
+		spin_unlock(&filp->f_u.fu_llist.lock);
+		rcu_read_unlock();
+	}
+	filevec_reinit(fvec);
+}
+
+static void filevec_add_drain(void)
+{
+	struct filevec *fvec = &get_cpu_var(sb_fvec);
+	if (filevec_count(fvec))
+		__filevec_add(fvec);
+	put_cpu_var(sb_fvec);
 }
 
+static void filevec_add_drain_per_cpu(struct work_struct *dummy)
+{
+	filevec_add_drain();
+}
+
+int filevec_add_drain_all(void)
+{
+	return schedule_on_each_cpu(filevec_add_drain_per_cpu);
+}
+EXPORT_SYMBOL_GPL(filevec_add_drain_all);
+
 void file_kill(struct file *file)
 {
-	if (!list_empty(&file->f_u.fu_list)) {
+	if (file_flag(file, F_SUPERBLOCK)) {
+		void **ptr;
+
+		file_flag_clear(file, F_SUPERBLOCK);
+
+		/*
+		 * If bit 0 of the fu_llist.next pointer is set we're still
+		 * enqueued on a per cpu filevec, in that case clear the entry
+		 * and we're done.
+		 */
+		spin_lock(&file->f_u.fu_llist.lock);
+		ptr = (void **)file->f_u.fu_llist.next;
+		if (__test_and_clear_bit(0, (void *)&ptr)) {
+			rcu_assign_pointer(*ptr, NULL);
+			INIT_LIST_HEAD(&file->f_u.fu_llist.head);
+			spin_unlock(&file->f_u.fu_llist.lock);
+			return;
+		}
+		spin_unlock(&file->f_u.fu_llist.lock);
+
+		if (!list_empty(&file->f_u.fu_list))
+			lock_list_del_init(&file->f_u.fu_llist);
+
+	} else if (!list_empty(&file->f_u.fu_list)) {
 		file_list_lock();
 		list_del_init(&file->f_u.fu_list);
 		file_list_unlock();
 	}
 }
 
+void file_move(struct file *file, struct list_head *list)
+{
+	struct super_block *sb;
+
+	if (!list)
+		return;
+
+	file_kill(file);
+
+	sb = file->f_mapping->host->i_sb;
+	if (list == &sb->s_files.head) {
+		struct filevec *fvec = &get_cpu_var(sb_fvec);
+		file_flag_set(file, F_SUPERBLOCK);
+		if (!filevec_add(fvec, file))
+			__filevec_add(fvec);
+		put_cpu_var(sb_fvec);
+	} else {
+		file_list_lock();
+		list_add(&file->f_u.fu_list, list);
+		file_list_unlock();
+	}
+}
+
 int fs_may_remount_ro(struct super_block *sb)
 {
-	struct list_head *p;
+	struct file *file;
 
 	/* Check that no files are currently opened for writing. */
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file *file = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(file, &sb->s_files, f_u.fu_llist) {
 		struct inode *inode = file->f_path.dentry->d_inode;
 
 		/* File with pending delete? */
@@ -281,10 +424,9 @@ int fs_may_remount_ro(struct super_block
 		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
 			goto too_bad;
 	}
-	file_list_unlock();
 	return 1; /* Tis' cool bro. */
 too_bad:
-	file_list_unlock();
+	lock_list_for_each_entry_stop(file, f_u.fu_llist);
 	return 0;
 }
 
Index: linux-2.6/fs/open.c
===================================================================
--- linux-2.6.orig/fs/open.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/open.c	2007-01-27 21:07:44.000000000 +0100
@@ -692,7 +692,7 @@ static struct file *__dentry_open(struct
 	f->f_path.mnt = mnt;
 	f->f_pos = 0;
 	f->f_op = fops_get(inode->i_fop);
-	file_move(f, &inode->i_sb->s_files);
+	file_move(f, &inode->i_sb->s_files.head);
 
 	if (!open && f->f_op)
 		open = f->f_op->open;
Index: linux-2.6/fs/proc/generic.c
===================================================================
--- linux-2.6.orig/fs/proc/generic.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/proc/generic.c	2007-01-27 21:07:44.000000000 +0100
@@ -549,15 +549,14 @@ static int proc_register(struct proc_dir
  */
 static void proc_kill_inodes(struct proc_dir_entry *de)
 {
-	struct list_head *p;
+	struct file *filp;
 	struct super_block *sb = proc_mnt->mnt_sb;
 
 	/*
 	 * Actually it's a partial revoke().
 	 */
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file * filp = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
 		struct dentry * dentry = filp->f_path.dentry;
 		struct inode * inode;
 		const struct file_operations *fops;
@@ -571,7 +570,6 @@ static void proc_kill_inodes(struct proc
 		filp->f_op = NULL;
 		fops_put(fops);
 	}
-	file_list_unlock();
 }
 
 static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c	2007-01-13 21:04:07.000000000 +0100
+++ linux-2.6/fs/super.c	2007-01-27 21:07:44.000000000 +0100
@@ -67,7 +67,7 @@ static struct super_block *alloc_super(s
 		}
 		INIT_LIST_HEAD(&s->s_dirty);
 		INIT_LIST_HEAD(&s->s_io);
-		INIT_LIST_HEAD(&s->s_files);
+		INIT_LOCK_LIST_HEAD(&s->s_files);
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
@@ -568,12 +568,11 @@ static void mark_files_ro(struct super_b
 {
 	struct file *f;
 
-	file_list_lock();
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+	filevec_add_drain_all();
+	lock_list_for_each_entry(f, &sb->s_files, f_u.fu_llist) {
 		if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f))
 			f->f_mode &= ~FMODE_WRITE;
 	}
-	file_list_unlock();
 }
 
 /**
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h	2007-01-22 22:43:28.000000000 +0100
+++ linux-2.6/include/linux/fs.h	2007-01-27 21:07:44.000000000 +0100
@@ -275,6 +275,7 @@ extern int dir_notify_enable;
 #include <linux/cache.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <linux/lock_list.h>
 #include <linux/radix-tree.h>
 #include <linux/prio_tree.h>
 #include <linux/init.h>
@@ -709,9 +710,13 @@ struct file {
 	/*
 	 * fu_list becomes invalid after file_free is called and queued via
 	 * fu_rcuhead for RCU freeing
+	 * fu_llist is used for the superblock s_files list; its crucial that
+	 * the spinlock contained therein is not clobbered by other uses of
+	 * the union.
 	 */
 	union {
 		struct list_head	fu_list;
+		struct lock_list_head	fu_llist;
 		struct rcu_head 	fu_rcuhead;
 	} f_u;
 	struct path		f_path;
@@ -744,9 +749,25 @@ extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 #define file_list_unlock() spin_unlock(&files_lock);
 
+/*
+ * steal the upper 8 bits from the read-a-head flags
+ */
+#define F_SHIFT		24
+
+#define F_SUPERBLOCK	0
+
+#define file_flag_set(file, flag)	\
+	__set_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+#define file_flag_clear(file, flag)	\
+	__clear_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+#define file_flag(file, flag)		\
+	test_bit((flag) + F_SHIFT, &(file)->f_ra.flags)
+
 #define get_file(x)	atomic_inc(&(x)->f_count)
 #define file_count(x)	atomic_read(&(x)->f_count)
 
+extern int filevec_add_drain_all(void);
+
 #define	MAX_NON_LFS	((1UL<<31) - 1)
 
 /* Page cache limit. The filesystems should put that into their s_maxbytes 
@@ -928,7 +949,7 @@ struct super_block {
 	struct list_head	s_dirty;	/* dirty inodes */
 	struct list_head	s_io;		/* parked for writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
-	struct list_head	s_files;
+	struct lock_list_head	s_files;
 
 	struct block_device	*s_bdev;
 	struct list_head	s_instances;
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c	2007-01-22 22:43:29.000000000 +0100
+++ linux-2.6/mm/readahead.c	2007-01-27 21:07:44.000000000 +0100
@@ -69,7 +69,7 @@ static inline void reset_ahead_window(st
 static inline void ra_off(struct file_ra_state *ra)
 {
 	ra->start = 0;
-	ra->flags = 0;
+	ra->flags &= (~0UL) << F_SHIFT;
 	ra->size = 0;
 	reset_ahead_window(ra);
 	return;
Index: linux-2.6/security/selinux/selinuxfs.c
===================================================================
--- linux-2.6.orig/security/selinux/selinuxfs.c	2007-01-13 21:04:19.000000000 +0100
+++ linux-2.6/security/selinux/selinuxfs.c	2007-01-27 21:10:48.000000000 +0100
@@ -940,7 +940,8 @@ static struct file_operations sel_commit
  * fs/proc/generic.c proc_kill_inodes */
 static void sel_remove_bools(struct dentry *de)
 {
-	struct list_head *p, *node;
+	struct list_head *node;
+	struct file *filp;
 	struct super_block *sb = de->d_sb;
 
 	spin_lock(&dcache_lock);
@@ -962,9 +963,8 @@ static void sel_remove_bools(struct dent
 
 	spin_unlock(&dcache_lock);
 
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file * filp = list_entry(p, struct file, f_u.fu_list);
+	filevec_add_drain_all();
+	lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) {
 		struct dentry * dentry = filp->f_path.dentry;
 
 		if (dentry->d_parent != de) {
@@ -972,7 +972,6 @@ static void sel_remove_bools(struct dent
 		}
 		filp->f_op = NULL;
 	}
-	file_list_unlock();
 }
 
 #define BOOL_DIR_NAME "booleans"

--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux