Re: PROBLEM: /proc (procfs) task exit race condition causes a kernel crash

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Eric,

I've attached a patch file which rolls up a set of patches to dcache.c along with a few changed I made to locking symantics. So far a 2.6.16 (+ -mm2) with the patch applied survives the harshest testing I can throw at it!

Note that I've also made some minor changes to exit.c [preempt disable/enable during task exit] and truncate.c [BUG_ON check of 'private' page]. The testing I've done is with a kernel built without preemption, and one built with voluntary preemption. A kernel built with forced preemption and with spinlock debugging enabled did NOT work very well!!! Also, the performance of a kernel built with voluntary preemption and with the cond_resched_lock() calls in dcache.c was surprisingly *BAD*, with the system going into a wheel-spin for a long period [high system cpu of 87%+] when I fired up a number of large cpu+memory hungry tasks. This might need to be looked at more closely?!

Eric W. Biederman wrote:

I have tried to reproduce this.  The circumstances weren't the most
controlled but they did overlap with what you described and I haven't seen
anything.

So I am guessing that you are having memory corruption from some source.
Either bad ram or a bad module.

I'm off on vacation for a week, so I won't be able to follow up.


Eric

diff -urpN linux-2.6.16-mm2/fs/dcache.c linux-2.6.16/fs/dcache.c
--- linux-2.6.16-mm2/fs/dcache.c	2006-05-31 16:30:13.000000000 +1000
+++ linux-2.6.16/fs/dcache.c	2006-05-31 16:25:53.000000000 +1000
@@ -36,12 +36,10 @@
 
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
-EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
 static seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
 
-EXPORT_SYMBOL(dcache_lock);
 
 static kmem_cache_t *dentry_cache __read_mostly;
 
@@ -142,21 +140,18 @@ static void dentry_iput(struct dentry * 
  * no dcache lock, please.
  */
 
-void dput(struct dentry *dentry)
+static void dput_locked(struct dentry *dentry, struct list_head *list)
 {
 	if (!dentry)
 		return;
 
-repeat:
-	if (atomic_read(&dentry->d_count) == 1)
-		might_sleep();
-	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
+	if (!atomic_dec_and_test(&dentry->d_count))
 		return;
 
+repeat:
 	spin_lock(&dentry->d_lock);
 	if (atomic_read(&dentry->d_count)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
 		return;
 	}
 
@@ -176,33 +171,59 @@ repeat:
   		dentry_stat.nr_unused++;
   	}
  	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 	return;
 
 unhash_it:
 	__d_drop(dentry);
 
 kill_it: {
-		struct dentry *parent;
-
 		/* If dentry was on d_lru list
 		 * delete it from there
 		 */
   		if (!list_empty(&dentry->d_lru)) {
-  			list_del(&dentry->d_lru);
+  			list_del_init(&dentry->d_lru);
   			dentry_stat.nr_unused--;
   		}
   		list_del(&dentry->d_u.d_child);
 		dentry_stat.nr_dentry--;	/* For d_free, below */
-		/*drops the locks, at that point nobody can reach this dentry */
-		dentry_iput(dentry);
-		parent = dentry->d_parent;
-		d_free(dentry);
-		if (dentry == parent)
+		/* at this point nobody can reach this dentry */
+		list_add(&dentry->d_lru, list);
+		spin_unlock(&dentry->d_lock);
+		if (dentry == dentry->d_parent)
 			return;
-		dentry = parent;
-		goto repeat;
+		dentry = dentry->d_parent;
+		if (atomic_dec_and_test(&dentry->d_count))
+			goto repeat;
+		/* out */
+	}
+}
+
+void dput(struct dentry *dentry)
+{
+	LIST_HEAD(free_list);
+
+	if (!dentry)
+		goto do_return;
+
+	if (atomic_add_unless(&dentry->d_count, -1, 1))
+		goto do_return;
+
+	spin_lock(&dcache_lock);			/* While we hold the dcache_lock */
+	dput_locked(dentry, &free_list);		/* Put ALL free-able dentry's onto 'free_list' */
+
+	if (!list_empty(&free_list)) {			/* Then process as a single batch! */
+		struct dentry *dentry, *p;
+		list_for_each_entry_safe(dentry, p, &free_list, d_lru) {
+			spin_lock(&dentry->d_lock);	/* Lock dentry while also holding dcache_lock! */
+			list_del(&dentry->d_lru);
+			dentry_iput(dentry);		/* Enter with locks held; Exit with no locks! */
+			d_free(dentry);
+			spin_lock(&dcache_lock);	/* Assume we will iterate again so ... */
+		}
 	}
+	spin_unlock(&dcache_lock);			/* There *MUST* be a better way of doing this?! */
+do_return:
+	return;
 }
 
 /**
@@ -219,13 +240,15 @@ kill_it: {
  
 int d_invalidate(struct dentry * dentry)
 {
+	int	ret = 0;
+
 	/*
 	 * If it's already been dropped, return OK.
 	 */
 	spin_lock(&dcache_lock);
 	if (d_unhashed(dentry)) {
 		spin_unlock(&dcache_lock);
-		return 0;
+		goto do_return;
 	}
 	/*
 	 * Check whether to do a partial shrink_dcache
@@ -252,14 +275,16 @@ int d_invalidate(struct dentry * dentry)
 		if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
-			return -EBUSY;
+			ret = -EBUSY;
+			goto do_return;
 		}
 	}
 
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
-	return 0;
+do_return:
+	return ret;
 }
 
 /* This should be called _only_ with dcache_lock held */
@@ -276,7 +301,10 @@ static inline struct dentry * __dget_loc
 
 struct dentry * dget_locked(struct dentry *dentry)
 {
-	return __dget_locked(dentry);
+	struct dentry*	ret;
+
+	ret = __dget_locked(dentry);
+	return ret;
 }
 
 /**
@@ -366,22 +394,39 @@ restart:
  */
 static inline void prune_one_dentry(struct dentry * dentry)
 {
-	struct dentry * parent;
+	LIST_HEAD(free_list);
 
 	__d_drop(dentry);
 	list_del(&dentry->d_u.d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
-	dentry_iput(dentry);
-	parent = dentry->d_parent;
+
+	/* dput the parent here before we release dcache_lock */
+	if (dentry != dentry->d_parent)
+		dput_locked(dentry->d_parent, &free_list);
+
+	dentry_iput(dentry);		/* drop locks */
 	d_free(dentry);
-	if (parent != dentry)
-		dput(parent);
+
+	if (!list_empty(&free_list)) {
+		struct dentry *tmp, *p;
+
+		list_for_each_entry_safe(tmp, p, &free_list, d_lru) {
+			spin_lock(&dcache_lock);	/* All of this locking/unlocking  */
+			spin_lock(&tmp->d_lock);	/* is so incredibly UGLY!!! */
+			list_del(&tmp->d_lru);
+			dentry_iput(tmp);
+			d_free(tmp);
+		}
+	}
+
 	spin_lock(&dcache_lock);
 }
 
 /**
  * prune_dcache - shrink the dcache
  * @count: number of entries to try and free
+ * @sb: if given, ignore dentries for other superblocks
+ *         which are being unmounted.
  *
  * Shrink the dcache. This is done when we need
  * more memory, or simply when we need to unmount
@@ -392,16 +437,30 @@ static inline void prune_one_dentry(stru
  * all the dentries are in use.
  */
  
-static void prune_dcache(int count)
+static void prune_dcache(int count, struct super_block *sb)
 {
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
 		struct dentry *dentry;
 		struct list_head *tmp;
+		struct rw_semaphore *s_umount;
 
-		cond_resched_lock(&dcache_lock);
+		/*cond_resched_lock(&dcache_lock);	** ?BAD PERFORMANCE? **   */
 
 		tmp = dentry_unused.prev;
+		if (unlikely(sb)) {
+			/* Try to find a dentry for this sb, but don't try
+			 * too hard, if they aren't near the tail they will
+			 * be moved down again soon
+			 */
+			int skip = count;
+			while (skip &&
+			       tmp != &dentry_unused &&
+			       list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
+				skip--;
+				tmp = tmp->prev;
+			}
+		}
 		if (tmp == &dentry_unused)
 			break;
 		list_del_init(tmp);
@@ -427,7 +486,45 @@ static void prune_dcache(int count)
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		prune_one_dentry(dentry);
+		/*
+		 * If the dentry is not DCACHED_REFERENCED, it is time
+		 * to remove it from the dcache, provided the super block is
+		 * NULL (which means we are trying to reclaim memory)
+		 * or this dentry belongs to the same super block that
+		 * we want to shrink.
+		 */
+		/*
+		 * If this dentry is for "my" filesystem, then I can prune it
+		 * without taking the s_umount lock (I already hold it).
+		 */
+		if (sb && dentry->d_sb == sb) {
+			prune_one_dentry(dentry);
+			continue;
+		}
+		/*
+		 * ...otherwise we need to be sure this filesystem isn't being
+		 * unmounted, otherwise we could race with
+		 * generic_shutdown_super(), and end up holding a reference to
+		 * an inode while the filesystem is unmounted.
+		 * So we try to get s_umount, and make sure s_root isn't NULL.
+		 * (Take a local copy of s_umount to avoid a use-after-free of
+		 * `dentry').
+		 */
+		s_umount = &dentry->d_sb->s_umount;
+		if (down_read_trylock(s_umount)) {
+			if (dentry->d_sb->s_root != NULL) {
+				prune_one_dentry(dentry);
+				up_read(s_umount);
+				continue;
+			}
+			up_read(s_umount);
+		}
+		spin_unlock(&dentry->d_lock);
+		/* Cannot remove the first dentry, and it isn't appropriate
+		 * to move it to the head of the list, so give up, and try
+		 * later
+		 */
+		break;
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -481,14 +578,14 @@ repeat:
 		if (dentry->d_sb != sb)
 			continue;
 		dentry_stat.nr_unused--;
-		list_del_init(tmp);
 		spin_lock(&dentry->d_lock);
+		list_del_init(tmp);
 		if (atomic_read(&dentry->d_count)) {
 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
 		prune_one_dentry(dentry);
-		cond_resched_lock(&dcache_lock);
+		/*cond_resched_lock(&dcache_lock);	** ?BAD PERFORMANCE? **   */
 		goto repeat;
 	}
 	spin_unlock(&dcache_lock);
@@ -512,6 +609,7 @@ int have_submounts(struct dentry *parent
 {
 	struct dentry *this_parent = parent;
 	struct list_head *next;
+	int	ret = 1;
 
 	spin_lock(&dcache_lock);
 	if (d_mountpoint(parent))
@@ -539,11 +637,10 @@ resume:
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
-	spin_unlock(&dcache_lock);
-	return 0; /* No mount points found in tree */
+	ret = 0; /* No mount points found in tree */
 positive:
 	spin_unlock(&dcache_lock);
-	return 1;
+	return ret;
 }
 
 /*
@@ -630,7 +727,7 @@ void shrink_dcache_parent(struct dentry 
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found);
+		prune_dcache(found, parent->d_sb);
 }
 
 /**
@@ -643,9 +740,10 @@ void shrink_dcache_parent(struct dentry 
  * done under dcache_lock.
  *
  */
-void shrink_dcache_anon(struct hlist_head *head)
+void shrink_dcache_anon(struct super_block *sb)
 {
 	struct hlist_node *lp;
+	struct hlist_head *head = &sb->s_anon;
 	int found;
 	do {
 		found = 0;
@@ -668,7 +766,7 @@ void shrink_dcache_anon(struct hlist_hea
 			}
 		}
 		spin_unlock(&dcache_lock);
-		prune_dcache(found);
+		prune_dcache(found, sb);
 	} while(found);
 }
 
@@ -686,12 +784,16 @@ void shrink_dcache_anon(struct hlist_hea
  */
 static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
 {
+	int	ret = -1;
+
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
-			return -1;
-		prune_dcache(nr);
+			goto do_return;
+		prune_dcache(nr, NULL);
 	}
-	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+	ret = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+do_return:
+	return ret;
 }
 
 /**
@@ -711,13 +813,14 @@ struct dentry *d_alloc(struct dentry * p
 
 	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); 
 	if (!dentry)
-		return NULL;
+		goto do_return;
 
 	if (name->len > DNAME_INLINE_LEN-1) {
 		dname = kmalloc(name->len + 1, GFP_KERNEL);
 		if (!dname) {
 			kmem_cache_free(dentry_cache, dentry); 
-			return NULL;
+			dentry = NULL;
+			goto do_return;
 		}
 	} else  {
 		dname = dentry->d_iname;
@@ -758,18 +861,20 @@ struct dentry *d_alloc(struct dentry * p
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	dentry_stat.nr_dentry++;
 	spin_unlock(&dcache_lock);
-
+do_return:
 	return dentry;
 }
 
 struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 {
 	struct qstr q;
+	struct dentry * ret;
 
 	q.name = name;
 	q.len = strlen(name);
 	q.hash = full_name_hash(q.name, q.len);
-	return d_alloc(parent, &q);
+	ret = d_alloc(parent, &q);
+	return ret;
 }
 
 /**
@@ -817,7 +922,7 @@ void d_instantiate(struct dentry *entry,
  */
 struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 {
-	struct dentry *alias;
+	struct dentry *alias = NULL;
 	int len = entry->d_name.len;
 	const char *name = entry->d_name.name;
 	unsigned int hash = entry->d_name.hash;
@@ -841,7 +946,7 @@ struct dentry *d_instantiate_unique(stru
 		spin_unlock(&dcache_lock);
 		BUG_ON(!d_unhashed(alias));
 		iput(inode);
-		return alias;
+		goto do_return;
 	}
 	list_add(&entry->d_alias, &inode->i_dentry);
 do_negative:
@@ -849,9 +954,10 @@ do_negative:
 	fsnotify_d_instantiate(entry, inode);
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
-	return NULL;
+	alias = NULL;
+do_return:
+	return alias;
 }
-EXPORT_SYMBOL(d_instantiate_unique);
 
 /**
  * d_alloc_root - allocate root dentry
@@ -915,12 +1021,14 @@ struct dentry * d_alloc_anon(struct inod
 
 	if ((res = d_find_alias(inode))) {
 		iput(inode);
-		return res;
+		goto do_return;
 	}
 
 	tmp = d_alloc(NULL, &anonstring);
-	if (!tmp)
-		return NULL;
+	if (!tmp) {
+		res = NULL;
+		goto do_return;
+	}
 
 	tmp->d_parent = tmp; /* make sure dput doesn't croak */
 	
@@ -948,6 +1056,7 @@ struct dentry * d_alloc_anon(struct inod
 		iput(inode);
 	if (tmp)
 		dput(tmp);
+do_return:
 	return res;
 }
 
@@ -1142,6 +1251,7 @@ int d_validate(struct dentry *dentry, st
 {
 	struct hlist_head *base;
 	struct hlist_node *lhp;
+	int		   ret = 0;
 
 	/* Check whether the ptr might be valid at all.. */
 	if (!kmem_ptr_validate(dentry_cache, dentry))
@@ -1159,12 +1269,13 @@ int d_validate(struct dentry *dentry, st
 		if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
 			__dget_locked(dentry);
 			spin_unlock(&dcache_lock);
-			return 1;
+			ret = 1;
+			goto out;
 		}
 	}
 	spin_unlock(&dcache_lock);
 out:
-	return 0;
+	return ret;
 }
 
 /*
@@ -1191,6 +1302,7 @@ out:
 void d_delete(struct dentry * dentry)
 {
 	int isdir = 0;
+
 	/*
 	 * Are we the only user?
 	 */
@@ -1203,7 +1315,7 @@ void d_delete(struct dentry * dentry)
 
 		dentry_iput(dentry);
 		fsnotify_nameremove(dentry, isdir);
-		return;
+		goto do_return;
 	}
 
 	if (!d_unhashed(dentry))
@@ -1213,6 +1325,8 @@ void d_delete(struct dentry * dentry)
 	spin_unlock(&dcache_lock);
 
 	fsnotify_nameremove(dentry, isdir);
+do_return:
+	return;
 }
 
 static void __d_rehash(struct dentry * entry, struct hlist_head *list)
@@ -1497,13 +1611,13 @@ char * d_path(struct dentry *dentry, str
  */
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
 {
-	int error;
+	int error = -ENOMEM;
 	struct vfsmount *pwdmnt, *rootmnt;
 	struct dentry *pwd, *root;
 	char *page = (char *) __get_free_page(GFP_USER);
 
 	if (!page)
-		return -ENOMEM;
+		goto do_return;
 
 	read_lock(&current->fs->lock);
 	pwdmnt = mntget(current->fs->pwdmnt);
@@ -1542,6 +1656,7 @@ out:
 	dput(root);
 	mntput(rootmnt);
 	free_page((unsigned long) page);
+do_return:
 	return error;
 }
 
@@ -1729,7 +1844,6 @@ kmem_cache_t *names_cachep __read_mostly
 /* SLAB cache for file structures */
 kmem_cache_t *filp_cachep __read_mostly;
 
-EXPORT_SYMBOL(d_genocide);
 
 extern void bdev_cache_init(void);
 extern void chrdev_init(void);
@@ -1764,6 +1878,10 @@ void __init vfs_caches_init(unsigned lon
 	chrdev_init();
 }
 
+EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
+EXPORT_SYMBOL(dcache_lock);
+EXPORT_SYMBOL(d_instantiate_unique);
+EXPORT_SYMBOL(d_genocide);
 EXPORT_SYMBOL(d_alloc);
 EXPORT_SYMBOL(d_alloc_anon);
 EXPORT_SYMBOL(d_alloc_root);
diff -urpN linux-2.6.16-mm2/kernel/exit.c linux-2.6.16/kernel/exit.c
--- linux-2.6.16-mm2/kernel/exit.c	2006-05-31 16:30:14.000000000 +1000
+++ linux-2.6.16/kernel/exit.c	2006-05-30 14:49:35.000000000 +1000
@@ -136,6 +136,7 @@ void release_task(struct task_struct * p
 {
 	int zap_leader;
 	task_t *leader;
+	preempt_disable();	// ** Cleanup as fast as we can! **
 repeat:
 	atomic_dec(&p->user->processes);
 	write_lock_irq(&tasklist_lock);
@@ -173,6 +174,7 @@ repeat:
 	p = leader;
 	if (unlikely(zap_leader))
 		goto repeat;
+	preempt_enable();	// ** OK to give other tasks some cycles now **
 }
 
 /*
diff -urpN linux-2.6.16-mm2/mm/truncate.c linux-2.6.16/mm/truncate.c
--- linux-2.6.16-mm2/mm/truncate.c	2006-05-31 16:30:14.000000000 +1000
+++ linux-2.6.16/mm/truncate.c	2006-05-28 17:22:46.000000000 +1000
@@ -80,7 +80,7 @@ invalidate_complete_page(struct address_
 		return 0;
 	}
 
-	BUG_ON(PagePrivate(page));
+	BUG_ON(PagePrivate(page) && (page_private(page) != 0));
 	__remove_from_page_cache(page);
 	write_unlock_irq(&mapping->tree_lock);
 	ClearPageUptodate(page);

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux