[RFC][PATCH] mm: concurrent page-cache

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

here my current attempts at a concurrent page-cache. It applies on top
if Nick's latest lockless page-cache patch. It is very much a
work-in-progress. It may eat your data. I post with the hopes of getting
some feedback.

I've ran it on a SMP qemu and found that it currently live-locks in
find_get_pages_tag(). It seems to be the case that a concurrent
radix_tree_tag_clear()/radix_tree_delete() removed the 'last' entry for
a gang lookup, which makes find_get_pages_tag() loop forever since it
requires progress.

If this is indeed the case, it seems to me the lockless page-cache patch
might be susceptible to this same problem.

I still have to investigate fully, but I might not have time to look
into it until after Ottawa.

Signed-off-by: Peter Zijlstra <[email protected]>

---
 fs/buffer.c                |    4 
 include/linux/page-flags.h |    2 
 include/linux/pagemap.h    |   21 +--
 include/linux/radix-tree.h |    6 
 lib/radix-tree.c           |  314 +++++++++++++++++++++++++++++++++------------
 mm/filemap.c               |   41 +++--
 mm/migrate.c               |   27 ++-
 mm/page-writeback.c        |   22 +--
 mm/swap_state.c            |   17 +-
 mm/swapfile.c              |    5 
 mm/truncate.c              |    7 -
 mm/vmscan.c                |   10 -
 12 files changed, 323 insertions(+), 153 deletions(-)

Index: linux-2.6.17-crt/include/linux/radix-tree.h
===================================================================
--- linux-2.6.17-crt.orig/include/linux/radix-tree.h
+++ linux-2.6.17-crt/include/linux/radix-tree.h
@@ -25,6 +25,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/rcupdate.h>
+#include <linux/spinlock.h>
 
 /*** radix-tree API starts here ***/
 
@@ -32,12 +33,14 @@ struct radix_tree_root {
 	unsigned int		height;
 	gfp_t			gfp_mask;
 	struct radix_tree_node	*rnode;
+	spinlock_t		lock;
 };
 
 #define RADIX_TREE_INIT(mask)	{					\
 	.height = 0,							\
 	.gfp_mask = (mask),						\
 	.rnode = NULL,							\
+	.lock = SPIN_LOCK_UNLOCKED,					\
 }
 
 #define RADIX_TREE(name, mask) \
@@ -48,6 +51,7 @@ do {									\
 	(root)->height = 0;						\
 	(root)->gfp_mask = (mask);					\
 	(root)->rnode = NULL;						\
+	spin_lock_init(&(root)->lock);					\
 } while (0)
 
 #define RADIX_TREE_MAX_TAGS 2
@@ -116,7 +120,7 @@ static inline void radix_tree_replace_sl
 
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
-void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long, spinlock_t **);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
Index: linux-2.6.17-crt/lib/radix-tree.c
===================================================================
--- linux-2.6.17-crt.orig/lib/radix-tree.c
+++ linux-2.6.17-crt/lib/radix-tree.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/rcupdate.h>
+#include <linux/spinlock.h>
 
 
 #ifdef __KERNEL__
@@ -52,11 +53,33 @@ struct radix_tree_node {
 	struct rcu_head	rcu_head;
 	void		*slots[RADIX_TREE_MAP_SIZE];
 	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
+	spinlock_t	lock;
 };
 
+/*
+ * radix_node_lock() - return a pointer to a spinlock or NULL.
+ *
+ * This function allows for alternate node lock mappings.
+ *
+ * The trivial case of returning NULL will reduce the locking to the root lock
+ * only, i.e. mutual exclusion for modifying operations.
+ *
+ * Another possible mapping is one lock per tree level, which would give a
+ * sort of pipelined serialisation.
+ *
+ * The current mapping is one lock per node, gives best concurrency, takes most
+ * memory.
+ */
+static inline spinlock_t * radix_node_lock(struct radix_tree_root *root,
+		struct radix_tree_node *node)
+{
+	return &node->lock;
+}
+
 struct radix_tree_path {
 	struct radix_tree_node *node;
 	int offset;
+	spinlock_t *lock;
 };
 
 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
@@ -177,6 +200,22 @@ static inline int any_tag_set(struct rad
 	return 0;
 }
 
+static inline int any_tag_set_but(struct radix_tree_node *node,
+		unsigned int tag, int offset)
+{
+	int idx;
+	int offset_idx = offset / BITS_PER_LONG;
+	unsigned long offset_mask = ~(1 << (offset % BITS_PER_LONG));
+	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+		unsigned long mask = ~0UL;
+		if (idx == offset_idx)
+			mask = offset_mask;
+		if (node->tags[tag][idx] & mask)
+			return 1;
+	}
+	return 0;
+}
+
 /*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
@@ -254,14 +293,19 @@ int radix_tree_insert(struct radix_tree_
 	struct radix_tree_node *node = NULL, *slot;
 	unsigned int height, shift;
 	int offset;
-	int error;
+	int error = 0;
+	spinlock_t *hlock, *llock = &root->lock;
+	int tag;
+	unsigned long flags;
+
+	spin_lock_irqsave(llock, flags);
 
 	/* Make sure the tree is high enough.  */
 	if ((!index && !root->rnode) ||
 			index > radix_tree_maxindex(root->height)) {
 		error = radix_tree_extend(root, index);
 		if (error)
-			return error;
+			goto out;
 	}
 
 	slot = root->rnode;
@@ -272,8 +316,10 @@ int radix_tree_insert(struct radix_tree_
 	do {
 		if (slot == NULL) {
 			/* Have to add a child node.  */
-			if (!(slot = radix_tree_node_alloc(root)))
-				return -ENOMEM;
+			if (!(slot = radix_tree_node_alloc(root))) {
+				error = -ENOMEM;
+				goto out;
+			}
 			slot->height = height;
 			if (node) {
 				rcu_assign_pointer(node->slots[offset], slot);
@@ -285,21 +331,36 @@ int radix_tree_insert(struct radix_tree_
 		/* Go a level down */
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		node = slot;
+
+		hlock = llock;
+		llock = radix_node_lock(root, node);
+		if (llock) {
+			spin_lock(llock);
+			spin_unlock(hlock);
+		}
+
 		slot = node->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
 	} while (height > 0);
 
-	if (slot != NULL)
-		return -EEXIST;
+	if (slot != NULL) {
+		error = -EEXIST;
+		goto out;
+	}
 
 	BUG_ON(!node);
 	node->count++;
 	rcu_assign_pointer(node->slots[offset], item);
-	BUG_ON(tag_get(node, 0, offset));
-	BUG_ON(tag_get(node, 1, offset));
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		BUG_ON(tag_get(node, tag, offset));
 
-	return 0;
+out:
+	if (!llock)
+		llock = &root->lock;
+
+	spin_unlock_irqrestore(llock, flags);
+	return error;
 }
 EXPORT_SYMBOL(radix_tree_insert);
 
@@ -314,21 +375,31 @@ EXPORT_SYMBOL(radix_tree_insert);
  *	This function cannot be called under rcu_read_lock, it must be
  *	excluded from writers, as must the returned slot.
  */
-void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index, spinlock_t **lock)
 {
 	unsigned int height, shift;
-	struct radix_tree_node **slot;
+	struct radix_tree_node **slot = NULL;
+	spinlock_t *hlock, *llock = &root->lock;
+
+	spin_lock(llock);
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
-		return NULL;
+		goto out;
 
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 	slot = &root->rnode;
 
 	while (height > 0) {
 		if (*slot == NULL)
-			return NULL;
+			goto out;
+
+		hlock = llock;
+		llock = radix_node_lock(root, *slot);
+		if (llock) {
+			spin_lock(llock);
+			spin_unlock(hlock);
+		}
 
 		slot = (struct radix_tree_node **)
 			((*slot)->slots +
@@ -337,6 +408,17 @@ void **radix_tree_lookup_slot(struct rad
 		height--;
 	}
 
+out:
+	if (!llock)
+		llock = &root->lock;
+
+	if (slot && *slot) {
+		*lock = llock;
+	} else {
+		spin_unlock(llock);
+		*lock = NULL;
+		slot = NULL;
+	}
 	return (void **)slot;
 }
 EXPORT_SYMBOL(radix_tree_lookup_slot);
@@ -400,17 +482,27 @@ void *radix_tree_tag_set(struct radix_tr
 			unsigned long index, unsigned int tag)
 {
 	unsigned int height, shift;
-	struct radix_tree_node *slot;
+	struct radix_tree_node *slot = NULL;
+	unsigned long flags;
+	spinlock_t *hlock, *llock = &root->lock;
+
+	spin_lock_irqsave(llock, flags);
 
 	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		return NULL;
+	BUG_ON(index > radix_tree_maxindex(height));
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	slot = root->rnode;
 
 	while (height > 0) {
 		int offset;
 
+		hlock = llock;
+		llock = radix_node_lock(root, slot);
+		if (llock) {
+			spin_lock(llock);
+			spin_unlock(hlock);
+		}
+
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		if (!tag_get(slot, tag, offset))
 			tag_set(slot, tag, offset);
@@ -420,6 +512,9 @@ void *radix_tree_tag_set(struct radix_tr
 		height--;
 	}
 
+	if (!llock)
+		llock = &root->lock;
+	spin_unlock_irqrestore(llock, flags);
 	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
@@ -441,48 +536,79 @@ EXPORT_SYMBOL(radix_tree_tag_set);
 void *radix_tree_tag_clear(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag)
 {
-	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_node *slot;
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH];
+	struct radix_tree_path *pathp = path, *punlock = path, *piter;
+	struct radix_tree_node *slot = NULL;
 	unsigned int height, shift;
-	void *ret = NULL;
+	unsigned long flags;
+	spinlock_t *lock = &root->lock;
+
+	spin_lock_irqsave(lock, flags);
+
+	pathp->node = NULL;
+	pathp->lock = lock;
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
 		goto out;
 
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	pathp->node = NULL;
 	slot = root->rnode;
 
 	while (height > 0) {
 		int offset;
+		int parent_tag = 0;
 
 		if (slot == NULL)
 			goto out;
 
+		if (pathp->node)
+			parent_tag = tag_get(pathp->node, tag, pathp->offset);
+
+		lock = radix_node_lock(root, slot);
+		if (lock)
+			spin_lock(lock);
+
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		pathp[1].offset = offset;
-		pathp[1].node = slot;
-		slot = slot->slots[offset];
 		pathp++;
+		pathp->offset = offset;
+		pathp->node = slot;
+		pathp->lock = lock;
+
+		/*
+		 * If the parent node does not have the tag set or the current
+		 * node has slots with the tag set other than the one we're
+		 * potentially clearing, the change can never propagate upwards
+		 * from here.
+		 */
+		if (lock && (!parent_tag || any_tag_set_but(slot, tag, offset))) {
+			for (; punlock < pathp; punlock++) {
+				BUG_ON(!punlock->lock);
+				spin_unlock(punlock->lock);
+				punlock->lock = NULL;
+			}
+		}
+
+		slot = slot->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
 	}
 
-	ret = slot;
-	if (ret == NULL)
+	if (slot == NULL)
 		goto out;
 
-	do {
-		if (!tag_get(pathp->node, tag, pathp->offset))
-			goto out;
-		tag_clear(pathp->node, tag, pathp->offset);
-		if (any_tag_set(pathp->node, tag))
-			goto out;
-		pathp--;
-	} while (pathp->node);
+	for (piter = pathp; piter >= punlock; piter--) {
+		BUG_ON(!punlock->lock);
+		if (piter->node)
+			tag_clear(piter->node, tag, piter->offset);
+	}
 out:
-	return ret;
+	for (; punlock <= pathp; punlock++) {
+		if (punlock->lock)
+			spin_unlock(punlock->lock);
+	}
+	local_irq_restore(flags);
+	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
 
@@ -773,6 +899,7 @@ static inline void radix_tree_shrink(str
 			root->rnode->slots[0]) {
 		struct radix_tree_node *to_free = root->rnode;
 		void *newptr;
+		int tag;
 
 		/*
 		 * We don't need rcu_assign_pointer(), since we are simply
@@ -785,11 +912,10 @@ static inline void radix_tree_shrink(str
 		root->rnode = newptr;
 		root->height--;
 		/* must only free zeroed nodes into the slab */
-		tag_clear(to_free, 0, 0);
-		tag_clear(to_free, 1, 0);
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			tag_clear(to_free, tag, 0);
 		to_free->slots[0] = NULL;
 		to_free->count = 0;
-		radix_tree_node_free(to_free);
 	}
 }
 
@@ -804,39 +930,71 @@ static inline void radix_tree_shrink(str
  */
 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
-	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH];
+	struct radix_tree_path *pathp = path, *punlock = path, *piter;
 	struct radix_tree_path *orig_pathp;
-	struct radix_tree_node *slot;
-	struct radix_tree_node *to_free;
+	struct radix_tree_node *slot = NULL;
 	unsigned int height, shift;
-	void *ret = NULL;
-	char tags[RADIX_TREE_MAX_TAGS];
-	int nr_cleared_tags;
 	int tag;
-	int offset;
+	int offset = 0;
+	unsigned long flags;
+	spinlock_t *lock = &root->lock;
+
+	spin_lock_irqsave(lock, flags);
 
+	pathp->lock = lock;
+	pathp->node = NULL;
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
 		goto out;
 
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	pathp->node = NULL;
 	slot = root->rnode;
 
 	for ( ; height > 0; height--) {
+		int parent_tags = 0;
+		int no_tags = 0;
+
 		if (slot == NULL)
 			goto out;
 
+		lock = radix_node_lock(root, slot);
+		if (lock)
+			spin_lock(lock);
+
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+			if (pathp->node)
+				parent_tags |= tag_get(pathp->node, tag, pathp->offset);
+
+			no_tags |= !any_tag_set_but(slot, tag, offset);
+		}
+
 		pathp++;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		pathp->offset = offset;
 		pathp->node = slot;
+		pathp->lock = lock;
+
+		/*
+		 * If the parent node does not have any tag set or the current
+		 * node has slots with the tags set other than the one we're
+		 * potentially clearing AND there are mode children than just us,
+		 * the changes can never propagate upwards from here.
+		 */
+		if (lock && (!parent_tags || !no_tags) &&
+				slot->count > 1+(!pathp->offset)) {
+			for (; punlock < pathp; punlock++) {
+				BUG_ON(!punlock->lock);
+				spin_unlock(punlock->lock);
+				punlock->lock = NULL;
+			}
+		}
+
 		slot = slot->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
 	}
 
-	ret = slot;
-	if (ret == NULL)
+	if (slot == NULL)
 		goto out;
 
 	orig_pathp = pathp;
@@ -844,54 +1002,45 @@ void *radix_tree_delete(struct radix_tre
 	/*
 	 * Clear all tags associated with the just-deleted item
 	 */
-	nr_cleared_tags = 0;
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		tags[tag] = 1;
-		if (tag_get(pathp->node, tag, pathp->offset)) {
-			tag_clear(pathp->node, tag, pathp->offset);
-			if (!any_tag_set(pathp->node, tag)) {
-				tags[tag] = 0;
-				nr_cleared_tags++;
+		for (piter = pathp; piter >= punlock; piter--) {
+			if (piter->node) {
+				if (!tag_get(piter->node, tag, piter->offset))
+					break;
+				tag_clear(piter->node, tag, piter->offset);
+				if (any_tag_set(piter->node, tag))
+					break;
 			}
 		}
 	}
 
-	for (pathp--; nr_cleared_tags && pathp->node; pathp--) {
-		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (tags[tag])
-				continue;
-
-			tag_clear(pathp->node, tag, pathp->offset);
-			if (any_tag_set(pathp->node, tag)) {
-				tags[tag] = 1;
-				nr_cleared_tags--;
-			}
-		}
-	}
-
-	to_free = NULL;
-	/* Now free the nodes we do not need anymore */
-	for (pathp = orig_pathp; pathp->node; pathp--) {
-		pathp->node->slots[pathp->offset] = NULL;
-		pathp->node->count--;
-		if (to_free)
-			radix_tree_node_free(to_free);
+	/*
+	 * Now unhook the nodes we do not need any more.
+	 */
+	for (piter = pathp; piter >= punlock && piter->node; piter--) {
+		piter->node->slots[piter->offset] = NULL;
+		piter->node->count--;
 
-		if (pathp->node->count) {
-			if (pathp->node == root->rnode)
+		if (piter->node->count) {
+			if (piter->node == root->rnode)
 				radix_tree_shrink(root);
 			goto out;
 		}
-
-		/* Node with zero slots in use so free it */
-		to_free = pathp->node;
 	}
+
+	BUG_ON(punlock->node || !punlock->lock);
+
 	root->rnode = NULL;
 	root->height = 0;
-	if (to_free)
-		radix_tree_node_free(to_free);
 out:
-	return ret;
+	for (; punlock <= pathp; punlock++) {
+		if (punlock->lock)
+			spin_unlock(punlock->lock);
+		if (punlock->node && punlock->node->count == 0)
+			radix_tree_node_free(punlock->node);
+	}
+	local_irq_restore(flags);
+	return slot;
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
@@ -914,6 +1063,7 @@ static void
 radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags)
 {
 	memset(node, 0, sizeof(struct radix_tree_node));
+	spin_lock_init(&((struct radix_tree_node *)node)->lock);
 }
 
 static __init unsigned long __maxindex(unsigned int height)
Index: linux-2.6.17-crt/fs/buffer.c
===================================================================
--- linux-2.6.17-crt.orig/fs/buffer.c
+++ linux-2.6.17-crt/fs/buffer.c
@@ -851,7 +851,7 @@ int __set_page_dirty_buffers(struct page
 	spin_unlock(&mapping->private_lock);
 
 	if (!TestSetPageDirty(page)) {
-		spin_lock_irq(&mapping->tree_lock);
+		LockPageNoNewRefs(page);
 		if (page->mapping) {	/* Race with truncate? */
 			if (mapping_cap_account_dirty(mapping))
 				inc_page_state(nr_dirty);
@@ -859,7 +859,7 @@ int __set_page_dirty_buffers(struct page
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 		}
-		spin_unlock_irq(&mapping->tree_lock);
+		UnlockPageNoNewRefs(page);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 		return 1;
 	}
Index: linux-2.6.17-crt/include/linux/page-flags.h
===================================================================
--- linux-2.6.17-crt.orig/include/linux/page-flags.h
+++ linux-2.6.17-crt/include/linux/page-flags.h
@@ -363,6 +363,8 @@ extern void __mod_page_state_offset(unsi
 #define SetPageNoNewRefs(page)	set_bit(PG_nonewrefs, &(page)->flags)
 #define ClearPageNoNewRefs(page) clear_bit(PG_nonewrefs, &(page)->flags)
 #define __ClearPageNoNewRefs(page) __clear_bit(PG_nonewrefs, &(page)->flags)
+#define LockPageNoNewRefs(page)	bit_spin_lock(PG_nonewrefs, &(page)->flags)
+#define UnlockPageNoNewRefs(page) bit_spin_unlock(PG_nonewrefs, &(page)->flags)
 
 struct page;	/* forward declaration */
 
Index: linux-2.6.17-crt/mm/filemap.c
===================================================================
--- linux-2.6.17-crt.orig/mm/filemap.c
+++ linux-2.6.17-crt/mm/filemap.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
+#include <linux/bit_spinlock.h>
 #include "filemap.h"
 #include "internal.h"
 
@@ -119,19 +120,19 @@ void __remove_from_page_cache(struct pag
 
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
+	spin_lock_irq(&mapping->tree_lock);
 	mapping->nrpages--;
 	pagecache_acct(-1);
+	spin_unlock_irq(&mapping->tree_lock);
 }
 
 void remove_from_page_cache(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
-
 	BUG_ON(!PageLocked(page));
 
-	spin_lock_irq(&mapping->tree_lock);
+	LockPageNoNewRefs(page);
 	__remove_from_page_cache(page);
-	spin_unlock_irq(&mapping->tree_lock);
+	UnlockPageNoNewRefs(page);
 }
 
 static int sync_page(void *word)
@@ -412,8 +413,10 @@ static int add_to_page_cache_nolock(stru
 	error = radix_tree_insert(&mapping->page_tree, offset, page);
 
 	if (!error) {
+		spin_lock_irq(&mapping->tree_lock);
 		mapping->nrpages++;
 		pagecache_acct(1);
+		spin_unlock_irq(&mapping->tree_lock);
 	} else {
 		page->mapping = NULL;
 		__ClearPageLocked(page);
@@ -436,9 +439,9 @@ int add_to_page_cache(struct page *page,
 	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 
 	if (error == 0) {
-		spin_lock_irq(&mapping->tree_lock);
+		LockPageNoNewRefs(page);
 		error = add_to_page_cache_nolock(page, mapping, offset);
-		spin_unlock_irq(&mapping->tree_lock);
+		UnlockPageNoNewRefs(page);
 
 		radix_tree_preload_end();
 	}
@@ -456,9 +459,8 @@ int __add_to_page_cache(struct page *pag
 	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 
 	if (error == 0) {
-		SetPageNoNewRefs(page);
+		LockPageNoNewRefs(page);
 		smp_wmb();
-		spin_lock_irq(&mapping->tree_lock);
 
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
 		if (!error) {
@@ -466,13 +468,15 @@ int __add_to_page_cache(struct page *pag
 			SetPageLocked(page);
 			page->mapping = mapping;
 			page->index = offset;
+
+			spin_lock_irq(&mapping->tree_lock);
 			mapping->nrpages++;
 			pagecache_acct(1);
+			spin_unlock_irq(&mapping->tree_lock);
 		}
 
-		spin_unlock_irq(&mapping->tree_lock);
 		smp_wmb();
-		ClearPageNoNewRefs(page);
+		UnlockPageNoNewRefs(page);
 		radix_tree_preload_end();
 	}
 	return error;
@@ -629,13 +633,20 @@ EXPORT_SYMBOL(find_get_page);
  */
 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
 {
-	struct page *page;
-
-	spin_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	struct page *page = NULL;
+	struct page **radix_pointer;
+	spinlock_t *lock;
+
+	local_irq_disable();
+	radix_pointer = (struct page **)radix_tree_lookup_slot(
+			&mapping->page_tree, offset, &lock);
+	if (radix_pointer)
+		page = radix_tree_deref_slot(radix_pointer);
 	if (page && TestSetPageLocked(page))
 		page = NULL;
-	spin_unlock_irq(&mapping->tree_lock);
+	if (lock)
+		spin_unlock(lock);
+	local_irq_enable();
 	return page;
 }
 
Index: linux-2.6.17-crt/mm/migrate.c
===================================================================
--- linux-2.6.17-crt.orig/mm/migrate.c
+++ linux-2.6.17-crt/mm/migrate.c
@@ -25,6 +25,8 @@
 #include <linux/cpuset.h>
 #include <linux/swapops.h>
 
+#include "internal.h"
+
 /* The maximum number of pages to take off the LRU for migration */
 #define MIGRATE_CHUNK_SIZE 256
 
@@ -181,6 +183,7 @@ int migrate_page_remove_references(struc
 {
 	struct address_space *mapping = page_mapping(page);
 	struct page **radix_pointer;
+	spinlock_t *lock;
 
 	/*
 	 * Avoid doing any of the following work if the page count
@@ -196,7 +199,7 @@ int migrate_page_remove_references(struc
 	 *
 	 * In order to reestablish file backed mappings the fault handlers
 	 * will take the radix tree_lock which may then be used to stop
-  	 * processses from accessing this page until the new page is ready.
+  	 * processes from accessing this page until the new page is ready.
 	 *
 	 * A process accessing via a swap pte (an anonymous page) will take a
 	 * page_lock on the old page which will block the process until the
@@ -219,18 +222,20 @@ int migrate_page_remove_references(struc
 	if (page_mapcount(page))
 		return -EAGAIN;
 
-	SetPageNoNewRefs(page);
+	LockPageNoNewRefs(page);
 	smp_wmb();
-	spin_lock_irq(&mapping->tree_lock);
 
+	local_irq_disable();
 	radix_pointer = (struct page **)radix_tree_lookup_slot(
 						&mapping->page_tree,
-						page_index(page));
+						page_index(page), &lock);
 
 	if (!page_mapping(page) || page_count(page) != nr_refs ||
 			radix_tree_deref_slot(radix_pointer) != page) {
-		spin_unlock_irq(&mapping->tree_lock);
-		ClearPageNoNewRefs(page);
+		if (lock)
+			spin_unlock(lock);
+		local_irq_enable();
+		UnlockPageNoNewRefs(page);
 		return -EAGAIN;
 	}
 
@@ -250,15 +255,15 @@ int migrate_page_remove_references(struc
 		set_page_private(newpage, page_private(page));
 	}
 
-	SetPageNoNewRefs(newpage);
+	LockPageNoNewRefs(newpage);
 	radix_tree_replace_slot(radix_pointer, newpage);
 	page->mapping = NULL;
 
-	spin_unlock_irq(&mapping->tree_lock);
-	__put_page(page);
+	spin_unlock_irq(lock);
 	smp_wmb();
-	ClearPageNoNewRefs(page);
-	ClearPageNoNewRefs(newpage);
+	UnlockPageNoNewRefs(page);
+	UnlockPageNoNewRefs(newpage);
+	__put_page(page);
 
 	return 0;
 }
Index: linux-2.6.17-crt/mm/page-writeback.c
===================================================================
--- linux-2.6.17-crt.orig/mm/page-writeback.c
+++ linux-2.6.17-crt/mm/page-writeback.c
@@ -29,6 +29,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/bit_spinlock.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -632,7 +633,7 @@ int __set_page_dirty_nobuffers(struct pa
 		struct address_space *mapping2;
 
 		if (mapping) {
-			spin_lock_irq(&mapping->tree_lock);
+			LockPageNoNewRefs(page);
 			mapping2 = page_mapping(page);
 			if (mapping2) { /* Race with truncate? */
 				BUG_ON(mapping2 != mapping);
@@ -641,7 +642,7 @@ int __set_page_dirty_nobuffers(struct pa
 				radix_tree_tag_set(&mapping->page_tree,
 					page_index(page), PAGECACHE_TAG_DIRTY);
 			}
-			spin_unlock_irq(&mapping->tree_lock);
+			UnlockPageNoNewRefs(page);
 			if (mapping->host) {
 				/* !PageAnon && !swapper_space */
 				__mark_inode_dirty(mapping->host,
@@ -716,20 +717,19 @@ EXPORT_SYMBOL(set_page_dirty_lock);
 int test_clear_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
-	unsigned long flags;
 
 	if (mapping) {
-		spin_lock_irqsave(&mapping->tree_lock, flags);
+		LockPageNoNewRefs(page);
 		if (TestClearPageDirty(page)) {
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
-			spin_unlock_irqrestore(&mapping->tree_lock, flags);
+			UnlockPageNoNewRefs(page);
 			if (mapping_cap_account_dirty(mapping))
 				dec_page_state(nr_dirty);
 			return 1;
 		}
-		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		UnlockPageNoNewRefs(page);
 		return 0;
 	}
 	return TestClearPageDirty(page);
@@ -771,16 +771,15 @@ int test_clear_page_writeback(struct pag
 	struct address_space *mapping = page_mapping(page);
 
 	if (mapping) {
-		unsigned long flags;
 		int ret;
 
-		spin_lock_irqsave(&mapping->tree_lock, flags);
+		LockPageNoNewRefs(page);
 		ret = TestClearPageWriteback(page);
 		if (ret)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		UnlockPageNoNewRefs(page);
 		return ret;
 	}
 	return TestClearPageWriteback(page);
@@ -791,10 +790,9 @@ int test_set_page_writeback(struct page 
 	struct address_space *mapping = page_mapping(page);
 
 	if (mapping) {
-		unsigned long flags;
 		int ret;
 
-		spin_lock_irqsave(&mapping->tree_lock, flags);
+		LockPageNoNewRefs(page);
 		ret = TestSetPageWriteback(page);
 		if (!ret)
 			radix_tree_tag_set(&mapping->page_tree,
@@ -804,7 +802,7 @@ int test_set_page_writeback(struct page 
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
-		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		UnlockPageNoNewRefs(page);
 		return ret;
 	}
 	return TestSetPageWriteback(page);
Index: linux-2.6.17-crt/mm/swap_state.c
===================================================================
--- linux-2.6.17-crt.orig/mm/swap_state.c
+++ linux-2.6.17-crt/mm/swap_state.c
@@ -16,6 +16,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
+#include <linux/bit_spinlock.h>
 
 #include <asm/pgtable.h>
 
@@ -78,9 +79,8 @@ static int __add_to_swap_cache(struct pa
 	BUG_ON(PagePrivate(page));
 	error = radix_tree_preload(gfp_mask);
 	if (!error) {
-		SetPageNoNewRefs(page);
+		LockPageNoNewRefs(page);
 		smp_wmb();
-		spin_lock_irq(&swapper_space.tree_lock);
 		error = radix_tree_insert(&swapper_space.page_tree,
 						entry.val, page);
 		if (!error) {
@@ -88,12 +88,14 @@ static int __add_to_swap_cache(struct pa
 			SetPageLocked(page);
 			SetPageSwapCache(page);
 			set_page_private(page, entry.val);
+
+			spin_lock_irq(&swapper_space.tree_lock);
 			total_swapcache_pages++;
 			pagecache_acct(1);
+			spin_unlock_irq(&swapper_space.tree_lock);
 		}
-		spin_unlock_irq(&swapper_space.tree_lock);
 		smp_wmb();
-		ClearPageNoNewRefs(page);
+		UnlockPageNoNewRefs(page);
 		radix_tree_preload_end();
 	}
 	return error;
@@ -135,9 +137,12 @@ void __delete_from_swap_cache(struct pag
 	radix_tree_delete(&swapper_space.page_tree, page_private(page));
 	set_page_private(page, 0);
 	ClearPageSwapCache(page);
+
+	spin_lock_irq(&swapper_space.tree_lock);
 	total_swapcache_pages--;
 	pagecache_acct(-1);
 	INC_CACHE_INFO(del_total);
+	spin_unlock_irq(&swapper_space.tree_lock);
 }
 
 /**
@@ -204,9 +209,9 @@ void delete_from_swap_cache(struct page 
 
 	entry.val = page_private(page);
 
-	spin_lock_irq(&swapper_space.tree_lock);
+	LockPageNoNewRefs(page);
 	__delete_from_swap_cache(page);
-	spin_unlock_irq(&swapper_space.tree_lock);
+	UnlockPageNoNewRefs(page);
 
 	swap_free(entry);
 	page_cache_release(page);
Index: linux-2.6.17-crt/mm/swapfile.c
===================================================================
--- linux-2.6.17-crt.orig/mm/swapfile.c
+++ linux-2.6.17-crt/mm/swapfile.c
@@ -28,6 +28,7 @@
 #include <linux/mutex.h>
 #include <linux/capability.h>
 #include <linux/syscalls.h>
+#include <linux/bit_spinlock.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct pa
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
 		/* Recheck the page count with the swapcache lock held.. */
-		spin_lock_irq(&swapper_space.tree_lock);
+		LockPageNoNewRefs(page);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		spin_unlock_irq(&swapper_space.tree_lock);
+		UnlockPageNoNewRefs(page);
 	}
 	spin_unlock(&swap_lock);
 
Index: linux-2.6.17-crt/mm/truncate.c
===================================================================
--- linux-2.6.17-crt.orig/mm/truncate.c
+++ linux-2.6.17-crt/mm/truncate.c
@@ -14,6 +14,7 @@
 #include <linux/pagevec.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
+#include <linux/bit_spinlock.h>
 
 
 static inline void truncate_partial_page(struct page *page, unsigned partial)
@@ -67,15 +68,15 @@ invalidate_complete_page(struct address_
 	if (PagePrivate(page) && !try_to_release_page(page, 0))
 		return 0;
 
-	spin_lock_irq(&mapping->tree_lock);
+	LockPageNoNewRefs(page);
 	if (PageDirty(page)) {
-		spin_unlock_irq(&mapping->tree_lock);
+		UnlockPageNoNewRefs(page);
 		return 0;
 	}
 
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
-	spin_unlock_irq(&mapping->tree_lock);
+	UnlockPageNoNewRefs(page);
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
Index: linux-2.6.17-crt/mm/vmscan.c
===================================================================
--- linux-2.6.17-crt.orig/mm/vmscan.c
+++ linux-2.6.17-crt/mm/vmscan.c
@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
+#include <linux/bit_spinlock.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -365,9 +366,8 @@ int remove_mapping(struct address_space 
 	if (!mapping)
 		return 0;		/* truncate got there first */
 
-	SetPageNoNewRefs(page);
+	LockPageNoNewRefs(page);
 	smp_wmb();
-	spin_lock_irq(&mapping->tree_lock);
 
 	/*
 	 * The non-racy check for busy page.  It is critical to check
@@ -383,13 +383,12 @@ int remove_mapping(struct address_space 
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
-		spin_unlock_irq(&mapping->tree_lock);
 		swap_free(swap);
 		goto free_it;
 	}
 
 	__remove_from_page_cache(page);
-	spin_unlock_irq(&mapping->tree_lock);
+	UnlockPageNoNewRefs(page);
 
 free_it:
 	smp_wmb();
@@ -398,8 +397,7 @@ free_it:
 	return 1;
 
 cannot_free:
-	spin_unlock_irq(&mapping->tree_lock);
-	ClearPageNoNewRefs(page);
+	UnlockPageNoNewRefs(page);
 	return 0;
 }
 
Index: linux-2.6.17-crt/include/linux/pagemap.h
===================================================================
--- linux-2.6.17-crt.orig/include/linux/pagemap.h
+++ linux-2.6.17-crt/include/linux/pagemap.h
@@ -13,6 +13,7 @@
 #include <linux/gfp.h>
 #include <linux/page-flags.h>
 #include <linux/hardirq.h> /* for in_interrupt() */
+#include <linux/bit_spinlock.h>
 
 /*
  * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
@@ -74,23 +75,17 @@ static inline struct page *page_cache_ge
 	atomic_inc(&page->_count);
 
 #else
-	if (unlikely(!get_page_unless_zero(page)))
-		return NULL; /* page has been freed */
-
-	/*
-	 * Note that get_page_unless_zero provides a memory barrier.
-	 * This is needed to ensure PageNoNewRefs is evaluated after the
-	 * page refcount has been raised. See below comment.
-	 */
-
 	/*
 	 * PageNoNewRefs is set in order to prevent new references to the
 	 * page (eg. before it gets removed from pagecache). Wait until it
-	 * becomes clear (and checks below will ensure we still have the
-	 * correct one).
+	 * becomes clear.
 	 */
-	while (unlikely(PageNoNewRefs(page)))
-		cpu_relax();
+	LockPageNoNewRefs(page);
+	if (unlikely(!get_page_unless_zero(page))) {
+		UnlockPageNoNewRefs(page);
+		return NULL; /* page has been freed */
+	}
+	UnlockPageNoNewRefs(page);
 
 	/*
 	 * smp_rmb is to ensure the load of page->flags (for PageNoNewRefs())


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux