[RFC/PATCH] Shared Page Tables [2/2]

Share page tables when possible.

Signed-off-by: Dave McCracken <[email protected]>

----

 arch/i386/Kconfig             |   12 +
 arch/s390/Kconfig             |   22 +
 arch/x86_64/Kconfig           |   22 +
 include/asm-generic/pgtable.h |   31 ++
 include/linux/mm.h            |   12 -
 include/linux/ptshare.h       |  175 +++++++++++++++
 include/linux/rmap.h          |    2 
 include/linux/sched.h         |    2 
 mm/Makefile                   |    1 
 mm/filemap_xip.c              |    3 
 mm/fremap.c                   |    6 
 mm/hugetlb.c                  |   54 +++-
 mm/memory.c                   |   39 ++-
 mm/mmap.c                     |    3 
 mm/mprotect.c                 |   10 
 mm/mremap.c                   |    7 
 mm/ptshare.c                  |  463 ++++++++++++++++++++++++++++++++++++++++++
 mm/rmap.c                     |   21 +
 18 files changed, 846 insertions(+), 39 deletions(-)

----

--- 2.6.17-rc1-macro/./arch/i386/Kconfig	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./arch/i386/Kconfig	2006-04-10 08:46:01.000000000 -0500
@@ -512,6 +512,18 @@ config X86_PAE
 	depends on HIGHMEM64G
 	default y
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
+config PTSHARE_PTE
+	bool
+	depends on PTSHARE
+	default y
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
--- 2.6.17-rc1-macro/./arch/s390/Kconfig	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./arch/s390/Kconfig	2006-04-10 08:46:01.000000000 -0500
@@ -218,6 +218,28 @@ config WARN_STACK_SIZE
 
 source "mm/Kconfig"
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
+menu "Page table levels to share"
+	depends on PTSHARE
+
+config PTSHARE_PTE
+	bool "Bottom level table (PTE)"
+	depends on PTSHARE
+	default y
+
+config PTSHARE_PMD
+	bool "Middle level table (PMD)"
+	depends on PTSHARE && 64BIT
+	default y
+
+endmenu
+
 comment "I/O subsystem configuration"
 
 config MACHCHK_WARNING
--- 2.6.17-rc1-macro/./arch/x86_64/Kconfig	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./arch/x86_64/Kconfig	2006-04-10 08:46:01.000000000 -0500
@@ -302,6 +302,28 @@ config NUMA_EMU
 	  into virtual nodes when booted with "numa=fake=N", where N is the
 	  number of nodes. This is only useful for debugging.
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
+config PTSHARE_PTE
+	bool
+	depends on PTSHARE
+	default y
+
+config PTSHARE_PMD
+	bool
+	depends on PTSHARE
+	default y
+
+config PTSHARE_HUGEPAGE
+	bool
+	depends on PTSHARE && PTSHARE_PMD
+	default y
+
 config ARCH_DISCONTIGMEM_ENABLE
        bool
        depends on NUMA
--- 2.6.17-rc1-macro/./include/asm-generic/pgtable.h	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./include/asm-generic/pgtable.h	2006-04-10 08:46:01.000000000 -0500
@@ -127,6 +127,16 @@ do {									\
 })
 #endif
 
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH_ALL
+#define ptep_clear_flush_all(__vma, __address, __ptep)			\
+({									\
+	pte_t __pte;							\
+	__pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep);	\
+	flush_tlb_all();				\
+	__pte;								\
+})
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
 struct mm_struct;
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
@@ -173,6 +183,27 @@ static inline void ptep_set_wrprotect(st
 #endif
 
 /*
+ * Some architectures might need flushes when higher levels of page table
+ * are unshared.
+ */
+
+#ifndef __HAVE_ARCH_PMD_CLEAR_FLUSH
+#define pmd_clear_flush(__mm, __addr, __pmd)				\
+({									\
+	pmd_clear(__pmd);						\
+	flush_tlb_all();						\
+})
+#endif
+
+#ifndef __HAVE_ARCH_PUD_CLEAR_FLUSH
+#define pud_clear_flush(__mm, __addr, __pud)				\
+({									\
+	pud_clear(__pud);						\
+	flush_tlb_all();						\
+})
+#endif
+
+/*
  * When walking page tables, get the address of the next boundary,
  * or the end address of the range if that comes earlier.  Although no
  * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
--- 2.6.17-rc1-macro/./include/linux/mm.h	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./include/linux/mm.h	2006-04-10 08:46:01.000000000 -0500
@@ -166,6 +166,8 @@ extern unsigned int kobjsize(const void 
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_MAPPED_COPY	0x01000000	/* T if mapped copy of data (nommu mmap) */
 #define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
+#define VM_TRANSITION	0x04000000	/* The vma is in transition (mprotect, mremap, etc) */
+#define VM_PTSHARE	0x08000000	/* This vma has shared one or more page tables */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -242,7 +244,7 @@ struct page {
 						 * see PAGE_MAPPING_ANON below.
 						 */
 	    };
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) || defined(CONFIG_PTSHARE)
 	    spinlock_t ptl;
 #endif
 	};
@@ -815,19 +817,19 @@ static inline pmd_t *pmd_alloc(struct mm
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) || defined(CONFIG_PTSHARE)
 /*
  * We tuck a spinlock to guard each pagetable page into its struct page,
  * at page->private, with BUILD_BUG_ON to make sure that this will not
  * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  * When freeing, reset page->mapping so free_pages_check won't complain.
  */
-#define __pte_lockptr(page)	&((page)->ptl)
+#define __pt_lockptr(page)	&((page)->ptl)
 #define pte_lock_init(_page)	do {					\
-	spin_lock_init(__pte_lockptr(_page));				\
+	spin_lock_init(__pt_lockptr(_page));				\
 } while (0)
 #define pte_lock_deinit(page)	((page)->mapping = NULL)
-#define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#define pte_lockptr(mm, pmd)	({(void)(mm); __pt_lockptr(pmd_page(*(pmd)));})
 #else
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
--- 2.6.17-rc1-macro/./include/linux/rmap.h	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./include/linux/rmap.h	2006-04-10 08:46:01.000000000 -0500
@@ -98,7 +98,7 @@ void remove_from_swap(struct page *page)
  * Called from mm/filemap_xip.c to unmap empty zero page
  */
 pte_t *page_check_address(struct page *, struct mm_struct *,
-				unsigned long, spinlock_t **);
+				unsigned long, spinlock_t **, int *);
 
 /*
  * Used by swapoff to help locate where page is expected in vma.
--- 2.6.17-rc1-macro/./include/linux/sched.h	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./include/linux/sched.h	2006-04-10 08:46:01.000000000 -0500
@@ -254,7 +254,7 @@ arch_get_unmapped_area_topdown(struct fi
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) || defined(CONFIG_PTSHARE)
 /*
  * The mm counters are not protected by its page_table_lock,
  * so must be incremented atomically.
--- 2.6.17-rc1-macro/./include/linux/ptshare.h	1969-12-31 18:00:00.000000000 -0600
+++ 2.6.17-rc1-shpt/./include/linux/ptshare.h	2006-04-10 08:46:01.000000000 -0500
@@ -0,0 +1,175 @@
+#ifndef _LINUX_PTSHARE_H
+#define _LINUX_PTSHARE_H
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Author: Dave McCracken <[email protected]>
+ */
+
+#undef	PT_DEBUG
+
+#ifdef CONFIG_PTSHARE
+static inline int pt_is_shared(struct page *page)
+{
+	return (page_mapcount(page) > 0);
+}
+
+static inline void pt_increment_share(struct page *page)
+{
+	atomic_inc(&page->_mapcount);
+}
+
+static inline void pt_decrement_share(struct page *page)
+{
+	atomic_dec(&page->_mapcount);
+}
+
+static inline int pt_vmashared(struct vm_area_struct *vma) {
+	return vma->vm_flags & VM_PTSHARE;
+}
+extern void pt_unshare_range(struct vm_area_struct *vma, unsigned long address,
+			     unsigned long end);
+#else /* CONFIG_PTSHARE */
+#define pt_is_shared(page)	(0)
+#define pt_increment_share(page)
+#define pt_decrement_share(page)
+#define pt_vmashared(vma) 0
+#define pt_unshare_range(vma, address, end)
+#endif /* CONFIG_PTSHARE */
+
+#ifdef CONFIG_PTSHARE_PTE
+static inline int pt_is_shared_pte(pmd_t pmdval)
+{
+	struct page *page;
+
+	page = pmd_page(pmdval);
+	return pt_is_shared(page);
+}
+
+static inline void pt_increment_pte(pmd_t pmdval)
+{
+	struct page *page;
+
+	page = pmd_page(pmdval);
+	pt_increment_share(page);
+}
+
+static inline void pt_decrement_pte(pmd_t pmdval)
+{
+	struct page *page;
+
+	page = pmd_page(pmdval);
+	pt_decrement_share(page);
+}
+
+extern pte_t * pt_share_pte(struct vm_area_struct *vma, unsigned long address,
+			    pmd_t *pmd);
+extern int pt_check_unshare_pte(struct mm_struct *mm, unsigned long address,
+				pmd_t *pmd);
+#else /* CONFIG_PTSHARE_PTE */
+static inline int pt_is_shared_pte(pmd_t pmdval)
+{
+	return 0;
+}
+static inline int pt_check_unshare_pte(struct mm_struct *mm, unsigned long address,
+				       pmd_t *pmd)
+{
+	return 0;
+}
+#define pt_increment_pte(pmdval)
+#define pt_decrement_pte(pmdval)
+#define pt_share_pte(vma, address, pmd) pte_alloc_map(vma->vm_mm, pmd, address)
+#endif /* CONFIG_PTSHARE_PTE */
+
+#ifdef CONFIG_PTSHARE_PMD
+static inline int pt_is_shared_pmd(pud_t pudval)
+{
+	struct page *page;
+
+	page = pud_page(pudval);
+	return pt_is_shared(page);
+}
+
+static inline void pt_increment_pmd(pud_t pudval)
+{
+	struct page *page;
+
+	page = pud_page(pudval);
+	pt_increment_share(page);
+}
+
+static inline void pt_decrement_pmd(pud_t pudval)
+{
+	struct page *page;
+
+	page = pud_page(pudval);
+	pt_decrement_share(page);
+}
+extern pmd_t * pt_share_pmd(struct vm_area_struct *vma, unsigned long address,
+			    pud_t *pud);
+extern int pt_check_unshare_pmd(struct mm_struct *mm, unsigned long address,
+				pud_t *pud);
+#else /* CONFIG_PTSHARE_PMD */
+static inline int pt_is_shared_pmd(pud_t pudval)
+{
+	return 0;
+}
+static inline int pt_check_unshare_pmd(struct mm_struct *mm, unsigned long address,
+				       pud_t *pud)
+{
+	return 0;
+}
+#define pt_increment_pmd(pudval)
+#define pt_decrement_pmd(pudval)
+#define pt_share_pmd(vma, address, pud) pmd_alloc(vma->vm_mm, pud, address)
+#endif /* CONFIG_PTSHARE_PMD */
+
+#ifdef CONFIG_PTSHARE_HUGEPAGE
+extern pte_t *pt_share_hugepage(struct mm_struct *mm, struct vm_area_struct *vma,
+			       unsigned long address);
+extern void pt_unshare_huge_range(struct vm_area_struct *vma, unsigned long address,
+				  unsigned long end);
+#else
+#define pt_share_hugepage(mm, vma, address)	huge_pte_alloc(mm, address)
+#define pt_unshare_huge_range(vma, address, end)
+#endif	/* CONFIG_PTSHARE_HUGEPAGE */
+
+/*
+ *  Locking macros...
+ *  All levels of page table at or above the level(s) we share use page_table_lock.
+ *  Each level below the share level uses the pt_lockptr in struct page in the level
+ *  above.
+ */
+
+#ifdef CONFIG_PTSHARE_PMD
+#define pmd_lock_init(pmd)	do {					\
+	spin_lock_init(__pt_lockptr(virt_to_page(pmd)));		\
+} while (0)
+#define pmd_lock_deinit(pmd)	((virt_to_page(pmd))->mapping = NULL)
+#define pmd_lockptr(mm, pmd)	({(void)(mm); __pt_lockptr(virt_to_page(pmd));})
+#else
+#define pmd_lock_init(pmd)	do {} while (0)
+#define pmd_lock_deinit(pmd)	do {} while (0)
+#define pmd_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
+#endif
+#ifdef CONFIG_PTSHARE_HUGEPAGE
+#define hugepte_lockptr(mm, pte)	pmd_lockptr(mm, pte)
+#else
+#define hugepte_lockptr(mm, pte)	({(void)(pte); &(mm)->page_table_lock;})
+#endif
+#endif /* _LINUX_PTSHARE_H */
--- 2.6.17-rc1-macro/./mm/Makefile	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./mm/Makefile	2006-04-10 08:47:12.000000000 -0500
@@ -23,4 +23,5 @@ obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_PTSHARE) += ptshare.o
 
--- 2.6.17-rc1-macro/./mm/filemap_xip.c	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./mm/filemap_xip.c	2006-04-10 08:46:01.000000000 -0500
@@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapp
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
+	int shared;
 	spinlock_t *ptl;
 	struct page *page;
 
@@ -184,7 +185,7 @@ __xip_unmap (struct address_space * mapp
 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 		page = ZERO_PAGE(address);
-		pte = page_check_address(page, mm, address, &ptl);
+		pte = page_check_address(page, mm, address, &ptl, &shared);
 		if (pte) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
--- 2.6.17-rc1-macro/./mm/fremap.c	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./mm/fremap.c	2006-04-10 08:46:01.000000000 -0500
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -193,6 +194,10 @@ asmlinkage long sys_remap_file_pages(uns
 				has_write_lock = 1;
 				goto retry;
 			}
+			vma->vm_flags |= VM_TRANSITION;
+			if (pt_vmashared(vma))
+				pt_unshare_range(vma, vma->vm_start, vma->vm_end);
+
 			mapping = vma->vm_file->f_mapping;
 			spin_lock(&mapping->i_mmap_lock);
 			flush_dcache_mmap_lock(mapping);
@@ -201,6 +206,7 @@ asmlinkage long sys_remap_file_pages(uns
 			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 			flush_dcache_mmap_unlock(mapping);
 			spin_unlock(&mapping->i_mmap_lock);
+			vma->vm_flags &= ~VM_TRANSITION;
 		}
 
 		err = vma->vm_ops->populate(vma, start, size,
--- 2.6.17-rc1-macro/./mm/hugetlb.c	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./mm/hugetlb.c	2006-04-10 09:05:39.000000000 -0500
@@ -11,6 +11,7 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/ptshare.h>
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
@@ -437,6 +438,7 @@ int copy_hugetlb_page_range(struct mm_st
 	struct page *ptepage;
 	unsigned long addr;
 	int cow;
+	spinlock_t *src_ptl, *dst_ptl;
 
 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
@@ -447,8 +449,10 @@ int copy_hugetlb_page_range(struct mm_st
 		dst_pte = huge_pte_alloc(dst, addr);
 		if (!dst_pte)
 			goto nomem;
-		spin_lock(&dst->page_table_lock);
-		spin_lock(&src->page_table_lock);
+		dst_ptl = hugepte_lockptr(dst, dst_pte);
+		src_ptl = hugepte_lockptr(src, src_pte);
+		spin_lock(dst_ptl);
+		spin_lock(src_ptl);
 		if (!pte_none(*src_pte)) {
 			if (cow)
 				ptep_set_wrprotect(src, addr, src_pte);
@@ -458,8 +462,8 @@ int copy_hugetlb_page_range(struct mm_st
 			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
-		spin_unlock(&src->page_table_lock);
-		spin_unlock(&dst->page_table_lock);
+		spin_unlock(src_ptl);
+		spin_unlock(dst_ptl);
 	}
 	return 0;
 
@@ -475,12 +479,14 @@ void unmap_hugepage_range(struct vm_area
 	pte_t *ptep;
 	pte_t pte;
 	struct page *page;
+	spinlock_t *ptl = NULL, *new_ptl;
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
-	spin_lock(&mm->page_table_lock);
+	if (pt_vmashared(vma))
+		pt_unshare_huge_range(vma, start, end);
 
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
@@ -490,6 +496,13 @@ void unmap_hugepage_range(struct vm_area
 		if (!ptep)
 			continue;
 
+		new_ptl = hugepte_lockptr(mm, ptep);
+		if (new_ptl != ptl) {
+			if (ptl)
+				spin_unlock(ptl);
+			ptl = new_ptl;
+			spin_lock(ptl);
+		}
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
 		if (pte_none(pte))
 			continue;
@@ -499,12 +512,15 @@ void unmap_hugepage_range(struct vm_area
 		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
 
-	spin_unlock(&mm->page_table_lock);
+	if (ptl)
+		spin_unlock(ptl);
+
 	flush_tlb_range(vma, start, end);
 }
 
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, pte_t *ptep, pte_t pte)
+			unsigned long address, pte_t *ptep, pte_t pte,
+		       spinlock_t *ptl)
 {
 	struct page *old_page, *new_page;
 	int avoidcopy;
@@ -527,9 +543,9 @@ static int hugetlb_cow(struct mm_struct 
 		return VM_FAULT_OOM;
 	}
 
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	copy_huge_page(new_page, old_page, address);
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 
 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
 	if (likely(pte_same(*ptep, pte))) {
@@ -553,6 +569,7 @@ int hugetlb_no_page(struct mm_struct *mm
 	struct page *page;
 	struct address_space *mapping;
 	pte_t new_pte;
+	spinlock_t *ptl;
 
 	mapping = vma->vm_file->f_mapping;
 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -590,7 +607,8 @@ retry:
 			lock_page(page);
 	}
 
-	spin_lock(&mm->page_table_lock);
+	ptl = hugepte_lockptr(mm, ptep);
+	spin_lock(ptl);
 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 	if (idx >= size)
 		goto backout;
@@ -606,16 +624,16 @@ retry:
 
 	if (write_access && !(vma->vm_flags & VM_SHARED)) {
 		/* Optimization, do the COW without a second fault */
-		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, ptl);
 	}
 
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	unlock_page(page);
 out:
 	return ret;
 
 backout:
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	hugetlb_put_quota(mapping);
 	unlock_page(page);
 	put_page(page);
@@ -628,9 +646,10 @@ int hugetlb_fault(struct mm_struct *mm, 
 	pte_t *ptep;
 	pte_t entry;
 	int ret;
+	spinlock_t *ptl;
 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 
-	ptep = huge_pte_alloc(mm, address);
+	ptep = pt_share_hugepage(mm, vma, address & HPAGE_MASK);
 	if (!ptep)
 		return VM_FAULT_OOM;
 
@@ -649,12 +668,13 @@ int hugetlb_fault(struct mm_struct *mm, 
 
 	ret = VM_FAULT_MINOR;
 
-	spin_lock(&mm->page_table_lock);
+	ptl = hugepte_lockptr(mm, ptep);
+	spin_lock(ptl);
 	/* Check for a racing update before calling hugetlb_cow */
 	if (likely(pte_same(entry, *ptep)))
 		if (write_access && !pte_write(entry))
-			ret = hugetlb_cow(mm, vma, address, ptep, entry);
-	spin_unlock(&mm->page_table_lock);
+			ret = hugetlb_cow(mm, vma, address, ptep, entry, ptl);
+	spin_unlock(ptl);
 	mutex_unlock(&hugetlb_instantiation_mutex);
 
 	return ret;
--- 2.6.17-rc1-macro/./mm/memory.c	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./mm/memory.c	2006-04-10 08:46:01.000000000 -0500
@@ -48,6 +48,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/ptshare.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -144,6 +145,10 @@ static inline void free_pmd_range(struct
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
+		if (pt_is_shared_pte(*pmd)) {
+			if (pt_check_unshare_pte(tlb->mm, addr, pmd))
+				continue;
+		}
 		free_pte_range(tlb, pmd);
 	} while (pmd++, addr = next, addr != end);
 
@@ -160,6 +165,7 @@ static inline void free_pmd_range(struct
 
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
+	pmd_lock_deinit(pmd);
 	pmd_free_tlb(tlb, pmd);
 }
 
@@ -177,6 +183,10 @@ static inline void free_pud_range(struct
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
+		if (pt_is_shared_pmd(*pud)) {
+			if (pt_check_unshare_pmd(tlb->mm, addr, pud))
+				continue;
+		}
 		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 	} while (pud++, addr = next, addr != end);
 
@@ -301,20 +311,21 @@ void free_pgtables(struct mmu_gather **t
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
 	struct page *new = pte_alloc_one(mm, address);
+	spinlock_t *ptl = pmd_lockptr(mm, pmd);
+
 	if (!new)
 		return -ENOMEM;
 
 	pte_lock_init(new);
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	if (pmd_present(*pmd)) {	/* Another has populated it */
 		pte_lock_deinit(new);
 		pte_free(new);
 	} else {
-		mm->nr_ptes++;
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
 	}
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	return 0;
 }
 
@@ -581,7 +592,7 @@ int copy_page_range(struct mm_struct *ds
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+	if (!(vma->vm_flags & (VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
@@ -613,6 +624,10 @@ static unsigned long zap_pte_range(struc
 	int file_rss = 0;
 	int anon_rss = 0;
 
+	if (pt_is_shared_pte(*pmd)) {
+		if (pt_check_unshare_pte(mm, addr, pmd))
+			return end;
+	}
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
 		pte_t ptent = *pte;
@@ -694,6 +709,10 @@ static inline unsigned long zap_pmd_rang
 	unsigned long next;
 
 	pmd = pmd_offset(pud, addr);
+	if (pt_is_shared_pmd(*pud)) {
+		if (pt_check_unshare_pmd(tlb->mm, addr, pud))
+			return end;
+	}
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd)) {
@@ -2272,10 +2291,10 @@ int __handle_mm_fault(struct mm_struct *
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
 		return VM_FAULT_OOM;
-	pmd = pmd_alloc(mm, pud, address);
+	pmd = pt_share_pmd(vma, address, pud);
 	if (!pmd)
 		return VM_FAULT_OOM;
-	pte = pte_alloc_map(mm, pmd, address);
+	pte = pt_share_pte(vma, address, pmd);
 	if (!pte)
 		return VM_FAULT_OOM;
 
@@ -2326,13 +2345,17 @@ int __pmd_alloc(struct mm_struct *mm, pu
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (pud_present(*pud))		/* Another has populated it */
 		pmd_free(new);
-	else
+	else {
+		pmd_lock_init(new);
 		pud_populate(mm, pud, new);
+	}
 #else
 	if (pgd_present(*pud))		/* Another has populated it */
 		pmd_free(new);
-	else
+	else {
+		pmd_lock_init(new);
 		pgd_populate(mm, pud, new);
+	}
 #endif /* __ARCH_HAS_4LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
--- 2.6.17-rc1-macro/./mm/mmap.c	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./mm/mmap.c	2006-04-10 08:46:01.000000000 -0500
@@ -1706,6 +1706,8 @@ int split_vma(struct mm_struct * mm, str
 	/* most fields are the same, copy all, and then fixup */
 	*new = *vma;
 
+	new->vm_flags &= ~VM_TRANSITION;
+
 	if (new_below)
 		new->vm_end = addr;
 	else {
@@ -1941,7 +1943,6 @@ void exit_mmap(struct mm_struct *mm)
 	while (vma)
 		vma = remove_vma(vma);
 
-	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
--- 2.6.17-rc1-macro/./mm/mprotect.c	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./mm/mprotect.c	2006-04-10 08:46:01.000000000 -0500
@@ -19,6 +19,7 @@
 #include <linux/mempolicy.h>
 #include <linux/personality.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -115,6 +116,10 @@ mprotect_fixup(struct vm_area_struct *vm
 		return 0;
 	}
 
+	vma->vm_flags |= VM_TRANSITION;
+	if (pt_vmashared(vma))
+		pt_unshare_range(vma, start, end);
+
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
@@ -126,8 +131,10 @@ mprotect_fixup(struct vm_area_struct *vm
 	if (newflags & VM_WRITE) {
 		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
 			charged = nrpages;
-			if (security_vm_enough_memory(charged))
+			if (security_vm_enough_memory(charged)) {
+				vma->vm_flags &= ~VM_TRANSITION;
 				return -ENOMEM;
+			}
 			newflags |= VM_ACCOUNT;
 		}
 	}
@@ -175,6 +182,7 @@ success:
 	return 0;
 
 fail:
+	vma->vm_flags &= ~VM_TRANSITION;
 	vm_unacct_memory(charged);
 	return error;
 }
--- 2.6.17-rc1-macro/./mm/mremap.c	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./mm/mremap.c	2006-04-10 08:46:01.000000000 -0500
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -178,6 +179,10 @@ static unsigned long move_vma(struct vm_
 	if (!new_vma)
 		return -ENOMEM;
 
+	vma->vm_flags |= VM_TRANSITION;
+	if (pt_vmashared(vma))
+		pt_unshare_range(vma, old_addr, old_addr + old_len);
+
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
 	if (moved_len < old_len) {
 		/*
@@ -221,6 +226,8 @@ static unsigned long move_vma(struct vm_
 	}
 	mm->hiwater_vm = hiwater_vm;
 
+	vma->vm_flags &= ~VM_TRANSITION;
+
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
 	if (excess) {
 		vma->vm_flags |= VM_ACCOUNT;
--- 2.6.17-rc1-macro/./mm/rmap.c	2006-04-02 22:22:10.000000000 -0500
+++ 2.6.17-rc1-shpt/./mm/rmap.c	2006-04-10 08:46:01.000000000 -0500
@@ -53,6 +53,7 @@
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
+#include <linux/ptshare.h>
 
 #include <asm/tlbflush.h>
 
@@ -286,7 +287,8 @@ unsigned long page_address_in_vma(struct
  * On success returns with pte mapped and locked.
  */
 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-			  unsigned long address, spinlock_t **ptlp)
+			  unsigned long address, spinlock_t **ptlp,
+			  int *shared)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -306,6 +308,9 @@ pte_t *page_check_address(struct page *p
 	if (!pmd_present(*pmd))
 		return NULL;
 
+	if (pt_is_shared_pmd(*pud))
+		*shared++;
+
 	pte = pte_offset_map(pmd, address);
 	/* Make a quick check before getting the lock */
 	if (!pte_present(*pte)) {
@@ -313,6 +318,9 @@ pte_t *page_check_address(struct page *p
 		return NULL;
 	}
 
+	if (pt_is_shared_pte(*pmd))
+		*shared++;
+
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
@@ -333,6 +341,7 @@ static int page_referenced_one(struct pa
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
 	pte_t *pte;
+	int shared;
 	spinlock_t *ptl;
 	int referenced = 0;
 
@@ -340,7 +349,7 @@ static int page_referenced_one(struct pa
 	if (address == -EFAULT)
 		goto out;
 
-	pte = page_check_address(page, mm, address, &ptl);
+	pte = page_check_address(page, mm, address, &ptl, &shared);
 	if (!pte)
 		goto out;
 
@@ -584,6 +593,7 @@ static int try_to_unmap_one(struct page 
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
+	int shared = 0;
 	spinlock_t *ptl;
 	int ret = SWAP_AGAIN;
 
@@ -591,7 +601,7 @@ static int try_to_unmap_one(struct page 
 	if (address == -EFAULT)
 		goto out;
 
-	pte = page_check_address(page, mm, address, &ptl);
+	pte = page_check_address(page, mm, address, &ptl, &shared);
 	if (!pte)
 		goto out;
 
@@ -609,7 +619,10 @@ static int try_to_unmap_one(struct page 
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
-	pteval = ptep_clear_flush(vma, address, pte);
+	if (shared)
+		pteval = ptep_clear_flush_all(vma, address, pte);
+	else
+		pteval = ptep_clear_flush(vma, address, pte);
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
--- 2.6.17-rc1-macro/./mm/ptshare.c	1969-12-31 18:00:00.000000000 -0600
+++ 2.6.17-rc1-shpt/./mm/ptshare.c	2006-04-10 08:46:01.000000000 -0500
@@ -0,0 +1,463 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Author: Dave McCracken <[email protected]>
+ */
+
+#include <linux/kernel.h>
+#include <linux/prio_tree.h>
+#include <linux/mm.h>
+#include <linux/ptshare.h>
+
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+#define VM_PGEND(vma)	(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) -1)
+
+#define	VMFLAG_COMPARE	(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)
+
+static int pt_shareable_vma(struct vm_area_struct *vma)
+{
+	/* We can't share anonymous memory */
+	if (!vma->vm_file)
+		return 0;
+
+	/* No sharing of nonlinear areas or vmas in transition */
+	if (vma->vm_flags & (VM_NONLINEAR|VM_TRANSITION|VM_PFNMAP|VM_INSERTPAGE))
+		return 0;
+
+	/* Only share shared mappings or read-only mappings */
+	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) == VM_WRITE)
+		return 0;
+
+	/* If it's smaller than the smallest shareable unit, don't bother
+	   calling it shareable */
+	if ((vma->vm_end - vma->vm_start) < PMD_SIZE)
+		return 0;
+
+	return 1;
+}
+
+#ifdef CONFIG_PTSHARE_PTE
+static inline void pt_unshare_pte(struct mm_struct *mm, pmd_t *pmd,
+				  unsigned long address)
+{
+	struct page *page;
+	spinlock_t *ptl;
+
+	if (pmd_present(*pmd)) {
+		page = pmd_page(*pmd);
+		ptl = __pt_lockptr(page);
+		spin_lock(ptl);
+		if (pt_is_shared(page)) {
+#ifdef PT_DEBUG
+			printk(KERN_DEBUG "Unsharing pte page at address 0x%lx\n",
+			       address);
+#endif
+			pt_decrement_share(page);
+			pmd_clear_flush(mm, address, pmd);
+		}
+		spin_unlock(ptl);
+	}
+}
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+static void pt_unshare_pmd(struct mm_struct *mm, pud_t *pud, unsigned long address,
+			   unsigned long end, int hugepage)
+{
+	if (!pud_present(*pud))
+	    return;
+
+#ifdef CONFIG_PTSHARE_PMD
+	{
+		struct page *page;
+		spinlock_t *ptl;
+
+		page = pud_page(*pud);
+		ptl = __pt_lockptr(page);
+		spin_lock(ptl);
+		if (pt_is_shared(page)) {
+#ifdef PT_DEBUG
+			printk(KERN_DEBUG "Unsharing pmd page at address 0x%lx\n",
+			       address);
+#endif
+			pt_decrement_share(page);
+			spin_unlock(ptl);
+			pud_clear_flush(mm, address, pud);
+			return;
+		}
+		spin_unlock(ptl);
+	}
+#endif
+#ifdef CONFIG_PTSHARE_PTE
+	if (hugepage)
+		return;
+
+	{
+		pmd_t *pmd;
+
+		pmd = pmd_offset(pud, address);
+		end = pud_addr_end(address, end);
+		while (address <= end) {
+			pt_unshare_pte(mm, pmd, address);
+			pmd++;
+			address += PMD_SIZE;
+		}
+	}
+#endif
+}
+
+#ifndef __PAGETABLE_PUD_FOLDED
+static void pt_unshare_pud(struct mm_struct *mm, pgd_t *pgd, unsigned long address,
+			   unsigned long end, int hugepage)
+{
+	pud_t *pud;
+
+	if (!pgd_present(*pgd))
+	    return;
+
+	pud = pud_offset(pgd, address);
+	end = pgd_addr_end(address, end);
+	while (address <= end) {
+		pt_unshare_pmd(mm, pud, address, end, hugepage);
+		pud++;
+		address += PUD_SIZE;
+	}
+}
+#endif /* __PAGETABLE_PUD_FOLDED */
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+static void pt_unshare_pgd(struct mm_struct *mm, unsigned long address,
+			   unsigned long end, int hugepage)
+{
+	pgd_t *pgd;
+
+	pgd = pgd_offset(mm, address);
+
+	spin_lock(&mm->page_table_lock);
+	while (address <= end) {
+#ifdef __PAGETABLE_PMD_FOLDED
+		if (!hugepage)
+			pt_unshare_pte(mm, (pmd_t *)pgd, address);
+#else
+#ifdef __PAGETABLE_PUD_FOLDED
+		pt_unshare_pmd(mm, (pud_t *)pgd, address, end, hugepage);
+#else
+		pt_unshare_pud(mm, pgd, address, end, hugepage);
+#endif
+#endif
+		pgd++;
+		address += PGDIR_SIZE;
+	}
+	spin_unlock(&mm->page_table_lock);
+}
+
+void pt_unshare_range(struct vm_area_struct *vma, unsigned long address,
+		      unsigned long end)
+{
+	struct address_space *mapping;
+
+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+	spin_lock(&mapping->i_mmap_lock);
+
+	pt_unshare_pgd(vma->vm_mm, address, end, 0);
+
+	spin_unlock(&mapping->i_mmap_lock);
+
+	/* If we unshare the entire vma it's safe to clear the share flag */
+	if (vma->vm_start == address &&
+	    vma->vm_end == end)
+		vma->vm_flags &= ~VM_PTSHARE;
+}
+
+static struct vm_area_struct *next_shareable_vma(struct vm_area_struct *vma,
+						 struct vm_area_struct *svma,
+						 struct prio_tree_iter *iter)
+{
+	while ((svma = vma_prio_tree_next(svma, iter))) {
+		if ((svma != vma) &&
+		    ((vma->vm_flags&VMFLAG_COMPARE) == (svma->vm_flags&VMFLAG_COMPARE)) &&
+		    (vma->vm_start == svma->vm_start) &&
+		    (vma->vm_end == svma->vm_end) &&
+		    (vma->vm_pgoff == svma->vm_pgoff))
+			break;
+	}
+	return svma;
+}
+
+#ifdef CONFIG_PTSHARE_PTE
+static int pt_shareable_pte(struct vm_area_struct *vma, unsigned long address)
+{
+	unsigned long base = address & PMD_MASK;
+	unsigned long end = base + (PMD_SIZE-1);
+
+	if (pt_shareable_vma(vma) &&
+	   (vma->vm_start <= base) &&
+	    (vma->vm_end >= end))
+		return 1;
+
+	return 0;
+}
+
+pte_t *pt_share_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd)
+{
+	struct prio_tree_iter iter;
+	struct vm_area_struct *svma = NULL;
+	pgd_t *spgd, spgde;
+	pud_t *spud, spude;
+	pmd_t *spmd, spmde;
+	pte_t *pte;
+	struct page *page;
+	struct address_space *mapping;
+	spinlock_t *ptl;
+
+	pmd_clear(&spmde);
+	page = NULL;
+	if (pmd_none(*pmd) &&
+	    pt_shareable_pte(vma, address)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Looking for shareable pte page at address 0x%lx\n",
+		       address);
+#endif
+		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+		spin_lock(&mapping->i_mmap_lock);
+		prio_tree_iter_init(&iter, &mapping->i_mmap,
+				    vma->vm_pgoff, VM_PGEND(vma));
+
+		while ((svma = next_shareable_vma(vma, svma, &iter))) {
+			spgd = pgd_offset(svma->vm_mm, address);
+			spgde = *spgd;
+			if (pgd_none(spgde))
+				continue;
+
+			spud = pud_offset(&spgde, address);
+			spude = *spud;
+			if (pud_none(spude))
+				continue;
+
+			spmd = pmd_offset(&spude, address);
+			spmde = *spmd;
+			if (pmd_none(spmde))
+				continue;
+
+			/* Found a shareable page */
+			page = pmd_page(spmde);
+			pt_increment_share(page);
+			break;
+		}
+		spin_unlock(&mapping->i_mmap_lock);
+		if (pmd_present(spmde)) {
+			ptl = pmd_lockptr(vma->vm_mm, pmd);
+			spin_lock(ptl);
+			if (pmd_none(*pmd)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Sharing pte page at address 0x%lx\n",
+				       address);
+#endif
+				pmd_populate(vma->vm_mm, pmd, page);
+				/* Both vmas now have shared pt */
+				vma->vm_flags |= VM_PTSHARE;
+				svma->vm_flags |= VM_PTSHARE;
+			} else {
+				/* Oops, already mapped... undo it */
+				pt_decrement_share(page);
+			}
+			spin_unlock(ptl);
+		}
+
+	}
+	pte = pte_alloc_map(vma->vm_mm, pmd, address);
+
+	return pte;
+}
+int pt_check_unshare_pte(struct mm_struct *mm, unsigned long address, pmd_t *pmd)
+{
+	struct page *page;
+	spinlock_t *ptl;
+
+	page = pmd_page(*pmd);
+	ptl = __pt_lockptr(page);
+	spin_lock(ptl);
+	/* Doublecheck now that we hold the lock */
+	if (pt_is_shared(page)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Unsharing pte at address 0x%lx\n",
+		       address);
+#endif
+		pt_decrement_share(page);
+		spin_unlock(ptl);
+		pmd_clear_flush(mm, address, pmd);
+		return 1;
+	}
+	spin_unlock(ptl);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_PTSHARE_PMD
+static int pt_shareable_pmd(struct vm_area_struct *vma,
+		 unsigned long address)
+{
+	unsigned long base = address & PUD_MASK;
+	unsigned long end = base + (PUD_SIZE-1);
+
+	if (pt_shareable_vma(vma) &&
+	    (vma->vm_start <= base) &&
+	    (vma->vm_end >= end))
+		return 1;
+
+	return 0;
+}
+
+pmd_t *pt_share_pmd(struct vm_area_struct *vma, unsigned long address, pud_t *pud)
+{
+	struct prio_tree_iter iter;
+	struct mm_struct *mm = vma->vm_mm;
+	struct vm_area_struct *svma = NULL;
+	pgd_t *spgd, spgde;
+	pud_t *spud, spude;
+	pmd_t *pmd;
+	struct page *page;
+	struct address_space *mapping;
+
+	pud_clear(&spude);
+	page = NULL;
+	if (pud_none(*pud) &&
+	    pt_shareable_pmd(vma, address)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Looking for shareable pmd page at address 0x%lx\n",
+		       address);
+#endif
+		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+
+		spin_lock(&mapping->i_mmap_lock);
+		prio_tree_iter_init(&iter, &mapping->i_mmap,
+				    vma->vm_pgoff, VM_PGEND(vma));
+
+		while ((svma = next_shareable_vma(vma, svma, &iter))) {
+			spgd = pgd_offset(svma->vm_mm, address);
+			spgde = *spgd;
+			if (pgd_none(spgde))
+				continue;
+
+			spud = pud_offset(spgd, address);
+			spude = *spud;
+			if (pud_none(spude))
+				continue;
+
+			/* Found a shareable page */
+			page = pud_page(spude);
+			pt_increment_share(page);
+			break;
+		}
+		spin_unlock(&mapping->i_mmap_lock);
+		if (pud_present(spude)) {
+			spin_lock(&mm->page_table_lock);
+			if (pud_none(*pud)) {
+#ifdef PT_DEBUG
+				printk(KERN_DEBUG "Sharing pmd page at address 0x%lx\n",
+				       address);
+#endif
+				pud_populate(mm, pud,
+					     (pmd_t *)pud_page_kernel(spude));
+				/* These vmas now have shared pt */
+				vma->vm_flags |= VM_PTSHARE;
+				svma->vm_flags |= VM_PTSHARE;
+			} else {
+				/* Oops, already mapped... undo it */
+				pt_decrement_share(page);
+			}
+			spin_unlock(&mm->page_table_lock);
+		}
+	}
+	pmd = pmd_alloc(mm, pud, address);
+
+	return pmd;
+}
+int pt_check_unshare_pmd(struct mm_struct *mm, unsigned long address, pud_t *pud)
+{
+	struct page *page;
+	spinlock_t *ptl;
+
+	page = pud_page(*pud);
+	ptl = __pt_lockptr(page);
+	spin_lock(ptl);
+	/* Doublecheck now that we hold the lock */
+	if (pt_is_shared(page)) {
+#ifdef PT_DEBUG
+			printk(KERN_DEBUG "Unsharing pmd at address 0x%lx\n",
+			       address);
+#endif
+		pt_decrement_share(page);
+		spin_unlock(ptl);
+		pud_clear_flush(mm, address, pud);
+		return 1;
+	}
+	spin_unlock(ptl);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_PTSHARE_HUGEPAGE
+
+void pt_unshare_huge_range(struct vm_area_struct *vma, unsigned long address,
+			   unsigned long end)
+{
+	struct address_space *mapping;
+
+	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+	spin_lock(&mapping->i_mmap_lock);
+
+#ifdef CONFIG_PTSHARE_HUGEPAGE_PTE
+	pt_unshare_pgd(vma->vm_mm, address, end, 0);
+#else
+	pt_unshare_pgd(vma->vm_mm, address, end, 1);
+#endif
+
+	spin_unlock(&mapping->i_mmap_lock);
+
+	/* If we unshare the entire vma it's safe to clear the share flag */
+	if (vma->vm_start >= address &&
+	    vma->vm_end <= end)
+		vma->vm_flags &= ~VM_PTSHARE;
+}
+
+pte_t *pt_share_hugepage(struct mm_struct *mm, struct vm_area_struct *vma,
+			 unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset(mm, address);
+	pud = pud_alloc(mm, pgd, address);
+	if (!pud)
+		return VM_FAULT_OOM;
+	pmd = pt_share_pmd(vma, address, pud);
+	if (!pmd)
+		return VM_FAULT_OOM;
+#ifdef CONFIG_PTSHARE_HUGEPAGE_PTE
+	pte = pt_share_pte(vma, address, pmd);
+#else
+	pte = (pte_t *)pmd;
+#endif
+	return pte;
+}
+#endif


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Prev by Date: [RFC/PATCH] Shared Page Tables [0/2]
Next by Date: [RFC/PATCH] Shared Page Tables [1/2]
Previous by thread: [RFC/PATCH] Shared Page Tables [0/2]
Next by thread: [RFC/PATCH] Shared Page Tables [1/2]
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]