[PATCH 2/2] Simple shared page tables

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The actual shared page table patches

------------------------

Diffstat:

 arch/i386/Kconfig             |    7 
 arch/s390/Kconfig             |    7 
 arch/x86_64/Kconfig           |    7 
 fs/proc/proc_misc.c           |    6 
 fs/proc/task_mmu.c            |    7 
 include/asm-generic/pgtable.h |   31 +++
 include/linux/mm.h            |   10 -
 include/linux/mmzone.h        |    3 
 include/linux/ptshare.h       |   69 ++++++++
 include/linux/rmap.h          |    2 
 include/linux/sched.h         |    5 
 mm/Makefile                   |    1 
 mm/filemap_xip.c              |    3 
 mm/fremap.c                   |    3 
 mm/memory.c                   |    8 -
 mm/mmap.c                     |    7 
 mm/mprotect.c                 |    8 -
 mm/mremap.c                   |    3 
 mm/ptshare.c                  |  332 ++++++++++++++++++++++++++++++++++++++++++
 mm/rmap.c                     |   18 +-
 mm/vmstat.c                   |    3 
 21 files changed, 524 insertions(+), 16 deletions(-)

------------------------

--- 2.6.18-rc4-macro/./arch/i386/Kconfig	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./arch/i386/Kconfig	2006-08-07 19:28:56.000000000 -0500
@@ -539,6 +539,13 @@ config X86_PAE
 	default y
 	select RESOURCES_64BIT
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
--- 2.6.18-rc4-macro/./arch/s390/Kconfig	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./arch/s390/Kconfig	2006-08-07 19:28:56.000000000 -0500
@@ -219,6 +219,13 @@ config WARN_STACK_SIZE
 
 source "mm/Kconfig"
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
 comment "I/O subsystem configuration"
 
 config MACHCHK_WARNING
--- 2.6.18-rc4-macro/./arch/x86_64/Kconfig	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./arch/x86_64/Kconfig	2006-08-07 19:28:56.000000000 -0500
@@ -321,6 +321,13 @@ config NUMA_EMU
 	  into virtual nodes when booted with "numa=fake=N", where N is the
 	  number of nodes. This is only useful for debugging.
 
+config PTSHARE
+	bool "Share page tables"
+	default y
+	help
+	  Turn on sharing of page tables between processes for large shared
+	  memory regions.
+
 config ARCH_DISCONTIGMEM_ENABLE
        bool
        depends on NUMA
--- 2.6.18-rc4-macro/./fs/proc/proc_misc.c	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./fs/proc/proc_misc.c	2006-08-15 10:15:09.000000000 -0500
@@ -169,6 +169,9 @@ static int meminfo_read_proc(char *page,
 		"Mapped:       %8lu kB\n"
 		"Slab:         %8lu kB\n"
 		"PageTables:   %8lu kB\n"
+#ifdef CONFIG_PTSHARE
+		"SharedPTE:    %8lu kB\n"
+#endif
 		"NFS Unstable: %8lu kB\n"
 		"Bounce:       %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
@@ -195,6 +198,9 @@ static int meminfo_read_proc(char *page,
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SLAB)),
 		K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_PTSHARE
+		K(global_page_state(NR_SHAREDPTE)),
+#endif
 		K(global_page_state(NR_UNSTABLE_NFS)),
 		K(global_page_state(NR_BOUNCE)),
 		K(allowed),
--- 2.6.18-rc4-macro/./fs/proc/task_mmu.c	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./fs/proc/task_mmu.c	2006-08-15 15:41:06.000000000 -0500
@@ -43,6 +43,9 @@ char *task_mem(struct mm_struct *mm, cha
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
+#ifdef CONFIG_PTSHARE
+		"VmSHPT:\t%8lu kB\n"
+#endif
 		"VmPTE:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
@@ -51,7 +54,11 @@ char *task_mem(struct mm_struct *mm, cha
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
+#ifdef CONFIG_PTSHARE
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_shpte) >> 10,
+#endif
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+
 	return buffer;
 }
 
--- 2.6.18-rc4-macro/./include/asm-generic/pgtable.h	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./include/asm-generic/pgtable.h	2006-08-07 19:28:56.000000000 -0500
@@ -127,6 +127,16 @@ do {									\
 })
 #endif
 
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH_ALL
+#define ptep_clear_flush_all(__vma, __address, __ptep)			\
+({									\
+	pte_t __pte;							\
+	__pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep);	\
+	flush_tlb_all();				\
+	__pte;								\
+})
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
 struct mm_struct;
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
@@ -164,6 +174,27 @@ static inline void ptep_set_wrprotect(st
 #endif
 
 /*
+ * Some architectures might need flushes when higher levels of page table
+ * are unshared.
+ */
+
+#ifndef __HAVE_ARCH_PMD_CLEAR_FLUSH
+#define pmd_clear_flush(__mm, __addr, __pmd)				\
+({									\
+	pmd_clear(__pmd);						\
+	flush_tlb_all();						\
+})
+#endif
+
+#ifndef __HAVE_ARCH_PUD_CLEAR_FLUSH
+#define pud_clear_flush(__mm, __addr, __pud)				\
+({									\
+	pud_clear(__pud);						\
+	flush_tlb_all();						\
+})
+#endif
+
+/*
  * When walking page tables, get the address of the next boundary,
  * or the end address of the range if that comes earlier.  Although no
  * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
--- 2.6.18-rc4-macro/./include/linux/mm.h	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./include/linux/mm.h	2006-08-07 19:28:56.000000000 -0500
@@ -245,7 +245,7 @@ struct page {
 						 * see PAGE_MAPPING_ANON below.
 						 */
 	    };
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) || defined(CONFIG_PTSHARE)
 	    spinlock_t ptl;
 #endif
 	};
@@ -826,19 +826,19 @@ static inline pmd_t *pmd_alloc(struct mm
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) || defined(CONFIG_PTSHARE)
 /*
  * We tuck a spinlock to guard each pagetable page into its struct page,
  * at page->private, with BUILD_BUG_ON to make sure that this will not
  * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  * When freeing, reset page->mapping so free_pages_check won't complain.
  */
-#define __pte_lockptr(page)	&((page)->ptl)
+#define __pt_lockptr(page)	&((page)->ptl)
 #define pte_lock_init(_page)	do {					\
-	spin_lock_init(__pte_lockptr(_page));				\
+	spin_lock_init(__pt_lockptr(_page));				\
 } while (0)
 #define pte_lock_deinit(page)	((page)->mapping = NULL)
-#define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#define pte_lockptr(mm, pmd)	({(void)(mm); __pt_lockptr(pmd_page(*(pmd)));})
 #else
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
--- 2.6.18-rc4-macro/./include/linux/mmzone.h	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./include/linux/mmzone.h	2006-08-15 10:14:04.000000000 -0500
@@ -65,6 +65,9 @@ enum zone_stat_item {
 	NUMA_LOCAL,		/* allocation from local node */
 	NUMA_OTHER,		/* allocation from other node */
 #endif
+#ifdef CONFIG_PTSHARE
+	NR_SHAREDPTE,		/* Number of shared page table pages */
+#endif
 	NR_VM_ZONE_STAT_ITEMS };
 
 struct per_cpu_pages {
--- 2.6.18-rc4-macro/./include/linux/rmap.h	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./include/linux/rmap.h	2006-08-07 19:28:56.000000000 -0500
@@ -96,7 +96,7 @@ int try_to_unmap(struct page *, int igno
  * Called from mm/filemap_xip.c to unmap empty zero page
  */
 pte_t *page_check_address(struct page *, struct mm_struct *,
-				unsigned long, spinlock_t **);
+				unsigned long, spinlock_t **, int *);
 
 /*
  * Used by swapoff to help locate where page is expected in vma.
--- 2.6.18-rc4-macro/./include/linux/sched.h	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./include/linux/sched.h	2006-08-15 14:05:41.000000000 -0500
@@ -257,7 +257,7 @@ arch_get_unmapped_area_topdown(struct fi
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) || defined(CONFIG_PTSHARE)
 /*
  * The mm counters are not protected by its page_table_lock,
  * so must be incremented atomically.
@@ -333,6 +333,9 @@ struct mm_struct {
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
+#ifdef CONFIG_PTSHARE
+	unsigned long nr_shpte;
+#endif
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
--- 2.6.18-rc4-macro/./include/linux/ptshare.h	1969-12-31 18:00:00.000000000 -0600
+++ 2.6.18-rc4-shpt/./include/linux/ptshare.h	2006-08-07 19:57:56.000000000 -0500
@@ -0,0 +1,69 @@
+#ifndef _LINUX_PTSHARE_H
+#define _LINUX_PTSHARE_H
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Author: Dave McCracken <[email protected]>
+ */
+
+#undef	PT_DEBUG
+
+#ifdef CONFIG_PTSHARE
+static inline int pt_is_shared(struct page *page)
+{
+	return (page_mapcount(page) > 0);
+}
+
+static inline int pt_is_shared_pte(pmd_t pmdval)
+{
+	struct page *page;
+
+	page = pmd_page(pmdval);
+	return pt_is_shared(page);
+}
+
+static inline void pt_increment_share(struct page *page)
+{
+	atomic_inc(&page->_mapcount);
+}
+
+static inline void pt_decrement_share(struct page *page)
+{
+	atomic_dec(&page->_mapcount);
+}
+
+extern pte_t *pt_share_pte(struct vm_area_struct *vma, pmd_t *pmd, 
+			   unsigned long address);
+
+extern void pt_unshare_range(struct mm_struct *mm, unsigned long address,
+			     unsigned long end);
+
+extern int pt_check_unshare_pte(struct mm_struct *mm, unsigned long address,
+				pmd_t *pmd);
+
+#else /* CONFIG_PTSHARE */
+#define pt_is_shared(page)	(0)
+#define pt_is_shared_pte(pmdval)	(0)
+#define pt_increment_share(page)
+#define pt_decrement_share(page)
+#define	pt_share_pte(vma, pmd, address)	pte_alloc_map(vma->vm_mm, pmd, address)
+#define pt_unshare_range(mm, address, end)
+#define pt_check_unshare_pte(mm, address, pmd)	(0)
+#endif /* CONFIG_PTSHARE */
+
+#endif /* _LINUX_PTSHARE_H */
--- 2.6.18-rc4-macro/./mm/Makefile	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./mm/Makefile	2006-08-07 19:28:56.000000000 -0500
@@ -23,4 +23,5 @@ obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_PTSHARE) += ptshare.o
 
--- 2.6.18-rc4-macro/./mm/filemap_xip.c	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./mm/filemap_xip.c	2006-08-07 19:28:56.000000000 -0500
@@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapp
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
+	int shared;
 	spinlock_t *ptl;
 	struct page *page;
 
@@ -184,7 +185,7 @@ __xip_unmap (struct address_space * mapp
 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 		page = ZERO_PAGE(address);
-		pte = page_check_address(page, mm, address, &ptl);
+		pte = page_check_address(page, mm, address, &ptl, &shared);
 		if (pte) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
--- 2.6.18-rc4-macro/./mm/fremap.c	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./mm/fremap.c	2006-08-07 19:28:56.000000000 -0500
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -200,6 +201,8 @@ asmlinkage long sys_remap_file_pages(uns
 				has_write_lock = 1;
 				goto retry;
 			}
+			pt_unshare_range(mm, vma->vm_start, vma->vm_end);
+
 			mapping = vma->vm_file->f_mapping;
 			spin_lock(&mapping->i_mmap_lock);
 			flush_dcache_mmap_lock(mapping);
--- 2.6.18-rc4-macro/./mm/memory.c	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./mm/memory.c	2006-08-15 15:13:51.000000000 -0500
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
+#include <linux/ptshare.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -145,6 +146,8 @@ static inline void free_pmd_range(struct
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
+		if (pt_check_unshare_pte(tlb->mm, addr, pmd))
+			continue;
 		free_pte_range(tlb, pmd);
 	} while (pmd++, addr = next, addr != end);
 
@@ -626,6 +629,9 @@ static unsigned long zap_pte_range(struc
 	int file_rss = 0;
 	int anon_rss = 0;
 
+	if (pt_check_unshare_pte(mm, addr, pmd))
+		return end;
+
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
 		pte_t ptent = *pte;
@@ -2340,7 +2346,7 @@ int __handle_mm_fault(struct mm_struct *
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
 		return VM_FAULT_OOM;
-	pte = pte_alloc_map(mm, pmd, address);
+	pte = pt_share_pte(vma, pmd, address);
 	if (!pte)
 		return VM_FAULT_OOM;
 
--- 2.6.18-rc4-macro/./mm/mmap.c	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./mm/mmap.c	2006-08-07 19:34:05.000000000 -0500
@@ -25,6 +25,7 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/ptshare.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1039,6 +1040,7 @@ munmap_back:
 			vm_flags |= VM_ACCOUNT;
 		}
 	}
+	pt_unshare_range(mm, addr, addr + len);
 
 	/*
 	 * Can we just expand an old private anonymous mapping?
@@ -1950,6 +1952,9 @@ void exit_mmap(struct mm_struct *mm)
 	unsigned long nr_accounted = 0;
 	unsigned long end;
 
+	/* We need this semaphore to protect against page table sharing */
+	down_write(&mm->mmap_sem);
+
 	lru_add_drain();
 	flush_cache_mm(mm);
 	tlb = tlb_gather_mmu(mm, 1);
@@ -1959,6 +1964,7 @@ void exit_mmap(struct mm_struct *mm)
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
+	up_write(&mm->mmap_sem);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,
@@ -1967,7 +1973,6 @@ void exit_mmap(struct mm_struct *mm)
 	while (vma)
 		vma = remove_vma(vma);
 
-	BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
--- 2.6.18-rc4-macro/./mm/mprotect.c	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./mm/mprotect.c	2006-08-07 19:28:56.000000000 -0500
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/ptshare.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -133,6 +134,8 @@ mprotect_fixup(struct vm_area_struct *vm
 		return 0;
 	}
 
+	pt_unshare_range(mm, start, end);
+
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
@@ -144,8 +147,9 @@ mprotect_fixup(struct vm_area_struct *vm
 	if (newflags & VM_WRITE) {
 		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
 			charged = nrpages;
-			if (security_vm_enough_memory(charged))
+			if (security_vm_enough_memory(charged)) {
 				return -ENOMEM;
+			}
 			newflags |= VM_ACCOUNT;
 		}
 	}
@@ -182,7 +186,7 @@ success:
 	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
 		mask &= ~VM_SHARED;
 
-	newprot = protection_map[newflags & mask];
+ 	newprot = protection_map[newflags & mask];
 
 	/*
 	 * vm_flags and vm_page_prot are protected by the mmap_sem
--- 2.6.18-rc4-macro/./mm/mremap.c	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./mm/mremap.c	2006-08-07 19:28:56.000000000 -0500
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/ptshare.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -178,6 +179,8 @@ static unsigned long move_vma(struct vm_
 	if (!new_vma)
 		return -ENOMEM;
 
+	pt_unshare_range(mm, old_addr, old_addr + old_len);
+
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
 	if (moved_len < old_len) {
 		/*
--- 2.6.18-rc4-macro/./mm/rmap.c	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./mm/rmap.c	2006-08-07 19:28:56.000000000 -0500
@@ -53,6 +53,7 @@
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
+#include <linux/ptshare.h>
 
 #include <asm/tlbflush.h>
 
@@ -248,7 +249,8 @@ unsigned long page_address_in_vma(struct
  * On success returns with pte mapped and locked.
  */
 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-			  unsigned long address, spinlock_t **ptlp)
+			  unsigned long address, spinlock_t **ptlp,
+			  int *shared)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -275,6 +277,9 @@ pte_t *page_check_address(struct page *p
 		return NULL;
 	}
 
+	if (pt_is_shared_pte(*pmd))
+		(*shared)++;
+
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
@@ -295,6 +300,7 @@ static int page_referenced_one(struct pa
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
 	pte_t *pte;
+	int shared;
 	spinlock_t *ptl;
 	int referenced = 0;
 
@@ -302,7 +308,7 @@ static int page_referenced_one(struct pa
 	if (address == -EFAULT)
 		goto out;
 
-	pte = page_check_address(page, mm, address, &ptl);
+	pte = page_check_address(page, mm, address, &ptl, &shared);
 	if (!pte)
 		goto out;
 
@@ -547,6 +553,7 @@ static int try_to_unmap_one(struct page 
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
+	int shared = 0;
 	spinlock_t *ptl;
 	int ret = SWAP_AGAIN;
 
@@ -554,7 +561,7 @@ static int try_to_unmap_one(struct page 
 	if (address == -EFAULT)
 		goto out;
 
-	pte = page_check_address(page, mm, address, &ptl);
+	pte = page_check_address(page, mm, address, &ptl, &shared);
 	if (!pte)
 		goto out;
 
@@ -571,7 +578,10 @@ static int try_to_unmap_one(struct page 
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
-	pteval = ptep_clear_flush(vma, address, pte);
+	if (shared)
+		pteval = ptep_clear_flush_all(vma, address, pte);
+	else
+		pteval = ptep_clear_flush(vma, address, pte);
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
--- 2.6.18-rc4-macro/./mm/vmstat.c	2006-08-06 13:20:11.000000000 -0500
+++ 2.6.18-rc4-shpt/./mm/vmstat.c	2006-08-15 16:32:59.000000000 -0500
@@ -402,6 +402,9 @@ static char *vmstat_text[] = {
 	"numa_local",
 	"numa_other",
 #endif
+#ifdef CONFIG_PTSHARE
+	"nr_sharedpte",
+#endif
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
 	"pgpgin",
--- 2.6.18-rc4-macro/./mm/ptshare.c	1969-12-31 18:00:00.000000000 -0600
+++ 2.6.18-rc4-shpt/./mm/ptshare.c	2006-08-15 16:41:49.000000000 -0500
@@ -0,0 +1,332 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Author: Dave McCracken <[email protected]>
+ */
+
+#include <linux/kernel.h>
+#include <linux/prio_tree.h>
+#include <linux/mm.h>
+#include <linux/ptshare.h>
+
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+#define VM_PGEND(vma)	(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) -1)
+
+#define	VMFLAG_COMPARE	(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)
+
+#undef	PT_DEBUG
+
+static inline void pt_unshare_pte(struct mm_struct *mm, pmd_t *pmd,
+				  unsigned long address)
+{
+	struct page *page;
+	spinlock_t *ptl;
+
+	if (pmd_present(*pmd)) {
+		page = pmd_page(*pmd);
+		ptl = __pt_lockptr(page);
+		spin_lock(ptl);
+		if (pt_is_shared(page)) {
+#ifdef PT_DEBUG
+			printk(KERN_DEBUG "Force unshare pte for %s[%d] at address 0x%lx\n",
+			       current->comm, current->pid, address);
+#endif
+			pt_decrement_share(page);
+			mm->nr_shpte--;
+			mm->nr_ptes--;
+			if (!pt_is_shared(page))
+				dec_zone_page_state(page, NR_SHAREDPTE);
+			pmd_clear_flush(mm, address, pmd);
+		}
+		spin_unlock(ptl);
+	}
+}
+
+#ifndef __PAGETABLE_PMD_FOLDED
+static void pt_unshare_pmd(struct mm_struct *mm, pud_t *pud, unsigned long address,
+			   unsigned long end)
+{
+	pmd_t *pmd;
+
+	if (!pud_present(*pud))
+	    return;
+
+	pmd = pmd_offset(pud, address);
+	end = pud_addr_end(address, end);
+	while (address < end) {
+		pt_unshare_pte(mm, pmd, address);
+		pmd++;
+		address += PMD_SIZE;
+	}
+}
+
+#ifndef __PAGETABLE_PUD_FOLDED
+static void pt_unshare_pud(struct mm_struct *mm, pgd_t *pgd, unsigned long address,
+			   unsigned long end)
+{
+	pud_t *pud;
+
+	if (!pgd_present(*pgd))
+	    return;
+
+	pud = pud_offset(pgd, address);
+	end = pgd_addr_end(address, end);
+	while (address < end) {
+		pt_unshare_pmd(mm, pud, address, end);
+		pud++;
+		address += PUD_SIZE;
+	}
+}
+#endif /* __PAGETABLE_PUD_FOLDED */
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+void pt_unshare_range(struct mm_struct *mm, unsigned long address,
+		      unsigned long end)
+{
+	pgd_t *pgd;
+
+	pgd = pgd_offset(mm, address);
+
+	spin_lock(&mm->page_table_lock);
+	while (address < end) {
+#ifdef __PAGETABLE_PMD_FOLDED
+		pt_unshare_pte(mm, (pmd_t *)pgd, address);
+#else
+#ifdef __PAGETABLE_PUD_FOLDED
+		pt_unshare_pmd(mm, (pud_t *)pgd, address, end);
+#else
+		pt_unshare_pud(mm, pgd, address, end);
+#endif
+#endif
+		pgd++;
+		address += PGDIR_SIZE;
+	}
+	spin_unlock(&mm->page_table_lock);
+}
+
+static int pt_shareable_flags(struct vm_area_struct *vma)
+{
+	/* We can't share anonymous memory */
+	if (!vma->vm_file || vma->anon_vma)
+		return 0;
+
+	/* No sharing of nonlinear areas */
+	if (vma->vm_flags & (VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE|VM_HUGETLB))
+		return 0;
+
+	/* Shared mappings are shareable */
+	if (vma->vm_flags & VM_SHARED)
+		return 1;
+
+	/* We can't share if it's writeable or might have been writeable */
+	if (vma->vm_flags & VM_WRITE)
+		return 0;
+
+	/* What's left is read-only, which is shareable */
+	return 1;
+}
+
+static int pt_shareable_range(struct vm_area_struct *vma, unsigned long address,
+			      unsigned long mask)
+{
+	unsigned long base = address & mask;
+	unsigned long end = base + ~mask;
+	struct vm_area_struct *prev = vma;
+
+	/* We can share if the vma spans the entire page */
+	if ((vma->vm_start <= base) &&
+	    (vma->vm_end > end))
+		return 1;
+
+	/* We can share if the page only contains that vma */
+	while ((vma = vma->vm_next)) {
+		if (vma->vm_start > end)
+			break;
+		/* No access vmas don't count since they don't use the page table */
+		if (vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC))
+			return 0;
+	}
+	while (1) {
+		vma = prev;
+		BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &prev) != vma);
+		if (!prev)
+			break;
+		if (prev->vm_end < base)
+			break;
+		if (prev->vm_flags & (VM_READ|VM_WRITE|VM_EXEC))
+			return 0;
+	}
+	return 1;
+}
+
+static struct vm_area_struct *next_shareable_vma(struct vm_area_struct *vma,
+						 struct vm_area_struct *svma,
+						 struct prio_tree_iter *iter)
+{
+	struct mm_struct *smm;
+
+	while ((svma = vma_prio_tree_next(svma, iter))) {
+		if (svma == vma)
+			continue;
+
+		smm = svma->vm_mm;
+		/* Skip this one if the mm is doing something to its vmas */
+		if (unlikely(!down_read_trylock(&smm->mmap_sem)))
+			continue;
+
+		if ((vma->vm_flags&VMFLAG_COMPARE) != (svma->vm_flags&VMFLAG_COMPARE))
+			goto next;
+
+		if ((vma->vm_start != svma->vm_start) ||
+		    (vma->vm_end != svma->vm_end) ||
+		    (vma->vm_pgoff != svma->vm_pgoff))
+			goto next;
+
+		return svma;
+
+	    next:
+		up_read(&smm->mmap_sem);
+	}
+	return NULL;
+}
+
+pte_t *pt_share_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address)
+{
+	struct prio_tree_iter iter;
+	struct mm_struct *mm = vma->vm_mm, *smm;
+	struct vm_area_struct *svma = NULL;
+	pgd_t *spgd, spgde;
+	pud_t *spud, spude;
+	pmd_t *spmd, spmde;
+	pte_t *pte;
+	spinlock_t *ptl = NULL;
+	struct page *page = NULL;
+	struct address_space *mapping;
+	int was_shared;
+
+	pmd_clear(&spmde);
+	if (pmd_none(*pmd) &&
+	    pt_shareable_flags(vma) &&
+	    pt_shareable_range(vma, address, PMD_MASK)) {
+#ifdef PT_DEBUG2
+		printk(KERN_DEBUG "Looking for shareable pte page at address 0x%lx\n",
+		       address);
+#endif
+		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+		spin_lock(&mapping->i_mmap_lock);
+		prio_tree_iter_init(&iter, &mapping->i_mmap,
+				    vma->vm_pgoff, VM_PGEND(vma));
+
+		while ((svma = next_shareable_vma(vma, svma, &iter))) {
+			smm = svma->vm_mm;
+
+			if (!pt_shareable_range(svma, address, PMD_MASK))
+				goto next;
+
+			spgd = pgd_offset(smm, address);
+			spgde = *spgd;
+			if (!pgd_present(spgde))
+				goto next;
+
+			spud = pud_offset(&spgde, address);
+			spude = *spud;
+			if (!pud_present(spude))
+				goto next;
+
+			spmd = pmd_offset(&spude, address);
+			spmde = *spmd;
+			if (pmd_present(spmde))
+				goto found;
+
+		    next:
+			up_read(&smm->mmap_sem);
+		}
+		spin_unlock(&mapping->i_mmap_lock);
+		goto notfound;
+
+found:
+		/* Found a shareable page */
+		spin_lock(&mm->page_table_lock);
+
+		page = pmd_page(spmde);
+		ptl = __pt_lockptr(page);
+		spin_lock(ptl);
+		was_shared = pt_is_shared(page);
+		pt_increment_share(page);
+
+		/* We have the page.  Now we can release the locks on the other mm */
+		up_read(&smm->mmap_sem);
+		spin_unlock(&mapping->i_mmap_lock);
+
+		/* Check to make sure no one else filled it already */
+		if (pmd_none(*pmd)) {
+#ifdef PT_DEBUG
+			printk(KERN_DEBUG "Sharing pte for %s[%d] at address 0x%lx\n",
+			       current->comm, current->pid, address);
+#endif
+			pmd_populate(mm, pmd, page);
+			mm->nr_shpte++;
+			mm->nr_ptes++;
+			if (!was_shared)
+				inc_zone_page_state(page, NR_SHAREDPTE);
+		} else
+			pt_decrement_share(page);
+
+		spin_unlock(ptl);
+		spin_unlock(&mm->page_table_lock);
+	}
+notfound:
+	pte = pte_alloc_map(mm, pmd, address);
+
+	return pte;
+}
+
+/*
+ *  Check whether this pmd entry points to a shared pte page.  If it does,
+ *  unshare it by removing the entry and decrementing the share count in the
+ *  page.  This effectively gives away the pte page to whoever else is sharing
+ *  it.  This process will then allocate a new pte page on the next fault.
+ */
+int pt_check_unshare_pte(struct mm_struct *mm, unsigned long address, pmd_t *pmd)
+{
+	struct page *page;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	page = pmd_page(*pmd);
+	ptl = __pt_lockptr(page);
+	spin_lock(ptl);
+	/* Check under the lock */
+	if (pt_is_shared(page)) {
+#ifdef PT_DEBUG
+		printk(KERN_DEBUG "Unshare pte for %s[%d] at address 0x%lx\n",
+		       current->comm, current->pid, address);
+#endif
+		pt_decrement_share(page);
+		mm->nr_shpte--;
+		mm->nr_ptes--;
+		if (!pt_is_shared(page))
+			dec_zone_page_state(page, NR_SHAREDPTE);
+		pmd_clear_flush(mm, address, pmd);
+		ret = 1;
+	}
+	spin_unlock(ptl);
+	return ret;
+}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux