[patch 2.6.13-rc4] fix get_user_pages bug

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

Not sure if this should be fixed for 2.6.13. It can result in
pagecache corruption: so I guess that answers my own question.

This was tested by Robin and appears to solve the problem. Roland
had a quick look and thought the basic idea was sound. I'd like to
get a couple more acks before going forward, and in particular
Robin was contemplating possible efficiency improvements (although
efficiency can wait on correctness).

Feedback please, anyone.

Thanks,
Nick

--
SUSE Labs, Novell Inc.

When get_user_pages for write access races with another process in the page
fault handler that has established the pte for read access, handle_mm_fault
in get_user_pages will return VM_FAULT_MINOR even if it hasn't made the page
correctly writeable (for example, broken COW).

Thus the assumption that get_user_pages has a writeable page at the mapping
after handle_mm_fault returns is incorrect. Fix this by reporting a raced
(uncompleted) fault and retrying the lookup and fault in get_user_pages before
making the assumption that we have a writeable page.

Great work by Robin Holt <[email protected]> to debug the problem.

Signed-off-by: Nick Piggin <[email protected]>

Index: linux-2.6/arch/i386/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/i386/mm/fault.c
+++ linux-2.6/arch/i386/mm/fault.c
@@ -351,6 +351,8 @@ good_area:
 			goto do_sigbus;
 		case VM_FAULT_OOM:
 			goto out_of_memory;
+		case VM_FAULT_RACE:
+			break;
 		default:
 			BUG();
 	}
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -625,6 +625,7 @@ static inline int page_mapped(struct pag
  * Used to decide whether a process gets delivered SIGBUS or
  * just gets major/minor fault counters bumped up.
  */
+#define VM_FAULT_RACE	(-2)
 #define VM_FAULT_OOM	(-1)
 #define VM_FAULT_SIGBUS	0
 #define VM_FAULT_MINOR	1
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -969,6 +969,16 @@ int get_user_pages(struct task_struct *t
 					return i ? i : -EFAULT;
 				case VM_FAULT_OOM:
 					return i ? i : -ENOMEM;
+				case VM_FAULT_RACE:
+					/*
+					 * Someone else got there first.
+					 * Must retry before we can assume
+					 * that we have actually performed
+					 * the write fault (below).
+					 */
+					if (write)
+						continue;
+					break;
 				default:
 					BUG();
 				}
@@ -1229,6 +1239,7 @@ static int do_wp_page(struct mm_struct *
 	struct page *old_page, *new_page;
 	unsigned long pfn = pte_pfn(pte);
 	pte_t entry;
+	int ret;
 
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
@@ -1285,7 +1296,9 @@ static int do_wp_page(struct mm_struct *
 	 */
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
+	ret = VM_FAULT_RACE;
 	if (likely(pte_same(*page_table, pte))) {
+		ret = VM_FAULT_MINOR;
 		if (PageAnon(old_page))
 			dec_mm_counter(mm, anon_rss);
 		if (PageReserved(old_page))
@@ -1304,7 +1317,7 @@ static int do_wp_page(struct mm_struct *
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	spin_unlock(&mm->page_table_lock);
-	return VM_FAULT_MINOR;
+	return ret;
 
 no_new_page:
 	page_cache_release(old_page);
@@ -1659,7 +1672,7 @@ static int do_swap_page(struct mm_struct
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
 			else
-				ret = VM_FAULT_MINOR;
+				ret = VM_FAULT_RACE;
 			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
 			goto out;
@@ -1681,7 +1694,7 @@ static int do_swap_page(struct mm_struct
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (unlikely(!pte_same(*page_table, orig_pte))) {
-		ret = VM_FAULT_MINOR;
+		ret = VM_FAULT_RACE;
 		goto out_nomap;
 	}
 
@@ -1742,6 +1755,7 @@ do_anonymous_page(struct mm_struct *mm, 
 {
 	pte_t entry;
 	struct page * page = ZERO_PAGE(addr);
+	int ret = VM_FAULT_MINOR;
 
 	/* Read-only mapping of ZERO_PAGE. */
 	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
@@ -1765,6 +1779,7 @@ do_anonymous_page(struct mm_struct *mm, 
 			pte_unmap(page_table);
 			page_cache_release(page);
 			spin_unlock(&mm->page_table_lock);
+			ret = VM_FAULT_RACE;
 			goto out;
 		}
 		inc_mm_counter(mm, rss);
@@ -1784,7 +1799,7 @@ do_anonymous_page(struct mm_struct *mm, 
 	lazy_mmu_prot_update(entry);
 	spin_unlock(&mm->page_table_lock);
 out:
-	return VM_FAULT_MINOR;
+	return ret;
 no_mem:
 	return VM_FAULT_OOM;
 }
@@ -1902,6 +1917,7 @@ retry:
 		pte_unmap(page_table);
 		page_cache_release(new_page);
 		spin_unlock(&mm->page_table_lock);
+		ret = VM_FAULT_RACE;
 		goto out;
 	}
 
Index: linux-2.6/arch/alpha/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/alpha/mm/fault.c
+++ linux-2.6/arch/alpha/mm/fault.c
@@ -162,6 +162,8 @@ do_page_fault(unsigned long address, uns
 		goto do_sigbus;
 	      case VM_FAULT_OOM:
 		goto out_of_memory;
+	      case VM_FAULT_RACE:
+		break;
 	      default:
 		BUG();
 	}
Index: linux-2.6/arch/arm/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/arm/mm/fault.c
+++ linux-2.6/arch/arm/mm/fault.c
@@ -195,6 +195,7 @@ survive:
 	case VM_FAULT_MINOR:
 		tsk->min_flt++;
 	case VM_FAULT_SIGBUS:
+	case VM_FAULT_RACE:
 		return fault;
 	}
 
Index: linux-2.6/arch/ia64/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/ia64/mm/fault.c
+++ linux-2.6/arch/ia64/mm/fault.c
@@ -164,6 +164,8 @@ ia64_do_page_fault (unsigned long addres
 		goto bad_area;
 	      case VM_FAULT_OOM:
 		goto out_of_memory;
+	      case VM_FAULT_RACE:
+		break;
 	      default:
 		BUG();
 	}
Index: linux-2.6/arch/m32r/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/m32r/mm/fault.c
+++ linux-2.6/arch/m32r/mm/fault.c
@@ -234,6 +234,8 @@ survive:
 			goto do_sigbus;
 		case VM_FAULT_OOM:
 			goto out_of_memory;
+		case VM_FAULT_RACE:
+			break;
 		default:
 			BUG();
 	}
Index: linux-2.6/arch/mips/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/mips/mm/fault.c
+++ linux-2.6/arch/mips/mm/fault.c
@@ -109,6 +109,8 @@ survive:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}
Index: linux-2.6/arch/ppc/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/ppc/mm/fault.c
+++ linux-2.6/arch/ppc/mm/fault.c
@@ -259,6 +259,8 @@ good_area:
                 goto do_sigbus;
         case VM_FAULT_OOM:
                 goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}
Index: linux-2.6/arch/ppc64/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/ppc64/mm/fault.c
+++ linux-2.6/arch/ppc64/mm/fault.c
@@ -234,6 +234,8 @@ good_area:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}
Index: linux-2.6/arch/s390/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/s390/mm/fault.c
+++ linux-2.6/arch/s390/mm/fault.c
@@ -260,6 +260,8 @@ survive:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}
Index: linux-2.6/arch/sh/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/sh/mm/fault.c
+++ linux-2.6/arch/sh/mm/fault.c
@@ -101,6 +101,8 @@ survive:
 			goto do_sigbus;
 		case VM_FAULT_OOM:
 			goto out_of_memory;
+		case VM_FAULT_RACE:
+			break;
 		default:
 			BUG();
 	}
Index: linux-2.6/arch/sparc/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/sparc/mm/fault.c
+++ linux-2.6/arch/sparc/mm/fault.c
@@ -302,8 +302,8 @@ good_area:
 		current->maj_flt++;
 		break;
 	case VM_FAULT_MINOR:
-	default:
 		current->min_flt++;
+	case VM_FAULT_RACE:
 		break;
 	}
 	up_read(&mm->mmap_sem);
Index: linux-2.6/arch/sparc64/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/sparc64/mm/fault.c
+++ linux-2.6/arch/sparc64/mm/fault.c
@@ -454,6 +454,8 @@ good_area:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}
Index: linux-2.6/arch/um/kernel/trap_kern.c
===================================================================
--- linux-2.6.orig/arch/um/kernel/trap_kern.c
+++ linux-2.6/arch/um/kernel/trap_kern.c
@@ -76,6 +76,8 @@ int handle_page_fault(unsigned long addr
 		case VM_FAULT_OOM:
 			err = -ENOMEM;
 			goto out_of_memory;
+		case VM_FAULT_RACE:
+			break;
 		default:
 			BUG();
 		}
Index: linux-2.6/arch/xtensa/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/xtensa/mm/fault.c
+++ linux-2.6/arch/xtensa/mm/fault.c
@@ -113,6 +113,8 @@ survive:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]
  Powered by Linux