Re: [PATCH] mm: fix page_mkclean_one (was: 2.6.19 file content corruption on ext3)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 12/24/06, Linus Torvalds <[email protected]> wrote:

How about this particularly stupid diff? (please test with something that
_would_ cause corruption normally).

It is _entirely_ untested, but what it tries to do is to simply serialize
any writeback in progress with any process that tries to re-map a shared
page into its address space and dirty it. I haven't tested it, and maybe
it misses some case, but it looks likea good way to try to avoid races
with marking pages dirty and the writeback phase ..

The apt cache files (/var/cache/apt/*.bin) still get corrupted with
this patch and 2.6.19.

Gordon

diff -Naupr linux-2.6.19.orig/fs/buffer.c linux-2.6.19/fs/buffer.c
--- linux-2.6.19.orig/fs/buffer.c       2006-11-29 14:57:37.000000000 -0700
+++ linux-2.6.19/fs/buffer.c    2006-12-21 01:16:31.000000000 -0700
@@ -2832,7 +2832,7 @@ int try_to_free_buffers(struct page *pag
       int ret = 0;

       BUG_ON(!PageLocked(page));
-       if (PageWriteback(page))
+       if (PageDirty(page) || PageWriteback(page))
               return 0;

       if (mapping == NULL) {          /* can this still happen? */
@@ -2843,17 +2843,6 @@ int try_to_free_buffers(struct page *pag
       spin_lock(&mapping->private_lock);
       ret = drop_buffers(page, &buffers_to_free);
       spin_unlock(&mapping->private_lock);
-       if (ret) {
-               /*
-                * If the filesystem writes its buffers by hand (eg ext3)
-                * then we can have clean buffers against a dirty page.  We
-                * clean the page here; otherwise later reattachment of buffers
-                * could encounter a non-uptodate page, which is unresolvable.
-                * This only applies in the rare case where try_to_free_buffers
-                * succeeds but the page is not freed.
-                */
-               clear_page_dirty(page);
-       }
out:
       if (buffers_to_free) {
               struct buffer_head *bh = buffers_to_free;
diff -Naupr linux-2.6.19.orig/fs/hugetlbfs/inode.c
linux-2.6.19/fs/hugetlbfs/inode.c
--- linux-2.6.19.orig/fs/hugetlbfs/inode.c      2006-11-29
14:57:37.000000000 -0700
+++ linux-2.6.19/fs/hugetlbfs/inode.c   2006-12-21 01:15:21.000000000 -0700
@@ -176,7 +176,7 @@ static int hugetlbfs_commit_write(struct

static void truncate_huge_page(struct page *page)
{
-       clear_page_dirty(page);
+       cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
       ClearPageUptodate(page);
       remove_from_page_cache(page);
       put_page(page);
diff -Naupr linux-2.6.19.orig/include/linux/page-flags.h
linux-2.6.19/include/linux/page-flags.h
--- linux-2.6.19.orig/include/linux/page-flags.h        2006-11-29
14:57:37.000000000 -0700
+++ linux-2.6.19/include/linux/page-flags.h     2006-12-21
01:15:21.000000000 -0700
@@ -253,15 +253,11 @@ static inline void SetPageUptodate(struc

struct page;   /* forward declaration */

-int test_clear_page_dirty(struct page *page);
+extern void cancel_dirty_page(struct page *page, unsigned int account_size);
+
int test_clear_page_writeback(struct page *page);
int test_set_page_writeback(struct page *page);

-static inline void clear_page_dirty(struct page *page)
-{
-       test_clear_page_dirty(page);
-}
-
static inline void set_page_writeback(struct page *page)
{
       test_set_page_writeback(page);
diff -Naupr linux-2.6.19.orig/mm/memory.c linux-2.6.19/mm/memory.c
--- linux-2.6.19.orig/mm/memory.c       2006-11-29 14:57:37.000000000 -0700
+++ linux-2.6.19/mm/memory.c    2006-12-24 11:04:03.000000000 -0700
@@ -1534,6 +1534,7 @@ static int do_wp_page(struct mm_struct *
                       if (!pte_same(*page_table, orig_pte))
                               goto unlock;
               }
+               wait_on_page_writeback(old_page);
               dirty_page = old_page;
               get_page(dirty_page);
               reuse = 1;
@@ -1832,6 +1833,33 @@ void unmap_mapping_range(struct address_
}
EXPORT_SYMBOL(unmap_mapping_range);

+static void check_last_page(struct address_space *mapping, loff_t size)
+{
+       pgoff_t index;
+       unsigned int offset;
+       struct page *page;
+
+       if (!mapping)
+               return;
+       offset = size & ~PAGE_MASK;
+       if (!offset)
+               return;
+       index = size >> PAGE_SHIFT;
+       page = find_lock_page(mapping, index);
+       if (page) {
+               unsigned int check = 0;
+               unsigned char *kaddr = kmap_atomic(page, KM_USER0);
+               do {
+                       check += kaddr[offset++];
+               } while (offset < PAGE_SIZE);
+               kunmap_atomic(kaddr,KM_USER0);
+               unlock_page(page);
+               page_cache_release(page);
+               if (check)
+                       printk("%s: BADNESS: truncate check %u\n",
current->comm, check);
+       }
+}
+
/**
 * vmtruncate - unmap mappings "freed" by truncate() syscall
 * @inode: inode of the file used
@@ -1865,6 +1893,7 @@ do_expand:
               goto out_sig;
       if (offset > inode->i_sb->s_maxbytes)
               goto out_big;
+       check_last_page(mapping, inode->i_size);
       i_size_write(inode, offset);

out_truncate:
@@ -2206,6 +2235,7 @@ retry:
                               page_cache_release(new_page);
                               return VM_FAULT_SIGBUS;
                       }
+                       wait_on_page_writeback(new_page);
               }
       }

diff -Naupr linux-2.6.19.orig/mm/page-writeback.c
linux-2.6.19/mm/page-writeback.c
--- linux-2.6.19.orig/mm/page-writeback.c       2006-11-29
14:57:37.000000000 -0700
+++ linux-2.6.19/mm/page-writeback.c    2006-12-21 01:26:53.000000000 -0700
@@ -843,39 +843,6 @@ int set_page_dirty_lock(struct page *pag
EXPORT_SYMBOL(set_page_dirty_lock);

/*
- * Clear a page's dirty flag, while caring for dirty memory accounting.
- * Returns true if the page was previously dirty.
- */
-int test_clear_page_dirty(struct page *page)
-{
-       struct address_space *mapping = page_mapping(page);
-       unsigned long flags;
-
-       if (mapping) {
-               write_lock_irqsave(&mapping->tree_lock, flags);
-               if (TestClearPageDirty(page)) {
-                       radix_tree_tag_clear(&mapping->page_tree,
-                                               page_index(page),
-                                               PAGECACHE_TAG_DIRTY);
-                       write_unlock_irqrestore(&mapping->tree_lock, flags);
-                       /*
-                        * We can continue to use `mapping' here because the
-                        * page is locked, which pins the address_space
-                        */
-                       if (mapping_cap_account_dirty(mapping)) {
-                               page_mkclean(page);
-                               dec_zone_page_state(page, NR_FILE_DIRTY);
-                       }
-                       return 1;
-               }
-               write_unlock_irqrestore(&mapping->tree_lock, flags);
-               return 0;
-       }
-       return TestClearPageDirty(page);
-}
-EXPORT_SYMBOL(test_clear_page_dirty);
-
-/*
 * Clear a page's dirty flag, while caring for dirty memory accounting.
 * Returns true if the page was previously dirty.
 *
diff -Naupr linux-2.6.19.orig/mm/rmap.c linux-2.6.19/mm/rmap.c
--- linux-2.6.19.orig/mm/rmap.c 2006-11-29 14:57:37.000000000 -0700
+++ linux-2.6.19/mm/rmap.c      2006-12-22 23:25:09.000000000 -0700
@@ -432,7 +432,7 @@ static int page_mkclean_one(struct page
{
       struct mm_struct *mm = vma->vm_mm;
       unsigned long address;
-       pte_t *pte, entry;
+       pte_t *pte;
       spinlock_t *ptl;
       int ret = 0;

@@ -444,17 +444,18 @@ static int page_mkclean_one(struct page
       if (!pte)
               goto out;

-       if (!pte_dirty(*pte) && !pte_write(*pte))
-               goto unlock;
+       if (pte_dirty(*pte) || pte_write(*pte)) {
+               pte_t entry;

-       entry = ptep_get_and_clear(mm, address, pte);
-       entry = pte_mkclean(entry);
-       entry = pte_wrprotect(entry);
-       ptep_establish(vma, address, pte, entry);
-       lazy_mmu_prot_update(entry);
-       ret = 1;
+               flush_cache_page(vma, address, pte_pfn(*pte));
+               entry = ptep_clear_flush(vma, address, pte);
+               entry = pte_wrprotect(entry);
+               entry = pte_mkclean(entry);
+               set_pte_at(vma, address, pte, entry);
+               lazy_mmu_prot_update(entry);
+               ret = 1;
+       }

-unlock:
       pte_unmap_unlock(pte, ptl);
out:
       return ret;
@@ -489,6 +490,8 @@ int page_mkclean(struct page *page)
               if (mapping)
                       ret = page_mkclean_file(mapping, page);
       }
+       if (page_test_and_clear_dirty(page))
+               ret = 1;

       return ret;
}
@@ -587,8 +590,6 @@ void page_remove_rmap(struct page *page)
                * Leaving it set also helps swapoff to reinstate ptes
                * faster for those pages still in swapcache.
                */
-               if (page_test_and_clear_dirty(page))
-                       set_page_dirty(page);
               __dec_zone_page_state(page,
                               PageAnon(page) ? NR_ANON_PAGES :
NR_FILE_MAPPED);
       }
@@ -607,6 +608,7 @@ static int try_to_unmap_one(struct page
       pte_t pteval;
       spinlock_t *ptl;
       int ret = SWAP_AGAIN;
+       struct page *dirty_page = NULL;

       address = vma_address(page, vma);
       if (address == -EFAULT)
@@ -633,7 +635,7 @@ static int try_to_unmap_one(struct page

       /* Move the dirty bit to the physical page now the pte is gone. */
       if (pte_dirty(pteval))
-               set_page_dirty(page);
+               dirty_page = page;

       /* Update high watermark before we lower rss */
       update_hiwater_rss(mm);
@@ -684,6 +686,8 @@ static int try_to_unmap_one(struct page

out_unmap:
       pte_unmap_unlock(pte, ptl);
+       if (dirty_page)
+               set_page_dirty(dirty_page);
out:
       return ret;
}
@@ -915,6 +919,9 @@ int try_to_unmap(struct page *page, int
       else
               ret = try_to_unmap_file(page, migration);

+       if (page_test_and_clear_dirty(page))
+               set_page_dirty(page);
+
       if (!page_mapped(page))
               ret = SWAP_SUCCESS;
       return ret;
diff -Naupr linux-2.6.19.orig/mm/truncate.c linux-2.6.19/mm/truncate.c
--- linux-2.6.19.orig/mm/truncate.c     2006-11-29 14:57:37.000000000 -0700
+++ linux-2.6.19/mm/truncate.c  2006-12-23 13:21:42.000000000 -0700
@@ -50,6 +50,21 @@ static inline void truncate_partial_page
               do_invalidatepage(page, partial);
}

+void cancel_dirty_page(struct page *page, unsigned int account_size)
+{
+       /* If we're cancelling the page, it had better not be mapped
any more */+       if (page_mapped(page)) {
+               static unsigned int warncount;
+
+               WARN_ON(++warncount < 5);
+       }
+
+       if (TestClearPageDirty(page) && account_size &&
+                       mapping_cap_account_dirty(page->mapping))
+               dec_zone_page_state(page, NR_FILE_DIRTY);
+}
+
+
/*
 * If truncate cannot remove the fs-private metadata from the page, the page
 * becomes anonymous.  It will be left on the LRU and may even be mapped into
@@ -66,10 +81,11 @@ truncate_complete_page(struct address_sp
       if (page->mapping != mapping)
               return;

+       cancel_dirty_page(page, PAGE_CACHE_SIZE);
+
       if (PagePrivate(page))
               do_invalidatepage(page, 0);

-       clear_page_dirty(page);
       ClearPageUptodate(page);
       ClearPageMappedToDisk(page);
       remove_from_page_cache(page);
@@ -348,7 +364,6 @@ int invalidate_inode_pages2_range(struct
               for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
                       struct page *page = pvec.pages[i];
                       pgoff_t page_index;
-                       int was_dirty;

                       lock_page(page);
                       if (page->mapping != mapping) {
@@ -384,12 +399,8 @@ int invalidate_inode_pages2_range(struct
                                         PAGE_CACHE_SIZE, 0);
                               }
                       }
-                       was_dirty = test_clear_page_dirty(page);
-                       if (!invalidate_complete_page2(mapping, page)) {
-                               if (was_dirty)
-                                       set_page_dirty(page);
+                       if (!invalidate_complete_page2(mapping, page))
                               ret = -EIO;
-                       }
                       unlock_page(page);
               }
               pagevec_release(&pvec);


--
Gordon Farquharson
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux