Fixes for NFS file truncation race condition(s)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Attached are two file, the first being a diff patch file, and the second being a test program that can be used to invoke the problem and confirm that it has been fixed after the patches are applied.

Explanation of the problem:

A number of places in the kernel appear to suffer from a race condition with truncation of files on NFS-mounted filesystems when the files have dirty pages in the buffer cache. When the attached program is run on a 2.6.15 kernel with the -git12 patch-set applied, it can invoke the problem within 10-15 minutes. This program simulates a transactional dB journaling operation which is where we first encountered the kernel bug. This bug also occurs on a 2.6.15 kernel with the -mm4 patch-set, although the diff file will probably only apply to a source tree with -git12 patches! The BUG_ON() occurs in "lib/radix-tree.c" function radix_tree_tag_set().

Of the four files patched, the first is probably the most "suspect" in whether it is necessary or fix the problem. I strongly suspect that it doesn't need to be applied but given the time required to verify the fix I havn't bothered to check the 15 possible combinations to see if all patches are necessary! We currently run with all four patches applied and don't see the problem after several weeks of testing on dual 3GHz Xeon Dell server(s).

A stack trace of the bug generated by "./breaknfs 100 /var/nfs/" follows-

kernel BUG at lib/radix-tree.c:372!
invalid opcode: 0000 [#1]
PREEMPT SMP
Modules linked in: nfsd exportfs parport_pc lp parport ipmi_poweroff bmcsensors i2c_ipmi i2c_core ipmi_si ipmi_devintf ipmi_msghandler binfmt_misc video thermal processor fan button battery ac ehci_hcd usbcore hw_random ide_cd cdrom ext3 jbd dm_mod ata_piix libata sd_mod scsi_mod
CPU:    1
EIP:    0060:[<c01e7bc8>]    Not tainted VLI
EFLAGS: 00010046   (2.6.15-git4)
EIP is at radix_tree_tag_set+0x6c/0x76
eax: 00000000   ebx: 00000001   ecx: f76a95c0   edx: 00000000
esi: 00000000   edi: 00000000   ebp: 00000008   esp: f656bcd0
ds: 007b   es: 007b   ss: 0068
Process breaknfs (pid: 5082, threadinfo=f656a000 task=f7dfea70)
Stack: 00000000 c17f8858 f7d67bec f7d67bfc c014736a f7d67bf0 00000000 00000001 00000213 f73ff8cc f73ff914 f6770480 f73ff780 c01c2f52 c17f8858 00000050 f7dfeb98 00000001 c01332e5 00000002 00000000 00000004 f656bd4c f656bd4c
Call Trace:
[<c014736a>] test_set_page_writeback+0xb5/0x108
[<c01c2f52>] nfs_flush_one+0xf9/0x1f3
[<c01332e5>] prepare_to_wait+0x12/0x4d
[<c01c30a6>] nfs_flush_list+0x5a/0xa8
[<c01c3ac9>] nfs_flush_inode+0x83/0xb5
[<c01c1e72>] nfs_writepages+0x84/0x112
[<c0146dfc>] do_writepages+0x2d/0x50
[<c013ff03>] __filemap_fdatawrite_range+0xc1/0xcc
[<c013ff45>] filemap_fdatawrite+0x37/0x3b
[<c01bb1f4>] nfs_sync_mapping+0x50/0x93
[<c01bc0e0>] nfs_revalidate_mapping+0x77/0xc4
[<c01bbeee>] __nfs_revalidate_inode+0x14b/0x24b
[<c01e7f4d>] radix_tree_gang_lookup_tag+0x56/0x70
[<c01c3b41>] nfs_commit_inode+0x46/0x6e
[<c01c3be2>] nfs_sync_inode+0x79/0x85
[<c01b9a54>] nfs_file_flush+0xc2/0xc4
[<c015ffdc>] filp_close+0x53/0x6e
[<c0160060>] sys_close+0x69/0x84
[<c0102daf>] sysenter_past_esp+0x54/0x75
Code: 0f a3 91 04 01 00 00 19 c0 85 c0 75 07 0f ab 91 04 01 00 00 8b 74 96 04 85 f6 74 0f 83 ef 06 83 eb 01 75 cd 89 f0 5b 5e 5f 5d c3 <0f> 0b 74 01 f8 4b 35 c0 eb e7 55 31 ed 57 56 53 83 ec 44 8b 4c
<6>note: breaknfs[5082] exited with preempt_count 1

diff -urN old/fs/buffer.c new/fs/buffer.c
--- old/fs/buffer.c	2006-02-04 16:40:18.000000000 +1000
+++ new/fs/buffer.c	2006-02-06 10:09:42.000000000 +1000
@@ -860,7 +860,8 @@
 	spin_unlock(&mapping->private_lock);
 
 	if (!TestSetPageDirty(page)) {
-		write_lock_irq(&mapping->tree_lock);
+		unsigned long	flags;
+		write_lock_irqsave(&mapping->tree_lock, flags);
 		if (page->mapping) {	/* Race with truncate? */
 			if (mapping_cap_account_dirty(mapping))
 				inc_page_state(nr_dirty);
@@ -868,7 +869,7 @@
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 		}
-		write_unlock_irq(&mapping->tree_lock);
+		write_unlock_irqrestore(&mapping->tree_lock, flags);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
 	
diff -urN old/mm/page-writeback.c new/mm/page-writeback.c
--- old/mm/page-writeback.c	2006-02-04 16:40:20.000000000 +1000
+++ new/mm/page-writeback.c	2006-02-06 11:10:43.000000000 +1000
@@ -712,7 +712,7 @@
 
 	if (mapping) {
 		write_lock_irqsave(&mapping->tree_lock, flags);
-		if (TestClearPageDirty(page)) {
+		if (TestClearPageDirty(page) && (page_mapping(page) == mapping)) { /* Race with truncate? */
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
@@ -768,7 +768,7 @@
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
-		if (ret)
+		if (ret && (page_mapping(page) == mapping))	/* Race with truncate? */
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
@@ -789,11 +789,11 @@
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
-		if (!ret)
+		if (!ret && (page_mapping(page) == mapping))	/* Race with truncate? */
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-		if (!PageDirty(page))
+		if (!PageDirty(page) && (page_mapping(page) == mapping))	/* Race with truncate? */
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
diff -urN old/mm/truncate.c new/mm/truncate.c
--- old/mm/truncate.c	2006-02-04 16:40:20.000000000 +1000
+++ new/mm/truncate.c	2006-02-06 11:01:35.000000000 +1000
@@ -68,7 +68,7 @@
 		return 0;
 
 	write_lock_irq(&mapping->tree_lock);
-	if (PageDirty(page)) {
+	if (PageDirty(page) || (page->mapping != mapping)) { /* Race with truncate? */
 		write_unlock_irq(&mapping->tree_lock);
 		return 0;
 	}
diff -urN old/mm/vmscan.c new/mm/vmscan.c
--- old/mm/vmscan.c	2006-02-04 16:40:20.000000000 +1000
+++ new/mm/vmscan.c	2006-02-06 11:14:46.000000000 +1000
@@ -380,6 +380,10 @@
 
 	write_lock_irq(&mapping->tree_lock);
 
+	if (page_mapping(page) != mapping) {	/* Race with truncate? */
+		goto cannot_free;		/* truncate got there first! */
+	}
+
 	/*
 	 * The non-racy check for busy page.  It is critical to check
 	 * PageDirty _after_ making sure that the page is freeable and
// Program:  breaknfs.c
//
// Compile:  cc breaknfs.c -o breaknfs
// Run:      ./breaknfs 100 [<nfs-mounted-dir>]
//
// Args:     arg1 = # of copies of program to run simultaneously
//           arg2 = A directory that is on an NFS-mounted filesystem

#define NFS_MOUNT	"/var/nfs"

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>

#define BUFFER_SIZE (1*1024*1024)
static char buffer[BUFFER_SIZE];

int main(int argc, char *argv[])
{
	char*	nfs_dir;
	int	count;
	int	fd;
	int	i;
	int	pid;

	char *files[] = {"file1", "file2", "file3", "file4", "file5"};

	if (argc > 1)
		count = atoi(argv[1]);
	else
		count = 1;
	if (argc > 2)
		nfs_dir = argv[2];
	else
		nfs_dir = NFS_MOUNT;

	/* cd -> ... */
	if (chdir(nfs_dir) < 0) {
		perror(nfs_dir);
		exit(1);
	}

	/* Fill buffer with numbers and letters, etc! */
	for ( i = 0; i < BUFFER_SIZE; i++ ) {
		buffer[i] = '0' + (i & 0x3f);
		if ( i && ((i % 80) == 0) )
			buffer[i] = '\n';
	}

	/* fork count-1 children */
	while (count-- > 1) {
		pid = fork();
		if (pid == 0) {
			/* child */
			break;
		} else if (pid < 0) {
			perror("fork");
			exit(1);
		}
	}
	srandom(getpid());

        /* Forever and a day ... */
	while(1) {
		for (i = 0; i < 5; i++) {
			int	write_size;

                        /* Write a random amount of bytes, truncating file before output */
			write_size = (random() % BUFFER_SIZE) + 1;
			fd = open(files[i], O_WRONLY | O_CREAT | O_TRUNC, 0600);
			if (fd < 0) {
				perror("open");
				exit(1);
			}
			if (write(fd, buffer, write_size) < 0) {
				perror("write");
				exit(1);
			}
			close(fd);
		}
	}

	return 0;
}

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux